mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-22 12:20:34 +00:00
feat: Sync collection (#3368)
* feat: sync collection * feat: sync collection * perf: website selector * update doc
This commit is contained in:
@@ -1,4 +1,7 @@
|
||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import {
|
||||
DatasetCollectionTypeEnum,
|
||||
TrainingModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
|
||||
import { MongoDatasetCollection } from './schema';
|
||||
import {
|
||||
@@ -24,6 +27,7 @@ import { getLLMModel, getVectorModel } from '../../ai/model';
|
||||
import { pushDataListToTrainingQueue } from '../training/controller';
|
||||
import { MongoImage } from '../../../common/file/image/schema';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
import { addDays } from 'date-fns';
|
||||
|
||||
export const createCollectionAndInsertData = async ({
|
||||
dataset,
|
||||
@@ -72,6 +76,17 @@ export const createCollectionAndInsertData = async ({
|
||||
|
||||
hashRawText: hashStr(rawText),
|
||||
rawTextLength: rawText.length,
|
||||
nextSyncTime: (() => {
|
||||
if (!dataset.autoSync) return undefined;
|
||||
if (
|
||||
[DatasetCollectionTypeEnum.link, DatasetCollectionTypeEnum.apiFile].includes(
|
||||
createCollectionParams.type
|
||||
)
|
||||
) {
|
||||
return addDays(new Date(), 1);
|
||||
}
|
||||
return undefined;
|
||||
})(),
|
||||
session
|
||||
});
|
||||
|
||||
@@ -155,10 +170,8 @@ export async function createOneCollection({
|
||||
|
||||
fileId,
|
||||
rawLink,
|
||||
|
||||
externalFileId,
|
||||
externalFileUrl,
|
||||
|
||||
apiFileId,
|
||||
|
||||
hashRawText,
|
||||
@@ -166,7 +179,10 @@ export async function createOneCollection({
|
||||
metadata = {},
|
||||
session,
|
||||
tags,
|
||||
createTime
|
||||
|
||||
createTime,
|
||||
updateTime,
|
||||
nextSyncTime
|
||||
}: CreateOneCollectionParams) {
|
||||
// Create collection tags
|
||||
const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session });
|
||||
@@ -197,7 +213,10 @@ export async function createOneCollection({
|
||||
rawTextLength,
|
||||
hashRawText,
|
||||
tags: collectionTags,
|
||||
createTime
|
||||
|
||||
createTime,
|
||||
updateTime,
|
||||
nextSyncTime
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
|
@@ -1,4 +1,4 @@
|
||||
import { connectionMongo, getMongoModel, type Model } from '../../../common/mongo';
|
||||
import { connectionMongo, getMongoModel } from '../../../common/mongo';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
|
||||
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constants';
|
||||
@@ -10,100 +10,95 @@ import {
|
||||
|
||||
export const DatasetColCollectionName = 'dataset_collections';
|
||||
|
||||
const DatasetCollectionSchema = new Schema(
|
||||
{
|
||||
parentId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: DatasetColCollectionName,
|
||||
default: null
|
||||
},
|
||||
teamId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: TeamCollectionName,
|
||||
required: true
|
||||
},
|
||||
tmbId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: TeamMemberCollectionName,
|
||||
required: true
|
||||
},
|
||||
datasetId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: DatasetCollectionName,
|
||||
required: true
|
||||
},
|
||||
type: {
|
||||
type: String,
|
||||
enum: Object.keys(DatasetCollectionTypeMap),
|
||||
required: true
|
||||
},
|
||||
name: {
|
||||
type: String,
|
||||
required: true
|
||||
},
|
||||
createTime: {
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
},
|
||||
updateTime: {
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
},
|
||||
forbid: {
|
||||
type: Boolean,
|
||||
default: false
|
||||
},
|
||||
|
||||
// chunk filed
|
||||
trainingType: {
|
||||
type: String,
|
||||
enum: Object.keys(TrainingTypeMap)
|
||||
},
|
||||
chunkSize: {
|
||||
type: Number,
|
||||
required: true
|
||||
},
|
||||
chunkSplitter: {
|
||||
type: String
|
||||
},
|
||||
qaPrompt: {
|
||||
type: String
|
||||
},
|
||||
ocrParse: Boolean,
|
||||
|
||||
tags: {
|
||||
type: [String],
|
||||
default: []
|
||||
},
|
||||
|
||||
// local file collection
|
||||
fileId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: 'dataset.files'
|
||||
},
|
||||
// web link collection
|
||||
rawLink: String,
|
||||
// api collection
|
||||
apiFileId: String,
|
||||
// external collection
|
||||
externalFileId: String,
|
||||
externalFileUrl: String, // external import url
|
||||
|
||||
// metadata
|
||||
rawTextLength: Number,
|
||||
hashRawText: String,
|
||||
metadata: {
|
||||
type: Object,
|
||||
default: {}
|
||||
}
|
||||
const DatasetCollectionSchema = new Schema({
|
||||
parentId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: DatasetColCollectionName,
|
||||
default: null
|
||||
},
|
||||
{
|
||||
// Auto update updateTime
|
||||
timestamps: {
|
||||
updatedAt: 'updateTime'
|
||||
}
|
||||
teamId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: TeamCollectionName,
|
||||
required: true
|
||||
},
|
||||
tmbId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: TeamMemberCollectionName,
|
||||
required: true
|
||||
},
|
||||
datasetId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: DatasetCollectionName,
|
||||
required: true
|
||||
},
|
||||
type: {
|
||||
type: String,
|
||||
enum: Object.keys(DatasetCollectionTypeMap),
|
||||
required: true
|
||||
},
|
||||
name: {
|
||||
type: String,
|
||||
required: true
|
||||
},
|
||||
createTime: {
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
},
|
||||
updateTime: {
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
},
|
||||
forbid: {
|
||||
type: Boolean,
|
||||
default: false
|
||||
},
|
||||
|
||||
// chunk filed
|
||||
trainingType: {
|
||||
type: String,
|
||||
enum: Object.keys(TrainingTypeMap)
|
||||
},
|
||||
chunkSize: {
|
||||
type: Number,
|
||||
required: true
|
||||
},
|
||||
chunkSplitter: {
|
||||
type: String
|
||||
},
|
||||
qaPrompt: {
|
||||
type: String
|
||||
},
|
||||
ocrParse: Boolean,
|
||||
|
||||
tags: {
|
||||
type: [String],
|
||||
default: []
|
||||
},
|
||||
|
||||
// local file collection
|
||||
fileId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: 'dataset.files'
|
||||
},
|
||||
// web link collection
|
||||
rawLink: String,
|
||||
// api collection
|
||||
apiFileId: String,
|
||||
// external collection
|
||||
externalFileId: String,
|
||||
externalFileUrl: String, // external import url
|
||||
|
||||
// next sync time
|
||||
nextSyncTime: Date,
|
||||
|
||||
// metadata
|
||||
rawTextLength: Number,
|
||||
hashRawText: String,
|
||||
metadata: {
|
||||
type: Object,
|
||||
default: {}
|
||||
}
|
||||
);
|
||||
});
|
||||
|
||||
try {
|
||||
// auth file
|
||||
@@ -122,6 +117,16 @@ try {
|
||||
// create time filter
|
||||
DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, createTime: 1 });
|
||||
|
||||
// next sync time filter
|
||||
DatasetCollectionSchema.index(
|
||||
{ type: 1, nextSyncTime: -1 },
|
||||
{
|
||||
partialFilterExpression: {
|
||||
nextSyncTime: { $exists: true }
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
// Get collection by external file id
|
||||
DatasetCollectionSchema.index(
|
||||
{ datasetId: 1, externalFileId: 1 },
|
||||
|
@@ -163,6 +163,10 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
|
||||
...sourceReadType
|
||||
});
|
||||
|
||||
if (!rawText) {
|
||||
return DatasetCollectionSyncResultEnum.failed;
|
||||
}
|
||||
|
||||
// Check if the original text is the same: skip if same
|
||||
const hashRawText = hashStr(rawText);
|
||||
if (collection.hashRawText && hashRawText === collection.hashRawText) {
|
||||
@@ -178,28 +182,30 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
|
||||
createCollectionParams: {
|
||||
teamId: collection.teamId,
|
||||
tmbId: collection.tmbId,
|
||||
datasetId: collection.datasetId._id,
|
||||
name: collection.name,
|
||||
datasetId: collection.datasetId._id,
|
||||
parentId: collection.parentId,
|
||||
type: collection.type,
|
||||
|
||||
trainingType: collection.trainingType,
|
||||
chunkSize: collection.chunkSize,
|
||||
chunkSplitter: collection.chunkSplitter,
|
||||
qaPrompt: collection.qaPrompt,
|
||||
|
||||
fileId: collection.fileId,
|
||||
rawLink: collection.rawLink,
|
||||
externalFileId: collection.externalFileId,
|
||||
externalFileUrl: collection.externalFileUrl,
|
||||
apiFileId: collection.apiFileId,
|
||||
|
||||
rawTextLength: rawText.length,
|
||||
hashRawText,
|
||||
rawTextLength: rawText.length,
|
||||
|
||||
metadata: collection.metadata,
|
||||
|
||||
tags: collection.tags,
|
||||
createTime: collection.createTime,
|
||||
|
||||
parentId: collection.parentId,
|
||||
trainingType: collection.trainingType,
|
||||
chunkSize: collection.chunkSize,
|
||||
chunkSplitter: collection.chunkSplitter,
|
||||
qaPrompt: collection.qaPrompt,
|
||||
metadata: collection.metadata
|
||||
updateTime: new Date()
|
||||
}
|
||||
});
|
||||
|
||||
|
@@ -91,17 +91,7 @@ const DatasetSchema = new Schema({
|
||||
type: Object
|
||||
},
|
||||
|
||||
syncSchedule: {
|
||||
cronString: {
|
||||
type: String
|
||||
},
|
||||
timezone: {
|
||||
type: String
|
||||
}
|
||||
},
|
||||
syncNextTime: {
|
||||
type: Date
|
||||
},
|
||||
autoSync: Boolean,
|
||||
|
||||
// abandoned
|
||||
externalReadUrl: {
|
||||
@@ -112,7 +102,6 @@ const DatasetSchema = new Schema({
|
||||
|
||||
try {
|
||||
DatasetSchema.index({ teamId: 1 });
|
||||
DatasetSchema.index({ syncSchedule: 1, syncNextTime: -1 });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
@@ -165,7 +165,8 @@ export async function pushDataListToTrainingQueue({
|
||||
a: item.a,
|
||||
chunkIndex: item.chunkIndex ?? 0,
|
||||
weight: weight ?? 0,
|
||||
indexes: item.indexes
|
||||
indexes: item.indexes,
|
||||
retryCount: 5
|
||||
})),
|
||||
{
|
||||
session,
|
||||
|
Reference in New Issue
Block a user