feat: Sync collection (#3368)

* feat: sync collection

* feat: sync collection

* perf: website selector

* update doc
This commit is contained in:
Archer
2024-12-11 15:03:41 +08:00
committed by GitHub
parent 048f5a2d53
commit d5752ddbaa
40 changed files with 365 additions and 191 deletions

View File

@@ -1,4 +1,7 @@
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import {
DatasetCollectionTypeEnum,
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constants';
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { MongoDatasetCollection } from './schema';
import {
@@ -24,6 +27,7 @@ import { getLLMModel, getVectorModel } from '../../ai/model';
import { pushDataListToTrainingQueue } from '../training/controller';
import { MongoImage } from '../../../common/file/image/schema';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { addDays } from 'date-fns';
export const createCollectionAndInsertData = async ({
dataset,
@@ -72,6 +76,17 @@ export const createCollectionAndInsertData = async ({
hashRawText: hashStr(rawText),
rawTextLength: rawText.length,
nextSyncTime: (() => {
if (!dataset.autoSync) return undefined;
if (
[DatasetCollectionTypeEnum.link, DatasetCollectionTypeEnum.apiFile].includes(
createCollectionParams.type
)
) {
return addDays(new Date(), 1);
}
return undefined;
})(),
session
});
@@ -155,10 +170,8 @@ export async function createOneCollection({
fileId,
rawLink,
externalFileId,
externalFileUrl,
apiFileId,
hashRawText,
@@ -166,7 +179,10 @@ export async function createOneCollection({
metadata = {},
session,
tags,
createTime
createTime,
updateTime,
nextSyncTime
}: CreateOneCollectionParams) {
// Create collection tags
const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session });
@@ -197,7 +213,10 @@ export async function createOneCollection({
rawTextLength,
hashRawText,
tags: collectionTags,
createTime
createTime,
updateTime,
nextSyncTime
}
],
{ session }

View File

@@ -1,4 +1,4 @@
import { connectionMongo, getMongoModel, type Model } from '../../../common/mongo';
import { connectionMongo, getMongoModel } from '../../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constants';
@@ -10,100 +10,95 @@ import {
export const DatasetColCollectionName = 'dataset_collections';
const DatasetCollectionSchema = new Schema(
{
parentId: {
type: Schema.Types.ObjectId,
ref: DatasetColCollectionName,
default: null
},
teamId: {
type: Schema.Types.ObjectId,
ref: TeamCollectionName,
required: true
},
tmbId: {
type: Schema.Types.ObjectId,
ref: TeamMemberCollectionName,
required: true
},
datasetId: {
type: Schema.Types.ObjectId,
ref: DatasetCollectionName,
required: true
},
type: {
type: String,
enum: Object.keys(DatasetCollectionTypeMap),
required: true
},
name: {
type: String,
required: true
},
createTime: {
type: Date,
default: () => new Date()
},
updateTime: {
type: Date,
default: () => new Date()
},
forbid: {
type: Boolean,
default: false
},
// chunk filed
trainingType: {
type: String,
enum: Object.keys(TrainingTypeMap)
},
chunkSize: {
type: Number,
required: true
},
chunkSplitter: {
type: String
},
qaPrompt: {
type: String
},
ocrParse: Boolean,
tags: {
type: [String],
default: []
},
// local file collection
fileId: {
type: Schema.Types.ObjectId,
ref: 'dataset.files'
},
// web link collection
rawLink: String,
// api collection
apiFileId: String,
// external collection
externalFileId: String,
externalFileUrl: String, // external import url
// metadata
rawTextLength: Number,
hashRawText: String,
metadata: {
type: Object,
default: {}
}
const DatasetCollectionSchema = new Schema({
parentId: {
type: Schema.Types.ObjectId,
ref: DatasetColCollectionName,
default: null
},
{
// Auto update updateTime
timestamps: {
updatedAt: 'updateTime'
}
teamId: {
type: Schema.Types.ObjectId,
ref: TeamCollectionName,
required: true
},
tmbId: {
type: Schema.Types.ObjectId,
ref: TeamMemberCollectionName,
required: true
},
datasetId: {
type: Schema.Types.ObjectId,
ref: DatasetCollectionName,
required: true
},
type: {
type: String,
enum: Object.keys(DatasetCollectionTypeMap),
required: true
},
name: {
type: String,
required: true
},
createTime: {
type: Date,
default: () => new Date()
},
updateTime: {
type: Date,
default: () => new Date()
},
forbid: {
type: Boolean,
default: false
},
// chunk filed
trainingType: {
type: String,
enum: Object.keys(TrainingTypeMap)
},
chunkSize: {
type: Number,
required: true
},
chunkSplitter: {
type: String
},
qaPrompt: {
type: String
},
ocrParse: Boolean,
tags: {
type: [String],
default: []
},
// local file collection
fileId: {
type: Schema.Types.ObjectId,
ref: 'dataset.files'
},
// web link collection
rawLink: String,
// api collection
apiFileId: String,
// external collection
externalFileId: String,
externalFileUrl: String, // external import url
// next sync time
nextSyncTime: Date,
// metadata
rawTextLength: Number,
hashRawText: String,
metadata: {
type: Object,
default: {}
}
);
});
try {
// auth file
@@ -122,6 +117,16 @@ try {
// create time filter
DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, createTime: 1 });
// next sync time filter
DatasetCollectionSchema.index(
{ type: 1, nextSyncTime: -1 },
{
partialFilterExpression: {
nextSyncTime: { $exists: true }
}
}
);
// Get collection by external file id
DatasetCollectionSchema.index(
{ datasetId: 1, externalFileId: 1 },

View File

@@ -163,6 +163,10 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
...sourceReadType
});
if (!rawText) {
return DatasetCollectionSyncResultEnum.failed;
}
// Check if the original text is the same: skip if same
const hashRawText = hashStr(rawText);
if (collection.hashRawText && hashRawText === collection.hashRawText) {
@@ -178,28 +182,30 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
createCollectionParams: {
teamId: collection.teamId,
tmbId: collection.tmbId,
datasetId: collection.datasetId._id,
name: collection.name,
datasetId: collection.datasetId._id,
parentId: collection.parentId,
type: collection.type,
trainingType: collection.trainingType,
chunkSize: collection.chunkSize,
chunkSplitter: collection.chunkSplitter,
qaPrompt: collection.qaPrompt,
fileId: collection.fileId,
rawLink: collection.rawLink,
externalFileId: collection.externalFileId,
externalFileUrl: collection.externalFileUrl,
apiFileId: collection.apiFileId,
rawTextLength: rawText.length,
hashRawText,
rawTextLength: rawText.length,
metadata: collection.metadata,
tags: collection.tags,
createTime: collection.createTime,
parentId: collection.parentId,
trainingType: collection.trainingType,
chunkSize: collection.chunkSize,
chunkSplitter: collection.chunkSplitter,
qaPrompt: collection.qaPrompt,
metadata: collection.metadata
updateTime: new Date()
}
});

View File

@@ -91,17 +91,7 @@ const DatasetSchema = new Schema({
type: Object
},
syncSchedule: {
cronString: {
type: String
},
timezone: {
type: String
}
},
syncNextTime: {
type: Date
},
autoSync: Boolean,
// abandoned
externalReadUrl: {
@@ -112,7 +102,6 @@ const DatasetSchema = new Schema({
try {
DatasetSchema.index({ teamId: 1 });
DatasetSchema.index({ syncSchedule: 1, syncNextTime: -1 });
} catch (error) {
console.log(error);
}

View File

@@ -165,7 +165,8 @@ export async function pushDataListToTrainingQueue({
a: item.a,
chunkIndex: item.chunkIndex ?? 0,
weight: weight ?? 0,
indexes: item.indexes
indexes: item.indexes,
retryCount: 5
})),
{
session,