feat: Sync collection (#3368)

* feat: sync collection * feat: sync collection * perf: website selector * update doc
2025-07-22 12:20:34 +00:00 · 2024-12-11 15:03:41 +08:00
parent 048f5a2d53
commit d5752ddbaa
40 changed files with 365 additions and 191 deletions
--- a/packages/service/core/dataset/collection/controller.ts
+++ b/packages/service/core/dataset/collection/controller.ts
@@ -1,4 +1,7 @@
-import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
+import {
+  DatasetCollectionTypeEnum,
+  TrainingModeEnum
+} from '@fastgpt/global/core/dataset/constants';
 import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
 import { MongoDatasetCollection } from './schema';
 import {
@@ -24,6 +27,7 @@ import { getLLMModel, getVectorModel } from '../../ai/model';
 import { pushDataListToTrainingQueue } from '../training/controller';
 import { MongoImage } from '../../../common/file/image/schema';
 import { hashStr } from '@fastgpt/global/common/string/tools';
+import { addDays } from 'date-fns';

 export const createCollectionAndInsertData = async ({
  dataset,
@@ -72,6 +76,17 @@ export const createCollectionAndInsertData = async ({

      hashRawText: hashStr(rawText),
      rawTextLength: rawText.length,
+      nextSyncTime: (() => {
+        if (!dataset.autoSync) return undefined;
+        if (
+          [DatasetCollectionTypeEnum.link, DatasetCollectionTypeEnum.apiFile].includes(
+            createCollectionParams.type
+          )
+        ) {
+          return addDays(new Date(), 1);
+        }
+        return undefined;
+      })(),
      session
    });

@@ -155,10 +170,8 @@ export async function createOneCollection({

  fileId,
  rawLink,
-
  externalFileId,
  externalFileUrl,
-
  apiFileId,

  hashRawText,
@@ -166,7 +179,10 @@ export async function createOneCollection({
  metadata = {},
  session,
  tags,
-  createTime
+
+  createTime,
+  updateTime,
+  nextSyncTime
 }: CreateOneCollectionParams) {
  // Create collection tags
  const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session });
@@ -197,7 +213,10 @@ export async function createOneCollection({
        rawTextLength,
        hashRawText,
        tags: collectionTags,
-        createTime
+
+        createTime,
+        updateTime,
+        nextSyncTime
      }
    ],
    { session }
--- a/packages/service/core/dataset/collection/schema.ts
+++ b/packages/service/core/dataset/collection/schema.ts
@@ -1,4 +1,4 @@
-import { connectionMongo, getMongoModel, type Model } from '../../../common/mongo';
+import { connectionMongo, getMongoModel } from '../../../common/mongo';
 const { Schema, model, models } = connectionMongo;
 import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
 import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constants';
@@ -10,100 +10,95 @@ import {

 export const DatasetColCollectionName = 'dataset_collections';

-const DatasetCollectionSchema = new Schema(
-  {
-    parentId: {
-      type: Schema.Types.ObjectId,
-      ref: DatasetColCollectionName,
-      default: null
-    },
-    teamId: {
-      type: Schema.Types.ObjectId,
-      ref: TeamCollectionName,
-      required: true
-    },
-    tmbId: {
-      type: Schema.Types.ObjectId,
-      ref: TeamMemberCollectionName,
-      required: true
-    },
-    datasetId: {
-      type: Schema.Types.ObjectId,
-      ref: DatasetCollectionName,
-      required: true
-    },
-    type: {
-      type: String,
-      enum: Object.keys(DatasetCollectionTypeMap),
-      required: true
-    },
-    name: {
-      type: String,
-      required: true
-    },
-    createTime: {
-      type: Date,
-      default: () => new Date()
-    },
-    updateTime: {
-      type: Date,
-      default: () => new Date()
-    },
-    forbid: {
-      type: Boolean,
-      default: false
-    },
-
-    // chunk filed
-    trainingType: {
-      type: String,
-      enum: Object.keys(TrainingTypeMap)
-    },
-    chunkSize: {
-      type: Number,
-      required: true
-    },
-    chunkSplitter: {
-      type: String
-    },
-    qaPrompt: {
-      type: String
-    },
-    ocrParse: Boolean,
-
-    tags: {
-      type: [String],
-      default: []
-    },
-
-    // local file collection
-    fileId: {
-      type: Schema.Types.ObjectId,
-      ref: 'dataset.files'
-    },
-    // web link collection
-    rawLink: String,
-    // api collection
-    apiFileId: String,
-    // external collection
-    externalFileId: String,
-    externalFileUrl: String, // external import url
-
-    // metadata
-    rawTextLength: Number,
-    hashRawText: String,
-    metadata: {
-      type: Object,
-      default: {}
-    }
+const DatasetCollectionSchema = new Schema({
+  parentId: {
+    type: Schema.Types.ObjectId,
+    ref: DatasetColCollectionName,
+    default: null
  },
-  {
-    // Auto update updateTime
-    timestamps: {
-      updatedAt: 'updateTime'
-    }
+  teamId: {
+    type: Schema.Types.ObjectId,
+    ref: TeamCollectionName,
+    required: true
+  },
+  tmbId: {
+    type: Schema.Types.ObjectId,
+    ref: TeamMemberCollectionName,
+    required: true
+  },
+  datasetId: {
+    type: Schema.Types.ObjectId,
+    ref: DatasetCollectionName,
+    required: true
+  },
+  type: {
+    type: String,
+    enum: Object.keys(DatasetCollectionTypeMap),
+    required: true
+  },
+  name: {
+    type: String,
+    required: true
+  },
+  createTime: {
+    type: Date,
+    default: () => new Date()
+  },
+  updateTime: {
+    type: Date,
+    default: () => new Date()
+  },
+  forbid: {
+    type: Boolean,
+    default: false
+  },
+
+  // chunk filed
+  trainingType: {
+    type: String,
+    enum: Object.keys(TrainingTypeMap)
+  },
+  chunkSize: {
+    type: Number,
+    required: true
+  },
+  chunkSplitter: {
+    type: String
+  },
+  qaPrompt: {
+    type: String
+  },
+  ocrParse: Boolean,
+
+  tags: {
+    type: [String],
+    default: []
+  },
+
+  // local file collection
+  fileId: {
+    type: Schema.Types.ObjectId,
+    ref: 'dataset.files'
+  },
+  // web link collection
+  rawLink: String,
+  // api collection
+  apiFileId: String,
+  // external collection
+  externalFileId: String,
+  externalFileUrl: String, // external import url
+
+  // next sync time
+  nextSyncTime: Date,
+
+  // metadata
+  rawTextLength: Number,
+  hashRawText: String,
+  metadata: {
+    type: Object,
+    default: {}
  }
-);
+});

 try {
  // auth file
@@ -122,6 +117,16 @@ try {
  // create time filter
  DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, createTime: 1 });

+  // next sync time filter
+  DatasetCollectionSchema.index(
+    { type: 1, nextSyncTime: -1 },
+    {
+      partialFilterExpression: {
+        nextSyncTime: { $exists: true }
+      }
+    }
+  );
+
  // Get collection by external file id
  DatasetCollectionSchema.index(
    { datasetId: 1, externalFileId: 1 },
--- a/packages/service/core/dataset/collection/utils.ts
+++ b/packages/service/core/dataset/collection/utils.ts
@@ -163,6 +163,10 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
    ...sourceReadType
  });

+  if (!rawText) {
+    return DatasetCollectionSyncResultEnum.failed;
+  }
+
  // Check if the original text is the same: skip if same
  const hashRawText = hashStr(rawText);
  if (collection.hashRawText && hashRawText === collection.hashRawText) {
@@ -178,28 +182,30 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
      createCollectionParams: {
        teamId: collection.teamId,
        tmbId: collection.tmbId,
-        datasetId: collection.datasetId._id,
        name: collection.name,
+        datasetId: collection.datasetId._id,
+        parentId: collection.parentId,
        type: collection.type,

+        trainingType: collection.trainingType,
+        chunkSize: collection.chunkSize,
+        chunkSplitter: collection.chunkSplitter,
+        qaPrompt: collection.qaPrompt,
+
        fileId: collection.fileId,
        rawLink: collection.rawLink,
        externalFileId: collection.externalFileId,
        externalFileUrl: collection.externalFileUrl,
        apiFileId: collection.apiFileId,

-        rawTextLength: rawText.length,
        hashRawText,
+        rawTextLength: rawText.length,
+
+        metadata: collection.metadata,

        tags: collection.tags,
        createTime: collection.createTime,
-
-        parentId: collection.parentId,
-        trainingType: collection.trainingType,
-        chunkSize: collection.chunkSize,
-        chunkSplitter: collection.chunkSplitter,
-        qaPrompt: collection.qaPrompt,
-        metadata: collection.metadata
+        updateTime: new Date()
      }
    });

--- a/packages/service/core/dataset/schema.ts
+++ b/packages/service/core/dataset/schema.ts
@@ -91,17 +91,7 @@ const DatasetSchema = new Schema({
    type: Object
  },

-  syncSchedule: {
-    cronString: {
-      type: String
-    },
-    timezone: {
-      type: String
-    }
-  },
-  syncNextTime: {
-    type: Date
-  },
+  autoSync: Boolean,

  // abandoned
  externalReadUrl: {
@@ -112,7 +102,6 @@ const DatasetSchema = new Schema({

 try {
  DatasetSchema.index({ teamId: 1 });
-  DatasetSchema.index({ syncSchedule: 1, syncNextTime: -1 });
 } catch (error) {
  console.log(error);
 }
--- a/packages/service/core/dataset/training/controller.ts
+++ b/packages/service/core/dataset/training/controller.ts
@@ -165,7 +165,8 @@ export async function pushDataListToTrainingQueue({
          a: item.a,
          chunkIndex: item.chunkIndex ?? 0,
          weight: weight ?? 0,
-          indexes: item.indexes
+          indexes: item.indexes,
+          retryCount: 5
        })),
        {
          session,