feat: Sync collection (#3368)

* feat: sync collection * feat: sync collection * perf: website selector * update doc
2025-10-17 16:45:02 +00:00 · 2024-12-11 15:03:41 +08:00
parent 048f5a2d53
commit d5752ddbaa
40 changed files with 365 additions and 191 deletions
--- a/packages/global/core/dataset/api.d.ts
+++ b/packages/global/core/dataset/api.d.ts
@@ -17,6 +17,9 @@ export type DatasetUpdateBody = {
  externalReadUrl?: DatasetSchemaType['externalReadUrl'];
  defaultPermission?: DatasetSchemaType['defaultPermission'];
  apiServer?: DatasetSchemaType['apiServer'];
+
+  // sync schedule
+  autoSync?: boolean;
 };

 /* ================= collection ===================== */
@@ -47,6 +50,8 @@ export type CreateDatasetCollectionParams = DatasetCollectionChunkMetadataType &
  tags?: string[];

  createTime?: Date;
+  updateTime?: Date;
+  nextSyncTime?: Date;
 };

 export type ApiCreateDatasetCollectionParams = DatasetCollectionChunkMetadataType & {
--- a/packages/global/core/dataset/constants.ts
+++ b/packages/global/core/dataset/constants.ts
@@ -82,7 +82,8 @@ export const DatasetCollectionTypeMap = {

 export enum DatasetCollectionSyncResultEnum {
  sameRaw = 'sameRaw',
-  success = 'success'
+  success = 'success',
+  failed = 'failed'
 }
 export const DatasetCollectionSyncResultMap = {
  [DatasetCollectionSyncResultEnum.sameRaw]: {
@@ -90,6 +91,9 @@ export const DatasetCollectionSyncResultMap = {
  },
  [DatasetCollectionSyncResultEnum.success]: {
    label: i18nT('common:core.dataset.collection.sync.result.success')
+  },
+  [DatasetCollectionSyncResultEnum.failed]: {
+    label: i18nT('dataset:sync_collection_failed')
  }
 };

--- a/packages/global/core/dataset/type.d.ts
+++ b/packages/global/core/dataset/type.d.ts
@@ -34,8 +34,7 @@ export type DatasetSchemaType = {
  inheritPermission: boolean;
  apiServer?: APIFileServer;

-  syncSchedule?: { cronString: string; timezone: string };
-  syncNextTime?: Date;
+  autoSync?: boolean;

  // abandon
  externalReadUrl?: string;
@@ -65,11 +64,13 @@ export type DatasetCollectionSchemaType = {
  fileId?: string; // local file id
  rawLink?: string; // link url
  externalFileId?: string; //external file id
+  apiFileId?: string; // api file id
+  externalFileUrl?: string; // external import url
+
+  nextSyncTime?: Date;

  rawTextLength?: number;
  hashRawText?: string;
-  externalFileUrl?: string; // external import url
-  apiFileId?: string; // api file id
  metadata?: {
    webPageSelector?: string;
    relatedImgId?: string; // The id of the associated image collections
--- a/packages/service/core/dataset/collection/controller.ts
+++ b/packages/service/core/dataset/collection/controller.ts
@@ -1,4 +1,7 @@
-import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
+import {
+  DatasetCollectionTypeEnum,
+  TrainingModeEnum
+} from '@fastgpt/global/core/dataset/constants';
 import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
 import { MongoDatasetCollection } from './schema';
 import {
@@ -24,6 +27,7 @@ import { getLLMModel, getVectorModel } from '../../ai/model';
 import { pushDataListToTrainingQueue } from '../training/controller';
 import { MongoImage } from '../../../common/file/image/schema';
 import { hashStr } from '@fastgpt/global/common/string/tools';
+import { addDays } from 'date-fns';

 export const createCollectionAndInsertData = async ({
  dataset,
@@ -72,6 +76,17 @@ export const createCollectionAndInsertData = async ({

      hashRawText: hashStr(rawText),
      rawTextLength: rawText.length,
+      nextSyncTime: (() => {
+        if (!dataset.autoSync) return undefined;
+        if (
+          [DatasetCollectionTypeEnum.link, DatasetCollectionTypeEnum.apiFile].includes(
+            createCollectionParams.type
+          )
+        ) {
+          return addDays(new Date(), 1);
+        }
+        return undefined;
+      })(),
      session
    });

@@ -155,10 +170,8 @@ export async function createOneCollection({

  fileId,
  rawLink,
-
  externalFileId,
  externalFileUrl,
-
  apiFileId,

  hashRawText,
@@ -166,7 +179,10 @@ export async function createOneCollection({
  metadata = {},
  session,
  tags,
-  createTime
+
+  createTime,
+  updateTime,
+  nextSyncTime
 }: CreateOneCollectionParams) {
  // Create collection tags
  const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session });
@@ -197,7 +213,10 @@ export async function createOneCollection({
        rawTextLength,
        hashRawText,
        tags: collectionTags,
-        createTime
+
+        createTime,
+        updateTime,
+        nextSyncTime
      }
    ],
    { session }
--- a/packages/service/core/dataset/collection/schema.ts
+++ b/packages/service/core/dataset/collection/schema.ts
@@ -1,4 +1,4 @@
-import { connectionMongo, getMongoModel, type Model } from '../../../common/mongo';
+import { connectionMongo, getMongoModel } from '../../../common/mongo';
 const { Schema, model, models } = connectionMongo;
 import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
 import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constants';
@@ -10,100 +10,95 @@ import {

 export const DatasetColCollectionName = 'dataset_collections';

-const DatasetCollectionSchema = new Schema(
-  {
-    parentId: {
-      type: Schema.Types.ObjectId,
-      ref: DatasetColCollectionName,
-      default: null
-    },
-    teamId: {
-      type: Schema.Types.ObjectId,
-      ref: TeamCollectionName,
-      required: true
-    },
-    tmbId: {
-      type: Schema.Types.ObjectId,
-      ref: TeamMemberCollectionName,
-      required: true
-    },
-    datasetId: {
-      type: Schema.Types.ObjectId,
-      ref: DatasetCollectionName,
-      required: true
-    },
-    type: {
-      type: String,
-      enum: Object.keys(DatasetCollectionTypeMap),
-      required: true
-    },
-    name: {
-      type: String,
-      required: true
-    },
-    createTime: {
-      type: Date,
-      default: () => new Date()
-    },
-    updateTime: {
-      type: Date,
-      default: () => new Date()
-    },
-    forbid: {
-      type: Boolean,
-      default: false
-    },
-
-    // chunk filed
-    trainingType: {
-      type: String,
-      enum: Object.keys(TrainingTypeMap)
-    },
-    chunkSize: {
-      type: Number,
-      required: true
-    },
-    chunkSplitter: {
-      type: String
-    },
-    qaPrompt: {
-      type: String
-    },
-    ocrParse: Boolean,
-
-    tags: {
-      type: [String],
-      default: []
-    },
-
-    // local file collection
-    fileId: {
-      type: Schema.Types.ObjectId,
-      ref: 'dataset.files'
-    },
-    // web link collection
-    rawLink: String,
-    // api collection
-    apiFileId: String,
-    // external collection
-    externalFileId: String,
-    externalFileUrl: String, // external import url
-
-    // metadata
-    rawTextLength: Number,
-    hashRawText: String,
-    metadata: {
-      type: Object,
-      default: {}
-    }
+const DatasetCollectionSchema = new Schema({
+  parentId: {
+    type: Schema.Types.ObjectId,
+    ref: DatasetColCollectionName,
+    default: null
  },
-  {
-    // Auto update updateTime
-    timestamps: {
-      updatedAt: 'updateTime'
-    }
+  teamId: {
+    type: Schema.Types.ObjectId,
+    ref: TeamCollectionName,
+    required: true
+  },
+  tmbId: {
+    type: Schema.Types.ObjectId,
+    ref: TeamMemberCollectionName,
+    required: true
+  },
+  datasetId: {
+    type: Schema.Types.ObjectId,
+    ref: DatasetCollectionName,
+    required: true
+  },
+  type: {
+    type: String,
+    enum: Object.keys(DatasetCollectionTypeMap),
+    required: true
+  },
+  name: {
+    type: String,
+    required: true
+  },
+  createTime: {
+    type: Date,
+    default: () => new Date()
+  },
+  updateTime: {
+    type: Date,
+    default: () => new Date()
+  },
+  forbid: {
+    type: Boolean,
+    default: false
+  },
+
+  // chunk filed
+  trainingType: {
+    type: String,
+    enum: Object.keys(TrainingTypeMap)
+  },
+  chunkSize: {
+    type: Number,
+    required: true
+  },
+  chunkSplitter: {
+    type: String
+  },
+  qaPrompt: {
+    type: String
+  },
+  ocrParse: Boolean,
+
+  tags: {
+    type: [String],
+    default: []
+  },
+
+  // local file collection
+  fileId: {
+    type: Schema.Types.ObjectId,
+    ref: 'dataset.files'
+  },
+  // web link collection
+  rawLink: String,
+  // api collection
+  apiFileId: String,
+  // external collection
+  externalFileId: String,
+  externalFileUrl: String, // external import url
+
+  // next sync time
+  nextSyncTime: Date,
+
+  // metadata
+  rawTextLength: Number,
+  hashRawText: String,
+  metadata: {
+    type: Object,
+    default: {}
  }
-);
+});

 try {
  // auth file
@@ -122,6 +117,16 @@ try {
  // create time filter
  DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, createTime: 1 });

+  // next sync time filter
+  DatasetCollectionSchema.index(
+    { type: 1, nextSyncTime: -1 },
+    {
+      partialFilterExpression: {
+        nextSyncTime: { $exists: true }
+      }
+    }
+  );
+
  // Get collection by external file id
  DatasetCollectionSchema.index(
    { datasetId: 1, externalFileId: 1 },
--- a/packages/service/core/dataset/collection/utils.ts
+++ b/packages/service/core/dataset/collection/utils.ts
@@ -163,6 +163,10 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
    ...sourceReadType
  });

+  if (!rawText) {
+    return DatasetCollectionSyncResultEnum.failed;
+  }
+
  // Check if the original text is the same: skip if same
  const hashRawText = hashStr(rawText);
  if (collection.hashRawText && hashRawText === collection.hashRawText) {
@@ -178,28 +182,30 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
      createCollectionParams: {
        teamId: collection.teamId,
        tmbId: collection.tmbId,
-        datasetId: collection.datasetId._id,
        name: collection.name,
+        datasetId: collection.datasetId._id,
+        parentId: collection.parentId,
        type: collection.type,

+        trainingType: collection.trainingType,
+        chunkSize: collection.chunkSize,
+        chunkSplitter: collection.chunkSplitter,
+        qaPrompt: collection.qaPrompt,
+
        fileId: collection.fileId,
        rawLink: collection.rawLink,
        externalFileId: collection.externalFileId,
        externalFileUrl: collection.externalFileUrl,
        apiFileId: collection.apiFileId,

-        rawTextLength: rawText.length,
        hashRawText,
+        rawTextLength: rawText.length,
+
+        metadata: collection.metadata,

        tags: collection.tags,
        createTime: collection.createTime,
-
-        parentId: collection.parentId,
-        trainingType: collection.trainingType,
-        chunkSize: collection.chunkSize,
-        chunkSplitter: collection.chunkSplitter,
-        qaPrompt: collection.qaPrompt,
-        metadata: collection.metadata
+        updateTime: new Date()
      }
    });

--- a/packages/service/core/dataset/schema.ts
+++ b/packages/service/core/dataset/schema.ts
@@ -91,17 +91,7 @@ const DatasetSchema = new Schema({
    type: Object
  },

-  syncSchedule: {
-    cronString: {
-      type: String
-    },
-    timezone: {
-      type: String
-    }
-  },
-  syncNextTime: {
-    type: Date
-  },
+  autoSync: Boolean,

  // abandoned
  externalReadUrl: {
@@ -112,7 +102,6 @@ const DatasetSchema = new Schema({

 try {
  DatasetSchema.index({ teamId: 1 });
-  DatasetSchema.index({ syncSchedule: 1, syncNextTime: -1 });
 } catch (error) {
  console.log(error);
 }
--- a/packages/service/core/dataset/training/controller.ts
+++ b/packages/service/core/dataset/training/controller.ts
@@ -165,7 +165,8 @@ export async function pushDataListToTrainingQueue({
          a: item.a,
          chunkIndex: item.chunkIndex ?? 0,
          weight: weight ?? 0,
-          indexes: item.indexes
+          indexes: item.indexes,
+          retryCount: 5
        })),
        {
          session,
--- a/packages/web/components/common/MySelect/CronSelector.tsx
+++ b/packages/web/components/common/MySelect/CronSelector.tsx
@@ -42,7 +42,7 @@ export const cronString2Fields = (cronString?: string) => {
 };

 export const cronString2Label = (
-  cronString: string,
+  cronString = '',
  t: any // i18nT
 ) => {
  const cronField = cronString2Fields(cronString);
--- a/packages/web/i18n/en/dataset.json
+++ b/packages/web/i18n/en/dataset.json
@@ -3,6 +3,8 @@
  "add_file": "Import",
  "api_file": "API Dataset",
  "api_url": "API Url",
+  "chunk_max_tokens": "max_tokens",
+  "close_auto_sync": "Are you sure you want to turn off automatic sync?",
  "collection.Create update time": "Creation/Update Time",
  "collection.Training type": "Training",
  "collection_not_support_retraining": "This collection type does not support retuning parameters",
@@ -12,6 +14,7 @@
  "collection_tags": "Collection Tags",
  "common_dataset": "General Dataset",
  "common_dataset_desc": "Build a Dataset by importing files, web links, or manual input.",
+  "config_sync_schedule": "Configure scheduled synchronization",
  "confirm_to_rebuild_embedding_tip": "Are you sure you want to switch the index for the Dataset?\nSwitching the index is a significant operation that requires re-indexing all data in your Dataset, which may take a long time. Please ensure your account has sufficient remaining points.\n\nAdditionally, you need to update the applications that use this Dataset to avoid conflicts with other indexed model Datasets.",
  "core.dataset.import.Adjust parameters": "Adjust parameters",
  "custom_data_process_params": "Custom",
@@ -36,7 +39,9 @@
  "ideal_chunk_length_tips": "Segment according to the end symbol and combine multiple segments into one block. This value determines the estimated size of the block, if there is any fluctuation.",
  "import.Auto mode Estimated Price Tips": "The text understanding model needs to be called, which requires more points: {{price}} points/1K tokens",
  "import.Embedding Estimated Price Tips": "Only use the index model and consume a small amount of AI points: {{price}} points/1K tokens",
+  "is_open_schedule": "Enable scheduled synchronization",
  "move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.",
+  "open_auto_sync": "After scheduled synchronization is turned on, the system will try to synchronize the collection from time to time every day. During the collection synchronization period, the collection data will not be searched.",
  "permission.des.manage": "Can manage the entire knowledge base data and information",
  "permission.des.read": "View knowledge base content",
  "permission.des.write": "Ability to add and change knowledge base content",
@@ -47,6 +52,9 @@
  "retrain_task_submitted": "The retraining task has been submitted",
  "same_api_collection": "The same API set exists",
  "start_sync_website_tip": "Confirm to start synchronizing data? \nThe old data will be deleted and retrieved again, please confirm!",
+  "sync_collection_failed": "Synchronization collection error, please check whether the source file can be accessed normally",
+  "sync_schedule": "Timing synchronization",
+  "sync_schedule_tip": "Only existing collections will be synchronized. \nIncludes linked collections and all collections in the API knowledge base. \nThe system will poll for updates every day, and the specific update time cannot be determined.",
  "tag.Add New": "Add New",
  "tag.Add_new_tag": "Add New Tag",
  "tag.Edit_tag": "Edit Tag",
@@ -59,6 +67,7 @@
  "tag.total_tags": "Total {{total}} tags",
  "the_knowledge_base_has_indexes_that_are_being_trained_or_being_rebuilt": "The Dataset has indexes that are being trained or rebuilt",
  "training_mode": "Chunk mode",
+  "vector_model_max_tokens_tip": "Each chunk of data has a maximum length of 3000 tokens",
  "website_dataset": "Website Sync",
  "website_dataset_desc": "Website sync allows you to build a Dataset directly using a web link."
 }
--- a/packages/web/i18n/zh-CN/dataset.json
+++ b/packages/web/i18n/zh-CN/dataset.json
@@ -3,6 +3,8 @@
  "add_file": "添加文件",
  "api_file": "API 文件库",
  "api_url": "接口地址",
+  "chunk_max_tokens": "分块上限",
+  "close_auto_sync": "确认关闭自动同步功能？",
  "collection.Create update time": "创建/更新时间",
  "collection.Training type": "训练模式",
  "collection_not_support_retraining": "该集合类型不支持重新调整参数",
@@ -12,6 +14,7 @@
  "collection_tags": "集合标签",
  "common_dataset": "通用知识库",
  "common_dataset_desc": "可通过导入文件、网页链接或手动录入形式构建知识库",
+  "config_sync_schedule": "配置定时同步",
  "confirm_to_rebuild_embedding_tip": "确认为知识库切换索引？\n切换索引是一个非常重量的操作，需要对您知识库内所有数据进行重新索引，时间可能较长，请确保账号内剩余积分充足。\n\n此外，你还需要注意修改选择该知识库的应用，避免它们与其他索引模型知识库混用。",
  "core.dataset.import.Adjust parameters": "调整参数",
  "custom_data_process_params": "自定义",
@@ -36,7 +39,9 @@
  "ideal_chunk_length_tips": "按结束符号进行分段，并将多个分段组成一个分块，该值决定了分块的预估大小，如果会有上下浮动。",
  "import.Auto mode Estimated Price Tips": "需调用文本理解模型，需要消耗较多AI 积分：{{price}} 积分/1K tokens",
  "import.Embedding Estimated Price Tips": "仅使用索引模型，消耗少量 AI 积分：{{price}} 积分/1K tokens",
+  "is_open_schedule": "启用定时同步",
  "move.hint": "移动后，所选知识库/文件夹将继承新文件夹的权限设置，原先的权限设置失效。",
+  "open_auto_sync": "开启定时同步后，系统将会每天不定时尝试同步集合，集合同步期间，会出现无法搜索到该集合数据现象。",
  "permission.des.manage": "可管理整个知识库数据和信息",
  "permission.des.read": "可查看知识库内容",
  "permission.des.write": "可增加和变更知识库内容",
@@ -47,6 +52,9 @@
  "retrain_task_submitted": "重新训练任务已提交",
  "same_api_collection": "存在相同的 API 集合",
  "start_sync_website_tip": "确认开始同步数据？将会删除旧数据后重新获取，请确认！",
+  "sync_collection_failed": "同步集合错误，请检查是否能正常访问源文件",
+  "sync_schedule": "定时同步",
+  "sync_schedule_tip": "仅会同步已存在的集合。包括链接集合以及 API 知识库里所有集合。系统会每天进行轮询更新，无法确定具体的更新时间。",
  "tag.Add New": "新建",
  "tag.Add_new_tag": "新建标签",
  "tag.Edit_tag": "编辑标签",
@@ -59,6 +67,7 @@
  "tag.total_tags": "共{{total}}个标签",
  "the_knowledge_base_has_indexes_that_are_being_trained_or_being_rebuilt": "知识库有训练中或正在重建的索引",
  "training_mode": "处理方式",
+  "vector_model_max_tokens_tip": "每个分块数据，最大长度为 3000 tokens",
  "website_dataset": "Web 站点同步",
  "website_dataset_desc": "Web 站点同步允许你直接使用一个网页链接构建知识库"
 }
--- a/packages/web/i18n/zh-Hant/dataset.json
+++ b/packages/web/i18n/zh-Hant/dataset.json
@@ -3,6 +3,8 @@
  "add_file": "新增文件",
  "api_file": "API 檔案庫",
  "api_url": "介面位址",
+  "chunk_max_tokens": "分塊上限",
+  "close_auto_sync": "確認關閉自動同步功能？",
  "collection.Create update time": "建立／更新時間",
  "collection.Training type": "分段模式",
  "collection_not_support_retraining": "此集合類型不支援重新調整參數",
@@ -12,6 +14,7 @@
  "collection_tags": "集合標籤",
  "common_dataset": "通用資料集",
  "common_dataset_desc": "可透過匯入檔案、網頁連結或手動輸入的方式建立資料集",
+  "config_sync_schedule": "配置定時同步",
  "confirm_to_rebuild_embedding_tip": "確定要為資料集切換索引嗎？\n切換索引是一個重要的操作，需要對您資料集內所有資料重新建立索引，可能需要較長時間，請確保帳號內剩餘點數充足。\n\n此外，您還需要注意修改使用此資料集的應用程式，避免與其他索引模型資料集混用。",
  "core.dataset.import.Adjust parameters": "調整參數",
  "custom_data_process_params": "自訂",
@@ -36,7 +39,9 @@
  "ideal_chunk_length_tips": "依結束符號進行分段，並將多個分段組成一個分塊，此值決定了分塊的預估大小，可能會有上下浮動。",
  "import.Auto mode Estimated Price Tips": "需呼叫文字理解模型，將消耗較多 AI 點數：{{price}} 點數 / 1K tokens",
  "import.Embedding Estimated Price Tips": "僅使用索引模型，消耗少量 AI 點數：{{price}} 點數 / 1K tokens",
+  "is_open_schedule": "啟用定時同步",
  "move.hint": "移動後，所選資料集／資料夾將繼承新資料夾的權限設定，原先的權限設定將失效。",
+  "open_auto_sync": "開啟定時同步後，系統將每天不定時嘗試同步集合，集合同步期間，會出現無法搜尋到該集合資料現象。",
  "permission.des.manage": "可管理整個資料集的資料和資訊",
  "permission.des.read": "可檢視資料集內容",
  "permission.des.write": "可新增和變更資料集內容",
@@ -47,6 +52,9 @@
  "retrain_task_submitted": "重新訓練任務已提交",
  "same_api_collection": "存在相同的 API 集合",
  "start_sync_website_tip": "確認開始同步資料？\n將會刪除舊資料後重新獲取，請確認！",
+  "sync_collection_failed": "同步集合錯誤，請檢查是否能正常存取來源文件",
+  "sync_schedule": "定時同步",
+  "sync_schedule_tip": "只會同步已存在的集合。\n包括連結集合以及 API 知識庫裡所有集合。\n系統會每天進行輪詢更新，無法確定特定的更新時間。",
  "tag.Add New": "新增",
  "tag.Add_new_tag": "新增標籤",
  "tag.Edit_tag": "編輯標籤",
@@ -59,6 +67,7 @@
  "tag.total_tags": "共 {{total}} 個標籤",
  "the_knowledge_base_has_indexes_that_are_being_trained_or_being_rebuilt": "資料集有索引正在訓練或重建中",
  "training_mode": "分段模式",
+  "vector_model_max_tokens_tip": "每個分塊數據，最大長度為 3000 tokens",
  "website_dataset": "網站同步",
  "website_dataset_desc": "網站同步功能讓您可以直接使用網頁連結建立資料集"
 }