feat: dataset index prefix (#5061)

2025-10-15 15:41:05 +00:00 · 2025-06-18 17:26:53 +08:00
parent 6b2ea696c5
commit 36fafd2149
34 changed files with 371 additions and 259 deletions
--- a/packages/global/core/dataset/controller.d.ts
+++ b/packages/global/core/dataset/controller.d.ts
@@ -10,6 +10,7 @@ export type CreateDatasetDataProps = {
  a?: string;
  imageId?: string;
  indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
+  indexPrefix?: string;
 };

 export type UpdateDatasetDataProps = {
@@ -21,6 +22,7 @@ export type UpdateDatasetDataProps = {
    dataId?: string; // pg data id
  })[];
  imageId?: string;
+  indexPrefix?: string;
 };

 export type PatchIndexesProps =
--- a/packages/global/core/dataset/training/type.d.ts
+++ b/packages/global/core/dataset/training/type.d.ts
@@ -7,9 +7,9 @@ export type PushDataToTrainingQueueProps = {
  datasetId: string;
  collectionId: string;

+  data: PushDatasetDataChunkProps[];
  mode?: TrainingModeEnum;
  data: PushDatasetDataChunkProps[];
-  prompt?: string;

  agentModel: string;
  vectorModel: string;
--- a/packages/global/core/dataset/type.d.ts
+++ b/packages/global/core/dataset/type.d.ts
@@ -36,6 +36,7 @@ export type ChunkSettingsType = {
  // Index enhance
  imageIndex?: boolean;
  autoIndexes?: boolean;
+  indexPrefixTitle?: boolean;

  // Chunk setting
  chunkSettingMode?: ChunkSettingModeEnum; // 系统参数/自定义参数
@@ -184,8 +185,6 @@ export type DatasetTrainingSchemaType = {
  expireAt: Date;
  lockTime: Date;
  mode: TrainingModeEnum;
-  model?: string;
-  prompt?: string;
  dataId?: string;
  q: string;
  a: string;
--- a/packages/service/core/dataset/collection/controller.ts
+++ b/packages/service/core/dataset/collection/controller.ts
@@ -103,6 +103,7 @@ export const createCollectionAndInsertData = async ({
      delete formatCreateCollectionParams.chunkSize;
      delete formatCreateCollectionParams.chunkSplitter;
      delete formatCreateCollectionParams.indexSize;
+      delete formatCreateCollectionParams.indexPrefixTitle;
    }
  }
  if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
@@ -223,7 +224,6 @@ export const createCollectionAndInsertData = async ({
          vlmModel: dataset.vlmModel,
          indexSize,
          mode: trainingMode,
-          prompt: formatCreateCollectionParams.qaPrompt,
          billId: traingBillId,
          data: chunks.map((item, index) => ({
            ...item,
--- a/packages/service/core/dataset/schema.ts
+++ b/packages/service/core/dataset/schema.ts
@@ -32,6 +32,7 @@ export const ChunkSettings = {

  imageIndex: Boolean,
  autoIndexes: Boolean,
+  indexPrefixTitle: Boolean,

  chunkSettingMode: {
    type: String,
--- a/packages/service/core/dataset/training/controller.ts
+++ b/packages/service/core/dataset/training/controller.ts
@@ -27,23 +27,6 @@ export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> =>
  } catch (error) {}
 };

-export const pushDataListToTrainingQueueByCollectionId = async ({
-  collectionId,
-  ...props
-}: Omit<PushDataToTrainingQueueProps, 'datasetId' | 'agentModel' | 'vectorModel' | 'vlmModel'>) => {
-  const {
-    dataset: { _id: datasetId, agentModel, vectorModel, vlmModel }
-  } = await getCollectionWithDataset(collectionId);
-  return pushDataListToTrainingQueue({
-    ...props,
-    datasetId,
-    collectionId,
-    vectorModel,
-    agentModel,
-    vlmModel
-  });
-};
-
 export async function pushDataListToTrainingQueue({
  teamId,
  tmbId,
@@ -53,7 +36,6 @@ export async function pushDataListToTrainingQueue({
  vectorModel,
  vlmModel,
  data,
-  prompt,
  billId,
  mode = TrainingModeEnum.chunk,
  indexSize,
@@ -149,8 +131,6 @@ export async function pushDataListToTrainingQueue({
          collectionId: collectionId,
          billId,
          mode: formatTrainingMode(item, mode),
-          prompt,
-          model,
          ...(item.q && { q: item.q }),
          ...(item.a && { a: item.a }),
          ...(item.imageId && { imageId: item.imageId }),
--- a/packages/service/core/dataset/training/schema.ts
+++ b/packages/service/core/dataset/training/schema.ts
@@ -10,6 +10,7 @@ import {
  TeamMemberCollectionName
 } from '@fastgpt/global/support/user/team/constant';
 import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
+import { DatasetDataCollectionName } from '../data/schema';

 export const DatasetTrainingCollectionName = 'dataset_trainings';

@@ -54,8 +55,6 @@ const TrainingDataSchema = new Schema({
    default: 5
  },

-  model: String,
-  prompt: String,
  q: {
    type: String,
    default: ''
@@ -74,7 +73,10 @@ const TrainingDataSchema = new Schema({
    type: Number,
    default: 0
  },
-  dataId: Schema.Types.ObjectId,
+  dataId: {
+    type: Schema.Types.ObjectId,
+    ref: DatasetDataCollectionName
+  },
  indexes: {
    type: [
      {
@@ -105,6 +107,12 @@ TrainingDataSchema.virtual('collection', {
  foreignField: '_id',
  justOne: true
 });
+TrainingDataSchema.virtual('data', {
+  ref: DatasetDataCollectionName,
+  localField: 'dataId',
+  foreignField: '_id',
+  justOne: true
+});

 try {
  // lock training data(teamId); delete training data
--- a/packages/web/i18n/en/dataset.json
+++ b/packages/web/i18n/en/dataset.json
@@ -111,6 +111,8 @@
  "import_param_setting": "Parameter settings",
  "import_select_file": "Select a file",
  "import_select_link": "Enter link",
+  "index_prefix_title": "Index add title",
+  "index_prefix_title_tips": "Automatically add title names to all indexes",
  "index_size": "Index size",
  "index_size_tips": "When vectorized, the system will automatically further segment the blocks according to this size.",
  "input_required_field_to_select_baseurl": "Please enter the required information first",
--- a/packages/web/i18n/zh-CN/dataset.json
+++ b/packages/web/i18n/zh-CN/dataset.json
@@ -111,6 +111,8 @@
  "import_param_setting": "参数设置",
  "import_select_file": "选择文件",
  "import_select_link": "输入链接",
+  "index_prefix_title": "将标题加入索引",
+  "index_prefix_title_tips": "自动给索引所有索引加标题名",
  "index_size": "索引大小",
  "index_size_tips": "向量化时内容的长度，系统会自动按该大小对分块进行进一步的分割。",
  "input_required_field_to_select_baseurl": "请先输入必填信息",
--- a/packages/web/i18n/zh-Hant/dataset.json
+++ b/packages/web/i18n/zh-Hant/dataset.json
@@ -110,6 +110,8 @@
  "import_param_setting": "參數設定",
  "import_select_file": "選擇文件",
  "import_select_link": "輸入連結",
+  "index_prefix_title": "將標題加入索引",
+  "index_prefix_title_tips": "自動給索引所有索引加標題名",
  "index_size": "索引大小",
  "index_size_tips": "向量化時內容的長度，系統會自動按該大小對分塊進行進一步的分割。",
  "input_required_field_to_select_baseurl": "請先輸入必填信息",