Add image index and pdf parse (#3956)

* feat: think tag parse * feat: parse think tag test * feat: pdf parse ux * feat: doc2x parse * perf: rewrite training mode setting * feat: image parse queue * perf: image index * feat: image parse process * feat: add init sh * fix: ts
2025-07-23 13:03:50 +00:00 · 2025-03-03 23:08:29 +08:00
parent 08b6f594df
commit adf5377ebe
106 changed files with 2337 additions and 1454 deletions
--- a/packages/service/core/dataset/collection/schema.ts
+++ b/packages/service/core/dataset/collection/schema.ts
@@ -1,7 +1,10 @@
 import { connectionMongo, getMongoModel } from '../../../common/mongo';
 const { Schema, model, models } = connectionMongo;
 import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
-import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constants';
+import {
+  DatasetCollectionTypeMap,
+  DatasetCollectionDataProcessModeEnum
+} from '@fastgpt/global/core/dataset/constants';
 import { DatasetCollectionName } from '../schema';
 import {
  TeamCollectionName,
@@ -31,6 +34,8 @@ const DatasetCollectionSchema = new Schema({
    ref: DatasetCollectionName,
    required: true
  },
+
+  // Basic info
  type: {
    type: String,
    enum: Object.keys(DatasetCollectionTypeMap),
@@ -40,6 +45,11 @@ const DatasetCollectionSchema = new Schema({
    type: String,
    required: true
  },
+  tags: {
+    type: [String],
+    default: []
+  },
+
  createTime: {
    type: Date,
    default: () => new Date()
@@ -48,33 +58,8 @@ const DatasetCollectionSchema = new Schema({
    type: Date,
    default: () => new Date()
  },
-  forbid: {
-    type: Boolean,
-    default: false
-  },
-
-  // chunk filed
-  trainingType: {
-    type: String,
-    enum: Object.keys(TrainingTypeMap)
-  },
-  chunkSize: {
-    type: Number,
-    required: true
-  },
-  chunkSplitter: {
-    type: String
-  },
-  qaPrompt: {
-    type: String
-  },
-  ocrParse: Boolean,
-
-  tags: {
-    type: [String],
-    default: []
-  },

+  // Metadata
  // local file collection
  fileId: {
    type: Schema.Types.ObjectId,
@@ -82,22 +67,39 @@ const DatasetCollectionSchema = new Schema({
  },
  // web link collection
  rawLink: String,
-  // api collection
+  // Api collection
  apiFileId: String,
-  // external collection
+  // external collection(Abandoned)
  externalFileId: String,
  externalFileUrl: String, // external import url

-  // next sync time
-  nextSyncTime: Date,
-
-  // metadata
  rawTextLength: Number,
  hashRawText: String,
  metadata: {
    type: Object,
    default: {}
-  }
+  },
+
+  forbid: Boolean,
+  // next sync time
+  nextSyncTime: Date,
+
+  // Parse settings
+  customPdfParse: Boolean,
+
+  // Chunk settings
+  imageIndex: Boolean,
+  autoIndexes: Boolean,
+  trainingType: {
+    type: String,
+    enum: Object.values(DatasetCollectionDataProcessModeEnum)
+  },
+  chunkSize: {
+    type: Number,
+    required: true
+  },
+  chunkSplitter: String,
+  qaPrompt: String
 });

 DatasetCollectionSchema.virtual('dataset', {