perf: chunk trigger and paragraph split (#4893)

* perf: chunk trigger and paragraph split * update max size computed * perf: i18n * remove table
2025-07-23 05:12:39 +00:00 · 2025-05-26 18:57:22 +08:00
parent 874300a56a
commit c25cd48e72
23 changed files with 859 additions and 164 deletions
--- a/packages/service/core/dataset/collection/controller.ts
+++ b/packages/service/core/dataset/collection/controller.ts
@@ -34,6 +34,7 @@ import { getTrainingModeByCollection } from './utils';
 import {
  computeChunkSize,
  computeChunkSplitter,
+  computeParagraphChunkDeep,
  getLLMMaxChunkSize
 } from '@fastgpt/global/core/dataset/training/utils';
 import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
@@ -74,6 +75,8 @@ export const createCollectionAndInsertData = async ({
    llmModel: getLLMModel(dataset.agentModel)
  });
  const chunkSplitter = computeChunkSplitter(createCollectionParams);
+  const paragraphChunkDeep = computeParagraphChunkDeep(createCollectionParams);
+
  if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
    delete createCollectionParams.chunkTriggerType;
    delete createCollectionParams.chunkTriggerMinSize;
@@ -87,7 +90,11 @@ export const createCollectionAndInsertData = async ({
  // 1. split chunks
  const chunks = rawText2Chunks({
    rawText,
+    chunkTriggerType: createCollectionParams.chunkTriggerType,
+    chunkTriggerMinSize: createCollectionParams.chunkTriggerMinSize,
    chunkSize,
+    paragraphChunkDeep,
+    paragraphChunkMinSize: createCollectionParams.paragraphChunkMinSize,
    maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
    overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
    customReg: chunkSplitter ? [chunkSplitter] : [],
@@ -112,6 +119,7 @@ export const createCollectionAndInsertData = async ({
    const { _id: collectionId } = await createOneCollection({
      ...createCollectionParams,
      trainingType,
+      paragraphChunkDeep,
      chunkSize,
      chunkSplitter,

@@ -212,46 +220,19 @@ export type CreateOneCollectionParams = CreateDatasetCollectionParams & {
  tmbId: string;
  session?: ClientSession;
 };
-export async function createOneCollection({
-  teamId,
-  tmbId,
-  name,
-  parentId,
-  datasetId,
-  type,
+export async function createOneCollection({ session, ...props }: CreateOneCollectionParams) {
+  const {
+    teamId,
+    parentId,
+    datasetId,
+    tags,

-  createTime,
-  updateTime,
-
-  hashRawText,
-  rawTextLength,
-  metadata = {},
-  tags,
-
-  nextSyncTime,
-
-  fileId,
-  rawLink,
-  externalFileId,
-  externalFileUrl,
-  apiFileId,
-
-  // Parse settings
-  customPdfParse,
-  imageIndex,
-  autoIndexes,
-
-  // Chunk settings
-  trainingType,
-  chunkSettingMode,
-  chunkSplitMode,
-  chunkSize,
-  indexSize,
-  chunkSplitter,
-  qaPrompt,
-
-  session
-}: CreateOneCollectionParams) {
+    fileId,
+    rawLink,
+    externalFileId,
+    externalFileUrl,
+    apiFileId
+  } = props;
  // Create collection tags
  const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session });

@@ -259,41 +240,18 @@ export async function createOneCollection({
  const [collection] = await MongoDatasetCollection.create(
    [
      {
+        ...props,
        teamId,
-        tmbId,
        parentId: parentId || null,
        datasetId,
-        name,
-        type,

-        rawTextLength,
-        hashRawText,
        tags: collectionTags,
-        metadata,
-
-        createTime,
-        updateTime,
-        nextSyncTime,

        ...(fileId ? { fileId } : {}),
        ...(rawLink ? { rawLink } : {}),
        ...(externalFileId ? { externalFileId } : {}),
        ...(externalFileUrl ? { externalFileUrl } : {}),
-        ...(apiFileId ? { apiFileId } : {}),
-
-        // Parse settings
-        customPdfParse,
-        imageIndex,
-        autoIndexes,
-
-        // Chunk settings
-        trainingType,
-        chunkSettingMode,
-        chunkSplitMode,
-        chunkSize,
-        indexSize,
-        chunkSplitter,
-        qaPrompt
+        ...(apiFileId ? { apiFileId } : {})
      }
    ],
    { session, ordered: true }
--- a/packages/service/core/dataset/read.ts
+++ b/packages/service/core/dataset/read.ts
@@ -1,5 +1,8 @@
 import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
-import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
+import {
+  ChunkTriggerConfigTypeEnum,
+  DatasetSourceReadTypeEnum
+} from '@fastgpt/global/core/dataset/constants';
 import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
 import { urlsFetch } from '../../common/string/cheerio';
 import { type TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
@@ -179,11 +182,17 @@ export const readApiServerFileContent = async ({

 export const rawText2Chunks = ({
  rawText,
+  chunkTriggerType = ChunkTriggerConfigTypeEnum.minSize,
+  chunkTriggerMinSize = 1000,
  backupParse,
  chunkSize = 512,
  ...splitProps
 }: {
  rawText: string;
+
+  chunkTriggerType?: ChunkTriggerConfigTypeEnum;
+  chunkTriggerMinSize?: number; // maxSize from agent model, not store
+
  backupParse?: boolean;
  tableParse?: boolean;
 } & TextSplitProps): {
@@ -209,6 +218,28 @@ export const rawText2Chunks = ({
    };
  };

+  // Chunk condition
+  // 1. 选择最大值条件，只有超过了最大值(默认为模型的最大值*0.7），才会触发分块
+  if (chunkTriggerType === ChunkTriggerConfigTypeEnum.maxSize) {
+    const textLength = rawText.trim().length;
+    const maxSize = splitProps.maxSize ? splitProps.maxSize * 0.7 : 16000;
+    if (textLength < maxSize) {
+      return [
+        {
+          q: rawText,
+          a: ''
+        }
+      ];
+    }
+  }
+  // 2. 选择最小值条件，只有超过最小值(手动决定)才会触发分块
+  if (chunkTriggerType !== ChunkTriggerConfigTypeEnum.forceChunk) {
+    const textLength = rawText.trim().length;
+    if (textLength < chunkTriggerMinSize) {
+      return [{ q: rawText, a: '' }];
+    }
+  }
+
  if (backupParse) {
    return parseDatasetBackup2Chunks(rawText).chunks;
  }
--- a/packages/service/core/dataset/schema.ts
+++ b/packages/service/core/dataset/schema.ts
@@ -47,7 +47,6 @@ export const ChunkSettings = {
  },
  paragraphChunkDeep: Number,
  paragraphChunkMinSize: Number,
-  paragraphChunkMaxSize: Number,
  chunkSize: Number,
  chunkSplitter: String,

--- a/packages/service/core/dataset/search/controller.ts
+++ b/packages/service/core/dataset/search/controller.ts
@@ -658,7 +658,7 @@ export async function searchDatasetData(
        tokenLen: 0
      };
    } catch (error) {
-      addLog.error('multiQueryRecall error', error);
+      addLog.error('Full text search error', error);
      return {
        fullTextRecallResults: [],
        tokenLen: 0
--- a/packages/service/worker/readFile/extension/rawText.ts
+++ b/packages/service/worker/readFile/extension/rawText.ts
@@ -1,5 +1,6 @@
 import iconv from 'iconv-lite';
 import { type ReadRawTextByBuffer, type ReadFileResponse } from '../type';
+import { matchMdImg } from '@fastgpt/global/common/string/markdown';

 const rawEncodingList = [
  'ascii',
@@ -34,7 +35,10 @@ export const readFileRawText = ({ buffer, encoding }: ReadRawTextByBuffer): Read
    }
  })();

+  const { text, imageList } = matchMdImg(content);
+
  return {
-    rawText: content
+    rawText: text,
+    imageList
  };
 };