perf: chunk trigger and paragraph split (#4893)

* perf: chunk trigger and paragraph split * update max size computed * perf: i18n * remove table
2025-07-23 13:03:50 +00:00 · 2025-05-26 18:57:22 +08:00
parent 874300a56a
commit c25cd48e72
23 changed files with 859 additions and 164 deletions
--- a/packages/global/common/string/textSplitter.ts
+++ b/packages/global/common/string/textSplitter.ts
@@ -7,6 +7,10 @@ export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
 type SplitProps = {
  text: string;
  chunkSize: number;
+
+  paragraphChunkDeep?: number; // Paragraph deep
+  paragraphChunkMinSize?: number; // Paragraph min size, if too small, it will merge
+
  maxSize?: number;
  overlapRatio?: number;
  customReg?: string[];
@@ -108,6 +112,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
  let {
    text = '',
    chunkSize,
+    paragraphChunkDeep = 5,
+    paragraphChunkMinSize = 100,
    maxSize = defaultMaxChunkSize,
    overlapRatio = 0.15,
    customReg = []
@@ -123,7 +129,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
  text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) {
    return match.replace(/\n/g, codeBlockMarker);
  });
-  // 2. 表格处理 - 单独提取表格出来，进行表头合并
+  // 2. Markdown 表格处理 - 单独提取表格出来，进行表头合并
  const tableReg =
    /(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n?)*)(?:\n|$)/g;
  const tableDataList = text.match(tableReg);
@@ -143,25 +149,40 @@ const commonSplit = (props: SplitProps): SplitResponse => {
  text = text.replace(/(\r?\n|\r){3,}/g, '\n\n\n');

  // The larger maxLen is, the next sentence is less likely to trigger splitting
-  const markdownIndex = 4;
-  const forbidOverlapIndex = 8;
+  const customRegLen = customReg.length;
+  const markdownIndex = paragraphChunkDeep - 1;
+  const forbidOverlapIndex = customRegLen + markdownIndex + 4;
+
+  const markdownHeaderRules = ((deep?: number): { reg: RegExp; maxLen: number }[] => {
+    if (!deep || deep === 0) return [];
+
+    const maxDeep = Math.min(deep, 8); // Maximum 8 levels
+    const rules: { reg: RegExp; maxLen: number }[] = [];
+
+    for (let i = 1; i <= maxDeep; i++) {
+      const hashSymbols = '#'.repeat(i);
+      rules.push({
+        reg: new RegExp(`^(${hashSymbols}\\s[^\\n]+\\n)`, 'gm'),
+        maxLen: chunkSize
+      });
+    }
+
+    return rules;
+  })(paragraphChunkDeep);

  const stepReges: { reg: RegExp | string; maxLen: number }[] = [
    ...customReg.map((text) => ({
      reg: text.replaceAll('\\n', '\n'),
      maxLen: chunkSize
    })),
-    { reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkSize },
-    { reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkSize },
-    { reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkSize },
-    { reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize },
-    { reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize },
+    ...markdownHeaderRules,

    { reg: /([\n](```[\s\S]*?```|~~~[\s\S]*?~~~))/g, maxLen: maxSize }, // code block
+    // HTML Table tag 尽可能保障完整
    {
      reg: /(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n)*)/g,
-      maxLen: Math.min(chunkSize * 1.5, maxSize)
-    }, // Table 尽可能保证完整性
+      maxLen: chunkSize
+    }, // Markdown Table 尽可能保证完整性
    { reg: /(\n{2,})/g, maxLen: chunkSize },
    { reg: /([\n])/g, maxLen: chunkSize },
    // ------ There's no overlap on the top
@@ -172,12 +193,10 @@ const commonSplit = (props: SplitProps): SplitResponse => {
    { reg: /([，]|,\s)/g, maxLen: chunkSize }
  ];

-  const customRegLen = customReg.length;
  const checkIsCustomStep = (step: number) => step < customRegLen;
  const checkIsMarkdownSplit = (step: number) =>
    step >= customRegLen && step <= markdownIndex + customRegLen;
-
-  const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customRegLen;
+  const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex;

  // if use markdown title split, Separate record title
  const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
@@ -301,6 +320,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
    const splitTexts = getSplitTexts({ text, step });

    const chunks: string[] = [];
+
    for (let i = 0; i < splitTexts.length; i++) {
      const item = splitTexts[i];

@@ -443,7 +463,6 @@ const commonSplit = (props: SplitProps): SplitResponse => {
 */
 export const splitText2Chunks = (props: SplitProps): SplitResponse => {
  let { text = '' } = props;
-  const start = Date.now();
  const splitWithCustomSign = text.split(CUSTOM_SPLIT_SIGN);

  const splitResult = splitWithCustomSign.map((item) => {
--- a/packages/global/core/dataset/training/utils.ts
+++ b/packages/global/core/dataset/training/utils.ts
@@ -120,7 +120,6 @@ export const computeChunkSize = (params: {

  return Math.min(params.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel));
 };
-
 export const computeChunkSplitter = (params: {
  chunkSettingMode?: ChunkSettingModeEnum;
  chunkSplitMode?: DataChunkSplitModeEnum;
@@ -129,8 +128,21 @@ export const computeChunkSplitter = (params: {
  if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
    return undefined;
  }
-  if (params.chunkSplitMode === DataChunkSplitModeEnum.size) {
+  if (params.chunkSplitMode !== DataChunkSplitModeEnum.char) {
    return undefined;
  }
  return params.chunkSplitter;
 };
+export const computeParagraphChunkDeep = (params: {
+  chunkSettingMode?: ChunkSettingModeEnum;
+  chunkSplitMode?: DataChunkSplitModeEnum;
+  paragraphChunkDeep?: number;
+}) => {
+  if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
+    return 5;
+  }
+  if (params.chunkSplitMode === DataChunkSplitModeEnum.paragraph) {
+    return params.paragraphChunkDeep;
+  }
+  return 0;
+};
--- a/packages/global/core/dataset/type.d.ts
+++ b/packages/global/core/dataset/type.d.ts
@@ -9,7 +9,8 @@ import type {
  DatasetTypeEnum,
  SearchScoreTypeEnum,
  TrainingModeEnum,
-  ChunkSettingModeEnum
+  ChunkSettingModeEnum,
+  ChunkTriggerConfigTypeEnum
 } from './constants';
 import type { DatasetPermission } from '../../support/permission/dataset/controller';
 import type { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
@@ -37,11 +38,10 @@ export type ChunkSettingsType = {
  paragraphChunkAIMode?: ParagraphChunkAIModeEnum;
  paragraphChunkDeep?: number; // Paragraph deep
  paragraphChunkMinSize?: number; // Paragraph min size, if too small, it will merge
-  paragraphChunkMaxSize?: number; // Paragraph max size, if too large, it will split
  // Size split
-  chunkSize?: number;
+  chunkSize?: number; // chunk/qa chunk size, Paragraph max chunk size.
  // Char split
-  chunkSplitter?: string;
+  chunkSplitter?: string; // chunk/qa chunk splitter
  indexSize?: number;

  qaPrompt?: string;