perf: text splitter (#4313)

* sync collection * remove lock * perf: text splitter * update comment
2025-07-21 11:43:56 +00:00 · 2025-03-25 17:44:38 +08:00
parent 826a53dcb6
commit 37b4a1919b
8 changed files with 716 additions and 70 deletions
--- a/docSite/content/zh-cn/docs/development/upgrading/492.md
+++ b/docSite/content/zh-cn/docs/development/upgrading/492.md
@@ -30,10 +30,15 @@ weight: 799
 6. 工作流节点数组字符串类型，自动适配 string 输入。
 7. 工作流节点数组类型，自动进行 JSON parse 解析 string 输入。
 8. AI proxy 日志优化，去除重试失败的日志，仅保留最后一份错误日志。
+9. 分块算法小调整：  
+  * 跨处理符号之间连续性更强。  
+  * 代码块分割时，用 LLM 模型上下文作为分块大小，尽可能保证代码块完整性。
+  * 表格分割时，用 LLM 模型上下文作为分块大小，尽可能保证表格完整性。

 ## 🐛 修复

 1. 飞书和语雀知识库无法同步。
 2. 渠道测试时，如果配置了模型自定义请求地址，会走自定义请求地址，而不是渠道请求地址。
 3. 语音识别模型测试未启用的模型时，无法正常测试。
-4. 管理员配置系统插件时，如果插件包含其他系统应用，无法正常鉴权。
+4. 管理员配置系统插件时，如果插件包含其他系统应用，无法正常鉴权。
+5. 移除 TTS 自定义请求地址时，必须需要填 requestAuth 字段。
--- a/packages/global/common/string/textSplitter.ts
+++ b/packages/global/common/string/textSplitter.ts
@@ -134,8 +134,11 @@ const commonSplit = (props: SplitProps): SplitResponse => {
    { reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize },
    { reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize },

-    { reg: /([\n]([`~]))/g, maxLen: chunkSize }, // code block
-    { reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkSize }, // 增大块，尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
+    { reg: /([\n](```[\s\S]*?```|~~~[\s\S]*?~~~))/g, maxLen: maxSize }, // code block
+    {
+      reg: /(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n)*)/g,
+      maxLen: maxSize
+    }, // Table 尽可能保证完整性
    { reg: /(\n{2,})/g, maxLen: chunkSize },
    { reg: /([\n])/g, maxLen: chunkSize },
    // ------ There's no overlap on the top
@@ -150,7 +153,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
  const checkIsCustomStep = (step: number) => step < customRegLen;
  const checkIsMarkdownSplit = (step: number) =>
    step >= customRegLen && step <= markdownIndex + customRegLen;
-  +customReg.length;
+
  const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customRegLen;

  // if use markdown title split, Separate record title
@@ -159,7 +162,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
      return [
        {
          text,
-          title: ''
+          title: '',
+          chunkMaxSize: chunkSize
        }
      ];
    }
@@ -167,7 +171,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
    const isCustomStep = checkIsCustomStep(step);
    const isMarkdownSplit = checkIsMarkdownSplit(step);

-    const { reg } = stepReges[step];
+    const { reg, maxLen } = stepReges[step];

    const replaceText = (() => {
      if (typeof reg === 'string') {
@@ -194,15 +198,19 @@ const commonSplit = (props: SplitProps): SplitResponse => {
        })()
      );
    })();
+
    const splitTexts = replaceText.split(splitMarker).filter((part) => part.trim());

    return splitTexts
      .map((text) => {
        const matchTitle = isMarkdownSplit ? text.match(reg)?.[0] || '' : '';
+        // 如果一个分块没有匹配到，则使用默认块大小，否则使用最大块大小
+        const chunkMaxSize = text.match(reg) === null ? chunkSize : maxLen;

        return {
          text: isMarkdownSplit ? text.replace(matchTitle, '') : text,
-          title: matchTitle
+          title: matchTitle,
+          chunkMaxSize
        };
      })
      .filter((item) => !!item.title || !!item.text?.trim());
@@ -252,9 +260,9 @@ const commonSplit = (props: SplitProps): SplitResponse => {
    const isCustomStep = checkIsCustomStep(step);
    const forbidConcat = isCustomStep; // forbid=true时候，lastText肯定为空

-    // oversize
+    // Over step
    if (step >= stepReges.length) {
-      if (text.length < chunkSize * 3) {
+      if (text.length < maxSize) {
        return [text];
      }
      // use slice-chunkSize to split text
@@ -268,19 +276,18 @@ const commonSplit = (props: SplitProps): SplitResponse => {
    // split text by special char
    const splitTexts = getSplitTexts({ text, step });

-    const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkSize;
-    const minChunkLen = chunkSize * 0.7;
-
    const chunks: string[] = [];
    for (let i = 0; i < splitTexts.length; i++) {
      const item = splitTexts[i];

+      const maxLen = item.chunkMaxSize; // 当前块最大长度
+
      const lastTextLen = lastText.length;
      const currentText = item.text;
      const newText = lastText + currentText;
      const newTextLen = newText.length;

-      // Markdown 模式下，会强制向下拆分最小块，并再最后一个标题时候，给小块都补充上所有标题（包含父级标题）
+      // Markdown 模式下，会强制向下拆分最小块，并再最后一个标题深度，给小块都补充上所有标题（包含父级标题）
      if (isMarkdownStep) {
        // split new Text, split chunks must will greater 1 (small lastText)
        const innerChunks = splitTextRecursively({
@@ -290,11 +297,13 @@ const commonSplit = (props: SplitProps): SplitResponse => {
          parentTitle: parentTitle + item.title
        });

+        // 只有标题，没有内容。
        if (innerChunks.length === 0) {
          chunks.push(`${parentTitle}${item.title}`);
          continue;
        }

+        // 在合并最深级标题时，需要补充标题
        chunks.push(
          ...innerChunks.map(
            (chunk) =>
@@ -307,7 +316,16 @@ const commonSplit = (props: SplitProps): SplitResponse => {

      // newText is too large(now, The lastText must be smaller than chunkSize)
      if (newTextLen > maxLen) {
-        // lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
+        const minChunkLen = maxLen * 0.8; // 当前块最小长度
+        const maxChunkLen = maxLen * 1.2; // 当前块最大长度
+
+        // 新文本没有非常大，直接认为它是一个新的块
+        if (newTextLen < maxChunkLen) {
+          chunks.push(newText);
+          lastText = getOneTextOverlapText({ text: newText, step }); // next chunk will start with overlayText
+          continue;
+        }
+        // 上一个文本块已经挺大的，单独做一个块
        if (lastTextLen > minChunkLen) {
          chunks.push(lastText);

@@ -317,13 +335,13 @@ const commonSplit = (props: SplitProps): SplitResponse => {
          continue;
        }

-        // 说明是新的文本块比较大，需要进一步拆分
+        // 说明是当前文本比较大，需要进一步拆分

-        // split new Text, split chunks must will greater 1 (small lastText)
+        // 把新的文本块进行一个拆分，并追加到 latestText 中
        const innerChunks = splitTextRecursively({
-          text: newText,
+          text: currentText,
          step: step + 1,
-          lastText: '',
+          lastText,
          parentTitle: parentTitle + item.title
        });
        const lastChunk = innerChunks[innerChunks.length - 1];
@@ -351,11 +369,11 @@ const commonSplit = (props: SplitProps): SplitResponse => {

      // Not overlap
      if (forbidConcat) {
-        chunks.push(item.text);
+        chunks.push(currentText);
        continue;
      }

-      lastText += item.text;
+      lastText = newText;
    }

    /* If the last chunk is independent, it needs to be push chunks. */
--- a/packages/service/core/ai/audio/speech.ts
+++ b/packages/service/core/ai/audio/speech.ts
@@ -30,11 +30,11 @@ export async function text2Speech({
      response_format: 'mp3',
      speed
    },
-    modelData.requestUrl && modelData.requestAuth
+    modelData.requestUrl
      ? {
          path: modelData.requestUrl,
          headers: {
-            Authorization: `Bearer ${modelData.requestAuth}`
+            ...(modelData.requestAuth ? { Authorization: `Bearer ${modelData.requestAuth}` } : {})
          }
        }
      : {}
--- a/packages/service/core/ai/utils.ts
+++ b/packages/service/core/ai/utils.ts
@@ -65,6 +65,7 @@ export const llmCompletionsBodyFormat = <T extends CompletionsBodyType>(

  const requestBody: T = {
    ...body,
+    model: modelData.model,
    temperature:
      typeof body.temperature === 'number'
        ? computedTemperature({
--- a/projects/app/src/pageComponents/account/team/OrgManage/index.tsx
+++ b/projects/app/src/pageComponents/account/team/OrgManage/index.tsx
@@ -196,7 +196,7 @@ function OrgTable({ Tabs }: { Tabs: React.ReactNode }) {
        isLoading={isLoadingOrgs}
      >
        <Box mb={3}>
-          <Path paths={paths} rootName={userInfo?.team?.teamName} onClick={setPath} />
+          <Path paths={paths} rootName={userInfo?.team?.teamName} />
        </Box>
        <Flex flex={'1 0 0'} h={0} w={'100%'} gap={'4'}>
          <MemberScrollData flex="1">
@@ -420,7 +420,7 @@ function OrgTable({ Tabs }: { Tabs: React.ReactNode }) {
                  <ActionButton
                    icon="common/administrator"
                    text={t('account_team:manage_member')}
-                    onClick={() => setManageMemberOrg(currentOrg ?? rootOrg)}
+                    onClick={() => setManageMemberOrg(currentOrg)}
                  />
                  {currentOrg && currentOrg?.path !== '' && (
                    <>
--- a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts
+++ b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts
@@ -94,7 +94,7 @@ async function handler(
    per: WritePermissionVal
  });

-  if (fileAuthRes && (String(fileAuthRes.tmbId) !== String(tmbId) || !fileAuthRes.isRoot)) {
+  if (fileAuthRes && String(fileAuthRes.tmbId) !== String(tmbId) && !fileAuthRes.isRoot) {
    return Promise.reject(CommonErrEnum.unAuthFile);
  }

--- a/projects/app/src/service/core/dataset/data/controller.ts
+++ b/projects/app/src/service/core/dataset/data/controller.ts
@@ -15,18 +15,19 @@ import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTex
 import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
 import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
 import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
-import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';

 const formatIndexes = async ({
  indexes,
  q,
  a = '',
-  indexSize
+  indexSize,
+  maxIndexSize
 }: {
  indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
  q: string;
  a?: string;
  indexSize: number;
+  maxIndexSize: number;
 }): Promise<
  {
    type: `${DatasetDataIndexTypeEnum}`;
@@ -46,9 +47,12 @@ const formatIndexes = async ({
  }) => {
    const qChunks = splitText2Chunks({
      text: q,
-      chunkSize: indexSize
+      chunkSize: indexSize,
+      maxSize: maxIndexSize
    }).chunks;
-    const aChunks = a ? splitText2Chunks({ text: a, chunkSize: indexSize }).chunks : [];
+    const aChunks = a
+      ? splitText2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize }).chunks
+      : [];

    return [
      ...qChunks.map((text) => ({
@@ -100,7 +104,11 @@ const formatIndexes = async ({
        // If oversize tokens, split it
        const tokens = await countPromptTokens(item.text);
        if (tokens > indexSize) {
-          const splitText = splitText2Chunks({ text: item.text, chunkSize: 512 }).chunks;
+          const splitText = splitText2Chunks({
+            text: item.text,
+            chunkSize: 512,
+            maxSize: maxIndexSize
+          }).chunks;
          return splitText.map((text) => ({
            text,
            type: item.type
@@ -151,7 +159,8 @@ export async function insertData2Dataset({
    indexes,
    q,
    a,
-    indexSize
+    indexSize,
+    maxIndexSize: embModel.maxToken
  });

  // insert to vector store
@@ -236,7 +245,13 @@ export async function updateData2Dataset({
  if (!mongoData) return Promise.reject('core.dataset.error.Data not found');

  // 2. Compute indexes
-  const formatIndexesResult = await formatIndexes({ indexes, q, a, indexSize });
+  const formatIndexesResult = await formatIndexes({
+    indexes,
+    q,
+    a,
+    indexSize,
+    maxIndexSize: getEmbeddingModel(model).maxToken
+  });

  // 3. Patch indexes, create, update, delete
  const patchResult: PatchIndexesProps[] = [];
--- a/test/cases/function/packages/global/common/string/textSplitter.test.ts
+++ b/test/cases/function/packages/global/common/string/textSplitter.test.ts