perf: text splitter (#4313)

* sync collection * remove lock * perf: text splitter * update comment
2025-10-14 23:22:22 +00:00 · 2025-03-25 17:44:38 +08:00
parent 826a53dcb6
commit 37b4a1919b
8 changed files with 716 additions and 70 deletions
--- a/packages/global/common/string/textSplitter.ts
+++ b/packages/global/common/string/textSplitter.ts
@@ -134,8 +134,11 @@ const commonSplit = (props: SplitProps): SplitResponse => {
    { reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize },
    { reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize },

-    { reg: /([\n]([`~]))/g, maxLen: chunkSize }, // code block
-    { reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkSize }, // 增大块，尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
+    { reg: /([\n](```[\s\S]*?```|~~~[\s\S]*?~~~))/g, maxLen: maxSize }, // code block
+    {
+      reg: /(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n)*)/g,
+      maxLen: maxSize
+    }, // Table 尽可能保证完整性
    { reg: /(\n{2,})/g, maxLen: chunkSize },
    { reg: /([\n])/g, maxLen: chunkSize },
    // ------ There's no overlap on the top
@@ -150,7 +153,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
  const checkIsCustomStep = (step: number) => step < customRegLen;
  const checkIsMarkdownSplit = (step: number) =>
    step >= customRegLen && step <= markdownIndex + customRegLen;
-  +customReg.length;
+
  const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customRegLen;

  // if use markdown title split, Separate record title
@@ -159,7 +162,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
      return [
        {
          text,
-          title: ''
+          title: '',
+          chunkMaxSize: chunkSize
        }
      ];
    }
@@ -167,7 +171,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
    const isCustomStep = checkIsCustomStep(step);
    const isMarkdownSplit = checkIsMarkdownSplit(step);

-    const { reg } = stepReges[step];
+    const { reg, maxLen } = stepReges[step];

    const replaceText = (() => {
      if (typeof reg === 'string') {
@@ -194,15 +198,19 @@ const commonSplit = (props: SplitProps): SplitResponse => {
        })()
      );
    })();
+
    const splitTexts = replaceText.split(splitMarker).filter((part) => part.trim());

    return splitTexts
      .map((text) => {
        const matchTitle = isMarkdownSplit ? text.match(reg)?.[0] || '' : '';
+        // 如果一个分块没有匹配到，则使用默认块大小，否则使用最大块大小
+        const chunkMaxSize = text.match(reg) === null ? chunkSize : maxLen;

        return {
          text: isMarkdownSplit ? text.replace(matchTitle, '') : text,
-          title: matchTitle
+          title: matchTitle,
+          chunkMaxSize
        };
      })
      .filter((item) => !!item.title || !!item.text?.trim());
@@ -252,9 +260,9 @@ const commonSplit = (props: SplitProps): SplitResponse => {
    const isCustomStep = checkIsCustomStep(step);
    const forbidConcat = isCustomStep; // forbid=true时候，lastText肯定为空

-    // oversize
+    // Over step
    if (step >= stepReges.length) {
-      if (text.length < chunkSize * 3) {
+      if (text.length < maxSize) {
        return [text];
      }
      // use slice-chunkSize to split text
@@ -268,19 +276,18 @@ const commonSplit = (props: SplitProps): SplitResponse => {
    // split text by special char
    const splitTexts = getSplitTexts({ text, step });

-    const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkSize;
-    const minChunkLen = chunkSize * 0.7;
-
    const chunks: string[] = [];
    for (let i = 0; i < splitTexts.length; i++) {
      const item = splitTexts[i];

+      const maxLen = item.chunkMaxSize; // 当前块最大长度
+
      const lastTextLen = lastText.length;
      const currentText = item.text;
      const newText = lastText + currentText;
      const newTextLen = newText.length;

-      // Markdown 模式下，会强制向下拆分最小块，并再最后一个标题时候，给小块都补充上所有标题（包含父级标题）
+      // Markdown 模式下，会强制向下拆分最小块，并再最后一个标题深度，给小块都补充上所有标题（包含父级标题）
      if (isMarkdownStep) {
        // split new Text, split chunks must will greater 1 (small lastText)
        const innerChunks = splitTextRecursively({
@@ -290,11 +297,13 @@ const commonSplit = (props: SplitProps): SplitResponse => {
          parentTitle: parentTitle + item.title
        });

+        // 只有标题，没有内容。
        if (innerChunks.length === 0) {
          chunks.push(`${parentTitle}${item.title}`);
          continue;
        }

+        // 在合并最深级标题时，需要补充标题
        chunks.push(
          ...innerChunks.map(
            (chunk) =>
@@ -307,7 +316,16 @@ const commonSplit = (props: SplitProps): SplitResponse => {

      // newText is too large(now, The lastText must be smaller than chunkSize)
      if (newTextLen > maxLen) {
-        // lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
+        const minChunkLen = maxLen * 0.8; // 当前块最小长度
+        const maxChunkLen = maxLen * 1.2; // 当前块最大长度
+
+        // 新文本没有非常大，直接认为它是一个新的块
+        if (newTextLen < maxChunkLen) {
+          chunks.push(newText);
+          lastText = getOneTextOverlapText({ text: newText, step }); // next chunk will start with overlayText
+          continue;
+        }
+        // 上一个文本块已经挺大的，单独做一个块
        if (lastTextLen > minChunkLen) {
          chunks.push(lastText);

@@ -317,13 +335,13 @@ const commonSplit = (props: SplitProps): SplitResponse => {
          continue;
        }

-        // 说明是新的文本块比较大，需要进一步拆分
+        // 说明是当前文本比较大，需要进一步拆分

-        // split new Text, split chunks must will greater 1 (small lastText)
+        // 把新的文本块进行一个拆分，并追加到 latestText 中
        const innerChunks = splitTextRecursively({
-          text: newText,
+          text: currentText,
          step: step + 1,
-          lastText: '',
+          lastText,
          parentTitle: parentTitle + item.title
        });
        const lastChunk = innerChunks[innerChunks.length - 1];
@@ -351,11 +369,11 @@ const commonSplit = (props: SplitProps): SplitResponse => {

      // Not overlap
      if (forbidConcat) {
-        chunks.push(item.text);
+        chunks.push(currentText);
        continue;
      }

-      lastText += item.text;
+      lastText = newText;
    }

    /* If the last chunk is independent, it needs to be push chunks. */
--- a/packages/service/core/ai/audio/speech.ts
+++ b/packages/service/core/ai/audio/speech.ts
@@ -30,11 +30,11 @@ export async function text2Speech({
      response_format: 'mp3',
      speed
    },
-    modelData.requestUrl && modelData.requestAuth
+    modelData.requestUrl
      ? {
          path: modelData.requestUrl,
          headers: {
-            Authorization: `Bearer ${modelData.requestAuth}`
+            ...(modelData.requestAuth ? { Authorization: `Bearer ${modelData.requestAuth}` } : {})
          }
        }
      : {}
--- a/packages/service/core/ai/utils.ts
+++ b/packages/service/core/ai/utils.ts
@@ -65,6 +65,7 @@ export const llmCompletionsBodyFormat = <T extends CompletionsBodyType>(

  const requestBody: T = {
    ...body,
+    model: modelData.model,
    temperature:
      typeof body.temperature === 'number'
        ? computedTemperature({