From 29edf1ea5fc51fe4e2ceb34a2345c38d127286ed Mon Sep 17 00:00:00 2001
From: Archer <545436317@qq.com>
Date: Sat, 9 Aug 2025 18:38:58 +0800
Subject: [PATCH] Perf: llm parse paragraph (#5420)

* feat: llm directory optimization (#5400)

* perf: llm parse

* doc

---------

Co-authored-by: colnii <1286949794@qq.com>
---
 packages/global/common/string/tools.ts        | 34 ++++++++++++++-----
 packages/web/i18n/en/dataset.json             |  6 ++--
 packages/web/i18n/zh-CN/dataset.json          |  6 ++--
 packages/web/i18n/zh-Hant/dataset.json        |  6 ++--
 .../detail/Form/CollectionChunkForm.tsx       | 11 ++++--
 .../core/dataset/queues/datasetParse.ts       | 22 ++++++++----
 6 files changed, 61 insertions(+), 24 deletions(-)

diff --git a/packages/global/common/string/tools.ts b/packages/global/common/string/tools.ts
index dc4c845a9..e616e8141 100644
--- a/packages/global/common/string/tools.ts
+++ b/packages/global/common/string/tools.ts
@@ -83,19 +83,37 @@ export const getRegQueryStr = (text: string, flags = 'i') => {
 
 /* slice json str */
 export const sliceJsonStr = (str: string) => {
-  str = str.replace(/(\\n|\\)/g, '').replace(/  /g, '');
+  str = str
+    .trim()
+    .replace(/(\\n|\\)/g, '')
+    .replace(/  /g, '');
 
-  const jsonRegex = /{(?:[^{}]|{(?:[^{}]|{[^{}]*})*})*}/g;
-  const matches = str.match(jsonRegex);
+  // Find first opening bracket
+  let start = -1;
+  let openChar = '';
 
-  if (!matches) {
-    return '';
+  for (let i = 0; i < str.length; i++) {
+    if (str[i] === '{' || str[i] === '[') {
+      start = i;
+      openChar = str[i];
+      break;
+    }
   }
 
-  // 找到第一个完整的 JSON 字符串
-  const jsonStr = matches[0];
+  if (start === -1) return str;
 
-  return jsonStr;
+  // Find matching closing bracket from the end
+  const closeChar = openChar === '{' ? '}' : ']';
+
+  for (let i = str.length - 1; i >= start; i--) {
+    const ch = str[i];
+
+    if (ch === closeChar) {
+      return str.slice(start, i + 1);
+    }
+  }
+
+  return str;
 };
 
 export const sliceStrStartEnd = (str: string, start: number, end: number) => {
diff --git a/packages/web/i18n/en/dataset.json b/packages/web/i18n/en/dataset.json
index d8c57c4fa..74a7270fb 100644
--- a/packages/web/i18n/en/dataset.json
+++ b/packages/web/i18n/en/dataset.json
@@ -122,11 +122,13 @@
   "insert_images_success": "The new picture is successfully added, and you need to wait for the training to be completed before it will be displayed.",
   "is_open_schedule": "Enable scheduled synchronization",
   "keep_image": "Keep the picture",
-  "llm_paragraph_mode": "LLM recognition paragraph(Beta)",
+  "llm_paragraph_mode": "LLM recognition paragraph",
   "llm_paragraph_mode_auto": "automatic",
-  "llm_paragraph_mode_auto_desc": "Enable the model to automatically recognize the title when the file content does not contain a Markdown title.",
+  "llm_paragraph_mode_auto_desc": "Enable model recognition when the text content does not contain a Markdown title.",
   "llm_paragraph_mode_forbid": "Disabled",
   "llm_paragraph_mode_forbid_desc": "Force the disabling of the model's automatic paragraph recognition",
+  "llm_paragraph_mode_force": "Force Process",
+  "llm_paragraph_mode_force_desc": "Force the use of the model to automatically identify paragraphs and ignore paragraphs in the original text (if any)",
   "loading": "Loading...",
   "max_chunk_size": "Maximum chunk size",
   "move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.",
diff --git a/packages/web/i18n/zh-CN/dataset.json b/packages/web/i18n/zh-CN/dataset.json
index af813c47f..155e2e31b 100644
--- a/packages/web/i18n/zh-CN/dataset.json
+++ b/packages/web/i18n/zh-CN/dataset.json
@@ -122,11 +122,13 @@
   "insert_images_success": "新增图片成功，需等待训练完成才会展示",
   "is_open_schedule": "启用定时同步",
   "keep_image": "保留图片",
-  "llm_paragraph_mode": "模型识别段落(Beta)",
+  "llm_paragraph_mode": "模型识别段落",
   "llm_paragraph_mode_auto": "自动",
-  "llm_paragraph_mode_auto_desc": "当文件内容不包含 Markdown 标题时，启用模型自动识别标题。",
+  "llm_paragraph_mode_auto_desc": "当文本内容不含 Markdown 标题时，启用模型识别。",
   "llm_paragraph_mode_forbid": "禁用",
   "llm_paragraph_mode_forbid_desc": "强制禁用模型自动识别段落",
+  "llm_paragraph_mode_force": "强制处理",
+  "llm_paragraph_mode_force_desc": "强制使用模型自动识别段落，并忽略原文本的段落（如有）",
   "loading": "加载中...",
   "max_chunk_size": "最大分块大小",
   "move.hint": "移动后，所选知识库/文件夹将继承新文件夹的权限设置，原先的权限设置失效。",
diff --git a/packages/web/i18n/zh-Hant/dataset.json b/packages/web/i18n/zh-Hant/dataset.json
index 07bde6b26..ffe6fd4e4 100644
--- a/packages/web/i18n/zh-Hant/dataset.json
+++ b/packages/web/i18n/zh-Hant/dataset.json
@@ -122,11 +122,13 @@
   "insert_images_success": "新增圖片成功，需等待訓練完成才會展示",
   "is_open_schedule": "啟用定時同步",
   "keep_image": "保留圖片",
-  "llm_paragraph_mode": "模型識別段落(Beta)",
+  "llm_paragraph_mode": "模型識別段落",
   "llm_paragraph_mode_auto": "自動",
-  "llm_paragraph_mode_auto_desc": "當文件內容不包含 Markdown 標題時，啟用模型自動識別標題。",
+  "llm_paragraph_mode_auto_desc": "當文本內容不含 Markdown 標題時，啟用模型識別。",
   "llm_paragraph_mode_forbid": "禁用",
   "llm_paragraph_mode_forbid_desc": "強制禁用模型自動識別段落",
+  "llm_paragraph_mode_force": "強制處理",
+  "llm_paragraph_mode_force_desc": "強制使用模型自動識別段落，並忽略原文本的段落（如有）",
   "loading": "加載中...",
   "max_chunk_size": "最大分塊大小",
   "move.hint": "移動後，所選資料集／資料夾將繼承新資料夾的權限設定，原先的權限設定將失效。",
diff --git a/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx b/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx
index effa0f43f..9cded766e 100644
--- a/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx
+++ b/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx
@@ -387,15 +387,20 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
                               setValue('paragraphChunkAIMode', e);
                             }}
                             list={[
+                              {
+                                label: t('dataset:llm_paragraph_mode_auto'),
+                                value: ParagraphChunkAIModeEnum.auto,
+                                description: t('dataset:llm_paragraph_mode_auto_desc')
+                              },
                               {
                                 label: t('dataset:llm_paragraph_mode_forbid'),
                                 value: ParagraphChunkAIModeEnum.forbid,
                                 description: t('dataset:llm_paragraph_mode_forbid_desc')
                               },
                               {
-                                label: t('dataset:llm_paragraph_mode_auto'),
-                                value: ParagraphChunkAIModeEnum.auto,
-                                description: t('dataset:llm_paragraph_mode_auto_desc')
+                                label: t('dataset:llm_paragraph_mode_force'),
+                                value: ParagraphChunkAIModeEnum.force,
+                                description: t('dataset:llm_paragraph_mode_force_desc')
                               }
                             ]}
                           />
diff --git a/projects/app/src/service/core/dataset/queues/datasetParse.ts b/projects/app/src/service/core/dataset/queues/datasetParse.ts
index e2799253b..f1d479daa 100644
--- a/projects/app/src/service/core/dataset/queues/datasetParse.ts
+++ b/projects/app/src/service/core/dataset/queues/datasetParse.ts
@@ -55,9 +55,13 @@ const requestLLMPargraph = async ({
     };
   }
 
-  // Check is markdown text(Include 1 group of title)
   if (paragraphChunkAIMode === ParagraphChunkAIModeEnum.auto) {
-    const isMarkdown = /^(#+)\s/.test(rawText);
+    // Check if the text contains Markdown header structure
+    const hasMarkdownHeaders = /^(#+)\s/m.test(rawText);
+    const hasMultipleHeaders = (rawText.match(/^(#+)\s/g) || []).length > 1;
+
+    const isMarkdown = hasMarkdownHeaders && hasMultipleHeaders;
+
     if (isMarkdown) {
       return {
         resultText: rawText,
@@ -71,11 +75,15 @@ const requestLLMPargraph = async ({
     resultText: string;
     totalInputTokens: number;
     totalOutputTokens: number;
-  }>('/core/dataset/training/llmPargraph', {
-    rawText,
-    model,
-    billId
-  });
+  }>(
+    '/core/dataset/training/llmPargraph',
+    {
+      rawText,
+      model,
+      billId
+    },
+    { timeout: 600000 }
+  );
 
   return data;
 };