From 29edf1ea5fc51fe4e2ceb34a2345c38d127286ed Mon Sep 17 00:00:00 2001 From: Archer <545436317@qq.com> Date: Sat, 9 Aug 2025 18:38:58 +0800 Subject: [PATCH] Perf: llm parse paragraph (#5420) * feat: llm directory optimization (#5400) * perf: llm parse * doc --------- Co-authored-by: colnii <1286949794@qq.com> --- packages/global/common/string/tools.ts | 34 ++++++++++++++----- packages/web/i18n/en/dataset.json | 6 ++-- packages/web/i18n/zh-CN/dataset.json | 6 ++-- packages/web/i18n/zh-Hant/dataset.json | 6 ++-- .../detail/Form/CollectionChunkForm.tsx | 11 ++++-- .../core/dataset/queues/datasetParse.ts | 22 ++++++++---- 6 files changed, 61 insertions(+), 24 deletions(-) diff --git a/packages/global/common/string/tools.ts b/packages/global/common/string/tools.ts index dc4c845a9..e616e8141 100644 --- a/packages/global/common/string/tools.ts +++ b/packages/global/common/string/tools.ts @@ -83,19 +83,37 @@ export const getRegQueryStr = (text: string, flags = 'i') => { /* slice json str */ export const sliceJsonStr = (str: string) => { - str = str.replace(/(\\n|\\)/g, '').replace(/ /g, ''); + str = str + .trim() + .replace(/(\\n|\\)/g, '') + .replace(/ /g, ''); - const jsonRegex = /{(?:[^{}]|{(?:[^{}]|{[^{}]*})*})*}/g; - const matches = str.match(jsonRegex); + // Find first opening bracket + let start = -1; + let openChar = ''; - if (!matches) { - return ''; + for (let i = 0; i < str.length; i++) { + if (str[i] === '{' || str[i] === '[') { + start = i; + openChar = str[i]; + break; + } } - // 找到第一个完整的 JSON 字符串 - const jsonStr = matches[0]; + if (start === -1) return str; - return jsonStr; + // Find matching closing bracket from the end + const closeChar = openChar === '{' ? '}' : ']'; + + for (let i = str.length - 1; i >= start; i--) { + const ch = str[i]; + + if (ch === closeChar) { + return str.slice(start, i + 1); + } + } + + return str; }; export const sliceStrStartEnd = (str: string, start: number, end: number) => { diff --git a/packages/web/i18n/en/dataset.json b/packages/web/i18n/en/dataset.json index d8c57c4fa..74a7270fb 100644 --- a/packages/web/i18n/en/dataset.json +++ b/packages/web/i18n/en/dataset.json @@ -122,11 +122,13 @@ "insert_images_success": "The new picture is successfully added, and you need to wait for the training to be completed before it will be displayed.", "is_open_schedule": "Enable scheduled synchronization", "keep_image": "Keep the picture", - "llm_paragraph_mode": "LLM recognition paragraph(Beta)", + "llm_paragraph_mode": "LLM recognition paragraph", "llm_paragraph_mode_auto": "automatic", - "llm_paragraph_mode_auto_desc": "Enable the model to automatically recognize the title when the file content does not contain a Markdown title.", + "llm_paragraph_mode_auto_desc": "Enable model recognition when the text content does not contain a Markdown title.", "llm_paragraph_mode_forbid": "Disabled", "llm_paragraph_mode_forbid_desc": "Force the disabling of the model's automatic paragraph recognition", + "llm_paragraph_mode_force": "Force Process", + "llm_paragraph_mode_force_desc": "Force the use of the model to automatically identify paragraphs and ignore paragraphs in the original text (if any)", "loading": "Loading...", "max_chunk_size": "Maximum chunk size", "move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.", diff --git a/packages/web/i18n/zh-CN/dataset.json b/packages/web/i18n/zh-CN/dataset.json index af813c47f..155e2e31b 100644 --- a/packages/web/i18n/zh-CN/dataset.json +++ b/packages/web/i18n/zh-CN/dataset.json @@ -122,11 +122,13 @@ "insert_images_success": "新增图片成功,需等待训练完成才会展示", "is_open_schedule": "启用定时同步", "keep_image": "保留图片", - "llm_paragraph_mode": "模型识别段落(Beta)", + "llm_paragraph_mode": "模型识别段落", "llm_paragraph_mode_auto": "自动", - "llm_paragraph_mode_auto_desc": "当文件内容不包含 Markdown 标题时,启用模型自动识别标题。", + "llm_paragraph_mode_auto_desc": "当文本内容不含 Markdown 标题时,启用模型识别。", "llm_paragraph_mode_forbid": "禁用", "llm_paragraph_mode_forbid_desc": "强制禁用模型自动识别段落", + "llm_paragraph_mode_force": "强制处理", + "llm_paragraph_mode_force_desc": "强制使用模型自动识别段落,并忽略原文本的段落(如有)", "loading": "加载中...", "max_chunk_size": "最大分块大小", "move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。", diff --git a/packages/web/i18n/zh-Hant/dataset.json b/packages/web/i18n/zh-Hant/dataset.json index 07bde6b26..ffe6fd4e4 100644 --- a/packages/web/i18n/zh-Hant/dataset.json +++ b/packages/web/i18n/zh-Hant/dataset.json @@ -122,11 +122,13 @@ "insert_images_success": "新增圖片成功,需等待訓練完成才會展示", "is_open_schedule": "啟用定時同步", "keep_image": "保留圖片", - "llm_paragraph_mode": "模型識別段落(Beta)", + "llm_paragraph_mode": "模型識別段落", "llm_paragraph_mode_auto": "自動", - "llm_paragraph_mode_auto_desc": "當文件內容不包含 Markdown 標題時,啟用模型自動識別標題。", + "llm_paragraph_mode_auto_desc": "當文本內容不含 Markdown 標題時,啟用模型識別。", "llm_paragraph_mode_forbid": "禁用", "llm_paragraph_mode_forbid_desc": "強制禁用模型自動識別段落", + "llm_paragraph_mode_force": "強制處理", + "llm_paragraph_mode_force_desc": "強制使用模型自動識別段落,並忽略原文本的段落(如有)", "loading": "加載中...", "max_chunk_size": "最大分塊大小", "move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。", diff --git a/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx b/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx index effa0f43f..9cded766e 100644 --- a/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx @@ -387,15 +387,20 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn diff --git a/projects/app/src/service/core/dataset/queues/datasetParse.ts b/projects/app/src/service/core/dataset/queues/datasetParse.ts index e2799253b..f1d479daa 100644 --- a/projects/app/src/service/core/dataset/queues/datasetParse.ts +++ b/projects/app/src/service/core/dataset/queues/datasetParse.ts @@ -55,9 +55,13 @@ const requestLLMPargraph = async ({ }; } - // Check is markdown text(Include 1 group of title) if (paragraphChunkAIMode === ParagraphChunkAIModeEnum.auto) { - const isMarkdown = /^(#+)\s/.test(rawText); + // Check if the text contains Markdown header structure + const hasMarkdownHeaders = /^(#+)\s/m.test(rawText); + const hasMultipleHeaders = (rawText.match(/^(#+)\s/g) || []).length > 1; + + const isMarkdown = hasMarkdownHeaders && hasMultipleHeaders; + if (isMarkdown) { return { resultText: rawText, @@ -71,11 +75,15 @@ const requestLLMPargraph = async ({ resultText: string; totalInputTokens: number; totalOutputTokens: number; - }>('/core/dataset/training/llmPargraph', { - rawText, - model, - billId - }); + }>( + '/core/dataset/training/llmPargraph', + { + rawText, + model, + billId + }, + { timeout: 600000 } + ); return data; };