Perf: llm parse paragraph (#5420)

* feat: llm directory optimization (#5400)

* perf: llm parse

* doc

---------

Co-authored-by: colnii <1286949794@qq.com>
This commit is contained in:
Archer
2025-08-09 18:38:58 +08:00
committed by GitHub
parent 1fc1e3fa80
commit 29edf1ea5f
6 changed files with 61 additions and 24 deletions

View File

@@ -83,19 +83,37 @@ export const getRegQueryStr = (text: string, flags = 'i') => {
/* slice json str */ /* slice json str */
export const sliceJsonStr = (str: string) => { export const sliceJsonStr = (str: string) => {
str = str.replace(/(\\n|\\)/g, '').replace(/ /g, ''); str = str
.trim()
.replace(/(\\n|\\)/g, '')
.replace(/ /g, '');
const jsonRegex = /{(?:[^{}]|{(?:[^{}]|{[^{}]*})*})*}/g; // Find first opening bracket
const matches = str.match(jsonRegex); let start = -1;
let openChar = '';
if (!matches) { for (let i = 0; i < str.length; i++) {
return ''; if (str[i] === '{' || str[i] === '[') {
start = i;
openChar = str[i];
break;
}
} }
// 找到第一个完整的 JSON 字符串 if (start === -1) return str;
const jsonStr = matches[0];
return jsonStr; // Find matching closing bracket from the end
const closeChar = openChar === '{' ? '}' : ']';
for (let i = str.length - 1; i >= start; i--) {
const ch = str[i];
if (ch === closeChar) {
return str.slice(start, i + 1);
}
}
return str;
}; };
export const sliceStrStartEnd = (str: string, start: number, end: number) => { export const sliceStrStartEnd = (str: string, start: number, end: number) => {

View File

@@ -122,11 +122,13 @@
"insert_images_success": "The new picture is successfully added, and you need to wait for the training to be completed before it will be displayed.", "insert_images_success": "The new picture is successfully added, and you need to wait for the training to be completed before it will be displayed.",
"is_open_schedule": "Enable scheduled synchronization", "is_open_schedule": "Enable scheduled synchronization",
"keep_image": "Keep the picture", "keep_image": "Keep the picture",
"llm_paragraph_mode": "LLM recognition paragraph(Beta)", "llm_paragraph_mode": "LLM recognition paragraph",
"llm_paragraph_mode_auto": "automatic", "llm_paragraph_mode_auto": "automatic",
"llm_paragraph_mode_auto_desc": "Enable the model to automatically recognize the title when the file content does not contain a Markdown title.", "llm_paragraph_mode_auto_desc": "Enable model recognition when the text content does not contain a Markdown title.",
"llm_paragraph_mode_forbid": "Disabled", "llm_paragraph_mode_forbid": "Disabled",
"llm_paragraph_mode_forbid_desc": "Force the disabling of the model's automatic paragraph recognition", "llm_paragraph_mode_forbid_desc": "Force the disabling of the model's automatic paragraph recognition",
"llm_paragraph_mode_force": "Force Process",
"llm_paragraph_mode_force_desc": "Force the use of the model to automatically identify paragraphs and ignore paragraphs in the original text (if any)",
"loading": "Loading...", "loading": "Loading...",
"max_chunk_size": "Maximum chunk size", "max_chunk_size": "Maximum chunk size",
"move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.", "move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.",

View File

@@ -122,11 +122,13 @@
"insert_images_success": "新增图片成功,需等待训练完成才会展示", "insert_images_success": "新增图片成功,需等待训练完成才会展示",
"is_open_schedule": "启用定时同步", "is_open_schedule": "启用定时同步",
"keep_image": "保留图片", "keep_image": "保留图片",
"llm_paragraph_mode": "模型识别段落(Beta)", "llm_paragraph_mode": "模型识别段落",
"llm_paragraph_mode_auto": "自动", "llm_paragraph_mode_auto": "自动",
"llm_paragraph_mode_auto_desc": "当文内容不含 Markdown 标题时,启用模型自动识别标题。", "llm_paragraph_mode_auto_desc": "当文内容不含 Markdown 标题时,启用模型识别。",
"llm_paragraph_mode_forbid": "禁用", "llm_paragraph_mode_forbid": "禁用",
"llm_paragraph_mode_forbid_desc": "强制禁用模型自动识别段落", "llm_paragraph_mode_forbid_desc": "强制禁用模型自动识别段落",
"llm_paragraph_mode_force": "强制处理",
"llm_paragraph_mode_force_desc": "强制使用模型自动识别段落,并忽略原文本的段落(如有)",
"loading": "加载中...", "loading": "加载中...",
"max_chunk_size": "最大分块大小", "max_chunk_size": "最大分块大小",
"move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。", "move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。",

View File

@@ -122,11 +122,13 @@
"insert_images_success": "新增圖片成功,需等待訓練完成才會展示", "insert_images_success": "新增圖片成功,需等待訓練完成才會展示",
"is_open_schedule": "啟用定時同步", "is_open_schedule": "啟用定時同步",
"keep_image": "保留圖片", "keep_image": "保留圖片",
"llm_paragraph_mode": "模型識別段落(Beta)", "llm_paragraph_mode": "模型識別段落",
"llm_paragraph_mode_auto": "自動", "llm_paragraph_mode_auto": "自動",
"llm_paragraph_mode_auto_desc": "當文內容不含 Markdown 標題時,啟用模型自動識別標題。", "llm_paragraph_mode_auto_desc": "當文內容不含 Markdown 標題時,啟用模型識別。",
"llm_paragraph_mode_forbid": "禁用", "llm_paragraph_mode_forbid": "禁用",
"llm_paragraph_mode_forbid_desc": "強制禁用模型自動識別段落", "llm_paragraph_mode_forbid_desc": "強制禁用模型自動識別段落",
"llm_paragraph_mode_force": "強制處理",
"llm_paragraph_mode_force_desc": "強制使用模型自動識別段落,並忽略原文本的段落(如有)",
"loading": "加載中...", "loading": "加載中...",
"max_chunk_size": "最大分塊大小", "max_chunk_size": "最大分塊大小",
"move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。", "move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。",

View File

@@ -387,15 +387,20 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
setValue('paragraphChunkAIMode', e); setValue('paragraphChunkAIMode', e);
}} }}
list={[ list={[
{
label: t('dataset:llm_paragraph_mode_auto'),
value: ParagraphChunkAIModeEnum.auto,
description: t('dataset:llm_paragraph_mode_auto_desc')
},
{ {
label: t('dataset:llm_paragraph_mode_forbid'), label: t('dataset:llm_paragraph_mode_forbid'),
value: ParagraphChunkAIModeEnum.forbid, value: ParagraphChunkAIModeEnum.forbid,
description: t('dataset:llm_paragraph_mode_forbid_desc') description: t('dataset:llm_paragraph_mode_forbid_desc')
}, },
{ {
label: t('dataset:llm_paragraph_mode_auto'), label: t('dataset:llm_paragraph_mode_force'),
value: ParagraphChunkAIModeEnum.auto, value: ParagraphChunkAIModeEnum.force,
description: t('dataset:llm_paragraph_mode_auto_desc') description: t('dataset:llm_paragraph_mode_force_desc')
} }
]} ]}
/> />

View File

@@ -55,9 +55,13 @@ const requestLLMPargraph = async ({
}; };
} }
// Check is markdown text(Include 1 group of title)
if (paragraphChunkAIMode === ParagraphChunkAIModeEnum.auto) { if (paragraphChunkAIMode === ParagraphChunkAIModeEnum.auto) {
const isMarkdown = /^(#+)\s/.test(rawText); // Check if the text contains Markdown header structure
const hasMarkdownHeaders = /^(#+)\s/m.test(rawText);
const hasMultipleHeaders = (rawText.match(/^(#+)\s/g) || []).length > 1;
const isMarkdown = hasMarkdownHeaders && hasMultipleHeaders;
if (isMarkdown) { if (isMarkdown) {
return { return {
resultText: rawText, resultText: rawText,
@@ -71,11 +75,15 @@ const requestLLMPargraph = async ({
resultText: string; resultText: string;
totalInputTokens: number; totalInputTokens: number;
totalOutputTokens: number; totalOutputTokens: number;
}>('/core/dataset/training/llmPargraph', { }>(
'/core/dataset/training/llmPargraph',
{
rawText, rawText,
model, model,
billId billId
}); },
{ timeout: 600000 }
);
return data; return data;
}; };