Perf: llm parse paragraph (#5420)

* feat: llm directory optimization (#5400)

* perf: llm parse

* doc

---------

Co-authored-by: colnii <1286949794@qq.com>
This commit is contained in:
Archer
2025-08-09 18:38:58 +08:00
committed by GitHub
parent 1fc1e3fa80
commit 29edf1ea5f
6 changed files with 61 additions and 24 deletions

View File

@@ -387,15 +387,20 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
setValue('paragraphChunkAIMode', e);
}}
list={[
{
label: t('dataset:llm_paragraph_mode_auto'),
value: ParagraphChunkAIModeEnum.auto,
description: t('dataset:llm_paragraph_mode_auto_desc')
},
{
label: t('dataset:llm_paragraph_mode_forbid'),
value: ParagraphChunkAIModeEnum.forbid,
description: t('dataset:llm_paragraph_mode_forbid_desc')
},
{
label: t('dataset:llm_paragraph_mode_auto'),
value: ParagraphChunkAIModeEnum.auto,
description: t('dataset:llm_paragraph_mode_auto_desc')
label: t('dataset:llm_paragraph_mode_force'),
value: ParagraphChunkAIModeEnum.force,
description: t('dataset:llm_paragraph_mode_force_desc')
}
]}
/>

View File

@@ -55,9 +55,13 @@ const requestLLMPargraph = async ({
};
}
// Check is markdown text(Include 1 group of title)
if (paragraphChunkAIMode === ParagraphChunkAIModeEnum.auto) {
const isMarkdown = /^(#+)\s/.test(rawText);
// Check if the text contains Markdown header structure
const hasMarkdownHeaders = /^(#+)\s/m.test(rawText);
const hasMultipleHeaders = (rawText.match(/^(#+)\s/g) || []).length > 1;
const isMarkdown = hasMarkdownHeaders && hasMultipleHeaders;
if (isMarkdown) {
return {
resultText: rawText,
@@ -71,11 +75,15 @@ const requestLLMPargraph = async ({
resultText: string;
totalInputTokens: number;
totalOutputTokens: number;
}>('/core/dataset/training/llmPargraph', {
rawText,
model,
billId
});
}>(
'/core/dataset/training/llmPargraph',
{
rawText,
model,
billId
},
{ timeout: 600000 }
);
return data;
};