Perf: llm parse paragraph (#5420)

* feat: llm directory optimization (#5400) * perf: llm parse * doc --------- Co-authored-by: colnii <1286949794@qq.com>
2026-01-30 01:02:15 +08:00 · 2025-08-09 18:38:58 +08:00
parent 1fc1e3fa80
commit 29edf1ea5f
6 changed files with 61 additions and 24 deletions
--- a/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx
+++ b/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx
@@ -387,15 +387,20 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
                              setValue('paragraphChunkAIMode', e);
                            }}
                            list={[
+                              {
+                                label: t('dataset:llm_paragraph_mode_auto'),
+                                value: ParagraphChunkAIModeEnum.auto,
+                                description: t('dataset:llm_paragraph_mode_auto_desc')
+                              },
                              {
                                label: t('dataset:llm_paragraph_mode_forbid'),
                                value: ParagraphChunkAIModeEnum.forbid,
                                description: t('dataset:llm_paragraph_mode_forbid_desc')
                              },
                              {
-                                label: t('dataset:llm_paragraph_mode_auto'),
-                                value: ParagraphChunkAIModeEnum.auto,
-                                description: t('dataset:llm_paragraph_mode_auto_desc')
+                                label: t('dataset:llm_paragraph_mode_force'),
+                                value: ParagraphChunkAIModeEnum.force,
+                                description: t('dataset:llm_paragraph_mode_force_desc')
                              }
                            ]}
                          />
--- a/projects/app/src/service/core/dataset/queues/datasetParse.ts
+++ b/projects/app/src/service/core/dataset/queues/datasetParse.ts
@@ -55,9 +55,13 @@ const requestLLMPargraph = async ({
    };
  }

-  // Check is markdown text(Include 1 group of title)
  if (paragraphChunkAIMode === ParagraphChunkAIModeEnum.auto) {
-    const isMarkdown = /^(#+)\s/.test(rawText);
+    // Check if the text contains Markdown header structure
+    const hasMarkdownHeaders = /^(#+)\s/m.test(rawText);
+    const hasMultipleHeaders = (rawText.match(/^(#+)\s/g) || []).length > 1;
+
+    const isMarkdown = hasMarkdownHeaders && hasMultipleHeaders;
+
    if (isMarkdown) {
      return {
        resultText: rawText,
@@ -71,11 +75,15 @@ const requestLLMPargraph = async ({
    resultText: string;
    totalInputTokens: number;
    totalOutputTokens: number;
-  }>('/core/dataset/training/llmPargraph', {
-    rawText,
-    model,
-    billId
-  });
+  }>(
+    '/core/dataset/training/llmPargraph',
+    {
+      rawText,
+      model,
+      billId
+    },
+    { timeout: 600000 }
+  );

  return data;
 };