perf: backup import (#4866)

* i18n * remove invalid code * perf: backup import * backup tip * fix: indexsize invalid
2025-10-15 07:31:19 +00:00 · 2025-05-22 15:53:51 +08:00
parent dd3c251603
commit 88bd3aaa9e
67 changed files with 751 additions and 388 deletions
--- a/packages/service/core/dataset/apiDataset/api.ts
+++ b/packages/service/core/dataset/apiDataset/api.ts
@@ -146,7 +146,8 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer }
        tmbId,
        url: previewUrl,
        relatedId: apiFileId,
-        customPdfParse
+        customPdfParse,
+        getFormatText: true
      });
      return {
        title,
--- a/packages/service/core/dataset/collection/controller.ts
+++ b/packages/service/core/dataset/collection/controller.ts
@@ -36,13 +36,14 @@ import {
  computeChunkSplitter,
  getLLMMaxChunkSize
 } from '@fastgpt/global/core/dataset/training/utils';
+import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';

 export const createCollectionAndInsertData = async ({
  dataset,
  rawText,
  relatedId,
  createCollectionParams,
-  isQAImport = false,
+  backupParse = false,
  billId,
  session
 }: {
@@ -50,8 +51,8 @@ export const createCollectionAndInsertData = async ({
  rawText: string;
  relatedId?: string;
  createCollectionParams: CreateOneCollectionParams;
+  backupParse?: boolean;

-  isQAImport?: boolean;
  billId?: string;
  session?: ClientSession;
 }) => {
@@ -81,7 +82,7 @@ export const createCollectionAndInsertData = async ({
    maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
    overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
    customReg: chunkSplitter ? [chunkSplitter] : [],
-    isQAImport
+    backupParse
  });

  // 2. auth limit
@@ -157,6 +158,10 @@ export const createCollectionAndInsertData = async ({
      billId: traingBillId,
      data: chunks.map((item, index) => ({
        ...item,
+        indexes: item.indexes?.map((text) => ({
+          type: DatasetDataIndexTypeEnum.custom,
+          text
+        })),
        chunkIndex: index
      })),
      session
--- a/packages/service/core/dataset/read.ts
+++ b/packages/service/core/dataset/read.ts
@@ -2,7 +2,6 @@ import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
 import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
 import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
 import { urlsFetch } from '../../common/string/cheerio';
-import { parseCsvTable2Chunks } from './training/utils';
 import { type TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
 import axios from 'axios';
 import { readRawContentByFileBuffer } from '../../common/file/read/utils';
@@ -13,18 +12,21 @@ import {
  type YuqueServer
 } from '@fastgpt/global/core/dataset/apiDataset';
 import { useApiDatasetRequest } from './apiDataset/api';
+import Papa from 'papaparse';

 export const readFileRawTextByUrl = async ({
  teamId,
  tmbId,
  url,
  customPdfParse,
+  getFormatText,
  relatedId
 }: {
  teamId: string;
  tmbId: string;
  url: string;
  customPdfParse?: boolean;
+  getFormatText?: boolean;
  relatedId: string; // externalFileId / apiFileId
 }) => {
  const response = await axios({
@@ -38,7 +40,7 @@ export const readFileRawTextByUrl = async ({

  const { rawText } = await readRawContentByFileBuffer({
    customPdfParse,
-    isQAImport: false,
+    getFormatText,
    extension,
    teamId,
    tmbId,
@@ -62,21 +64,21 @@ export const readDatasetSourceRawText = async ({
  tmbId,
  type,
  sourceId,
-  isQAImport,
  selector,
  externalFileId,
  apiServer,
  feishuServer,
  yuqueServer,
-  customPdfParse
+  customPdfParse,
+  getFormatText
 }: {
  teamId: string;
  tmbId: string;
  type: DatasetSourceReadTypeEnum;
  sourceId: string;
  customPdfParse?: boolean;
+  getFormatText?: boolean;

-  isQAImport?: boolean; // csv data
  selector?: string; // link selector
  externalFileId?: string; // external file dataset
  apiServer?: APIFileServer; // api dataset
@@ -92,8 +94,8 @@ export const readDatasetSourceRawText = async ({
      tmbId,
      bucketName: BucketNameEnum.dataset,
      fileId: sourceId,
-      isQAImport,
-      customPdfParse
+      customPdfParse,
+      getFormatText
    });
    return {
      title: filename,
@@ -183,16 +185,38 @@ export const readApiServerFileContent = async ({

 export const rawText2Chunks = ({
  rawText,
-  isQAImport,
+  backupParse,
  chunkSize = 512,
  ...splitProps
 }: {
  rawText: string;
-  isQAImport?: boolean;
-} & TextSplitProps) => {
-  if (isQAImport) {
-    const { chunks } = parseCsvTable2Chunks(rawText);
-    return chunks;
+  backupParse?: boolean;
+  tableParse?: boolean;
+} & TextSplitProps): {
+  q: string;
+  a: string;
+  indexes?: string[];
+}[] => {
+  const parseDatasetBackup2Chunks = (rawText: string) => {
+    const csvArr = Papa.parse(rawText).data as string[][];
+    console.log(rawText, csvArr);
+
+    const chunks = csvArr
+      .slice(1)
+      .map((item) => ({
+        q: item[0] || '',
+        a: item[1] || '',
+        indexes: item.slice(2)
+      }))
+      .filter((item) => item.q || item.a);
+
+    return {
+      chunks
+    };
+  };
+
+  if (backupParse) {
+    return parseDatasetBackup2Chunks(rawText).chunks;
  }

  const { chunks } = splitText2Chunks({
@@ -203,6 +227,7 @@ export const rawText2Chunks = ({

  return chunks.map((item) => ({
    q: item,
-    a: ''
+    a: '',
+    indexes: []
  }));
 };
--- a/packages/service/core/dataset/training/constants.ts
+++ b/packages/service/core/dataset/training/constants.ts
@@ -1,6 +1,5 @@
 export enum ImportDataSourceEnum {
  fileLocal = 'fileLocal',
  fileLink = 'fileLink',
-  fileCustom = 'fileCustom',
-  tableLocal = 'tableLocal'
+  fileCustom = 'fileCustom'
 }
--- a/packages/service/core/dataset/training/utils.ts
+++ b/packages/service/core/dataset/training/utils.ts
@@ -1,16 +0,0 @@
-import Papa from 'papaparse';
-
-export const parseCsvTable2Chunks = (rawText: string) => {
-  const csvArr = Papa.parse(rawText).data as string[][];
-
-  const chunks = csvArr
-    .map((item) => ({
-      q: item[0] || '',
-      a: item[1] || ''
-    }))
-    .filter((item) => item.q || item.a);
-
-  return {
-    chunks
-  };
-};
--- a/packages/service/core/workflow/dispatch/agent/extract.ts
+++ b/packages/service/core/workflow/dispatch/agent/extract.ts
@@ -223,28 +223,29 @@ const toolChoice = async (props: ActionProps) => {
    }
  ];

+  const body = llmCompletionsBodyFormat(
+    {
+      stream: true,
+      model: extractModel.model,
+      temperature: 0.01,
+      messages: filterMessages,
+      tools,
+      tool_choice: { type: 'function', function: { name: agentFunName } }
+    },
+    extractModel
+  );
  const { response } = await createChatCompletion({
-    body: llmCompletionsBodyFormat(
-      {
-        stream: true,
-        model: extractModel.model,
-        temperature: 0.01,
-        messages: filterMessages,
-        tools,
-        tool_choice: { type: 'function', function: { name: agentFunName } }
-      },
-      extractModel
-    ),
+    body,
    userKey: externalProvider.openaiAccount
  });
-  const { toolCalls, usage } = await formatLLMResponse(response);
+  const { text, toolCalls, usage } = await formatLLMResponse(response);

  const arg: Record<string, any> = (() => {
    try {
      return json5.parse(toolCalls?.[0]?.function?.arguments || '');
    } catch (error) {
-      console.log(agentFunction.parameters);
-      console.log(toolCalls?.[0]?.function);
+      console.log('body', body);
+      console.log('AI response', text, toolCalls?.[0]?.function);
      console.log('Your model may not support tool_call', error);
      return {};
    }
--- a/packages/service/core/workflow/dispatch/tools/readFiles.ts
+++ b/packages/service/core/workflow/dispatch/tools/readFiles.ts
@@ -211,12 +211,12 @@ export const getFileContentFromLinks = async ({
          // Read file
          const { rawText } = await readRawContentByFileBuffer({
            extension,
-            isQAImport: false,
            teamId,
            tmbId,
            buffer,
            encoding,
-            customPdfParse
+            customPdfParse,
+            getFormatText: true
          });

          // Add to buffer