External dataset (#1497)

* perf: read rawText and chunk code * perf: read raw text * perf: read rawtext * perf: token count * log
2025-10-19 10:07:24 +00:00 · 2024-05-16 11:47:53 +08:00
parent d5073f98ab
commit c6d9b15897
36 changed files with 531 additions and 267 deletions
--- a/packages/service/common/file/read/utils.ts
+++ b/packages/service/common/file/read/utils.ts
@@ -5,6 +5,7 @@ import { addHours } from 'date-fns';

 import { WorkerNameEnum, runWorker } from '../../../worker/utils';
 import { ReadFileResponse } from '../../../worker/file/type';
+import { rawTextBackupPrefix } from '@fastgpt/global/core/dataset/read';

 export const initMarkdownText = ({
  teamId,
@@ -29,36 +30,44 @@ export const initMarkdownText = ({

 export const readFileRawContent = async ({
  extension,
-  csvFormat,
+  isQAImport,
  teamId,
  buffer,
  encoding,
  metadata
 }: {
-  csvFormat?: boolean;
+  isQAImport?: boolean;
  extension: string;
  teamId: string;
  buffer: Buffer;
  encoding: string;
  metadata?: Record<string, any>;
 }) => {
-  const result = await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
+  let { rawText, formatText } = await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
    extension,
-    csvFormat,
    encoding,
    buffer
  });

  // markdown data format
  if (['md', 'html', 'docx'].includes(extension)) {
-    result.rawText = await initMarkdownText({
+    rawText = await initMarkdownText({
      teamId: teamId,
-      md: result.rawText,
+      md: rawText,
      metadata: metadata
    });
  }

-  return result;
+  if (['csv', 'xlsx'].includes(extension)) {
+    // qa data
+    if (isQAImport) {
+      rawText = rawText || '';
+    } else {
+      rawText = formatText || '';
+    }
+  }
+
+  return { rawText };
 };

 export const htmlToMarkdown = async (html?: string | null) => {