Perf: read file woker (#1337)

* perf: read file worker * fix: Http node url input * fix: htm2md * fix: html2md * fix: ts * perf: Problem classification increases the matching order * feat: tool response answer
2025-07-23 05:12:39 +00:00 · 2024-04-30 18:12:20 +08:00
parent 1529c1e991
commit b5f0ac3e1d
35 changed files with 413 additions and 398 deletions
--- a/packages/service/common/file/read/utils.ts
+++ b/packages/service/common/file/read/utils.ts
@@ -1,16 +1,10 @@
-import { markdownProcess } from '@fastgpt/global/common/string/markdown';
+import { markdownProcess, simpleMarkdownText } from '@fastgpt/global/common/string/markdown';
 import { uploadMongoImg } from '../image/controller';
 import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
 import { addHours } from 'date-fns';
-import { ReadFileByBufferParams } from './type';
-import { readFileRawText } from '../read/rawText';
-import { readMarkdown } from '../read/markdown';
-import { readHtmlRawText } from '../read/html';
-import { readPdfFile } from '../read/pdf';
-import { readWordFile } from '../read/word';
-import { readCsvRawText } from '../read/csv';
-import { readPptxRawText } from '../read/pptx';
-import { readXlsxRawText } from '../read/xlsx';
+
+import { WorkerNameEnum, runWorker } from '../../../worker/utils';
+import { ReadFileResponse } from '../../../worker/file/type';

 export const initMarkdownText = ({
  teamId,
@@ -36,46 +30,39 @@ export const initMarkdownText = ({
 export const readFileRawContent = async ({
  extension,
  csvFormat,
-  params
+  teamId,
+  buffer,
+  encoding,
+  metadata
 }: {
  csvFormat?: boolean;
  extension: string;
-  params: ReadFileByBufferParams;
+  teamId: string;
+  buffer: Buffer;
+  encoding: string;
+  metadata?: Record<string, any>;
 }) => {
-  switch (extension) {
-    case 'txt':
-      return readFileRawText(params);
-    case 'md':
-      return readMarkdown(params);
-    case 'html':
-      return readHtmlRawText(params);
-    case 'pdf':
-      return readPdfFile(params);
-    case 'docx':
-      return readWordFile(params);
-    case 'pptx':
-      return readPptxRawText(params);
-    case 'xlsx':
-      const xlsxResult = await readXlsxRawText(params);
-      if (csvFormat) {
-        return {
-          rawText: xlsxResult.formatText || ''
-        };
-      }
-      return {
-        rawText: xlsxResult.rawText
-      };
-    case 'csv':
-      const csvResult = await readCsvRawText(params);
-      if (csvFormat) {
-        return {
-          rawText: csvResult.formatText || ''
-        };
-      }
-      return {
-        rawText: csvResult.rawText
-      };
-    default:
-      return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
+  const result = await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
+    extension,
+    csvFormat,
+    encoding,
+    buffer
+  });
+
+  // markdown data format
+  if (['md', 'html', 'docx'].includes(extension)) {
+    result.rawText = await initMarkdownText({
+      teamId: teamId,
+      md: result.rawText,
+      metadata: metadata
+    });
  }
+
+  return result;
+};
+
+export const htmlToMarkdown = async (html?: string | null) => {
+  const md = await runWorker<string>(WorkerNameEnum.htmlStr2Md, { html: html || '' });
+
+  return simpleMarkdownText(md);
 };