Perf: read file woker (#1337)

* perf: read file worker * fix: Http node url input * fix: htm2md * fix: html2md * fix: ts * perf: Problem classification increases the matching order * feat: tool response answer
2025-07-27 00:17:31 +00:00 · 2024-04-30 18:12:20 +08:00
parent 1529c1e991
commit b5f0ac3e1d
35 changed files with 413 additions and 398 deletions
--- a/packages/service/common/file/read/csv.ts
+++ b/packages/service/common/file/read/csv.ts
@@ -1,21 +0,0 @@
-import Papa from 'papaparse';
-import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
-import { readFileRawText } from './rawText';
-
-// 加载源文件内容
-export const readCsvRawText = async (params: ReadFileByBufferParams): Promise<ReadFileResponse> => {
-  const { rawText } = readFileRawText(params);
-
-  const csvArr = Papa.parse(rawText).data as string[][];
-
-  const header = csvArr[0];
-
-  const formatText = header
-    ? csvArr.map((item) => item.map((item, i) => `${header[i]}:${item}`).join('\n')).join('\n')
-    : '';
-
-  return {
-    rawText,
-    formatText
-  };
-};
--- a/packages/service/common/file/read/html.ts
+++ b/packages/service/common/file/read/html.ts
@@ -1,23 +0,0 @@
-import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
-import { initMarkdownText } from './utils';
-import { htmlToMarkdown } from '../../string/markdown';
-import { readFileRawText } from './rawText';
-
-export const readHtmlRawText = async (
-  params: ReadFileByBufferParams
-): Promise<ReadFileResponse> => {
-  const { teamId, metadata } = params;
-  const { rawText: html } = readFileRawText(params);
-
-  const md = await htmlToMarkdown(html);
-
-  const rawText = await initMarkdownText({
-    teamId,
-    md,
-    metadata
-  });
-
-  return {
-    rawText
-  };
-};
--- a/packages/service/common/file/read/markdown.ts
+++ b/packages/service/common/file/read/markdown.ts
@@ -1,18 +0,0 @@
-import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
-import { initMarkdownText } from './utils';
-import { readFileRawText } from './rawText';
-
-export const readMarkdown = async (params: ReadFileByBufferParams): Promise<ReadFileResponse> => {
-  const { teamId, metadata } = params;
-  const { rawText: md } = readFileRawText(params);
-
-  const rawText = await initMarkdownText({
-    teamId,
-    md,
-    metadata
-  });
-
-  return {
-    rawText
-  };
-};
--- a/packages/service/common/file/read/parseOffice.ts
+++ b/packages/service/common/file/read/parseOffice.ts
@@ -1,119 +0,0 @@
-import { getNanoid } from '@fastgpt/global/common/string/tools';
-import fs from 'fs';
-import decompress from 'decompress';
-import { DOMParser } from '@xmldom/xmldom';
-import { clearDirFiles } from '../utils';
-import { addLog } from '../../system/log';
-
-const DEFAULTDECOMPRESSSUBLOCATION = '/tmp';
-
-function getNewFileName(ext: string) {
-  return `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}.${ext}`;
-}
-
-const parseString = (xml: string) => {
-  let parser = new DOMParser();
-  return parser.parseFromString(xml, 'text/xml');
-};
-
-const parsePowerPoint = async ({
-  filepath,
-  decompressPath,
-  encoding
-}: {
-  filepath: string;
-  decompressPath: string;
-  encoding: BufferEncoding;
-}) => {
-  // Files regex that hold our content of interest
-  const allFilesRegex = /ppt\/(notesSlides|slides)\/(notesSlide|slide)\d+.xml/g;
-  const slidesRegex = /ppt\/slides\/slide\d+.xml/g;
-
-  /** The decompress location which contains the filename in it */
-
-  const files = await decompress(filepath, decompressPath, {
-    filter: (x) => !!x.path.match(allFilesRegex)
-  });
-
-  // Verify if atleast the slides xml files exist in the extracted files list.
-  if (
-    files.length == 0 ||
-    !files.map((file) => file.path).some((filename) => filename.match(slidesRegex))
-  ) {
-    return Promise.reject('解析 PPT 失败');
-  }
-
-  // Returning an array of all the xml contents read using fs.readFileSync
-  const xmlContentArray = files.map((file) =>
-    fs.readFileSync(`${decompressPath}/${file.path}`, encoding)
-  );
-
-  let responseArr: string[] = [];
-
-  xmlContentArray.forEach((xmlContent) => {
-    /** Find text nodes with a:p tags */
-    const xmlParagraphNodesList = parseString(xmlContent).getElementsByTagName('a:p');
-
-    /** Store all the text content to respond */
-    responseArr.push(
-      Array.from(xmlParagraphNodesList)
-        // Filter paragraph nodes than do not have any text nodes which are identifiable by a:t tag
-        .filter((paragraphNode) => paragraphNode.getElementsByTagName('a:t').length != 0)
-        .map((paragraphNode) => {
-          /** Find text nodes with a:t tags */
-          const xmlTextNodeList = paragraphNode.getElementsByTagName('a:t');
-          return Array.from(xmlTextNodeList)
-            .filter((textNode) => textNode.childNodes[0] && textNode.childNodes[0].nodeValue)
-            .map((textNode) => textNode.childNodes[0].nodeValue)
-            .join('');
-        })
-        .join('\n')
-    );
-  });
-
-  return responseArr.join('\n');
-};
-
-export const parseOffice = async ({
-  buffer,
-  encoding,
-  extension
-}: {
-  buffer: Buffer;
-  encoding: BufferEncoding;
-  extension: string;
-}) => {
-  // Prepare file for processing
-  // create temp file subdirectory if it does not exist
-  if (!fs.existsSync(DEFAULTDECOMPRESSSUBLOCATION)) {
-    fs.mkdirSync(DEFAULTDECOMPRESSSUBLOCATION, { recursive: true });
-  }
-
-  // temp file name
-  const filepath = getNewFileName(extension);
-  const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}`;
-  //   const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/test`;
-
-  // write new file
-  fs.writeFileSync(filepath, buffer, {
-    encoding
-  });
-
-  const text = await (async () => {
-    try {
-      switch (extension) {
-        case 'pptx':
-          return parsePowerPoint({ filepath, decompressPath, encoding });
-        default:
-          return Promise.reject('只能读取 .pptx 文件');
-      }
-    } catch (error) {
-      addLog.error(`Load ppt error`, { error });
-    }
-    return '';
-  })();
-
-  fs.unlinkSync(filepath);
-  clearDirFiles(decompressPath);
-  return text;
-};
--- a/packages/service/common/file/read/pdf.ts
+++ b/packages/service/common/file/read/pdf.ts
@@ -1,71 +0,0 @@
-import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
-// @ts-ignore
-import('pdfjs-dist/legacy/build/pdf.worker.min.mjs');
-import { ReadFileByBufferParams, ReadFileResponse } from './type';
-
-type TokenType = {
-  str: string;
-  dir: string;
-  width: number;
-  height: number;
-  transform: number[];
-  fontName: string;
-  hasEOL: boolean;
-};
-
-export const readPdfFile = async ({
-  buffer
-}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
-  const readPDFPage = async (doc: any, pageNo: number) => {
-    const page = await doc.getPage(pageNo);
-    const tokenizedText = await page.getTextContent();
-
-    const viewport = page.getViewport({ scale: 1 });
-    const pageHeight = viewport.height;
-    const headerThreshold = pageHeight * 0.95;
-    const footerThreshold = pageHeight * 0.05;
-
-    const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
-      return (
-        !token.transform ||
-        (token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
-      );
-    });
-
-    // concat empty string 'hasEOL'
-    for (let i = 0; i < pageTexts.length; i++) {
-      const item = pageTexts[i];
-      if (item.str === '' && pageTexts[i - 1]) {
-        pageTexts[i - 1].hasEOL = item.hasEOL;
-        pageTexts.splice(i, 1);
-        i--;
-      }
-    }
-
-    page.cleanup();
-
-    return pageTexts
-      .map((token) => {
-        const paragraphEnd = token.hasEOL && /([。？！.?!\n\r]|(\r\n))$/.test(token.str);
-
-        return paragraphEnd ? `${token.str}\n` : token.str;
-      })
-      .join('');
-  };
-
-  const loadingTask = pdfjs.getDocument(buffer.buffer);
-  const doc = await loadingTask.promise;
-
-  const pageTextPromises = [];
-  for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
-    pageTextPromises.push(readPDFPage(doc, pageNo));
-  }
-  const pageTexts = await Promise.all(pageTextPromises);
-
-  loadingTask.destroy();
-
-  return {
-    rawText: pageTexts.join(''),
-    metadata: {}
-  };
-};
--- a/packages/service/common/file/read/pptx.ts
+++ b/packages/service/common/file/read/pptx.ts
@@ -1,18 +0,0 @@
-import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
-// import { parseOfficeAsync } from 'officeparser';
-import { parseOffice } from './parseOffice';
-
-export const readPptxRawText = async ({
-  buffer,
-  encoding
-}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
-  const result = await parseOffice({
-    buffer,
-    encoding: encoding as BufferEncoding,
-    extension: 'pptx'
-  });
-
-  return {
-    rawText: result
-  };
-};
--- a/packages/service/common/file/read/rawText.ts
+++ b/packages/service/common/file/read/rawText.ts
@@ -1,28 +0,0 @@
-import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
-import iconv from 'iconv-lite';
-
-const rawEncodingList = [
-  'ascii',
-  'utf8',
-  'utf-8',
-  'utf16le',
-  'utf-16le',
-  'ucs2',
-  'ucs-2',
-  'base64',
-  'base64url',
-  'latin1',
-  'binary',
-  'hex'
-];
-
-// 加载源文件内容
-export const readFileRawText = ({ buffer, encoding }: ReadFileByBufferParams): ReadFileResponse => {
-  const content = rawEncodingList.includes(encoding)
-    ? buffer.toString(encoding as BufferEncoding)
-    : iconv.decode(buffer, 'gbk');
-
-  return {
-    rawText: content
-  };
-};
--- a/packages/service/common/file/read/type.d.ts
+++ b/packages/service/common/file/read/type.d.ts
@@ -1,12 +0,0 @@
-export type ReadFileByBufferParams = {
-  teamId: string;
-  buffer: Buffer;
-  encoding: string;
-  metadata?: Record<string, any>;
-};
-
-export type ReadFileResponse = {
-  rawText: string;
-  formatText?: string;
-  metadata?: Record<string, any>;
-};
--- a/packages/service/common/file/read/utils.ts
+++ b/packages/service/common/file/read/utils.ts
@@ -1,16 +1,10 @@
-import { markdownProcess } from '@fastgpt/global/common/string/markdown';
+import { markdownProcess, simpleMarkdownText } from '@fastgpt/global/common/string/markdown';
 import { uploadMongoImg } from '../image/controller';
 import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
 import { addHours } from 'date-fns';
-import { ReadFileByBufferParams } from './type';
-import { readFileRawText } from '../read/rawText';
-import { readMarkdown } from '../read/markdown';
-import { readHtmlRawText } from '../read/html';
-import { readPdfFile } from '../read/pdf';
-import { readWordFile } from '../read/word';
-import { readCsvRawText } from '../read/csv';
-import { readPptxRawText } from '../read/pptx';
-import { readXlsxRawText } from '../read/xlsx';
+
+import { WorkerNameEnum, runWorker } from '../../../worker/utils';
+import { ReadFileResponse } from '../../../worker/file/type';

 export const initMarkdownText = ({
  teamId,
@@ -36,46 +30,39 @@ export const initMarkdownText = ({
 export const readFileRawContent = async ({
  extension,
  csvFormat,
-  params
+  teamId,
+  buffer,
+  encoding,
+  metadata
 }: {
  csvFormat?: boolean;
  extension: string;
-  params: ReadFileByBufferParams;
+  teamId: string;
+  buffer: Buffer;
+  encoding: string;
+  metadata?: Record<string, any>;
 }) => {
-  switch (extension) {
-    case 'txt':
-      return readFileRawText(params);
-    case 'md':
-      return readMarkdown(params);
-    case 'html':
-      return readHtmlRawText(params);
-    case 'pdf':
-      return readPdfFile(params);
-    case 'docx':
-      return readWordFile(params);
-    case 'pptx':
-      return readPptxRawText(params);
-    case 'xlsx':
-      const xlsxResult = await readXlsxRawText(params);
-      if (csvFormat) {
-        return {
-          rawText: xlsxResult.formatText || ''
-        };
-      }
-      return {
-        rawText: xlsxResult.rawText
-      };
-    case 'csv':
-      const csvResult = await readCsvRawText(params);
-      if (csvFormat) {
-        return {
-          rawText: csvResult.formatText || ''
-        };
-      }
-      return {
-        rawText: csvResult.rawText
-      };
-    default:
-      return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
+  const result = await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
+    extension,
+    csvFormat,
+    encoding,
+    buffer
+  });
+
+  // markdown data format
+  if (['md', 'html', 'docx'].includes(extension)) {
+    result.rawText = await initMarkdownText({
+      teamId: teamId,
+      md: result.rawText,
+      metadata: metadata
+    });
  }
+
+  return result;
+};
+
+export const htmlToMarkdown = async (html?: string | null) => {
+  const md = await runWorker<string>(WorkerNameEnum.htmlStr2Md, { html: html || '' });
+
+  return simpleMarkdownText(md);
 };
--- a/packages/service/common/file/read/word.ts
+++ b/packages/service/common/file/read/word.ts
@@ -1,35 +0,0 @@
-import mammoth from 'mammoth';
-import { htmlToMarkdown } from '../../string/markdown';
-import { ReadFileByBufferParams, ReadFileResponse } from './type';
-import { initMarkdownText } from './utils';
-
-/**
- * read docx to markdown
- */
-export const readWordFile = async ({
-  teamId,
-  buffer,
-  metadata = {}
-}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
-  try {
-    const { value: html } = await mammoth.convertToHtml({
-      buffer
-    });
-
-    const md = await htmlToMarkdown(html);
-
-    const rawText = await initMarkdownText({
-      teamId,
-      md,
-      metadata
-    });
-
-    return {
-      rawText,
-      metadata: {}
-    };
-  } catch (error) {
-    console.log('error doc read:', error);
-    return Promise.reject('Can not read doc file, please convert to PDF');
-  }
-};
--- a/packages/service/common/file/read/xlsx.ts
+++ b/packages/service/common/file/read/xlsx.ts
@@ -1,45 +0,0 @@
-import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
-import xlsx from 'node-xlsx';
-import Papa from 'papaparse';
-
-export const readXlsxRawText = async ({
-  buffer
-}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
-  const result = xlsx.parse(buffer, {
-    skipHidden: false,
-    defval: ''
-  });
-
-  const format2Csv = result.map(({ name, data }) => {
-    return {
-      title: `#${name}`,
-      csvText: data.map((item) => item.join(',')).join('\n')
-    };
-  });
-
-  const rawText = format2Csv.map((item) => item.csvText).join('\n');
-  const formatText = format2Csv
-    .map((item) => {
-      const csvArr = Papa.parse(item.csvText).data as string[][];
-      const header = csvArr[0];
-
-      const formatText = header
-        ? csvArr
-            .map((item) =>
-              item
-                .map((item, i) => (item ? `${header[i]}:${item}` : ''))
-                .filter(Boolean)
-                .join('\n')
-            )
-            .join('\n')
-        : '';
-
-      return `${item.title}\n${formatText}`;
-    })
-    .join('\n');
-
-  return {
-    rawText: rawText,
-    formatText
-  };
-};