Feat: pptx and xlsx loader (#1118)

* perf: plan tip * perf: upload size controller * feat: add image ttl index * feat: new upload file ux * remove file * feat: support read pptx * feat: support xlsx * fix: rerank docker flie
2025-10-17 16:45:02 +00:00 · 2024-04-01 19:01:26 +08:00
parent f9d266a6af
commit 21288d1736
90 changed files with 2707 additions and 1678 deletions
--- a/packages/web/common/file/read/csv.ts
+++ b/packages/web/common/file/read/csv.ts
@@ -1,40 +0,0 @@
-import Papa from 'papaparse';
-import { readFileRawText } from './rawText';
-
-/**
- * read csv to json
- * @response {
- *  header: string[],
- *  data: string[][]
- * }
- */
-export const readCsvContent = async ({ file }: { file: File }) => {
-  try {
-    const { rawText: textArr } = await readFileRawText(file);
-    const csvArr = Papa.parse(textArr).data as string[][];
-    if (csvArr.length === 0) {
-      throw new Error('csv 解析失败');
-    }
-
-    const header = csvArr.shift() as string[];
-
-    // add title to data
-    const rawText = csvArr
-      .map((item) =>
-        item.map((value, index) => {
-          if (!header[index]) return value;
-          return `${header[index]}: ${value}`;
-        })
-      )
-      .flat()
-      .join('\n');
-
-    return {
-      rawText,
-      header,
-      data: csvArr.map((item) => item)
-    };
-  } catch (error) {
-    return Promise.reject('解析 csv 文件失败');
-  }
-};
--- a/packages/web/common/file/read/html.ts
+++ b/packages/web/common/file/read/html.ts
@@ -1,21 +0,0 @@
-import { htmlStr2Md } from '../../string/markdown';
-import { readFileRawText } from './rawText';
-import { markdownProcess } from '@fastgpt/global/common/string/markdown';
-
-export const readHtmlFile = async ({
-  file,
-  uploadImgController
-}: {
-  file: File;
-  uploadImgController?: (base64: string) => Promise<string>;
-}) => {
-  const { rawText } = await readFileRawText(file);
-  const md = htmlStr2Md(rawText);
-
-  const simpleMd = await markdownProcess({
-    rawText: md,
-    uploadImgController
-  });
-
-  return { rawText: simpleMd };
-};
--- a/packages/web/common/file/read/index.ts
+++ b/packages/web/common/file/read/index.ts
@@ -1,49 +0,0 @@
-import { loadFile2Buffer } from '../utils';
-import { readCsvContent } from './csv';
-import { readHtmlFile } from './html';
-import { readMdFile } from './md';
-import { readPdfFile } from './pdf';
-import { readFileRawText } from './rawText';
-import { readWordFile } from './word';
-
-export const readFileRawContent = async ({
-  file,
-  uploadBase64Controller
-}: {
-  file: File;
-  uploadBase64Controller?: (base64: string) => Promise<string>;
-}): Promise<{
-  rawText: string;
-}> => {
-  const extension = file?.name?.split('.')?.pop()?.toLowerCase();
-
-  switch (extension) {
-    case 'txt':
-      return readFileRawText(file);
-    case 'md':
-      return readMdFile({
-        file,
-        uploadImgController: uploadBase64Controller
-      });
-    case 'html':
-      return readHtmlFile({
-        file,
-        uploadImgController: uploadBase64Controller
-      });
-    case 'csv':
-      return readCsvContent({ file });
-    case 'pdf':
-      const pdf = await loadFile2Buffer({ file });
-      return readPdfFile({ pdf });
-    case 'docx':
-      return readWordFile({
-        file,
-        uploadImgController: uploadBase64Controller
-      });
-
-    default:
-      return {
-        rawText: ''
-      };
-  }
-};
--- a/packages/web/common/file/read/md.ts
+++ b/packages/web/common/file/read/md.ts
@@ -1,17 +0,0 @@
-import { markdownProcess } from '@fastgpt/global/common/string/markdown';
-import { readFileRawText } from './rawText';
-
-export const readMdFile = async ({
-  file,
-  uploadImgController
-}: {
-  file: File;
-  uploadImgController?: (base64: string) => Promise<string>;
-}) => {
-  const { rawText: md } = await readFileRawText(file);
-  const simpleMd = await markdownProcess({
-    rawText: md,
-    uploadImgController
-  });
-  return { rawText: simpleMd };
-};
--- a/packages/web/common/file/read/pdf.ts
+++ b/packages/web/common/file/read/pdf.ts
@@ -1,64 +0,0 @@
-/* read file to txt */
-import * as pdfjsLib from 'pdfjs-dist';
-
-type TokenType = {
-  str: string;
-  dir: string;
-  width: number;
-  height: number;
-  transform: number[];
-  fontName: string;
-  hasEOL: boolean;
-};
-
-export const readPdfFile = async ({ pdf }: { pdf: ArrayBuffer }) => {
-  pdfjsLib.GlobalWorkerOptions.workerSrc = '/js/pdf.worker.js';
-
-  const readPDFPage = async (doc: any, pageNo: number) => {
-    const page = await doc.getPage(pageNo);
-    const tokenizedText = await page.getTextContent();
-
-    const viewport = page.getViewport({ scale: 1 });
-    const pageHeight = viewport.height;
-    const headerThreshold = pageHeight * 0.95;
-    const footerThreshold = pageHeight * 0.05;
-
-    const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
-      return (
-        !token.transform ||
-        (token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
-      );
-    });
-
-    // concat empty string 'hasEOL'
-    for (let i = 0; i < pageTexts.length; i++) {
-      const item = pageTexts[i];
-      if (item.str === '' && pageTexts[i - 1]) {
-        pageTexts[i - 1].hasEOL = item.hasEOL;
-        pageTexts.splice(i, 1);
-        i--;
-      }
-    }
-
-    page.cleanup();
-
-    return pageTexts
-      .map((token) => {
-        const paragraphEnd = token.hasEOL && /([。？！.?!\n\r]|(\r\n))$/.test(token.str);
-
-        return paragraphEnd ? `${token.str}\n` : token.str;
-      })
-      .join('');
-  };
-
-  const doc = await pdfjsLib.getDocument(pdf).promise;
-  const pageTextPromises = [];
-  for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
-    pageTextPromises.push(readPDFPage(doc, pageNo));
-  }
-  const pageTexts = await Promise.all(pageTextPromises);
-
-  return {
-    rawText: pageTexts.join('')
-  };
-};
--- a/packages/web/common/file/read/rawText.ts
+++ b/packages/web/common/file/read/rawText.ts
@@ -1,36 +0,0 @@
-import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
-
-/**
- * read file raw text
- */
-export const readFileRawText = (file: File) => {
-  return new Promise<{ rawText: string }>((resolve, reject) => {
-    try {
-      const reader = new FileReader();
-      reader.onload = () => {
-        //@ts-ignore
-        const encode = detectFileEncoding(reader.result);
-
-        // 再次读取文件，这次使用检测到的编码
-        const reader2 = new FileReader();
-        reader2.onload = () => {
-          resolve({
-            rawText: reader2.result as string
-          });
-        };
-        reader2.onerror = (err) => {
-          console.log('Error reading file with detected encoding:', err);
-          reject('Read file error with detected encoding');
-        };
-        reader2.readAsText(file, encode);
-      };
-      reader.onerror = (err) => {
-        console.log('error txt read:', err);
-        reject('Read file error');
-      };
-      reader.readAsBinaryString(file);
-    } catch (error) {
-      reject(error);
-    }
-  });
-};
--- a/packages/web/common/file/read/word.ts
+++ b/packages/web/common/file/read/word.ts
@@ -1,28 +0,0 @@
-import { markdownProcess } from '@fastgpt/global/common/string/markdown';
-import { htmlStr2Md } from '../../string/markdown';
-import { loadFile2Buffer } from '../utils';
-import mammoth from 'mammoth';
-
-export const readWordFile = async ({
-  file,
-  uploadImgController
-}: {
-  file: File;
-  uploadImgController?: (base64: string) => Promise<string>;
-}) => {
-  const buffer = await loadFile2Buffer({ file });
-
-  const { value: html } = await mammoth.convertToHtml({
-    arrayBuffer: buffer
-  });
-  const md = htmlStr2Md(html);
-
-  const rawText = await markdownProcess({
-    rawText: md,
-    uploadImgController: uploadImgController
-  });
-
-  return {
-    rawText
-  };
-};