Feat: pptx and xlsx loader (#1118)

* perf: plan tip * perf: upload size controller * feat: add image ttl index * feat: new upload file ux * remove file * feat: support read pptx * feat: support xlsx * fix: rerank docker flie
2025-10-17 08:37:59 +00:00 · 2024-04-01 19:01:26 +08:00
parent f9d266a6af
commit 21288d1736
90 changed files with 2707 additions and 1678 deletions
--- a/packages/service/common/file/gridfs/controller.ts
+++ b/packages/service/common/file/gridfs/controller.ts
@@ -4,6 +4,18 @@ import fsp from 'fs/promises';
 import fs from 'fs';
 import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
 import { MongoFileSchema } from './schema';
+import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
+import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
+import { readFileRawText } from '../read/rawText';
+import { ReadFileByBufferParams } from '../read/type';
+import { readMarkdown } from '../read/markdown';
+import { readHtmlRawText } from '../read/html';
+import { readPdfFile } from '../read/pdf';
+import { readWordFile } from '../read/word';
+import { readCsvRawText } from '../read/csv';
+import { MongoRwaTextBuffer } from '../../buffer/rawText/schema';
+import { readPptxRawText } from '../read/pptx';
+import { readXlsxRawText } from '../read/xlsx';

 export function getGFSCollection(bucket: `${BucketNameEnum}`) {
  MongoFileSchema;
@@ -111,3 +123,139 @@ export async function getDownloadStream({

  return bucket.openDownloadStream(new Types.ObjectId(fileId));
 }
+
+export const readFileEncode = async ({
+  bucketName,
+  fileId
+}: {
+  bucketName: `${BucketNameEnum}`;
+  fileId: string;
+}) => {
+  const encodeStream = await getDownloadStream({ bucketName, fileId });
+  let buffers: Buffer = Buffer.from([]);
+  for await (const chunk of encodeStream) {
+    buffers = Buffer.concat([buffers, chunk]);
+    if (buffers.length > 10) {
+      encodeStream.abort();
+      break;
+    }
+  }
+
+  const encoding = detectFileEncoding(buffers);
+
+  return encoding as BufferEncoding;
+};
+
+export const readFileContent = async ({
+  teamId,
+  bucketName,
+  fileId,
+  csvFormat = false
+}: {
+  teamId: string;
+  bucketName: `${BucketNameEnum}`;
+  fileId: string;
+  csvFormat?: boolean;
+}): Promise<{
+  rawText: string;
+  filename: string;
+}> => {
+  // read buffer
+  const fileBuffer = await MongoRwaTextBuffer.findOne({ sourceId: fileId }).lean();
+  if (fileBuffer) {
+    return {
+      rawText: fileBuffer.rawText,
+      filename: fileBuffer.metadata?.filename || ''
+    };
+  }
+
+  const [file, encoding, fileStream] = await Promise.all([
+    getFileById({ bucketName, fileId }),
+    readFileEncode({ bucketName, fileId }),
+    getDownloadStream({ bucketName, fileId })
+  ]);
+
+  if (!file) {
+    return Promise.reject(CommonErrEnum.fileNotFound);
+  }
+
+  const extension = file?.filename?.split('.')?.pop()?.toLowerCase() || '';
+
+  const fileBuffers = await (() => {
+    return new Promise<Buffer>((resolve, reject) => {
+      let buffers = Buffer.from([]);
+      fileStream.on('data', (chunk) => {
+        buffers = Buffer.concat([buffers, chunk]);
+      });
+      fileStream.on('end', () => {
+        resolve(buffers);
+      });
+      fileStream.on('error', (err) => {
+        reject(err);
+      });
+    });
+  })();
+
+  const params: ReadFileByBufferParams = {
+    teamId,
+    buffer: fileBuffers,
+    encoding,
+    metadata: {
+      relatedId: fileId
+    }
+  };
+
+  const { rawText } = await (async () => {
+    switch (extension) {
+      case 'txt':
+        return readFileRawText(params);
+      case 'md':
+        return readMarkdown(params);
+      case 'html':
+        return readHtmlRawText(params);
+      case 'pdf':
+        return readPdfFile(params);
+      case 'docx':
+        return readWordFile(params);
+      case 'pptx':
+        return readPptxRawText(params);
+      case 'xlsx':
+        const xlsxResult = await readXlsxRawText(params);
+        if (csvFormat) {
+          return {
+            rawText: xlsxResult.formatText || ''
+          };
+        }
+        return {
+          rawText: xlsxResult.rawText
+        };
+      case 'csv':
+        const csvResult = await readCsvRawText(params);
+        if (csvFormat) {
+          return {
+            rawText: csvResult.formatText || ''
+          };
+        }
+        return {
+          rawText: csvResult.rawText
+        };
+      default:
+        return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
+    }
+  })();
+
+  if (rawText.trim()) {
+    await MongoRwaTextBuffer.create({
+      sourceId: fileId,
+      rawText,
+      metadata: {
+        filename: file.filename
+      }
+    });
+  }
+
+  return {
+    rawText,
+    filename: file.filename
+  };
+};