fix: upload file (#2992)

* fix: upload file * chore: remove wasm, support html image parse * chore: adjust * chore: move base64match function into htmlstr2md
2025-07-22 20:37:48 +00:00 · 2024-10-28 21:44:50 +08:00
parent 4e3d817b63
commit b712a821f8
8 changed files with 440 additions and 240 deletions
--- a/packages/service/common/file/gridfs/controller.ts
+++ b/packages/service/common/file/gridfs/controller.ts
@@ -159,7 +159,6 @@ export const readFileContentFromMongo = async ({
    getFileById({ bucketName, fileId }),
    getDownloadStream({ bucketName, fileId })
  ]);
-  // console.log('get file stream', Date.now() - start);
  if (!file) {
    return Promise.reject(CommonErrEnum.fileNotFound);
  }
--- a/packages/service/common/file/read/utils.ts
+++ b/packages/service/common/file/read/utils.ts
@@ -1,7 +1,5 @@
-import { markdownProcess } from '@fastgpt/global/common/string/markdown';
 import { uploadMongoImg } from '../image/controller';
 import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
-import { addHours } from 'date-fns';
 import FormData from 'form-data';

 import { WorkerNameEnum, runWorker } from '../../../worker/utils';
@@ -10,6 +8,7 @@ import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
 import type { ReadFileResponse } from '../../../worker/readFile/type';
 import axios from 'axios';
 import { addLog } from '../../system/log';
+import { batchRun } from '@fastgpt/global/common/fn/utils';

 export type readRawTextByLocalFileParams = {
  teamId: string;
@@ -53,21 +52,6 @@ export const readRawContentByFileBuffer = async ({
  encoding: string;
  metadata?: Record<string, any>;
 }) => {
-  // Upload image in markdown
-  const matchMdImgTextAndUpload = ({ teamId, md }: { md: string; teamId: string }) =>
-    markdownProcess({
-      rawText: md,
-      uploadImgController: (base64Img) =>
-        uploadMongoImg({
-          type: MongoImageTypeEnum.collectionImage,
-          base64Img,
-          teamId,
-          metadata,
-          expiredTime: addHours(new Date(), 1)
-        })
-    });
-
-  /* If */
  const customReadfileUrl = process.env.CUSTOM_READ_FILE_URL;
  const customReadFileExtension = process.env.CUSTOM_READ_FILE_EXTENSION || '';
  const ocrParse = process.env.CUSTOM_READ_FILE_OCR || 'false';
@@ -111,19 +95,28 @@ export const readRawContentByFileBuffer = async ({
    };
  };

-  let { rawText, formatText } =
+  let { rawText, formatText, imageList } =
    (await readFileFromCustomService()) ||
    (await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
      extension,
      encoding,
-      buffer
+      buffer,
+      teamId
    }));

  // markdown data format
-  if (['md', 'html', 'docx', ...customReadFileExtension.split(',')].includes(extension)) {
-    rawText = await matchMdImgTextAndUpload({
-      teamId: teamId,
-      md: rawText
+  if (imageList) {
+    await batchRun(imageList, async (item) => {
+      const src = await uploadMongoImg({
+        type: MongoImageTypeEnum.collectionImage,
+        base64Img: `data:${item.mime};base64,${item.base64}`,
+        teamId,
+        metadata: {
+          ...metadata,
+          mime: item.mime
+        }
+      });
+      rawText = rawText.replace(item.uuid, src);
    });
  }

--- a/packages/service/worker/htmlStr2Md/utils.ts
+++ b/packages/service/worker/htmlStr2Md/utils.ts
@@ -1,7 +1,14 @@
 import TurndownService from 'turndown';
+import { ImageType } from '../readFile/type';
+// @ts-ignore
 const turndownPluginGfm = require('joplin-turndown-plugin-gfm');

-export const html2md = (html: string): string => {
+export const html2md = (
+  html: string
+): {
+  rawText: string;
+  imageList: ImageType[];
+} => {
  const turndownService = new TurndownService({
    headingStyle: 'atx',
    bulletListMarker: '-',
@@ -15,12 +22,32 @@ export const html2md = (html: string): string => {

  try {
    turndownService.remove(['i', 'script', 'iframe', 'style']);
-
    turndownService.use(turndownPluginGfm.gfm);

-    return turndownService.turndown(html);
+    const base64Regex = /"(data:image\/[^;]+;base64[^"]+)"/g;
+    const imageList: ImageType[] = [];
+    const images = Array.from(html.match(base64Regex) || []);
+    for (const image of images) {
+      const uuid = crypto.randomUUID();
+      const mime = image.split(';')[0].split(':')[1];
+      const base64 = image.split(',')[1];
+      html = html.replace(image, uuid);
+      imageList.push({
+        uuid,
+        base64,
+        mime
+      });
+    }
+
+    return {
+      rawText: turndownService.turndown(html),
+      imageList
+    };
  } catch (error) {
    console.log('html 2 markdown error', error);
-    return '';
+    return {
+      rawText: '',
+      imageList: []
+    };
  }
 };
--- a/packages/service/worker/readFile/extension/docx.ts
+++ b/packages/service/worker/readFile/extension/docx.ts
@@ -1,20 +1,39 @@
-import mammoth from 'mammoth';
-import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
+import mammoth, { images } from 'mammoth';
+import { ReadRawTextByBuffer, ReadFileResponse, ImageType } from '../type';
 import { html2md } from '../../htmlStr2Md/utils';

 /**
 * read docx to markdown
 */
 export const readDocsFile = async ({ buffer }: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
+  const imageList: ImageType[] = [];
  try {
-    const { value: html } = await mammoth.convertToHtml({
-      buffer
-    });
+    const { value: html } = await mammoth.convertToHtml(
+      {
+        buffer
+      },
+      {
+        convertImage: images.imgElement(async (image) => {
+          const imageBase64 = await image.readAsBase64String();
+          const uuid = crypto.randomUUID();
+          const mime = image.contentType;
+          imageList.push({
+            uuid,
+            base64: imageBase64,
+            mime
+          });
+          return {
+            src: uuid
+          };
+        })
+      }
+    );

-    const rawText = html2md(html);
+    const { rawText } = html2md(html);

    return {
-      rawText
+      rawText,
+      imageList
    };
  } catch (error) {
    console.log('error doc read:', error);
--- a/packages/service/worker/readFile/extension/html.ts
+++ b/packages/service/worker/readFile/extension/html.ts
@@ -5,9 +5,10 @@ import { html2md } from '../../htmlStr2Md/utils';
 export const readHtmlRawText = async (params: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
  const { rawText: html } = readFileRawText(params);

-  const rawText = html2md(html);
+  const { rawText, imageList } = html2md(html);

  return {
-    rawText
+    rawText,
+    imageList
  };
 };
--- a/packages/service/worker/readFile/type.d.ts
+++ b/packages/service/worker/readFile/type.d.ts
@@ -8,7 +8,14 @@ export type ReadRawTextProps<T> = {

 export type ReadRawTextByBuffer = ReadRawTextProps<Buffer>;

+export type ImageType = {
+  uuid: string;
+  base64: string;
+  mime: string;
+};
+
 export type ReadFileResponse = {
  rawText: string;
  formatText?: string;
+  imageList?: ImageType[];
 };
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
--- a/projects/app/src/pages/api/core/dataset/collection/create/fileId.ts
+++ b/projects/app/src/pages/api/core/dataset/collection/create/fileId.ts
@@ -32,6 +32,7 @@ async function handler(req: ApiRequestProps<FileIdCreateDatasetCollectionParams>
    ...body
  } = req.body;

+  const start = Date.now();
  const { teamId, tmbId, dataset } = await authDataset({
    req,
    authToken: true,
@@ -46,6 +47,7 @@ async function handler(req: ApiRequestProps<FileIdCreateDatasetCollectionParams>
    bucketName: BucketNameEnum.dataset,
    fileId
  });
+
  // 2. split chunks
  const chunks = rawText2Chunks({
    rawText,