fix: upload file (#2992)

* fix: upload file

* chore: remove wasm, support html image parse

* chore: adjust

* chore: move base64match function into htmlstr2md
This commit is contained in:
Finley Ge
2024-10-28 21:44:50 +08:00
committed by GitHub
parent 4e3d817b63
commit b712a821f8
8 changed files with 440 additions and 240 deletions

View File

@@ -159,7 +159,6 @@ export const readFileContentFromMongo = async ({
getFileById({ bucketName, fileId }), getFileById({ bucketName, fileId }),
getDownloadStream({ bucketName, fileId }) getDownloadStream({ bucketName, fileId })
]); ]);
// console.log('get file stream', Date.now() - start);
if (!file) { if (!file) {
return Promise.reject(CommonErrEnum.fileNotFound); return Promise.reject(CommonErrEnum.fileNotFound);
} }

View File

@@ -1,7 +1,5 @@
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
import { uploadMongoImg } from '../image/controller'; import { uploadMongoImg } from '../image/controller';
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants'; import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
import { addHours } from 'date-fns';
import FormData from 'form-data'; import FormData from 'form-data';
import { WorkerNameEnum, runWorker } from '../../../worker/utils'; import { WorkerNameEnum, runWorker } from '../../../worker/utils';
@@ -10,6 +8,7 @@ import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
import type { ReadFileResponse } from '../../../worker/readFile/type'; import type { ReadFileResponse } from '../../../worker/readFile/type';
import axios from 'axios'; import axios from 'axios';
import { addLog } from '../../system/log'; import { addLog } from '../../system/log';
import { batchRun } from '@fastgpt/global/common/fn/utils';
export type readRawTextByLocalFileParams = { export type readRawTextByLocalFileParams = {
teamId: string; teamId: string;
@@ -53,21 +52,6 @@ export const readRawContentByFileBuffer = async ({
encoding: string; encoding: string;
metadata?: Record<string, any>; metadata?: Record<string, any>;
}) => { }) => {
// Upload image in markdown
const matchMdImgTextAndUpload = ({ teamId, md }: { md: string; teamId: string }) =>
markdownProcess({
rawText: md,
uploadImgController: (base64Img) =>
uploadMongoImg({
type: MongoImageTypeEnum.collectionImage,
base64Img,
teamId,
metadata,
expiredTime: addHours(new Date(), 1)
})
});
/* If */
const customReadfileUrl = process.env.CUSTOM_READ_FILE_URL; const customReadfileUrl = process.env.CUSTOM_READ_FILE_URL;
const customReadFileExtension = process.env.CUSTOM_READ_FILE_EXTENSION || ''; const customReadFileExtension = process.env.CUSTOM_READ_FILE_EXTENSION || '';
const ocrParse = process.env.CUSTOM_READ_FILE_OCR || 'false'; const ocrParse = process.env.CUSTOM_READ_FILE_OCR || 'false';
@@ -111,19 +95,28 @@ export const readRawContentByFileBuffer = async ({
}; };
}; };
let { rawText, formatText } = let { rawText, formatText, imageList } =
(await readFileFromCustomService()) || (await readFileFromCustomService()) ||
(await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, { (await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
extension, extension,
encoding, encoding,
buffer buffer,
teamId
})); }));
// markdown data format // markdown data format
if (['md', 'html', 'docx', ...customReadFileExtension.split(',')].includes(extension)) { if (imageList) {
rawText = await matchMdImgTextAndUpload({ await batchRun(imageList, async (item) => {
teamId: teamId, const src = await uploadMongoImg({
md: rawText type: MongoImageTypeEnum.collectionImage,
base64Img: `data:${item.mime};base64,${item.base64}`,
teamId,
metadata: {
...metadata,
mime: item.mime
}
});
rawText = rawText.replace(item.uuid, src);
}); });
} }

View File

@@ -1,7 +1,14 @@
import TurndownService from 'turndown'; import TurndownService from 'turndown';
import { ImageType } from '../readFile/type';
// @ts-ignore
const turndownPluginGfm = require('joplin-turndown-plugin-gfm'); const turndownPluginGfm = require('joplin-turndown-plugin-gfm');
export const html2md = (html: string): string => { export const html2md = (
html: string
): {
rawText: string;
imageList: ImageType[];
} => {
const turndownService = new TurndownService({ const turndownService = new TurndownService({
headingStyle: 'atx', headingStyle: 'atx',
bulletListMarker: '-', bulletListMarker: '-',
@@ -15,12 +22,32 @@ export const html2md = (html: string): string => {
try { try {
turndownService.remove(['i', 'script', 'iframe', 'style']); turndownService.remove(['i', 'script', 'iframe', 'style']);
turndownService.use(turndownPluginGfm.gfm); turndownService.use(turndownPluginGfm.gfm);
return turndownService.turndown(html); const base64Regex = /"(data:image\/[^;]+;base64[^"]+)"/g;
const imageList: ImageType[] = [];
const images = Array.from(html.match(base64Regex) || []);
for (const image of images) {
const uuid = crypto.randomUUID();
const mime = image.split(';')[0].split(':')[1];
const base64 = image.split(',')[1];
html = html.replace(image, uuid);
imageList.push({
uuid,
base64,
mime
});
}
return {
rawText: turndownService.turndown(html),
imageList
};
} catch (error) { } catch (error) {
console.log('html 2 markdown error', error); console.log('html 2 markdown error', error);
return ''; return {
rawText: '',
imageList: []
};
} }
}; };

View File

@@ -1,20 +1,39 @@
import mammoth from 'mammoth'; import mammoth, { images } from 'mammoth';
import { ReadRawTextByBuffer, ReadFileResponse } from '../type'; import { ReadRawTextByBuffer, ReadFileResponse, ImageType } from '../type';
import { html2md } from '../../htmlStr2Md/utils'; import { html2md } from '../../htmlStr2Md/utils';
/** /**
* read docx to markdown * read docx to markdown
*/ */
export const readDocsFile = async ({ buffer }: ReadRawTextByBuffer): Promise<ReadFileResponse> => { export const readDocsFile = async ({ buffer }: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
const imageList: ImageType[] = [];
try { try {
const { value: html } = await mammoth.convertToHtml({ const { value: html } = await mammoth.convertToHtml(
buffer {
}); buffer
},
{
convertImage: images.imgElement(async (image) => {
const imageBase64 = await image.readAsBase64String();
const uuid = crypto.randomUUID();
const mime = image.contentType;
imageList.push({
uuid,
base64: imageBase64,
mime
});
return {
src: uuid
};
})
}
);
const rawText = html2md(html); const { rawText } = html2md(html);
return { return {
rawText rawText,
imageList
}; };
} catch (error) { } catch (error) {
console.log('error doc read:', error); console.log('error doc read:', error);

View File

@@ -5,9 +5,10 @@ import { html2md } from '../../htmlStr2Md/utils';
export const readHtmlRawText = async (params: ReadRawTextByBuffer): Promise<ReadFileResponse> => { export const readHtmlRawText = async (params: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
const { rawText: html } = readFileRawText(params); const { rawText: html } = readFileRawText(params);
const rawText = html2md(html); const { rawText, imageList } = html2md(html);
return { return {
rawText rawText,
imageList
}; };
}; };

View File

@@ -8,7 +8,14 @@ export type ReadRawTextProps<T> = {
export type ReadRawTextByBuffer = ReadRawTextProps<Buffer>; export type ReadRawTextByBuffer = ReadRawTextProps<Buffer>;
export type ImageType = {
uuid: string;
base64: string;
mime: string;
};
export type ReadFileResponse = { export type ReadFileResponse = {
rawText: string; rawText: string;
formatText?: string; formatText?: string;
imageList?: ImageType[];
}; };

558
pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@@ -32,6 +32,7 @@ async function handler(req: ApiRequestProps<FileIdCreateDatasetCollectionParams>
...body ...body
} = req.body; } = req.body;
const start = Date.now();
const { teamId, tmbId, dataset } = await authDataset({ const { teamId, tmbId, dataset } = await authDataset({
req, req,
authToken: true, authToken: true,
@@ -46,6 +47,7 @@ async function handler(req: ApiRequestProps<FileIdCreateDatasetCollectionParams>
bucketName: BucketNameEnum.dataset, bucketName: BucketNameEnum.dataset,
fileId fileId
}); });
// 2. split chunks // 2. split chunks
const chunks = rawText2Chunks({ const chunks = rawText2Chunks({
rawText, rawText,