mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 13:03:50 +00:00
fix: upload file (#2992)
* fix: upload file * chore: remove wasm, support html image parse * chore: adjust * chore: move base64match function into htmlstr2md
This commit is contained in:
@@ -159,7 +159,6 @@ export const readFileContentFromMongo = async ({
|
|||||||
getFileById({ bucketName, fileId }),
|
getFileById({ bucketName, fileId }),
|
||||||
getDownloadStream({ bucketName, fileId })
|
getDownloadStream({ bucketName, fileId })
|
||||||
]);
|
]);
|
||||||
// console.log('get file stream', Date.now() - start);
|
|
||||||
if (!file) {
|
if (!file) {
|
||||||
return Promise.reject(CommonErrEnum.fileNotFound);
|
return Promise.reject(CommonErrEnum.fileNotFound);
|
||||||
}
|
}
|
||||||
|
@@ -1,7 +1,5 @@
|
|||||||
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
|
|
||||||
import { uploadMongoImg } from '../image/controller';
|
import { uploadMongoImg } from '../image/controller';
|
||||||
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
|
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
|
||||||
import { addHours } from 'date-fns';
|
|
||||||
import FormData from 'form-data';
|
import FormData from 'form-data';
|
||||||
|
|
||||||
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
|
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
|
||||||
@@ -10,6 +8,7 @@ import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
|
|||||||
import type { ReadFileResponse } from '../../../worker/readFile/type';
|
import type { ReadFileResponse } from '../../../worker/readFile/type';
|
||||||
import axios from 'axios';
|
import axios from 'axios';
|
||||||
import { addLog } from '../../system/log';
|
import { addLog } from '../../system/log';
|
||||||
|
import { batchRun } from '@fastgpt/global/common/fn/utils';
|
||||||
|
|
||||||
export type readRawTextByLocalFileParams = {
|
export type readRawTextByLocalFileParams = {
|
||||||
teamId: string;
|
teamId: string;
|
||||||
@@ -53,21 +52,6 @@ export const readRawContentByFileBuffer = async ({
|
|||||||
encoding: string;
|
encoding: string;
|
||||||
metadata?: Record<string, any>;
|
metadata?: Record<string, any>;
|
||||||
}) => {
|
}) => {
|
||||||
// Upload image in markdown
|
|
||||||
const matchMdImgTextAndUpload = ({ teamId, md }: { md: string; teamId: string }) =>
|
|
||||||
markdownProcess({
|
|
||||||
rawText: md,
|
|
||||||
uploadImgController: (base64Img) =>
|
|
||||||
uploadMongoImg({
|
|
||||||
type: MongoImageTypeEnum.collectionImage,
|
|
||||||
base64Img,
|
|
||||||
teamId,
|
|
||||||
metadata,
|
|
||||||
expiredTime: addHours(new Date(), 1)
|
|
||||||
})
|
|
||||||
});
|
|
||||||
|
|
||||||
/* If */
|
|
||||||
const customReadfileUrl = process.env.CUSTOM_READ_FILE_URL;
|
const customReadfileUrl = process.env.CUSTOM_READ_FILE_URL;
|
||||||
const customReadFileExtension = process.env.CUSTOM_READ_FILE_EXTENSION || '';
|
const customReadFileExtension = process.env.CUSTOM_READ_FILE_EXTENSION || '';
|
||||||
const ocrParse = process.env.CUSTOM_READ_FILE_OCR || 'false';
|
const ocrParse = process.env.CUSTOM_READ_FILE_OCR || 'false';
|
||||||
@@ -111,19 +95,28 @@ export const readRawContentByFileBuffer = async ({
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
let { rawText, formatText } =
|
let { rawText, formatText, imageList } =
|
||||||
(await readFileFromCustomService()) ||
|
(await readFileFromCustomService()) ||
|
||||||
(await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
|
(await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
|
||||||
extension,
|
extension,
|
||||||
encoding,
|
encoding,
|
||||||
buffer
|
buffer,
|
||||||
|
teamId
|
||||||
}));
|
}));
|
||||||
|
|
||||||
// markdown data format
|
// markdown data format
|
||||||
if (['md', 'html', 'docx', ...customReadFileExtension.split(',')].includes(extension)) {
|
if (imageList) {
|
||||||
rawText = await matchMdImgTextAndUpload({
|
await batchRun(imageList, async (item) => {
|
||||||
teamId: teamId,
|
const src = await uploadMongoImg({
|
||||||
md: rawText
|
type: MongoImageTypeEnum.collectionImage,
|
||||||
|
base64Img: `data:${item.mime};base64,${item.base64}`,
|
||||||
|
teamId,
|
||||||
|
metadata: {
|
||||||
|
...metadata,
|
||||||
|
mime: item.mime
|
||||||
|
}
|
||||||
|
});
|
||||||
|
rawText = rawText.replace(item.uuid, src);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -1,7 +1,14 @@
|
|||||||
import TurndownService from 'turndown';
|
import TurndownService from 'turndown';
|
||||||
|
import { ImageType } from '../readFile/type';
|
||||||
|
// @ts-ignore
|
||||||
const turndownPluginGfm = require('joplin-turndown-plugin-gfm');
|
const turndownPluginGfm = require('joplin-turndown-plugin-gfm');
|
||||||
|
|
||||||
export const html2md = (html: string): string => {
|
export const html2md = (
|
||||||
|
html: string
|
||||||
|
): {
|
||||||
|
rawText: string;
|
||||||
|
imageList: ImageType[];
|
||||||
|
} => {
|
||||||
const turndownService = new TurndownService({
|
const turndownService = new TurndownService({
|
||||||
headingStyle: 'atx',
|
headingStyle: 'atx',
|
||||||
bulletListMarker: '-',
|
bulletListMarker: '-',
|
||||||
@@ -15,12 +22,32 @@ export const html2md = (html: string): string => {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
turndownService.remove(['i', 'script', 'iframe', 'style']);
|
turndownService.remove(['i', 'script', 'iframe', 'style']);
|
||||||
|
|
||||||
turndownService.use(turndownPluginGfm.gfm);
|
turndownService.use(turndownPluginGfm.gfm);
|
||||||
|
|
||||||
return turndownService.turndown(html);
|
const base64Regex = /"(data:image\/[^;]+;base64[^"]+)"/g;
|
||||||
|
const imageList: ImageType[] = [];
|
||||||
|
const images = Array.from(html.match(base64Regex) || []);
|
||||||
|
for (const image of images) {
|
||||||
|
const uuid = crypto.randomUUID();
|
||||||
|
const mime = image.split(';')[0].split(':')[1];
|
||||||
|
const base64 = image.split(',')[1];
|
||||||
|
html = html.replace(image, uuid);
|
||||||
|
imageList.push({
|
||||||
|
uuid,
|
||||||
|
base64,
|
||||||
|
mime
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
rawText: turndownService.turndown(html),
|
||||||
|
imageList
|
||||||
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log('html 2 markdown error', error);
|
console.log('html 2 markdown error', error);
|
||||||
return '';
|
return {
|
||||||
|
rawText: '',
|
||||||
|
imageList: []
|
||||||
|
};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@@ -1,20 +1,39 @@
|
|||||||
import mammoth from 'mammoth';
|
import mammoth, { images } from 'mammoth';
|
||||||
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
|
import { ReadRawTextByBuffer, ReadFileResponse, ImageType } from '../type';
|
||||||
import { html2md } from '../../htmlStr2Md/utils';
|
import { html2md } from '../../htmlStr2Md/utils';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* read docx to markdown
|
* read docx to markdown
|
||||||
*/
|
*/
|
||||||
export const readDocsFile = async ({ buffer }: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
|
export const readDocsFile = async ({ buffer }: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
|
||||||
|
const imageList: ImageType[] = [];
|
||||||
try {
|
try {
|
||||||
const { value: html } = await mammoth.convertToHtml({
|
const { value: html } = await mammoth.convertToHtml(
|
||||||
|
{
|
||||||
buffer
|
buffer
|
||||||
|
},
|
||||||
|
{
|
||||||
|
convertImage: images.imgElement(async (image) => {
|
||||||
|
const imageBase64 = await image.readAsBase64String();
|
||||||
|
const uuid = crypto.randomUUID();
|
||||||
|
const mime = image.contentType;
|
||||||
|
imageList.push({
|
||||||
|
uuid,
|
||||||
|
base64: imageBase64,
|
||||||
|
mime
|
||||||
});
|
});
|
||||||
|
return {
|
||||||
|
src: uuid
|
||||||
|
};
|
||||||
|
})
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
const rawText = html2md(html);
|
const { rawText } = html2md(html);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
rawText
|
rawText,
|
||||||
|
imageList
|
||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log('error doc read:', error);
|
console.log('error doc read:', error);
|
||||||
|
@@ -5,9 +5,10 @@ import { html2md } from '../../htmlStr2Md/utils';
|
|||||||
export const readHtmlRawText = async (params: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
|
export const readHtmlRawText = async (params: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
|
||||||
const { rawText: html } = readFileRawText(params);
|
const { rawText: html } = readFileRawText(params);
|
||||||
|
|
||||||
const rawText = html2md(html);
|
const { rawText, imageList } = html2md(html);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
rawText
|
rawText,
|
||||||
|
imageList
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
7
packages/service/worker/readFile/type.d.ts
vendored
7
packages/service/worker/readFile/type.d.ts
vendored
@@ -8,7 +8,14 @@ export type ReadRawTextProps<T> = {
|
|||||||
|
|
||||||
export type ReadRawTextByBuffer = ReadRawTextProps<Buffer>;
|
export type ReadRawTextByBuffer = ReadRawTextProps<Buffer>;
|
||||||
|
|
||||||
|
export type ImageType = {
|
||||||
|
uuid: string;
|
||||||
|
base64: string;
|
||||||
|
mime: string;
|
||||||
|
};
|
||||||
|
|
||||||
export type ReadFileResponse = {
|
export type ReadFileResponse = {
|
||||||
rawText: string;
|
rawText: string;
|
||||||
formatText?: string;
|
formatText?: string;
|
||||||
|
imageList?: ImageType[];
|
||||||
};
|
};
|
||||||
|
558
pnpm-lock.yaml
generated
558
pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
@@ -32,6 +32,7 @@ async function handler(req: ApiRequestProps<FileIdCreateDatasetCollectionParams>
|
|||||||
...body
|
...body
|
||||||
} = req.body;
|
} = req.body;
|
||||||
|
|
||||||
|
const start = Date.now();
|
||||||
const { teamId, tmbId, dataset } = await authDataset({
|
const { teamId, tmbId, dataset } = await authDataset({
|
||||||
req,
|
req,
|
||||||
authToken: true,
|
authToken: true,
|
||||||
@@ -46,6 +47,7 @@ async function handler(req: ApiRequestProps<FileIdCreateDatasetCollectionParams>
|
|||||||
bucketName: BucketNameEnum.dataset,
|
bucketName: BucketNameEnum.dataset,
|
||||||
fileId
|
fileId
|
||||||
});
|
});
|
||||||
|
|
||||||
// 2. split chunks
|
// 2. split chunks
|
||||||
const chunks = rawText2Chunks({
|
const chunks = rawText2Chunks({
|
||||||
rawText,
|
rawText,
|
||||||
|
Reference in New Issue
Block a user