mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-21 11:43:56 +00:00

* i18n * tool call * fix: mcp create permission;Plugin unauth tip * fix: mcp create permission;Plugin unauth tip * fix: Cite modal permission * remove invalide cite * perf: prompt * filter fulltext search * fix: ts * fix: ts * fix: ts
192 lines
4.9 KiB
TypeScript
192 lines
4.9 KiB
TypeScript
import { uploadMongoImg } from '../image/controller';
|
|
import FormData from 'form-data';
|
|
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
|
|
import fs from 'fs';
|
|
import type { ReadFileResponse } from '../../../worker/readFile/type';
|
|
import axios from 'axios';
|
|
import { addLog } from '../../system/log';
|
|
import { batchRun } from '@fastgpt/global/common/system/utils';
|
|
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
|
|
import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
|
|
import { useDoc2xServer } from '../../../thirdProvider/doc2x';
|
|
|
|
export type readRawTextByLocalFileParams = {
|
|
teamId: string;
|
|
tmbId: string;
|
|
path: string;
|
|
encoding: string;
|
|
customPdfParse?: boolean;
|
|
metadata?: Record<string, any>;
|
|
};
|
|
export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParams) => {
|
|
const { path } = params;
|
|
|
|
const extension = path?.split('.')?.pop()?.toLowerCase() || '';
|
|
|
|
const buffer = await fs.promises.readFile(path);
|
|
|
|
return readRawContentByFileBuffer({
|
|
extension,
|
|
isQAImport: false,
|
|
customPdfParse: params.customPdfParse,
|
|
teamId: params.teamId,
|
|
tmbId: params.tmbId,
|
|
encoding: params.encoding,
|
|
buffer,
|
|
metadata: params.metadata
|
|
});
|
|
};
|
|
|
|
export const readRawContentByFileBuffer = async ({
|
|
teamId,
|
|
tmbId,
|
|
|
|
extension,
|
|
buffer,
|
|
encoding,
|
|
metadata,
|
|
customPdfParse = false,
|
|
isQAImport = false
|
|
}: {
|
|
teamId: string;
|
|
tmbId: string;
|
|
|
|
extension: string;
|
|
buffer: Buffer;
|
|
encoding: string;
|
|
metadata?: Record<string, any>;
|
|
|
|
customPdfParse?: boolean;
|
|
isQAImport: boolean;
|
|
}): Promise<ReadFileResponse> => {
|
|
const systemParse = () =>
|
|
runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
|
|
extension,
|
|
encoding,
|
|
buffer,
|
|
teamId
|
|
});
|
|
const parsePdfFromCustomService = async (): Promise<ReadFileResponse> => {
|
|
const url = global.systemEnv.customPdfParse?.url;
|
|
const token = global.systemEnv.customPdfParse?.key;
|
|
if (!url) return systemParse();
|
|
|
|
const start = Date.now();
|
|
addLog.info('Parsing files from an external service');
|
|
|
|
const data = new FormData();
|
|
data.append('file', buffer, {
|
|
filename: `file.${extension}`
|
|
});
|
|
const { data: response } = await axios.post<{
|
|
pages: number;
|
|
markdown: string;
|
|
error?: Object | string;
|
|
}>(url, data, {
|
|
timeout: 600000,
|
|
headers: {
|
|
...data.getHeaders(),
|
|
Authorization: token ? `Bearer ${token}` : undefined
|
|
}
|
|
});
|
|
|
|
if (response.error) {
|
|
return Promise.reject(response.error);
|
|
}
|
|
|
|
addLog.info(`Custom file parsing is complete, time: ${Date.now() - start}ms`);
|
|
|
|
const rawText = response.markdown;
|
|
const { text, imageList } = matchMdImg(rawText);
|
|
|
|
createPdfParseUsage({
|
|
teamId,
|
|
tmbId,
|
|
pages: response.pages
|
|
});
|
|
|
|
return {
|
|
rawText: text,
|
|
formatText: rawText,
|
|
imageList
|
|
};
|
|
};
|
|
// Doc2x api
|
|
const parsePdfFromDoc2x = async (): Promise<ReadFileResponse> => {
|
|
const doc2xKey = global.systemEnv.customPdfParse?.doc2xKey;
|
|
if (!doc2xKey) return systemParse();
|
|
|
|
const { pages, text, imageList } = await useDoc2xServer({ apiKey: doc2xKey }).parsePDF(buffer);
|
|
|
|
createPdfParseUsage({
|
|
teamId,
|
|
tmbId,
|
|
pages
|
|
});
|
|
|
|
return {
|
|
rawText: text,
|
|
formatText: text,
|
|
imageList
|
|
};
|
|
};
|
|
// Custom read file service
|
|
const pdfParseFn = async (): Promise<ReadFileResponse> => {
|
|
if (!customPdfParse) return systemParse();
|
|
if (global.systemEnv.customPdfParse?.url) return parsePdfFromCustomService();
|
|
if (global.systemEnv.customPdfParse?.doc2xKey) return parsePdfFromDoc2x();
|
|
|
|
return systemParse();
|
|
};
|
|
|
|
const start = Date.now();
|
|
addLog.debug(`Start parse file`, { extension });
|
|
|
|
let { rawText, formatText, imageList } = await (async () => {
|
|
if (extension === 'pdf') {
|
|
return await pdfParseFn();
|
|
}
|
|
return await systemParse();
|
|
})();
|
|
|
|
addLog.debug(`Parse file success, time: ${Date.now() - start}ms. `);
|
|
|
|
// markdown data format
|
|
if (imageList) {
|
|
await batchRun(imageList, async (item) => {
|
|
const src = await (async () => {
|
|
try {
|
|
return await uploadMongoImg({
|
|
base64Img: `data:${item.mime};base64,${item.base64}`,
|
|
teamId,
|
|
metadata: {
|
|
...metadata,
|
|
mime: item.mime
|
|
}
|
|
});
|
|
} catch (error) {
|
|
addLog.warn('Upload file image error', { error });
|
|
return 'Upload load image error';
|
|
}
|
|
})();
|
|
rawText = rawText.replace(item.uuid, src);
|
|
if (formatText) {
|
|
formatText = formatText.replace(item.uuid, src);
|
|
}
|
|
});
|
|
}
|
|
|
|
if (['csv', 'xlsx'].includes(extension)) {
|
|
// qa data
|
|
if (isQAImport) {
|
|
rawText = rawText || '';
|
|
} else {
|
|
rawText = formatText || rawText;
|
|
}
|
|
}
|
|
|
|
addLog.debug(`Upload file success, time: ${Date.now() - start}ms`);
|
|
|
|
return { rawText, formatText, imageList };
|
|
};
|