Files
FastGPT/packages/service/common/file/read/utils.ts
Archer 4e83840c14 perf: tool call check (#4818)
* i18n

* tool call

* fix: mcp create permission;Plugin unauth tip

* fix: mcp create permission;Plugin unauth tip

* fix: Cite modal permission

* remove invalide cite

* perf: prompt

* filter fulltext search

* fix: ts

* fix: ts

* fix: ts
2025-05-15 15:51:34 +08:00

192 lines
4.9 KiB
TypeScript

import { uploadMongoImg } from '../image/controller';
import FormData from 'form-data';
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
import fs from 'fs';
import type { ReadFileResponse } from '../../../worker/readFile/type';
import axios from 'axios';
import { addLog } from '../../system/log';
import { batchRun } from '@fastgpt/global/common/system/utils';
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
import { useDoc2xServer } from '../../../thirdProvider/doc2x';
export type readRawTextByLocalFileParams = {
teamId: string;
tmbId: string;
path: string;
encoding: string;
customPdfParse?: boolean;
metadata?: Record<string, any>;
};
export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParams) => {
const { path } = params;
const extension = path?.split('.')?.pop()?.toLowerCase() || '';
const buffer = await fs.promises.readFile(path);
return readRawContentByFileBuffer({
extension,
isQAImport: false,
customPdfParse: params.customPdfParse,
teamId: params.teamId,
tmbId: params.tmbId,
encoding: params.encoding,
buffer,
metadata: params.metadata
});
};
export const readRawContentByFileBuffer = async ({
teamId,
tmbId,
extension,
buffer,
encoding,
metadata,
customPdfParse = false,
isQAImport = false
}: {
teamId: string;
tmbId: string;
extension: string;
buffer: Buffer;
encoding: string;
metadata?: Record<string, any>;
customPdfParse?: boolean;
isQAImport: boolean;
}): Promise<ReadFileResponse> => {
const systemParse = () =>
runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
extension,
encoding,
buffer,
teamId
});
const parsePdfFromCustomService = async (): Promise<ReadFileResponse> => {
const url = global.systemEnv.customPdfParse?.url;
const token = global.systemEnv.customPdfParse?.key;
if (!url) return systemParse();
const start = Date.now();
addLog.info('Parsing files from an external service');
const data = new FormData();
data.append('file', buffer, {
filename: `file.${extension}`
});
const { data: response } = await axios.post<{
pages: number;
markdown: string;
error?: Object | string;
}>(url, data, {
timeout: 600000,
headers: {
...data.getHeaders(),
Authorization: token ? `Bearer ${token}` : undefined
}
});
if (response.error) {
return Promise.reject(response.error);
}
addLog.info(`Custom file parsing is complete, time: ${Date.now() - start}ms`);
const rawText = response.markdown;
const { text, imageList } = matchMdImg(rawText);
createPdfParseUsage({
teamId,
tmbId,
pages: response.pages
});
return {
rawText: text,
formatText: rawText,
imageList
};
};
// Doc2x api
const parsePdfFromDoc2x = async (): Promise<ReadFileResponse> => {
const doc2xKey = global.systemEnv.customPdfParse?.doc2xKey;
if (!doc2xKey) return systemParse();
const { pages, text, imageList } = await useDoc2xServer({ apiKey: doc2xKey }).parsePDF(buffer);
createPdfParseUsage({
teamId,
tmbId,
pages
});
return {
rawText: text,
formatText: text,
imageList
};
};
// Custom read file service
const pdfParseFn = async (): Promise<ReadFileResponse> => {
if (!customPdfParse) return systemParse();
if (global.systemEnv.customPdfParse?.url) return parsePdfFromCustomService();
if (global.systemEnv.customPdfParse?.doc2xKey) return parsePdfFromDoc2x();
return systemParse();
};
const start = Date.now();
addLog.debug(`Start parse file`, { extension });
let { rawText, formatText, imageList } = await (async () => {
if (extension === 'pdf') {
return await pdfParseFn();
}
return await systemParse();
})();
addLog.debug(`Parse file success, time: ${Date.now() - start}ms. `);
// markdown data format
if (imageList) {
await batchRun(imageList, async (item) => {
const src = await (async () => {
try {
return await uploadMongoImg({
base64Img: `data:${item.mime};base64,${item.base64}`,
teamId,
metadata: {
...metadata,
mime: item.mime
}
});
} catch (error) {
addLog.warn('Upload file image error', { error });
return 'Upload load image error';
}
})();
rawText = rawText.replace(item.uuid, src);
if (formatText) {
formatText = formatText.replace(item.uuid, src);
}
});
}
if (['csv', 'xlsx'].includes(extension)) {
// qa data
if (isQAImport) {
rawText = rawText || '';
} else {
rawText = formatText || rawText;
}
}
addLog.debug(`Upload file success, time: ${Date.now() - start}ms`);
return { rawText, formatText, imageList };
};