import { uploadMongoImg } from '../image/controller'; import FormData from 'form-data'; import { WorkerNameEnum, runWorker } from '../../../worker/utils'; import fs from 'fs'; import type { ReadFileResponse } from '../../../worker/readFile/type'; import axios from 'axios'; import { addLog } from '../../system/log'; import { batchRun } from '@fastgpt/global/common/system/utils'; import { matchMdImg } from '@fastgpt/global/common/string/markdown'; import { createPdfParseUsage } from '../../../support/wallet/usage/controller'; import { useDoc2xServer } from '../../../thirdProvider/doc2x'; export type readRawTextByLocalFileParams = { teamId: string; tmbId: string; path: string; encoding: string; customPdfParse?: boolean; metadata?: Record; }; export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParams) => { const { path } = params; const extension = path?.split('.')?.pop()?.toLowerCase() || ''; const buffer = await fs.promises.readFile(path); return readRawContentByFileBuffer({ extension, isQAImport: false, customPdfParse: params.customPdfParse, teamId: params.teamId, tmbId: params.tmbId, encoding: params.encoding, buffer, metadata: params.metadata }); }; export const readRawContentByFileBuffer = async ({ teamId, tmbId, extension, buffer, encoding, metadata, customPdfParse = false, isQAImport = false }: { teamId: string; tmbId: string; extension: string; buffer: Buffer; encoding: string; metadata?: Record; customPdfParse?: boolean; isQAImport: boolean; }): Promise => { const systemParse = () => runWorker(WorkerNameEnum.readFile, { extension, encoding, buffer, teamId }); const parsePdfFromCustomService = async (): Promise => { const url = global.systemEnv.customPdfParse?.url; const token = global.systemEnv.customPdfParse?.key; if (!url) return systemParse(); const start = Date.now(); addLog.info('Parsing files from an external service'); const data = new FormData(); data.append('file', buffer, { filename: `file.${extension}` }); const { data: response } = await axios.post<{ pages: number; markdown: string; error?: Object | string; }>(url, data, { timeout: 600000, headers: { ...data.getHeaders(), Authorization: token ? `Bearer ${token}` : undefined } }); if (response.error) { return Promise.reject(response.error); } addLog.info(`Custom file parsing is complete, time: ${Date.now() - start}ms`); const rawText = response.markdown; const { text, imageList } = matchMdImg(rawText); createPdfParseUsage({ teamId, tmbId, pages: response.pages }); return { rawText: text, formatText: rawText, imageList }; }; // Doc2x api const parsePdfFromDoc2x = async (): Promise => { const doc2xKey = global.systemEnv.customPdfParse?.doc2xKey; if (!doc2xKey) return systemParse(); const { pages, text, imageList } = await useDoc2xServer({ apiKey: doc2xKey }).parsePDF(buffer); createPdfParseUsage({ teamId, tmbId, pages }); return { rawText: text, formatText: text, imageList }; }; // Custom read file service const pdfParseFn = async (): Promise => { if (!customPdfParse) return systemParse(); if (global.systemEnv.customPdfParse?.url) return parsePdfFromCustomService(); if (global.systemEnv.customPdfParse?.doc2xKey) return parsePdfFromDoc2x(); return systemParse(); }; const start = Date.now(); addLog.debug(`Start parse file`, { extension }); let { rawText, formatText, imageList } = await (async () => { if (extension === 'pdf') { return await pdfParseFn(); } return await systemParse(); })(); addLog.debug(`Parse file success, time: ${Date.now() - start}ms. `); // markdown data format if (imageList) { await batchRun(imageList, async (item) => { const src = await (async () => { try { return await uploadMongoImg({ base64Img: `data:${item.mime};base64,${item.base64}`, teamId, metadata: { ...metadata, mime: item.mime } }); } catch (error) { addLog.warn('Upload file image error', { error }); return 'Upload load image error'; } })(); rawText = rawText.replace(item.uuid, src); if (formatText) { formatText = formatText.replace(item.uuid, src); } }); } if (['csv', 'xlsx'].includes(extension)) { // qa data if (isQAImport) { rawText = rawText || ''; } else { rawText = formatText || rawText; } } addLog.debug(`Upload file success, time: ${Date.now() - start}ms`); return { rawText, formatText, imageList }; };