mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 05:12:39 +00:00
Add image index and pdf parse (#3956)
* feat: think tag parse * feat: parse think tag test * feat: pdf parse ux * feat: doc2x parse * perf: rewrite training mode setting * feat: image parse queue * perf: image index * feat: image parse process * feat: add init sh * fix: ts
This commit is contained in:
@@ -186,20 +186,25 @@ export async function getDownloadStream({
|
||||
|
||||
export const readFileContentFromMongo = async ({
|
||||
teamId,
|
||||
tmbId,
|
||||
bucketName,
|
||||
fileId,
|
||||
isQAImport = false
|
||||
isQAImport = false,
|
||||
customPdfParse = false
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
bucketName: `${BucketNameEnum}`;
|
||||
fileId: string;
|
||||
isQAImport?: boolean;
|
||||
customPdfParse?: boolean;
|
||||
}): Promise<{
|
||||
rawText: string;
|
||||
filename: string;
|
||||
}> => {
|
||||
const bufferId = `${fileId}-${customPdfParse}`;
|
||||
// read buffer
|
||||
const fileBuffer = await MongoRawTextBuffer.findOne({ sourceId: fileId }, undefined, {
|
||||
const fileBuffer = await MongoRawTextBuffer.findOne({ sourceId: bufferId }, undefined, {
|
||||
...readFromSecondary
|
||||
}).lean();
|
||||
if (fileBuffer) {
|
||||
@@ -227,9 +232,11 @@ export const readFileContentFromMongo = async ({
|
||||
|
||||
// Get raw text
|
||||
const { rawText } = await readRawContentByFileBuffer({
|
||||
customPdfParse,
|
||||
extension,
|
||||
isQAImport,
|
||||
teamId,
|
||||
tmbId,
|
||||
buffer: fileBuffers,
|
||||
encoding,
|
||||
metadata: {
|
||||
@@ -240,7 +247,7 @@ export const readFileContentFromMongo = async ({
|
||||
// < 14M
|
||||
if (fileBuffers.length < 14 * 1024 * 1024 && rawText.trim()) {
|
||||
MongoRawTextBuffer.create({
|
||||
sourceId: fileId,
|
||||
sourceId: bufferId,
|
||||
rawText,
|
||||
metadata: {
|
||||
filename: file.filename
|
||||
|
27
packages/service/common/file/image/utils.ts
Normal file
27
packages/service/common/file/image/utils.ts
Normal file
@@ -0,0 +1,27 @@
|
||||
import axios from 'axios';
|
||||
import { addLog } from '../../system/log';
|
||||
import { serverRequestBaseUrl } from '../../api/serverRequest';
|
||||
import { getFileContentTypeFromHeader, guessBase64ImageType } from '../utils';
|
||||
|
||||
export const getImageBase64 = async (url: string) => {
|
||||
addLog.debug(`Load image to base64: ${url}`);
|
||||
|
||||
try {
|
||||
const response = await axios.get(url, {
|
||||
baseURL: serverRequestBaseUrl,
|
||||
responseType: 'arraybuffer',
|
||||
proxy: false
|
||||
});
|
||||
|
||||
const base64 = Buffer.from(response.data, 'binary').toString('base64');
|
||||
const imageType =
|
||||
getFileContentTypeFromHeader(response.headers['content-type']) ||
|
||||
guessBase64ImageType(base64);
|
||||
|
||||
return `data:${imageType};base64,${base64}`;
|
||||
} catch (error) {
|
||||
addLog.debug(`Load image to base64 failed: ${url}`);
|
||||
console.log(error);
|
||||
return Promise.reject(error);
|
||||
}
|
||||
};
|
@@ -1,18 +1,23 @@
|
||||
import { uploadMongoImg } from '../image/controller';
|
||||
import FormData from 'form-data';
|
||||
|
||||
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
|
||||
import fs from 'fs';
|
||||
import type { ReadFileResponse } from '../../../worker/readFile/type';
|
||||
import type { ImageType, ReadFileResponse } from '../../../worker/readFile/type';
|
||||
import axios from 'axios';
|
||||
import { addLog } from '../../system/log';
|
||||
import { batchRun } from '@fastgpt/global/common/fn/utils';
|
||||
import { matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
|
||||
import { batchRun } from '@fastgpt/global/common/system/utils';
|
||||
import { htmlTable2Md, matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
|
||||
import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
|
||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { getNanoid } from '@fastgpt/global/common/string/tools';
|
||||
|
||||
export type readRawTextByLocalFileParams = {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
path: string;
|
||||
encoding: string;
|
||||
customPdfParse?: boolean;
|
||||
metadata?: Record<string, any>;
|
||||
};
|
||||
export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParams) => {
|
||||
@@ -22,46 +27,51 @@ export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParam
|
||||
|
||||
const buffer = await fs.promises.readFile(path);
|
||||
|
||||
const { rawText } = await readRawContentByFileBuffer({
|
||||
return readRawContentByFileBuffer({
|
||||
extension,
|
||||
isQAImport: false,
|
||||
customPdfParse: params.customPdfParse,
|
||||
teamId: params.teamId,
|
||||
tmbId: params.tmbId,
|
||||
encoding: params.encoding,
|
||||
buffer,
|
||||
metadata: params.metadata
|
||||
});
|
||||
|
||||
return {
|
||||
rawText
|
||||
};
|
||||
};
|
||||
|
||||
export const readRawContentByFileBuffer = async ({
|
||||
extension,
|
||||
isQAImport,
|
||||
teamId,
|
||||
tmbId,
|
||||
|
||||
extension,
|
||||
buffer,
|
||||
encoding,
|
||||
metadata
|
||||
metadata,
|
||||
customPdfParse = false,
|
||||
isQAImport = false
|
||||
}: {
|
||||
isQAImport?: boolean;
|
||||
extension: string;
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
|
||||
extension: string;
|
||||
buffer: Buffer;
|
||||
encoding: string;
|
||||
metadata?: Record<string, any>;
|
||||
}) => {
|
||||
// Custom read file service
|
||||
const customReadfileUrl = process.env.CUSTOM_READ_FILE_URL;
|
||||
const customReadFileExtension = process.env.CUSTOM_READ_FILE_EXTENSION || '';
|
||||
const ocrParse = process.env.CUSTOM_READ_FILE_OCR || 'false';
|
||||
const readFileFromCustomService = async (): Promise<ReadFileResponse | undefined> => {
|
||||
if (
|
||||
!customReadfileUrl ||
|
||||
!customReadFileExtension ||
|
||||
!customReadFileExtension.includes(extension)
|
||||
)
|
||||
return;
|
||||
|
||||
customPdfParse?: boolean;
|
||||
isQAImport: boolean;
|
||||
}): Promise<ReadFileResponse> => {
|
||||
const systemParse = () =>
|
||||
runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
|
||||
extension,
|
||||
encoding,
|
||||
buffer,
|
||||
teamId
|
||||
});
|
||||
const parsePdfFromCustomService = async (): Promise<ReadFileResponse> => {
|
||||
const url = global.systemEnv.customPdfParse?.url;
|
||||
const token = global.systemEnv.customPdfParse?.key;
|
||||
if (!url) return systemParse();
|
||||
|
||||
const start = Date.now();
|
||||
addLog.info('Parsing files from an external service');
|
||||
@@ -70,20 +80,18 @@ export const readRawContentByFileBuffer = async ({
|
||||
data.append('file', buffer, {
|
||||
filename: `file.${extension}`
|
||||
});
|
||||
data.append('extension', extension);
|
||||
data.append('ocr', ocrParse);
|
||||
const { data: response } = await axios.post<{
|
||||
success: boolean;
|
||||
message: string;
|
||||
data: {
|
||||
page: number;
|
||||
markdown: string;
|
||||
duration: number;
|
||||
};
|
||||
}>(customReadfileUrl, data, {
|
||||
}>(url, data, {
|
||||
timeout: 600000,
|
||||
headers: {
|
||||
...data.getHeaders()
|
||||
...data.getHeaders(),
|
||||
Authorization: token ? `Bearer ${token}` : undefined
|
||||
}
|
||||
});
|
||||
|
||||
@@ -92,21 +100,208 @@ export const readRawContentByFileBuffer = async ({
|
||||
const rawText = response.data.markdown;
|
||||
const { text, imageList } = matchMdImgTextAndUpload(rawText);
|
||||
|
||||
createPdfParseUsage({
|
||||
teamId,
|
||||
tmbId,
|
||||
pages: response.data.page
|
||||
});
|
||||
|
||||
return {
|
||||
rawText: text,
|
||||
formatText: rawText,
|
||||
imageList
|
||||
};
|
||||
};
|
||||
const parsePdfFromDoc2x = async (): Promise<ReadFileResponse> => {
|
||||
const doc2xKey = global.systemEnv.customPdfParse?.doc2xKey;
|
||||
if (!doc2xKey) return systemParse();
|
||||
|
||||
let { rawText, formatText, imageList } =
|
||||
(await readFileFromCustomService()) ||
|
||||
(await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
|
||||
extension,
|
||||
encoding,
|
||||
buffer,
|
||||
teamId
|
||||
}));
|
||||
const parseTextImage = async (text: string) => {
|
||||
// Extract image links and convert to base64
|
||||
const imageList: { id: string; url: string }[] = [];
|
||||
const processedText = text.replace(/!\[.*?\]\((http[^)]+)\)/g, (match, url) => {
|
||||
const id = getNanoid();
|
||||
imageList.push({
|
||||
id,
|
||||
url
|
||||
});
|
||||
return ``;
|
||||
});
|
||||
|
||||
let resultImageList: ImageType[] = [];
|
||||
await Promise.all(
|
||||
imageList.map(async (item) => {
|
||||
try {
|
||||
const response = await axios.get(item.url, { responseType: 'arraybuffer' });
|
||||
const mime = response.headers['content-type'] || 'image/jpeg';
|
||||
const base64 = response.data.toString('base64');
|
||||
resultImageList.push({
|
||||
uuid: item.id,
|
||||
mime,
|
||||
base64
|
||||
});
|
||||
} catch (error) {
|
||||
addLog.warn(`Failed to get image from ${item.url}: ${getErrText(error)}`);
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
return {
|
||||
text: processedText,
|
||||
imageList: resultImageList
|
||||
};
|
||||
};
|
||||
|
||||
let startTime = Date.now();
|
||||
|
||||
// 1. Get pre-upload URL first
|
||||
const { data: preupload_data } = await axios
|
||||
.post<{ code: string; data: { uid: string; url: string } }>(
|
||||
'https://v2.doc2x.noedgeai.com/api/v2/parse/preupload',
|
||||
null,
|
||||
{
|
||||
headers: {
|
||||
Authorization: `Bearer ${doc2xKey}`
|
||||
}
|
||||
}
|
||||
)
|
||||
.catch((error) => {
|
||||
return Promise.reject(
|
||||
`[Pre-upload Error] Failed to get pre-upload URL: ${getErrText(error)}`
|
||||
);
|
||||
});
|
||||
if (preupload_data?.code !== 'success') {
|
||||
return Promise.reject(`Failed to get pre-upload URL: ${JSON.stringify(preupload_data)}`);
|
||||
}
|
||||
|
||||
const upload_url = preupload_data.data.url;
|
||||
const uid = preupload_data.data.uid;
|
||||
|
||||
// 2. Upload file to pre-signed URL with binary stream
|
||||
const blob = new Blob([buffer], { type: 'application/pdf' });
|
||||
const response = await axios
|
||||
.put(upload_url, blob, {
|
||||
headers: {
|
||||
'Content-Type': 'application/pdf'
|
||||
}
|
||||
})
|
||||
.catch((error) => {
|
||||
return Promise.reject(`[Upload Error] Failed to upload file: ${getErrText(error)}`);
|
||||
});
|
||||
if (response.status !== 200) {
|
||||
return Promise.reject(`Upload failed with status ${response.status}: ${response.statusText}`);
|
||||
}
|
||||
|
||||
await delay(5000);
|
||||
addLog.debug(`Uploaded file to Doc2x, uid: ${uid}`);
|
||||
// 3. Get the result by uid
|
||||
const checkResult = async (retry = 30) => {
|
||||
if (retry <= 0) {
|
||||
return Promise.reject(
|
||||
`[Parse Timeout Error] Failed to get result (uid: ${uid}): Process timeout`
|
||||
);
|
||||
}
|
||||
|
||||
try {
|
||||
const { data: result_data } = await axios
|
||||
.get<{
|
||||
code: string;
|
||||
data: {
|
||||
progress: number;
|
||||
status: 'processing' | 'failed' | 'success';
|
||||
result: {
|
||||
pages: {
|
||||
md: string;
|
||||
}[];
|
||||
};
|
||||
};
|
||||
}>(`https://v2.doc2x.noedgeai.com/api/v2/parse/status?uid=${uid}`, {
|
||||
headers: {
|
||||
Authorization: `Bearer ${doc2xKey}`
|
||||
}
|
||||
})
|
||||
.catch((error) => {
|
||||
return Promise.reject(
|
||||
`[Parse Status Error] Failed to get parse status: ${getErrText(error)}`
|
||||
);
|
||||
});
|
||||
|
||||
// Error
|
||||
if (!['ok', 'success'].includes(result_data.code)) {
|
||||
return Promise.reject(
|
||||
`Failed to get result (uid: ${uid}): ${JSON.stringify(result_data)}`
|
||||
);
|
||||
}
|
||||
|
||||
// Process
|
||||
if (['ready', 'processing'].includes(result_data.data.status)) {
|
||||
addLog.debug(`Waiting for the result, uid: ${uid}`);
|
||||
await delay(5000);
|
||||
return checkResult(retry - 1);
|
||||
}
|
||||
|
||||
// Finifsh
|
||||
if (result_data.data.status === 'success') {
|
||||
const result = result_data.data.result.pages
|
||||
.map((page) => page.md)
|
||||
.join('\n')
|
||||
// Do some post-processing
|
||||
.replace(/\\[\(\)]/g, '$')
|
||||
.replace(/\\[\[\]]/g, '$$')
|
||||
.replace(/<img\s+src="([^"]+)"(?:\s*\?[^>]*)?(?:\s*\/>|>)/g, '')
|
||||
.replace(/<!-- Media -->/g, '')
|
||||
.replace(/<!-- Footnote -->/g, '')
|
||||
.replace(/\$(.+?)\s+\\tag\{(.+?)\}\$/g, '$$$1 \\qquad \\qquad ($2)$$')
|
||||
.replace(/\\text\{([^}]*?)(\b\w+)_(\w+\b)([^}]*?)\}/g, '\\text{$1$2\\_$3$4}');
|
||||
|
||||
const { text, imageList } = await parseTextImage(htmlTable2Md(result));
|
||||
|
||||
return {
|
||||
pages: result_data.data.result.pages.length,
|
||||
text,
|
||||
imageList
|
||||
};
|
||||
}
|
||||
return checkResult(retry - 1);
|
||||
} catch (error) {
|
||||
if (retry > 1) {
|
||||
await delay(100);
|
||||
return checkResult(retry - 1);
|
||||
}
|
||||
return Promise.reject(error);
|
||||
}
|
||||
};
|
||||
|
||||
const { pages, text, imageList } = await checkResult();
|
||||
|
||||
createPdfParseUsage({
|
||||
teamId,
|
||||
tmbId,
|
||||
pages
|
||||
});
|
||||
|
||||
addLog.info(`Doc2x parse success, time: ${Date.now() - startTime}ms`);
|
||||
return {
|
||||
rawText: text,
|
||||
formatText: text,
|
||||
imageList
|
||||
};
|
||||
};
|
||||
// Custom read file service
|
||||
const pdfParseFn = async (): Promise<ReadFileResponse> => {
|
||||
if (!customPdfParse) return systemParse();
|
||||
if (global.systemEnv.customPdfParse?.url) return parsePdfFromCustomService();
|
||||
if (global.systemEnv.customPdfParse?.doc2xKey) return parsePdfFromDoc2x();
|
||||
|
||||
return systemParse();
|
||||
};
|
||||
|
||||
let { rawText, formatText, imageList } = await (async () => {
|
||||
if (extension === 'pdf') {
|
||||
return await pdfParseFn();
|
||||
}
|
||||
return await systemParse();
|
||||
})();
|
||||
|
||||
// markdown data format
|
||||
if (imageList) {
|
||||
@@ -142,5 +337,5 @@ export const readRawContentByFileBuffer = async ({
|
||||
}
|
||||
}
|
||||
|
||||
return { rawText };
|
||||
return { rawText, formatText, imageList };
|
||||
};
|
||||
|
@@ -10,6 +10,11 @@ export const SERVICE_LOCAL_HOST =
|
||||
export const initFastGPTConfig = (config?: FastGPTConfigFileType) => {
|
||||
if (!config) return;
|
||||
|
||||
// Special config computed
|
||||
config.feConfigs.showCustomPdfParse =
|
||||
!!config.systemEnv.customPdfParse?.url || !!config.systemEnv.customPdfParse?.doc2xKey;
|
||||
config.feConfigs.customPdfParsePrice = config.systemEnv.customPdfParse?.price || 0;
|
||||
|
||||
global.feConfigs = config.feConfigs;
|
||||
global.systemEnv = config.systemEnv;
|
||||
global.subPlans = config.subPlans;
|
||||
|
Reference in New Issue
Block a user