diff --git a/docSite/content/zh-cn/docs/development/upgrading/497.md b/docSite/content/zh-cn/docs/development/upgrading/497.md new file mode 100644 index 000000000..be5c6a332 --- /dev/null +++ b/docSite/content/zh-cn/docs/development/upgrading/497.md @@ -0,0 +1,19 @@ +--- +title: 'V4.9.7(进行中)' +description: 'FastGPT V4.9.7 更新说明' +icon: 'upgrade' +draft: false +toc: true +weight: 793 +--- + +## 🚀 新增内容 + + +## ⚙️ 优化 + +1. Doc2x 文档解析,增加报错信息捕获,增加超时时长 + +## 🐛 修复 + + diff --git a/packages/global/common/error/utils.ts b/packages/global/common/error/utils.ts index 88a9ea9da..69cf364a6 100644 --- a/packages/global/common/error/utils.ts +++ b/packages/global/common/error/utils.ts @@ -4,7 +4,13 @@ export const getErrText = (err: any, def = ''): any => { const msg: string = typeof err === 'string' ? err - : err?.response?.data?.message || err?.response?.message || err?.message || def; + : err?.response?.data?.message || + err?.response?.message || + err?.message || + err?.response?.data?.msg || + err?.response?.msg || + err?.msg || + def; // msg && console.log('error =>', msg); return replaceSensitiveText(msg); }; diff --git a/packages/service/common/file/read/utils.ts b/packages/service/common/file/read/utils.ts index d260c770a..1e83ba351 100644 --- a/packages/service/common/file/read/utils.ts +++ b/packages/service/common/file/read/utils.ts @@ -2,16 +2,13 @@ import { uploadMongoImg } from '../image/controller'; import FormData from 'form-data'; import { WorkerNameEnum, runWorker } from '../../../worker/utils'; import fs from 'fs'; -import type { ImageType, ReadFileResponse } from '../../../worker/readFile/type'; +import type { ReadFileResponse } from '../../../worker/readFile/type'; import axios from 'axios'; import { addLog } from '../../system/log'; import { batchRun } from '@fastgpt/global/common/system/utils'; -import { htmlTable2Md, matchMdImg } from '@fastgpt/global/common/string/markdown'; +import { matchMdImg } from '@fastgpt/global/common/string/markdown'; import { createPdfParseUsage } from '../../../support/wallet/usage/controller'; -import { getErrText } from '@fastgpt/global/common/error/utils'; -import { delay } from '@fastgpt/global/common/system/utils'; -import { getNanoid } from '@fastgpt/global/common/string/tools'; -import { getImageBase64 } from '../image/utils'; +import { useDoc2xServer } from '../../../thirdProvider/doc2x'; export type readRawTextByLocalFileParams = { teamId: string; @@ -114,169 +111,12 @@ export const readRawContentByFileBuffer = async ({ imageList }; }; + // Doc2x api const parsePdfFromDoc2x = async (): Promise => { const doc2xKey = global.systemEnv.customPdfParse?.doc2xKey; if (!doc2xKey) return systemParse(); - const parseTextImage = async (text: string) => { - // Extract image links and convert to base64 - const imageList: { id: string; url: string }[] = []; - let processedText = text.replace(/!\[.*?\]\((http[^)]+)\)/g, (match, url) => { - const id = `IMAGE_${getNanoid()}_IMAGE`; - imageList.push({ - id, - url - }); - return `![](${id})`; - }); - - // Get base64 from image url - let resultImageList: ImageType[] = []; - await batchRun( - imageList, - async (item) => { - try { - const { base64, mime } = await getImageBase64(item.url); - resultImageList.push({ - uuid: item.id, - mime, - base64 - }); - } catch (error) { - processedText = processedText.replace(item.id, item.url); - addLog.warn(`Failed to get image from ${item.url}: ${getErrText(error)}`); - } - }, - 5 - ); - - return { - text: processedText, - imageList: resultImageList - }; - }; - - let startTime = Date.now(); - - // 1. Get pre-upload URL first - const { data: preupload_data } = await axios - .post<{ code: string; data: { uid: string; url: string } }>( - 'https://v2.doc2x.noedgeai.com/api/v2/parse/preupload', - null, - { - headers: { - Authorization: `Bearer ${doc2xKey}` - } - } - ) - .catch((error) => { - return Promise.reject( - `[Pre-upload Error] Failed to get pre-upload URL: ${getErrText(error)}` - ); - }); - if (preupload_data?.code !== 'success') { - return Promise.reject(`Failed to get pre-upload URL: ${JSON.stringify(preupload_data)}`); - } - - const upload_url = preupload_data.data.url; - const uid = preupload_data.data.uid; - - // 2. Upload file to pre-signed URL with binary stream - const blob = new Blob([buffer], { type: 'application/pdf' }); - const response = await axios - .put(upload_url, blob, { - headers: { - 'Content-Type': 'application/pdf' - } - }) - .catch((error) => { - return Promise.reject(`[Upload Error] Failed to upload file: ${getErrText(error)}`); - }); - if (response.status !== 200) { - return Promise.reject(`Upload failed with status ${response.status}: ${response.statusText}`); - } - - await delay(5000); - addLog.debug(`Uploaded file to Doc2x, uid: ${uid}`); - // 3. Get the result by uid - const checkResult = async (retry = 30) => { - if (retry <= 0) { - return Promise.reject( - `[Parse Timeout Error] Failed to get result (uid: ${uid}): Process timeout` - ); - } - - try { - const { data: result_data } = await axios - .get<{ - code: string; - data: { - progress: number; - status: 'processing' | 'failed' | 'success'; - result: { - pages: { - md: string; - }[]; - }; - }; - }>(`https://v2.doc2x.noedgeai.com/api/v2/parse/status?uid=${uid}`, { - headers: { - Authorization: `Bearer ${doc2xKey}` - } - }) - .catch((error) => { - return Promise.reject( - `[Parse Status Error] Failed to get parse status: ${getErrText(error)}` - ); - }); - - // Error - if (!['ok', 'success'].includes(result_data.code)) { - return Promise.reject( - `Failed to get result (uid: ${uid}): ${JSON.stringify(result_data)}` - ); - } - - // Process - if (['ready', 'processing'].includes(result_data.data.status)) { - addLog.debug(`Waiting for the result, uid: ${uid}`); - await delay(5000); - return checkResult(retry - 1); - } - - // Finifsh - if (result_data.data.status === 'success') { - const result = result_data.data.result.pages - .map((page) => page.md) - .join('') - // Do some post-processing - .replace(/\\[\(\)]/g, '$') - .replace(/\\[\[\]]/g, '$$') - .replace(/]*)?(?:\s*\/>|>)/g, '![img]($1)') - .replace(//g, '') - .replace(//g, '') - .replace(/\$(.+?)\s+\\tag\{(.+?)\}\$/g, '$$$1 \\qquad \\qquad ($2)$$') - .replace(/\\text\{([^}]*?)(\b\w+)_(\w+\b)([^}]*?)\}/g, '\\text{$1$2\\_$3$4}'); - - const { text, imageList } = await parseTextImage(htmlTable2Md(result)); - - return { - pages: result_data.data.result.pages.length, - text, - imageList - }; - } - return checkResult(retry - 1); - } catch (error) { - if (retry > 1) { - await delay(100); - return checkResult(retry - 1); - } - return Promise.reject(error); - } - }; - - const { pages, text, imageList } = await checkResult(); + const { pages, text, imageList } = await useDoc2xServer({ apiKey: doc2xKey }).parsePDF(buffer); createPdfParseUsage({ teamId, @@ -284,7 +124,6 @@ export const readRawContentByFileBuffer = async ({ pages }); - addLog.info(`Doc2x parse success, time: ${Date.now() - startTime}ms`); return { rawText: text, formatText: text, diff --git a/packages/service/thirdProvider/doc2x/index.ts b/packages/service/thirdProvider/doc2x/index.ts new file mode 100644 index 000000000..cda856a98 --- /dev/null +++ b/packages/service/thirdProvider/doc2x/index.ts @@ -0,0 +1,224 @@ +import { batchRun, delay } from '@fastgpt/global/common/system/utils'; +import { addLog } from '../../common/system/log'; +import { htmlTable2Md } from '@fastgpt/global/common/string/markdown'; +import axios, { Method } from 'axios'; +import { getNanoid } from '@fastgpt/global/common/string/tools'; +import { getErrText } from '@fastgpt/global/common/error/utils'; +import { ImageType } from '../../worker/readFile/type'; +import { getImageBase64 } from '../../common/file/image/utils'; + +type ApiResponseDataType = { + code: string; + msg?: string; + data: T; +}; + +export const useDoc2xServer = ({ apiKey }: { apiKey: string }) => { + // Init request + const instance = axios.create({ + baseURL: 'https://v2.doc2x.noedgeai.com/api', + timeout: 60000, + headers: { + Authorization: `Bearer ${apiKey}` + } + }); + // Response check + const checkRes = (data: ApiResponseDataType) => { + if (data === undefined) { + addLog.info('[Doc2x] Server data is empty'); + return Promise.reject('服务器异常'); + } + return data; + }; + const responseError = (err: any) => { + if (!err) { + return Promise.reject({ message: '[Doc2x] Unknown error' }); + } + if (typeof err === 'string') { + return Promise.reject({ message: `[Doc2x] ${err}` }); + } + if (typeof err.message === 'string') { + return Promise.reject({ message: `[Doc2x] ${err.message}` }); + } + if (typeof err.data === 'string') { + return Promise.reject({ message: `[Doc2x] ${err.data}` }); + } + if (err?.response?.data) { + return Promise.reject({ message: `[Doc2x] ${getErrText(err?.response?.data)}` }); + } + + addLog.error('[Doc2x] Unknown error', err); + return Promise.reject({ message: `[Doc2x] ${getErrText(err)}` }); + }; + const request = (url: string, data: any, method: Method): Promise> => { + // Remove empty data + for (const key in data) { + if (data[key] === undefined) { + delete data[key]; + } + } + + return instance + .request({ + url, + method, + data: ['POST', 'PUT'].includes(method) ? data : undefined, + params: !['POST', 'PUT'].includes(method) ? data : undefined + }) + .then((res) => checkRes(res.data)) + .catch((err) => responseError(err)); + }; + + const parsePDF = async (fileBuffer: Buffer) => { + addLog.debug('[Doc2x] PDF parse start'); + const startTime = Date.now(); + + // 1. Get pre-upload URL first + const { + code, + msg, + data: preupload_data + } = await request<{ uid: string; url: string }>('/v2/parse/preupload', null, 'POST'); + if (!['ok', 'success'].includes(code)) { + return Promise.reject(`[Doc2x] Failed to get pre-upload URL: ${msg}`); + } + const upload_url = preupload_data.url; + const uid = preupload_data.uid; + + // 2. Upload file to pre-signed URL with binary stream + const blob = new Blob([fileBuffer], { type: 'application/pdf' }); + const response = await axios + .put(upload_url, blob, { + headers: { + 'Content-Type': 'application/pdf' + } + }) + .catch((error) => { + return Promise.reject(`[Doc2x] Failed to upload file: ${getErrText(error)}`); + }); + if (response.status !== 200) { + return Promise.reject( + `[Doc2x] Upload failed with status ${response.status}: ${response.statusText}` + ); + } + addLog.debug(`[Doc2x] Uploaded file success, uid: ${uid}`); + + await delay(5000); + + // 3. Get the result by uid + const checkResult = async () => { + // 10 minutes + let retry = 120; + + while (retry > 0) { + try { + const { + code, + data: result_data, + msg + } = await request<{ + progress: number; + status: 'processing' | 'failed' | 'success'; + result: { + pages: { + md: string; + }[]; + }; + }>(`/v2/parse/status?uid=${uid}`, null, 'GET'); + + // Error + if (!['ok', 'success'].includes(code)) { + return Promise.reject(`[Doc2x] Failed to get result (uid: ${uid}): ${msg}`); + } + + // Process + if (['ready', 'processing'].includes(result_data.status)) { + addLog.debug(`[Doc2x] Waiting for the result, uid: ${uid}`); + await delay(5000); + } + + // Finifsh + if (result_data.status === 'success') { + return { + text: result_data.result.pages + .map((page) => page.md) + .join('') + .replace(/\\[\(\)]/g, '$') + .replace(/\\[\[\]]/g, '$$') + .replace(/]*)?(?:\s*\/>|>)/g, '![img]($1)') + .replace(//g, '') + .replace(//g, '') + .replace(/\$(.+?)\s+\\tag\{(.+?)\}\$/g, '$$$1 \\qquad \\qquad ($2)$$') + .replace(/\\text\{([^}]*?)(\b\w+)_(\w+\b)([^}]*?)\}/g, '\\text{$1$2\\_$3$4}'), + pages: result_data.result.pages.length + }; + } + } catch (error) { + // Just network error + addLog.warn(`[Doc2x] Get result error`, { error }); + await delay(500); + } + + retry--; + } + return Promise.reject(`[Doc2x] Failed to get result (uid: ${uid}): Process timeout`); + }; + + const { text, pages } = await checkResult(); + + // ![](url) => ![](base64) + const parseTextImage = async (text: string) => { + // Extract image links and convert to base64 + const imageList: { id: string; url: string }[] = []; + let processedText = text.replace(/!\[.*?\]\((http[^)]+)\)/g, (match, url) => { + const id = `IMAGE_${getNanoid()}_IMAGE`; + imageList.push({ + id, + url + }); + return `![](${id})`; + }); + + // Get base64 from image url + let resultImageList: ImageType[] = []; + await batchRun( + imageList, + async (item) => { + try { + const { base64, mime } = await getImageBase64(item.url); + resultImageList.push({ + uuid: item.id, + mime, + base64 + }); + } catch (error) { + processedText = processedText.replace(item.id, item.url); + addLog.warn(`[Doc2x] Failed to get image from ${item.url}: ${getErrText(error)}`); + } + }, + 5 + ); + + return { + text: processedText, + imageList: resultImageList + }; + }; + const { text: formatText, imageList } = await parseTextImage(htmlTable2Md(text)); + + addLog.debug(`[Doc2x] PDF parse finished`, { + time: `${Math.round((Date.now() - startTime) / 1000)}s`, + pages + }); + + return { + pages, + text: formatText, + imageList + }; + }; + + return { + parsePDF + }; +};