From bc79d46d4b75db5cb1b36729bead2bd159598f5c Mon Sep 17 00:00:00 2001 From: Menghuan Date: Mon, 25 Nov 2024 20:01:50 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0Doc2X=E6=8F=92=E4=BB=B6?= =?UTF-8?q?=EF=BC=9A=E8=BE=93=E5=87=BA=E6=96=87=E6=9C=AC=E9=A2=84=E5=A4=84?= =?UTF-8?q?=E7=90=86=E6=94=B9=E8=BF=9B=20(#3199)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Error text fix * Add post-processing for table * Some table can not convert * Refactor table conversion logic in PDF2text plugin * Fix table separator formatting issue * Refactor table separator formatting in PDF2text plugin * Refactor table conversion logic in PDF2text plugin and add HTMLtable option * 修复本地部署时无法获取文件的问题 * Refactor PDF fetching and parsing logic * Refactor PDF fetching and parsing logic, and fix table separator formatting issue * Bug fix: HTMLtable control not work --- packages/plugins/src/Doc2X/PDF2text/index.ts | 179 ++++++++++++++---- .../plugins/src/Doc2X/PDF2text/template.json | 61 +++++- 2 files changed, 197 insertions(+), 43 deletions(-) diff --git a/packages/plugins/src/Doc2X/PDF2text/index.ts b/packages/plugins/src/Doc2X/PDF2text/index.ts index 272cf6e68..9e875da74 100644 --- a/packages/plugins/src/Doc2X/PDF2text/index.ts +++ b/packages/plugins/src/Doc2X/PDF2text/index.ts @@ -4,6 +4,7 @@ import { getErrText } from '@fastgpt/global/common/error/utils'; type Props = { apikey: string; + HTMLtable: boolean; files: string[]; }; @@ -14,7 +15,88 @@ type Response = Promise<{ error?: Record; }>; -const main = async ({ apikey, files }: Props): Response => { +function processContent(content: string, HTMLtable: boolean): string { + if (HTMLtable) { + return content; + } + return content.replace(/[\s\S]*?<\/table>/g, (htmlTable) => { + try { + // Clean up whitespace and newlines + const cleanHtml = htmlTable.replace(/\n\s*/g, ''); + const rows = cleanHtml.match(/(.*?)<\/tr>/g); + if (!rows) return htmlTable; + + // Parse table data + let tableData: string[][] = []; + let maxColumns = 0; + + // Try to convert to markdown table + try { + rows.forEach((row, rowIndex) => { + if (!tableData[rowIndex]) { + tableData[rowIndex] = []; + } + let colIndex = 0; + const cells = row.match(/(.*?)<\/td>/g) || []; + + cells.forEach((cell) => { + while (tableData[rowIndex][colIndex]) { + colIndex++; + } + const colspan = parseInt(cell.match(/colspan="(\d+)"/)?.[1] || '1'); + const rowspan = parseInt(cell.match(/rowspan="(\d+)"/)?.[1] || '1'); + const content = cell.replace(/|<\/td>/g, '').trim(); + + for (let i = 0; i < rowspan; i++) { + for (let j = 0; j < colspan; j++) { + if (!tableData[rowIndex + i]) { + tableData[rowIndex + i] = []; + } + tableData[rowIndex + i][colIndex + j] = i === 0 && j === 0 ? content : '^^'; + } + } + colIndex += colspan; + maxColumns = Math.max(maxColumns, colIndex); + }); + + for (let i = 0; i < maxColumns; i++) { + if (!tableData[rowIndex][i]) { + tableData[rowIndex][i] = ' '; + } + } + }); + const chunks: string[] = []; + + const headerCells = tableData[0] + .slice(0, maxColumns) + .map((cell) => (cell === '^^' ? ' ' : cell || ' ')); + const headerRow = '| ' + headerCells.join(' | ') + ' |'; + chunks.push(headerRow); + + const separator = '| ' + Array(headerCells.length).fill('---').join(' | ') + ' |'; + chunks.push(separator); + + tableData.slice(1).forEach((row) => { + const paddedRow = row + .slice(0, maxColumns) + .map((cell) => (cell === '^^' ? ' ' : cell || ' ')); + while (paddedRow.length < maxColumns) { + paddedRow.push(' '); + } + chunks.push('| ' + paddedRow.join(' | ') + ' |'); + }); + + return chunks.join('\n'); + } catch (error) { + return htmlTable; + } + } catch (error) { + return htmlTable; + } + }); +} + +const main = async ({ apikey, files, HTMLtable }: Props): Response => { // Check the apikey if (!apikey) { return Promise.reject(`API key is required`); @@ -30,77 +112,88 @@ const main = async ({ apikey, files }: Props): Response => { for await (const url of files) { try { //Fetch the pdf and check its content type - const PDFResponse = await axiosInstance.get(url, { responseType: 'arraybuffer' }); + const PDFResponse = await axios + .get(url, { + responseType: 'arraybuffer', + proxy: false, + timeout: 20000 + }) + .catch((error) => { + throw new Error(`[Fetch PDF Error] Failed to fetch PDF: ${getErrText(error)}`); + }); + if (PDFResponse.status !== 200) { throw new Error( - `File:${url} \n\nFailed to fetch PDF from URL: ${PDFResponse.statusText}\n` + `[Fetch PDF Error] Failed with status ${PDFResponse.status}: ${PDFResponse.data}` ); } const contentType = PDFResponse.headers['content-type']; const file_name = url.match(/read\/([^?]+)/)?.[1] || 'unknown.pdf'; if (!contentType || !contentType.startsWith('application/pdf')) { - throw new Error( - `File:${file_name}\n\nThe provided file does not point to a PDF: ${contentType}\n` - ); + throw new Error(`The provided file does not point to a PDF: ${contentType}`); } const blob = new Blob([PDFResponse.data], { type: 'application/pdf' }); // Get pre-upload URL first - const preupload_response = await axiosInstance.post( - 'https://v2.doc2x.noedgeai.com/api/v2/parse/preupload', - null, - { + const preupload_response = await axiosInstance + .post('https://v2.doc2x.noedgeai.com/api/v2/parse/preupload', null, { headers: { Authorization: `Bearer ${apikey}` } - } - ); + }) + .catch((error) => { + throw new Error(`[Pre-upload Error] Failed to get pre-upload URL: ${getErrText(error)}`); + }); if (preupload_response.status !== 200) { - throw new Error( - `File:${file_name}\n\nFailed to get pre-upload URL: ${preupload_response.statusText}\n` - ); + throw new Error(`Failed to get pre-upload URL: ${preupload_response.data}`); } const preupload_data = preupload_response.data; if (preupload_data.code !== 'success') { - throw new Error( - `File:${file_name}\n\nFailed to get pre-upload URL: ${JSON.stringify(preupload_data)}\n` - ); + throw new Error(`Failed to get pre-upload URL: ${JSON.stringify(preupload_data)}`); } const upload_url = preupload_data.data.url; const uid = preupload_data.data.uid; // Upload file to pre-signed URL with binary stream - const response = await axiosInstance.put(upload_url, blob, { - headers: { - 'Content-Type': 'application/pdf' - } - }); + const response = await axiosInstance + .put(upload_url, blob, { + headers: { + 'Content-Type': 'application/pdf' + } + }) + .catch((error) => { + throw new Error(`[Upload Error] Failed to upload file: ${getErrText(error)}`); + }); + if (response.status !== 200) { throw new Error(`Upload failed with status ${response.status}: ${response.statusText}`); } // Get the result by uid - // Wait for the result, at most 90s - const checkResult = async (retry = 30) => { + // Wait for the result + const checkResult = async (retry = 20) => { if (retry <= 0) return Promise.reject( - `File:${file_name}\n\nFailed to get result (uid: ${uid}): Get result timeout\n` + `File:${file_name}\n\n[Parse Timeout Error] Failed to get result (uid: ${uid}): Process timeout\n` ); try { - const result_response = await axiosInstance.get( - `https://v2.doc2x.noedgeai.com/api/v2/parse/status?uid=${uid}`, - { + const result_response = await axiosInstance + .get(`https://v2.doc2x.noedgeai.com/api/v2/parse/status?uid=${uid}`, { headers: { Authorization: `Bearer ${apikey}` } - } - ); + }) + .catch((error) => { + throw new Error( + `[Parse Status Error] Failed to get parse status: ${getErrText(error)}` + ); + }); const result_data = result_response.data; if (!['ok', 'success'].includes(result_data.code)) { @@ -110,29 +203,35 @@ const main = async ({ apikey, files }: Props): Response => { } if (['ready', 'processing'].includes(result_data.data.status)) { - await delay(3000); + await delay(4000); return checkResult(retry - 1); } if (result_data.data.status === 'success') { - const result = ( + const result = processContent( await Promise.all( result_data.data.result.pages.map((page: { md: any }) => page.md) - ).then((pages) => pages.join('\n')) + ).then((pages) => pages.join('\n')), + HTMLtable ) // Do some post-processing .replace(/\\[\(\)]/g, '$') .replace(/\\[\[\]]/g, '$$') - .replace(/]*)?(?:\s*\/>|>)/g, '![img]($1)'); + .replace(/]*)?(?:\s*\/>|>)/g, '![img]($1)') + .replace(//g, '') + .replace(//g, '') + .replace(/\$(.+?)\s+\\tag\{(.+?)\}\$/g, '$$$1 \\qquad \\qquad ($2)$$') + .replace(/\\text\{([^}]*?)(\b\w+)_(\w+\b)([^}]*?)\}/g, '\\text{$1$2\\_$3$4}'); return `File:${file_name}\n\n${result}\n`; } - - await delay(100); return checkResult(retry - 1); } catch (error) { - await delay(100); - return checkResult(retry - 1); + if (retry > 1) { + await delay(100); + return checkResult(retry - 1); + } + throw error; } }; @@ -140,7 +239,7 @@ const main = async ({ apikey, files }: Props): Response => { successResult.push(result); } catch (error) { failedResult.push( - `File:${url} \n\nFailed to fetch image from URL: ${getErrText(error)}\n` + `File:${url} \n\nFailed to fetch file from URL: ${getErrText(error)}\n` ); } } diff --git a/packages/plugins/src/Doc2X/PDF2text/template.json b/packages/plugins/src/Doc2X/PDF2text/template.json index 653454993..9a84c51a6 100644 --- a/packages/plugins/src/Doc2X/PDF2text/template.json +++ b/packages/plugins/src/Doc2X/PDF2text/template.json @@ -7,10 +7,8 @@ "courseUrl": "https://fael3z0zfze.feishu.cn/wiki/Rkc5witXWiJoi5kORd2cofh6nDg?fromScene=spaceOverview", "showStatus": true, "weight": 10, - "isTool": true, "templateType": "tools", - "workflow": { "nodes": [ { @@ -52,6 +50,26 @@ "canSelectImg": false, "maxFiles": 14, "defaultValue": "" + }, + { + "renderTypeList": ["switch", "reference"], + "selectedTypeIndex": 0, + "valueType": "boolean", + "canEdit": true, + "key": "HTMLtable", + "label": "HTMLtable", + "description": "是否以HTML格式输出表格。如果需要精确地输出表格,请打开此开关以使用HTML格式。关闭后,表格将转换为Markdown形式输出,但这可能会损失一些表格特性,如合并单元格。", + "defaultValue": false, + "list": [ + { + "label": "", + "value": "" + } + ], + "maxFiles": 5, + "canSelectFile": true, + "canSelectImg": true, + "required": true } ], "outputs": [ @@ -68,6 +86,13 @@ "key": "files", "label": "files", "type": "hidden" + }, + { + "id": "htmltable", + "valueType": "boolean", + "key": "HTMLtable", + "label": "HTMLtable", + "type": "hidden" } ] }, @@ -220,7 +245,7 @@ "key": "system_httpJsonBody", "renderTypeList": ["hidden"], "valueType": "any", - "value": "{\n \"apikey\": \"{{apikey}}\",\n \"files\": {{files}}\n}", + "value": "{\n \"apikey\": \"{{apikey}}\",\n \"HTMLtable\": {{HTMLtable}},\n \"files\": {{files}}\n}", "label": "", "required": false, "debugLabel": "", @@ -305,6 +330,36 @@ }, "required": true, "value": [["pluginInput", "url"]] + }, + { + "renderTypeList": ["reference"], + "valueType": "boolean", + "canEdit": true, + "key": "HTMLtable", + "label": "HTMLtable", + "customInputConfig": { + "selectValueTypeList": [ + "string", + "number", + "boolean", + "object", + "arrayString", + "arrayNumber", + "arrayBoolean", + "arrayObject", + "arrayAny", + "any", + "chatHistory", + "datasetQuote", + "dynamic", + "selectApp", + "selectDataset" + ], + "showDescription": false, + "showDefaultValue": true + }, + "required": true, + "value": ["pluginInput", "htmltable"] } ], "outputs": [