mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-28 17:29:44 +00:00
更新Doc2X插件:输出文本预处理改进 (#3199)
* Error text fix * Add post-processing for table * Some table can not convert * Refactor table conversion logic in PDF2text plugin * Fix table separator formatting issue * Refactor table separator formatting in PDF2text plugin * Refactor table conversion logic in PDF2text plugin and add HTMLtable option * 修复本地部署时无法获取文件的问题 * Refactor PDF fetching and parsing logic * Refactor PDF fetching and parsing logic, and fix table separator formatting issue * Bug fix: HTMLtable control not work
This commit is contained in:
@@ -4,6 +4,7 @@ import { getErrText } from '@fastgpt/global/common/error/utils';
|
|||||||
|
|
||||||
type Props = {
|
type Props = {
|
||||||
apikey: string;
|
apikey: string;
|
||||||
|
HTMLtable: boolean;
|
||||||
files: string[];
|
files: string[];
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -14,7 +15,88 @@ type Response = Promise<{
|
|||||||
error?: Record<string, any>;
|
error?: Record<string, any>;
|
||||||
}>;
|
}>;
|
||||||
|
|
||||||
const main = async ({ apikey, files }: Props): Response => {
|
function processContent(content: string, HTMLtable: boolean): string {
|
||||||
|
if (HTMLtable) {
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
return content.replace(/<table>[\s\S]*?<\/table>/g, (htmlTable) => {
|
||||||
|
try {
|
||||||
|
// Clean up whitespace and newlines
|
||||||
|
const cleanHtml = htmlTable.replace(/\n\s*/g, '');
|
||||||
|
const rows = cleanHtml.match(/<tr>(.*?)<\/tr>/g);
|
||||||
|
if (!rows) return htmlTable;
|
||||||
|
|
||||||
|
// Parse table data
|
||||||
|
let tableData: string[][] = [];
|
||||||
|
let maxColumns = 0;
|
||||||
|
|
||||||
|
// Try to convert to markdown table
|
||||||
|
try {
|
||||||
|
rows.forEach((row, rowIndex) => {
|
||||||
|
if (!tableData[rowIndex]) {
|
||||||
|
tableData[rowIndex] = [];
|
||||||
|
}
|
||||||
|
let colIndex = 0;
|
||||||
|
const cells = row.match(/<td.*?>(.*?)<\/td>/g) || [];
|
||||||
|
|
||||||
|
cells.forEach((cell) => {
|
||||||
|
while (tableData[rowIndex][colIndex]) {
|
||||||
|
colIndex++;
|
||||||
|
}
|
||||||
|
const colspan = parseInt(cell.match(/colspan="(\d+)"/)?.[1] || '1');
|
||||||
|
const rowspan = parseInt(cell.match(/rowspan="(\d+)"/)?.[1] || '1');
|
||||||
|
const content = cell.replace(/<td.*?>|<\/td>/g, '').trim();
|
||||||
|
|
||||||
|
for (let i = 0; i < rowspan; i++) {
|
||||||
|
for (let j = 0; j < colspan; j++) {
|
||||||
|
if (!tableData[rowIndex + i]) {
|
||||||
|
tableData[rowIndex + i] = [];
|
||||||
|
}
|
||||||
|
tableData[rowIndex + i][colIndex + j] = i === 0 && j === 0 ? content : '^^';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
colIndex += colspan;
|
||||||
|
maxColumns = Math.max(maxColumns, colIndex);
|
||||||
|
});
|
||||||
|
|
||||||
|
for (let i = 0; i < maxColumns; i++) {
|
||||||
|
if (!tableData[rowIndex][i]) {
|
||||||
|
tableData[rowIndex][i] = ' ';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
const chunks: string[] = [];
|
||||||
|
|
||||||
|
const headerCells = tableData[0]
|
||||||
|
.slice(0, maxColumns)
|
||||||
|
.map((cell) => (cell === '^^' ? ' ' : cell || ' '));
|
||||||
|
const headerRow = '| ' + headerCells.join(' | ') + ' |';
|
||||||
|
chunks.push(headerRow);
|
||||||
|
|
||||||
|
const separator = '| ' + Array(headerCells.length).fill('---').join(' | ') + ' |';
|
||||||
|
chunks.push(separator);
|
||||||
|
|
||||||
|
tableData.slice(1).forEach((row) => {
|
||||||
|
const paddedRow = row
|
||||||
|
.slice(0, maxColumns)
|
||||||
|
.map((cell) => (cell === '^^' ? ' ' : cell || ' '));
|
||||||
|
while (paddedRow.length < maxColumns) {
|
||||||
|
paddedRow.push(' ');
|
||||||
|
}
|
||||||
|
chunks.push('| ' + paddedRow.join(' | ') + ' |');
|
||||||
|
});
|
||||||
|
|
||||||
|
return chunks.join('\n');
|
||||||
|
} catch (error) {
|
||||||
|
return htmlTable;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
return htmlTable;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const main = async ({ apikey, files, HTMLtable }: Props): Response => {
|
||||||
// Check the apikey
|
// Check the apikey
|
||||||
if (!apikey) {
|
if (!apikey) {
|
||||||
return Promise.reject(`API key is required`);
|
return Promise.reject(`API key is required`);
|
||||||
@@ -30,77 +112,88 @@ const main = async ({ apikey, files }: Props): Response => {
|
|||||||
for await (const url of files) {
|
for await (const url of files) {
|
||||||
try {
|
try {
|
||||||
//Fetch the pdf and check its content type
|
//Fetch the pdf and check its content type
|
||||||
const PDFResponse = await axiosInstance.get(url, { responseType: 'arraybuffer' });
|
const PDFResponse = await axios
|
||||||
|
.get(url, {
|
||||||
|
responseType: 'arraybuffer',
|
||||||
|
proxy: false,
|
||||||
|
timeout: 20000
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
throw new Error(`[Fetch PDF Error] Failed to fetch PDF: ${getErrText(error)}`);
|
||||||
|
});
|
||||||
|
|
||||||
if (PDFResponse.status !== 200) {
|
if (PDFResponse.status !== 200) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`File:${url} \n<Content>\nFailed to fetch PDF from URL: ${PDFResponse.statusText}\n</Content>`
|
`[Fetch PDF Error] Failed with status ${PDFResponse.status}: ${PDFResponse.data}`
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
const contentType = PDFResponse.headers['content-type'];
|
const contentType = PDFResponse.headers['content-type'];
|
||||||
const file_name = url.match(/read\/([^?]+)/)?.[1] || 'unknown.pdf';
|
const file_name = url.match(/read\/([^?]+)/)?.[1] || 'unknown.pdf';
|
||||||
if (!contentType || !contentType.startsWith('application/pdf')) {
|
if (!contentType || !contentType.startsWith('application/pdf')) {
|
||||||
throw new Error(
|
throw new Error(`The provided file does not point to a PDF: ${contentType}`);
|
||||||
`File:${file_name}\n<Content>\nThe provided file does not point to a PDF: ${contentType}\n</Content>`
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const blob = new Blob([PDFResponse.data], { type: 'application/pdf' });
|
const blob = new Blob([PDFResponse.data], { type: 'application/pdf' });
|
||||||
// Get pre-upload URL first
|
// Get pre-upload URL first
|
||||||
const preupload_response = await axiosInstance.post(
|
const preupload_response = await axiosInstance
|
||||||
'https://v2.doc2x.noedgeai.com/api/v2/parse/preupload',
|
.post('https://v2.doc2x.noedgeai.com/api/v2/parse/preupload', null, {
|
||||||
null,
|
|
||||||
{
|
|
||||||
headers: {
|
headers: {
|
||||||
Authorization: `Bearer ${apikey}`
|
Authorization: `Bearer ${apikey}`
|
||||||
}
|
}
|
||||||
}
|
})
|
||||||
);
|
.catch((error) => {
|
||||||
|
throw new Error(`[Pre-upload Error] Failed to get pre-upload URL: ${getErrText(error)}`);
|
||||||
|
});
|
||||||
|
|
||||||
if (preupload_response.status !== 200) {
|
if (preupload_response.status !== 200) {
|
||||||
throw new Error(
|
throw new Error(`Failed to get pre-upload URL: ${preupload_response.data}`);
|
||||||
`File:${file_name}\n<Content>\nFailed to get pre-upload URL: ${preupload_response.statusText}\n</Content>`
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const preupload_data = preupload_response.data;
|
const preupload_data = preupload_response.data;
|
||||||
if (preupload_data.code !== 'success') {
|
if (preupload_data.code !== 'success') {
|
||||||
throw new Error(
|
throw new Error(`Failed to get pre-upload URL: ${JSON.stringify(preupload_data)}`);
|
||||||
`File:${file_name}\n<Content>\nFailed to get pre-upload URL: ${JSON.stringify(preupload_data)}\n</Content>`
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const upload_url = preupload_data.data.url;
|
const upload_url = preupload_data.data.url;
|
||||||
const uid = preupload_data.data.uid;
|
const uid = preupload_data.data.uid;
|
||||||
// Upload file to pre-signed URL with binary stream
|
// Upload file to pre-signed URL with binary stream
|
||||||
|
|
||||||
const response = await axiosInstance.put(upload_url, blob, {
|
const response = await axiosInstance
|
||||||
|
.put(upload_url, blob, {
|
||||||
headers: {
|
headers: {
|
||||||
'Content-Type': 'application/pdf'
|
'Content-Type': 'application/pdf'
|
||||||
}
|
}
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
throw new Error(`[Upload Error] Failed to upload file: ${getErrText(error)}`);
|
||||||
});
|
});
|
||||||
|
|
||||||
if (response.status !== 200) {
|
if (response.status !== 200) {
|
||||||
throw new Error(`Upload failed with status ${response.status}: ${response.statusText}`);
|
throw new Error(`Upload failed with status ${response.status}: ${response.statusText}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the result by uid
|
// Get the result by uid
|
||||||
|
|
||||||
// Wait for the result, at most 90s
|
// Wait for the result
|
||||||
const checkResult = async (retry = 30) => {
|
const checkResult = async (retry = 20) => {
|
||||||
if (retry <= 0)
|
if (retry <= 0)
|
||||||
return Promise.reject(
|
return Promise.reject(
|
||||||
`File:${file_name}\n<Content>\nFailed to get result (uid: ${uid}): Get result timeout\n</Content>`
|
`File:${file_name}\n<Content>\n[Parse Timeout Error] Failed to get result (uid: ${uid}): Process timeout\n</Content>`
|
||||||
);
|
);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const result_response = await axiosInstance.get(
|
const result_response = await axiosInstance
|
||||||
`https://v2.doc2x.noedgeai.com/api/v2/parse/status?uid=${uid}`,
|
.get(`https://v2.doc2x.noedgeai.com/api/v2/parse/status?uid=${uid}`, {
|
||||||
{
|
|
||||||
headers: {
|
headers: {
|
||||||
Authorization: `Bearer ${apikey}`
|
Authorization: `Bearer ${apikey}`
|
||||||
}
|
}
|
||||||
}
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
throw new Error(
|
||||||
|
`[Parse Status Error] Failed to get parse status: ${getErrText(error)}`
|
||||||
);
|
);
|
||||||
|
});
|
||||||
|
|
||||||
const result_data = result_response.data;
|
const result_data = result_response.data;
|
||||||
if (!['ok', 'success'].includes(result_data.code)) {
|
if (!['ok', 'success'].includes(result_data.code)) {
|
||||||
@@ -110,37 +203,43 @@ const main = async ({ apikey, files }: Props): Response => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (['ready', 'processing'].includes(result_data.data.status)) {
|
if (['ready', 'processing'].includes(result_data.data.status)) {
|
||||||
await delay(3000);
|
await delay(4000);
|
||||||
return checkResult(retry - 1);
|
return checkResult(retry - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (result_data.data.status === 'success') {
|
if (result_data.data.status === 'success') {
|
||||||
const result = (
|
const result = processContent(
|
||||||
await Promise.all(
|
await Promise.all(
|
||||||
result_data.data.result.pages.map((page: { md: any }) => page.md)
|
result_data.data.result.pages.map((page: { md: any }) => page.md)
|
||||||
).then((pages) => pages.join('\n'))
|
).then((pages) => pages.join('\n')),
|
||||||
|
HTMLtable
|
||||||
)
|
)
|
||||||
// Do some post-processing
|
// Do some post-processing
|
||||||
.replace(/\\[\(\)]/g, '$')
|
.replace(/\\[\(\)]/g, '$')
|
||||||
.replace(/\\[\[\]]/g, '$$')
|
.replace(/\\[\[\]]/g, '$$')
|
||||||
.replace(/<img\s+src="([^"]+)"(?:\s*\?[^>]*)?(?:\s*\/>|>)/g, '');
|
.replace(/<img\s+src="([^"]+)"(?:\s*\?[^>]*)?(?:\s*\/>|>)/g, '')
|
||||||
|
.replace(/<!-- Media -->/g, '')
|
||||||
|
.replace(/<!-- Footnote -->/g, '')
|
||||||
|
.replace(/\$(.+?)\s+\\tag\{(.+?)\}\$/g, '$$$1 \\qquad \\qquad ($2)$$')
|
||||||
|
.replace(/\\text\{([^}]*?)(\b\w+)_(\w+\b)([^}]*?)\}/g, '\\text{$1$2\\_$3$4}');
|
||||||
|
|
||||||
return `File:${file_name}\n<Content>\n${result}\n</Content>`;
|
return `File:${file_name}\n<Content>\n${result}\n</Content>`;
|
||||||
}
|
}
|
||||||
|
|
||||||
await delay(100);
|
|
||||||
return checkResult(retry - 1);
|
return checkResult(retry - 1);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
if (retry > 1) {
|
||||||
await delay(100);
|
await delay(100);
|
||||||
return checkResult(retry - 1);
|
return checkResult(retry - 1);
|
||||||
}
|
}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const result = await checkResult();
|
const result = await checkResult();
|
||||||
successResult.push(result);
|
successResult.push(result);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
failedResult.push(
|
failedResult.push(
|
||||||
`File:${url} \n<Content>\nFailed to fetch image from URL: ${getErrText(error)}\n</Content>`
|
`File:${url} \n<Content>\nFailed to fetch file from URL: ${getErrText(error)}\n</Content>`
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -7,10 +7,8 @@
|
|||||||
"courseUrl": "https://fael3z0zfze.feishu.cn/wiki/Rkc5witXWiJoi5kORd2cofh6nDg?fromScene=spaceOverview",
|
"courseUrl": "https://fael3z0zfze.feishu.cn/wiki/Rkc5witXWiJoi5kORd2cofh6nDg?fromScene=spaceOverview",
|
||||||
"showStatus": true,
|
"showStatus": true,
|
||||||
"weight": 10,
|
"weight": 10,
|
||||||
|
|
||||||
"isTool": true,
|
"isTool": true,
|
||||||
"templateType": "tools",
|
"templateType": "tools",
|
||||||
|
|
||||||
"workflow": {
|
"workflow": {
|
||||||
"nodes": [
|
"nodes": [
|
||||||
{
|
{
|
||||||
@@ -52,6 +50,26 @@
|
|||||||
"canSelectImg": false,
|
"canSelectImg": false,
|
||||||
"maxFiles": 14,
|
"maxFiles": 14,
|
||||||
"defaultValue": ""
|
"defaultValue": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"renderTypeList": ["switch", "reference"],
|
||||||
|
"selectedTypeIndex": 0,
|
||||||
|
"valueType": "boolean",
|
||||||
|
"canEdit": true,
|
||||||
|
"key": "HTMLtable",
|
||||||
|
"label": "HTMLtable",
|
||||||
|
"description": "是否以HTML格式输出表格。如果需要精确地输出表格,请打开此开关以使用HTML格式。关闭后,表格将转换为Markdown形式输出,但这可能会损失一些表格特性,如合并单元格。",
|
||||||
|
"defaultValue": false,
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"label": "",
|
||||||
|
"value": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"maxFiles": 5,
|
||||||
|
"canSelectFile": true,
|
||||||
|
"canSelectImg": true,
|
||||||
|
"required": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -68,6 +86,13 @@
|
|||||||
"key": "files",
|
"key": "files",
|
||||||
"label": "files",
|
"label": "files",
|
||||||
"type": "hidden"
|
"type": "hidden"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "htmltable",
|
||||||
|
"valueType": "boolean",
|
||||||
|
"key": "HTMLtable",
|
||||||
|
"label": "HTMLtable",
|
||||||
|
"type": "hidden"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -220,7 +245,7 @@
|
|||||||
"key": "system_httpJsonBody",
|
"key": "system_httpJsonBody",
|
||||||
"renderTypeList": ["hidden"],
|
"renderTypeList": ["hidden"],
|
||||||
"valueType": "any",
|
"valueType": "any",
|
||||||
"value": "{\n \"apikey\": \"{{apikey}}\",\n \"files\": {{files}}\n}",
|
"value": "{\n \"apikey\": \"{{apikey}}\",\n \"HTMLtable\": {{HTMLtable}},\n \"files\": {{files}}\n}",
|
||||||
"label": "",
|
"label": "",
|
||||||
"required": false,
|
"required": false,
|
||||||
"debugLabel": "",
|
"debugLabel": "",
|
||||||
@@ -305,6 +330,36 @@
|
|||||||
},
|
},
|
||||||
"required": true,
|
"required": true,
|
||||||
"value": [["pluginInput", "url"]]
|
"value": [["pluginInput", "url"]]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"renderTypeList": ["reference"],
|
||||||
|
"valueType": "boolean",
|
||||||
|
"canEdit": true,
|
||||||
|
"key": "HTMLtable",
|
||||||
|
"label": "HTMLtable",
|
||||||
|
"customInputConfig": {
|
||||||
|
"selectValueTypeList": [
|
||||||
|
"string",
|
||||||
|
"number",
|
||||||
|
"boolean",
|
||||||
|
"object",
|
||||||
|
"arrayString",
|
||||||
|
"arrayNumber",
|
||||||
|
"arrayBoolean",
|
||||||
|
"arrayObject",
|
||||||
|
"arrayAny",
|
||||||
|
"any",
|
||||||
|
"chatHistory",
|
||||||
|
"datasetQuote",
|
||||||
|
"dynamic",
|
||||||
|
"selectApp",
|
||||||
|
"selectDataset"
|
||||||
|
],
|
||||||
|
"showDescription": false,
|
||||||
|
"showDefaultValue": true
|
||||||
|
},
|
||||||
|
"required": true,
|
||||||
|
"value": ["pluginInput", "htmltable"]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
Reference in New Issue
Block a user