Perf: read file woker (#1337)

* perf: read file worker

* fix: Http node url input

* fix: htm2md

* fix: html2md

* fix: ts

* perf: Problem classification increases the matching order

* feat: tool response answer
This commit is contained in:
Archer
2024-04-30 18:12:20 +08:00
committed by GitHub
parent 1529c1e991
commit b5f0ac3e1d
35 changed files with 413 additions and 398 deletions

View File

@@ -0,0 +1,21 @@
import Papa from 'papaparse';
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
import { readFileRawText } from './rawText';
// 加载源文件内容
export const readCsvRawText = async (params: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
const { rawText } = readFileRawText(params);
const csvArr = Papa.parse(rawText).data as string[][];
const header = csvArr[0];
const formatText = header
? csvArr.map((item) => item.map((item, i) => `${header[i]}:${item}`).join('\n')).join('\n')
: '';
return {
rawText,
formatText
};
};

View File

@@ -0,0 +1,23 @@
import mammoth from 'mammoth';
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
import { html2md } from '../../htmlStr2Md/utils';
/**
* read docx to markdown
*/
export const readDocsFile = async ({ buffer }: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
try {
const { value: html } = await mammoth.convertToHtml({
buffer
});
const rawText = html2md(html);
return {
rawText
};
} catch (error) {
console.log('error doc read:', error);
return Promise.reject('Can not read doc file, please convert to PDF');
}
};

View File

@@ -0,0 +1,13 @@
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
import { readFileRawText } from './rawText';
import { html2md } from '../../htmlStr2Md/utils';
export const readHtmlRawText = async (params: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
const { rawText: html } = readFileRawText(params);
const rawText = html2md(html);
return {
rawText
};
};

View File

@@ -0,0 +1,68 @@
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
// @ts-ignore
import('pdfjs-dist/legacy/build/pdf.worker.min.mjs');
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
type TokenType = {
str: string;
dir: string;
width: number;
height: number;
transform: number[];
fontName: string;
hasEOL: boolean;
};
export const readPdfFile = async ({ buffer }: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
const readPDFPage = async (doc: any, pageNo: number) => {
const page = await doc.getPage(pageNo);
const tokenizedText = await page.getTextContent();
const viewport = page.getViewport({ scale: 1 });
const pageHeight = viewport.height;
const headerThreshold = pageHeight * 0.95;
const footerThreshold = pageHeight * 0.05;
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
return (
!token.transform ||
(token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
);
});
// concat empty string 'hasEOL'
for (let i = 0; i < pageTexts.length; i++) {
const item = pageTexts[i];
if (item.str === '' && pageTexts[i - 1]) {
pageTexts[i - 1].hasEOL = item.hasEOL;
pageTexts.splice(i, 1);
i--;
}
}
page.cleanup();
return pageTexts
.map((token) => {
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
return paragraphEnd ? `${token.str}\n` : token.str;
})
.join('');
};
const loadingTask = pdfjs.getDocument(buffer.buffer);
const doc = await loadingTask.promise;
const pageTextPromises = [];
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
pageTextPromises.push(readPDFPage(doc, pageNo));
}
const pageTexts = await Promise.all(pageTextPromises);
loadingTask.destroy();
return {
rawText: pageTexts.join('')
};
};

View File

@@ -0,0 +1,18 @@
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
// import { parseOfficeAsync } from 'officeparser';
import { parseOffice } from '../parseOffice';
export const readPptxRawText = async ({
buffer,
encoding
}: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
const result = await parseOffice({
buffer,
encoding: encoding as BufferEncoding,
extension: 'pptx'
});
return {
rawText: result
};
};

View File

@@ -0,0 +1,28 @@
import iconv from 'iconv-lite';
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
const rawEncodingList = [
'ascii',
'utf8',
'utf-8',
'utf16le',
'utf-16le',
'ucs2',
'ucs-2',
'base64',
'base64url',
'latin1',
'binary',
'hex'
];
// 加载源文件内容
export const readFileRawText = ({ buffer, encoding }: ReadRawTextByBuffer): ReadFileResponse => {
const content = rawEncodingList.includes(encoding)
? buffer.toString(encoding as BufferEncoding)
: iconv.decode(buffer, 'gbk');
return {
rawText: content
};
};

View File

@@ -0,0 +1,45 @@
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
import xlsx from 'node-xlsx';
import Papa from 'papaparse';
export const readXlsxRawText = async ({
buffer
}: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
const result = xlsx.parse(buffer, {
skipHidden: false,
defval: ''
});
const format2Csv = result.map(({ name, data }) => {
return {
title: `#${name}`,
csvText: data.map((item) => item.join(',')).join('\n')
};
});
const rawText = format2Csv.map((item) => item.csvText).join('\n');
const formatText = format2Csv
.map((item) => {
const csvArr = Papa.parse(item.csvText).data as string[][];
const header = csvArr[0];
const formatText = header
? csvArr
.map((item) =>
item
.map((item, i) => (item ? `${header[i]}:${item}` : ''))
.filter(Boolean)
.join('\n')
)
.join('\n')
: '';
return `${item.title}\n${formatText}`;
})
.join('\n');
return {
rawText: rawText,
formatText
};
};

View File

@@ -0,0 +1,119 @@
import { getNanoid } from '@fastgpt/global/common/string/tools';
import fs from 'fs';
import decompress from 'decompress';
import { DOMParser } from '@xmldom/xmldom';
import { clearDirFiles } from '../../common/file/utils';
import { addLog } from '../../common/system/log';
const DEFAULTDECOMPRESSSUBLOCATION = '/tmp';
function getNewFileName(ext: string) {
return `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}.${ext}`;
}
const parseString = (xml: string) => {
let parser = new DOMParser();
return parser.parseFromString(xml, 'text/xml');
};
const parsePowerPoint = async ({
filepath,
decompressPath,
encoding
}: {
filepath: string;
decompressPath: string;
encoding: BufferEncoding;
}) => {
// Files regex that hold our content of interest
const allFilesRegex = /ppt\/(notesSlides|slides)\/(notesSlide|slide)\d+.xml/g;
const slidesRegex = /ppt\/slides\/slide\d+.xml/g;
/** The decompress location which contains the filename in it */
const files = await decompress(filepath, decompressPath, {
filter: (x) => !!x.path.match(allFilesRegex)
});
// Verify if atleast the slides xml files exist in the extracted files list.
if (
files.length == 0 ||
!files.map((file) => file.path).some((filename) => filename.match(slidesRegex))
) {
return Promise.reject('解析 PPT 失败');
}
// Returning an array of all the xml contents read using fs.readFileSync
const xmlContentArray = files.map((file) =>
fs.readFileSync(`${decompressPath}/${file.path}`, encoding)
);
let responseArr: string[] = [];
xmlContentArray.forEach((xmlContent) => {
/** Find text nodes with a:p tags */
const xmlParagraphNodesList = parseString(xmlContent).getElementsByTagName('a:p');
/** Store all the text content to respond */
responseArr.push(
Array.from(xmlParagraphNodesList)
// Filter paragraph nodes than do not have any text nodes which are identifiable by a:t tag
.filter((paragraphNode) => paragraphNode.getElementsByTagName('a:t').length != 0)
.map((paragraphNode) => {
/** Find text nodes with a:t tags */
const xmlTextNodeList = paragraphNode.getElementsByTagName('a:t');
return Array.from(xmlTextNodeList)
.filter((textNode) => textNode.childNodes[0] && textNode.childNodes[0].nodeValue)
.map((textNode) => textNode.childNodes[0].nodeValue)
.join('');
})
.join('\n')
);
});
return responseArr.join('\n');
};
export const parseOffice = async ({
buffer,
encoding,
extension
}: {
buffer: Buffer;
encoding: BufferEncoding;
extension: string;
}) => {
// Prepare file for processing
// create temp file subdirectory if it does not exist
if (!fs.existsSync(DEFAULTDECOMPRESSSUBLOCATION)) {
fs.mkdirSync(DEFAULTDECOMPRESSSUBLOCATION, { recursive: true });
}
// temp file name
const filepath = getNewFileName(extension);
const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}`;
// const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/test`;
// write new file
fs.writeFileSync(filepath, buffer, {
encoding
});
const text = await (async () => {
try {
switch (extension) {
case 'pptx':
return parsePowerPoint({ filepath, decompressPath, encoding });
default:
return Promise.reject('只能读取 .pptx 文件');
}
} catch (error) {
addLog.error(`Load ppt error`, { error });
}
return '';
})();
fs.unlinkSync(filepath);
clearDirFiles(decompressPath);
return text;
};

View File

@@ -0,0 +1,71 @@
import { parentPort } from 'worker_threads';
import { readFileRawText } from './extension/rawText';
import { ReadRawTextByBuffer, ReadRawTextProps } from './type';
import { readHtmlRawText } from './extension/html';
import { readPdfFile } from './extension/pdf';
import { readDocsFile } from './extension/docx';
import { readPptxRawText } from './extension/pptx';
import { readXlsxRawText } from './extension/xlsx';
import { readCsvRawText } from './extension/csv';
parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
const readFileRawContent = async (params: ReadRawTextByBuffer) => {
switch (params.extension) {
case 'txt':
case 'md':
return readFileRawText(params);
case 'html':
return readHtmlRawText(params);
case 'pdf':
return readPdfFile(params);
case 'docx':
return readDocsFile(params);
case 'pptx':
return readPptxRawText(params);
case 'xlsx':
const xlsxResult = await readXlsxRawText(params);
if (params.csvFormat) {
return {
rawText: xlsxResult.formatText || ''
};
}
return {
rawText: xlsxResult.rawText
};
case 'csv':
const csvResult = await readCsvRawText(params);
if (params.csvFormat) {
return {
rawText: csvResult.formatText || ''
};
}
return {
rawText: csvResult.rawText
};
default:
return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
}
};
// params.buffer: Uint8Array -> buffer
const buffer = Buffer.from(props.buffer);
const newProps: ReadRawTextByBuffer = {
...props,
buffer
};
try {
parentPort?.postMessage({
type: 'success',
data: await readFileRawContent(newProps)
});
} catch (error) {
console.log(error);
parentPort?.postMessage({
type: 'error',
data: error
});
}
global?.close?.();
});

15
packages/service/worker/file/type.d.ts vendored Normal file
View File

@@ -0,0 +1,15 @@
import { ReadFileByBufferParams } from '../../common/file/read/type';
export type ReadRawTextProps<T> = {
csvFormat?: boolean;
extension: string;
buffer: T;
encoding: string;
};
export type ReadRawTextByBuffer = ReadRawTextProps<Buffer>;
export type ReadFileResponse = {
rawText: string;
formatText?: string;
};

View File

@@ -1,60 +0,0 @@
import { parentPort } from 'worker_threads';
import TurndownService from 'turndown';
//@ts-ignore
import domino from 'domino';
//@ts-ignore
import * as turndownPluginGfm from 'joplin-turndown-plugin-gfm';
const turndownService = new TurndownService({
headingStyle: 'atx',
bulletListMarker: '-',
codeBlockStyle: 'fenced',
fence: '```',
emDelimiter: '_',
strongDelimiter: '**',
linkStyle: 'inlined',
linkReferenceStyle: 'full'
});
parentPort?.on('message', (params: { html: string }) => {
const html2md = (html: string): string => {
try {
const window = domino.createWindow(html);
const document = window.document;
turndownService.remove(['i', 'script', 'iframe']);
turndownService.addRule('codeBlock', {
filter: 'pre',
replacement(_, node) {
const content = node.textContent?.trim() || '';
// @ts-ignore
const codeName = node?._attrsByQName?.class?.data?.trim() || '';
return `\n\`\`\`${codeName}\n${content}\n\`\`\`\n`;
}
});
turndownService.use(turndownPluginGfm.gfm);
// @ts-ignore
return turndownService.turndown(document);
} catch (error) {
return '';
}
};
try {
const md = html2md(params?.html || '');
parentPort?.postMessage({
type: 'success',
data: md
});
} catch (error) {
parentPort?.postMessage({
type: 'error',
data: error
});
}
global?.close?.();
});

View File

@@ -0,0 +1,20 @@
import { parentPort } from 'worker_threads';
import { html2md } from './utils';
parentPort?.on('message', (params: { html: string }) => {
try {
const md = html2md(params?.html || '');
parentPort?.postMessage({
type: 'success',
data: md
});
} catch (error) {
parentPort?.postMessage({
type: 'error',
data: error
});
}
global?.close?.();
});

View File

@@ -0,0 +1,40 @@
import TurndownService from 'turndown';
const domino = require('domino-ext');
const turndownPluginGfm = require('joplin-turndown-plugin-gfm');
export const html2md = (html: string): string => {
const turndownService = new TurndownService({
headingStyle: 'atx',
bulletListMarker: '-',
codeBlockStyle: 'fenced',
fence: '```',
emDelimiter: '_',
strongDelimiter: '**',
linkStyle: 'inlined',
linkReferenceStyle: 'full'
});
try {
const window = domino.createWindow(html);
const document = window.document;
turndownService.remove(['i', 'script', 'iframe']);
turndownService.addRule('codeBlock', {
filter: 'pre',
replacement(_, node) {
const content = node.textContent?.trim() || '';
// @ts-ignore
const codeName = node?._attrsByQName?.class?.data?.trim() || '';
return `\n\`\`\`${codeName}\n${content}\n\`\`\`\n`;
}
});
turndownService.use(turndownPluginGfm.gfm);
return turndownService.turndown(document);
} catch (error) {
console.log('html 2 markdown error', error);
return '';
}
};

View File

@@ -2,6 +2,7 @@ import { Worker } from 'worker_threads';
import path from 'path';
export enum WorkerNameEnum {
readFile = 'readFile',
htmlStr2Md = 'htmlStr2Md',
countGptMessagesTokens = 'countGptMessagesTokens'
}