Perf system plugin and worker (#2126)

* perf: worker pool

* perf: worker register

* perf: worker controller

* perf: system plugin worker

* perf: system plugin worker

* perf: worker

* perf: worker

* worker timeout

* perf: copy icon
This commit is contained in:
Archer
2024-07-23 11:23:42 +08:00
committed by GitHub
parent a4787bce5c
commit e99c91aaa6
34 changed files with 433 additions and 235 deletions

View File

@@ -0,0 +1,25 @@
import Papa from 'papaparse';
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
import { readFileRawText } from './rawText';
// 加载源文件内容
export const readCsvRawText = async (params: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
const { rawText } = readFileRawText(params);
const csvArr = Papa.parse(rawText).data as string[][];
const header = csvArr[0];
// format to md table
const formatText = `| ${header.join(' | ')} |
| ${header.map(() => '---').join(' | ')} |
${csvArr
.slice(1)
.map((row) => `| ${row.map((item) => item.replace(/\n/g, '\\n')).join(' | ')} |`)
.join('\n')}`;
return {
rawText,
formatText
};
};

View File

@@ -0,0 +1,23 @@
import mammoth from 'mammoth';
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
import { html2md } from '../../htmlStr2Md/utils';
/**
* read docx to markdown
*/
export const readDocsFile = async ({ buffer }: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
try {
const { value: html } = await mammoth.convertToHtml({
buffer
});
const rawText = html2md(html);
return {
rawText
};
} catch (error) {
console.log('error doc read:', error);
return Promise.reject('Can not read doc file, please convert to PDF');
}
};

View File

@@ -0,0 +1,13 @@
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
import { readFileRawText } from './rawText';
import { html2md } from '../../htmlStr2Md/utils';
export const readHtmlRawText = async (params: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
const { rawText: html } = readFileRawText(params);
const rawText = html2md(html);
return {
rawText
};
};

View File

@@ -0,0 +1,74 @@
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
// @ts-ignore
import('pdfjs-dist/legacy/build/pdf.worker.min.mjs');
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
type TokenType = {
str: string;
dir: string;
width: number;
height: number;
transform: number[];
fontName: string;
hasEOL: boolean;
};
export const readPdfFile = async ({ buffer }: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
const readPDFPage = async (doc: any, pageNo: number) => {
try {
const page = await doc.getPage(pageNo);
const tokenizedText = await page.getTextContent();
const viewport = page.getViewport({ scale: 1 });
const pageHeight = viewport.height;
const headerThreshold = pageHeight * 0.95;
const footerThreshold = pageHeight * 0.05;
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
return (
!token.transform ||
(token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
);
});
// concat empty string 'hasEOL'
for (let i = 0; i < pageTexts.length; i++) {
const item = pageTexts[i];
if (item.str === '' && pageTexts[i - 1]) {
pageTexts[i - 1].hasEOL = item.hasEOL;
pageTexts.splice(i, 1);
i--;
}
}
page.cleanup();
return pageTexts
.map((token) => {
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
return paragraphEnd ? `${token.str}\n` : token.str;
})
.join('');
} catch (error) {
console.log('pdf read error', error);
return '';
}
};
const loadingTask = pdfjs.getDocument(buffer.buffer);
const doc = await loadingTask.promise;
// Avoid OOM.
let result = '';
const pageArr = Array.from({ length: doc.numPages }, (_, i) => i + 1);
for await (const pageNo of pageArr) {
result += await readPDFPage(doc, pageNo);
}
loadingTask.destroy();
return {
rawText: result
};
};

View File

@@ -0,0 +1,18 @@
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
// import { parseOfficeAsync } from 'officeparser';
import { parseOffice } from '../parseOffice';
export const readPptxRawText = async ({
buffer,
encoding
}: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
const result = await parseOffice({
buffer,
encoding: encoding as BufferEncoding,
extension: 'pptx'
});
return {
rawText: result
};
};

View File

@@ -0,0 +1,28 @@
import iconv from 'iconv-lite';
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
const rawEncodingList = [
'ascii',
'utf8',
'utf-8',
'utf16le',
'utf-16le',
'ucs2',
'ucs-2',
'base64',
'base64url',
'latin1',
'binary',
'hex'
];
// 加载源文件内容
export const readFileRawText = ({ buffer, encoding }: ReadRawTextByBuffer): ReadFileResponse => {
const content = rawEncodingList.includes(encoding)
? buffer.toString(encoding as BufferEncoding)
: iconv.decode(buffer, 'gbk');
return {
rawText: content
};
};

View File

@@ -0,0 +1,46 @@
import { CUSTOM_SPLIT_SIGN } from '@fastgpt/global/common/string/textSplitter';
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
import xlsx from 'node-xlsx';
import Papa from 'papaparse';
export const readXlsxRawText = async ({
buffer
}: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
const result = xlsx.parse(buffer, {
skipHidden: false,
defval: ''
});
const format2Csv = result.map(({ name, data }) => {
return {
title: `#${name}`,
csvText: data.map((item) => item.join(',')).join('\n')
};
});
const rawText = format2Csv.map((item) => item.csvText).join('\n');
const formatText = format2Csv
.map((item) => {
const csvArr = Papa.parse(item.csvText).data as string[][];
const header = csvArr[0];
if (!header) return;
const formatText = `| ${header.join(' | ')} |
| ${header.map(() => '---').join(' | ')} |
${csvArr
.slice(1)
.map((row) => `| ${row.map((item) => item.replace(/\n/g, '\\n')).join(' | ')} |`)
.join('\n')}`;
return formatText;
})
.filter(Boolean)
.join(CUSTOM_SPLIT_SIGN);
return {
rawText: rawText,
formatText
};
};