mirror of
https://github.com/labring/FastGPT.git
synced 2025-10-18 09:24:03 +00:00
Perf system plugin and worker (#2126)
* perf: worker pool * perf: worker register * perf: worker controller * perf: system plugin worker * perf: system plugin worker * perf: worker * perf: worker * worker timeout * perf: copy icon
This commit is contained in:
25
packages/service/worker/readFile/extension/csv.ts
Normal file
25
packages/service/worker/readFile/extension/csv.ts
Normal file
@@ -0,0 +1,25 @@
|
||||
import Papa from 'papaparse';
|
||||
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
|
||||
import { readFileRawText } from './rawText';
|
||||
|
||||
// 加载源文件内容
|
||||
export const readCsvRawText = async (params: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
|
||||
const { rawText } = readFileRawText(params);
|
||||
|
||||
const csvArr = Papa.parse(rawText).data as string[][];
|
||||
|
||||
const header = csvArr[0];
|
||||
|
||||
// format to md table
|
||||
const formatText = `| ${header.join(' | ')} |
|
||||
| ${header.map(() => '---').join(' | ')} |
|
||||
${csvArr
|
||||
.slice(1)
|
||||
.map((row) => `| ${row.map((item) => item.replace(/\n/g, '\\n')).join(' | ')} |`)
|
||||
.join('\n')}`;
|
||||
|
||||
return {
|
||||
rawText,
|
||||
formatText
|
||||
};
|
||||
};
|
23
packages/service/worker/readFile/extension/docx.ts
Normal file
23
packages/service/worker/readFile/extension/docx.ts
Normal file
@@ -0,0 +1,23 @@
|
||||
import mammoth from 'mammoth';
|
||||
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
|
||||
import { html2md } from '../../htmlStr2Md/utils';
|
||||
|
||||
/**
|
||||
* read docx to markdown
|
||||
*/
|
||||
export const readDocsFile = async ({ buffer }: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
|
||||
try {
|
||||
const { value: html } = await mammoth.convertToHtml({
|
||||
buffer
|
||||
});
|
||||
|
||||
const rawText = html2md(html);
|
||||
|
||||
return {
|
||||
rawText
|
||||
};
|
||||
} catch (error) {
|
||||
console.log('error doc read:', error);
|
||||
return Promise.reject('Can not read doc file, please convert to PDF');
|
||||
}
|
||||
};
|
13
packages/service/worker/readFile/extension/html.ts
Normal file
13
packages/service/worker/readFile/extension/html.ts
Normal file
@@ -0,0 +1,13 @@
|
||||
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
|
||||
import { readFileRawText } from './rawText';
|
||||
import { html2md } from '../../htmlStr2Md/utils';
|
||||
|
||||
export const readHtmlRawText = async (params: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
|
||||
const { rawText: html } = readFileRawText(params);
|
||||
|
||||
const rawText = html2md(html);
|
||||
|
||||
return {
|
||||
rawText
|
||||
};
|
||||
};
|
74
packages/service/worker/readFile/extension/pdf.ts
Normal file
74
packages/service/worker/readFile/extension/pdf.ts
Normal file
@@ -0,0 +1,74 @@
|
||||
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
|
||||
// @ts-ignore
|
||||
import('pdfjs-dist/legacy/build/pdf.worker.min.mjs');
|
||||
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
|
||||
|
||||
type TokenType = {
|
||||
str: string;
|
||||
dir: string;
|
||||
width: number;
|
||||
height: number;
|
||||
transform: number[];
|
||||
fontName: string;
|
||||
hasEOL: boolean;
|
||||
};
|
||||
|
||||
export const readPdfFile = async ({ buffer }: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
|
||||
const readPDFPage = async (doc: any, pageNo: number) => {
|
||||
try {
|
||||
const page = await doc.getPage(pageNo);
|
||||
const tokenizedText = await page.getTextContent();
|
||||
|
||||
const viewport = page.getViewport({ scale: 1 });
|
||||
const pageHeight = viewport.height;
|
||||
const headerThreshold = pageHeight * 0.95;
|
||||
const footerThreshold = pageHeight * 0.05;
|
||||
|
||||
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
|
||||
return (
|
||||
!token.transform ||
|
||||
(token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
|
||||
);
|
||||
});
|
||||
|
||||
// concat empty string 'hasEOL'
|
||||
for (let i = 0; i < pageTexts.length; i++) {
|
||||
const item = pageTexts[i];
|
||||
if (item.str === '' && pageTexts[i - 1]) {
|
||||
pageTexts[i - 1].hasEOL = item.hasEOL;
|
||||
pageTexts.splice(i, 1);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
|
||||
page.cleanup();
|
||||
|
||||
return pageTexts
|
||||
.map((token) => {
|
||||
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
|
||||
|
||||
return paragraphEnd ? `${token.str}\n` : token.str;
|
||||
})
|
||||
.join('');
|
||||
} catch (error) {
|
||||
console.log('pdf read error', error);
|
||||
return '';
|
||||
}
|
||||
};
|
||||
|
||||
const loadingTask = pdfjs.getDocument(buffer.buffer);
|
||||
const doc = await loadingTask.promise;
|
||||
|
||||
// Avoid OOM.
|
||||
let result = '';
|
||||
const pageArr = Array.from({ length: doc.numPages }, (_, i) => i + 1);
|
||||
for await (const pageNo of pageArr) {
|
||||
result += await readPDFPage(doc, pageNo);
|
||||
}
|
||||
|
||||
loadingTask.destroy();
|
||||
|
||||
return {
|
||||
rawText: result
|
||||
};
|
||||
};
|
18
packages/service/worker/readFile/extension/pptx.ts
Normal file
18
packages/service/worker/readFile/extension/pptx.ts
Normal file
@@ -0,0 +1,18 @@
|
||||
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
|
||||
// import { parseOfficeAsync } from 'officeparser';
|
||||
import { parseOffice } from '../parseOffice';
|
||||
|
||||
export const readPptxRawText = async ({
|
||||
buffer,
|
||||
encoding
|
||||
}: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
|
||||
const result = await parseOffice({
|
||||
buffer,
|
||||
encoding: encoding as BufferEncoding,
|
||||
extension: 'pptx'
|
||||
});
|
||||
|
||||
return {
|
||||
rawText: result
|
||||
};
|
||||
};
|
28
packages/service/worker/readFile/extension/rawText.ts
Normal file
28
packages/service/worker/readFile/extension/rawText.ts
Normal file
@@ -0,0 +1,28 @@
|
||||
import iconv from 'iconv-lite';
|
||||
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
|
||||
|
||||
const rawEncodingList = [
|
||||
'ascii',
|
||||
'utf8',
|
||||
'utf-8',
|
||||
'utf16le',
|
||||
'utf-16le',
|
||||
'ucs2',
|
||||
'ucs-2',
|
||||
'base64',
|
||||
'base64url',
|
||||
'latin1',
|
||||
'binary',
|
||||
'hex'
|
||||
];
|
||||
|
||||
// 加载源文件内容
|
||||
export const readFileRawText = ({ buffer, encoding }: ReadRawTextByBuffer): ReadFileResponse => {
|
||||
const content = rawEncodingList.includes(encoding)
|
||||
? buffer.toString(encoding as BufferEncoding)
|
||||
: iconv.decode(buffer, 'gbk');
|
||||
|
||||
return {
|
||||
rawText: content
|
||||
};
|
||||
};
|
46
packages/service/worker/readFile/extension/xlsx.ts
Normal file
46
packages/service/worker/readFile/extension/xlsx.ts
Normal file
@@ -0,0 +1,46 @@
|
||||
import { CUSTOM_SPLIT_SIGN } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
|
||||
import xlsx from 'node-xlsx';
|
||||
import Papa from 'papaparse';
|
||||
|
||||
export const readXlsxRawText = async ({
|
||||
buffer
|
||||
}: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
|
||||
const result = xlsx.parse(buffer, {
|
||||
skipHidden: false,
|
||||
defval: ''
|
||||
});
|
||||
|
||||
const format2Csv = result.map(({ name, data }) => {
|
||||
return {
|
||||
title: `#${name}`,
|
||||
csvText: data.map((item) => item.join(',')).join('\n')
|
||||
};
|
||||
});
|
||||
|
||||
const rawText = format2Csv.map((item) => item.csvText).join('\n');
|
||||
|
||||
const formatText = format2Csv
|
||||
.map((item) => {
|
||||
const csvArr = Papa.parse(item.csvText).data as string[][];
|
||||
const header = csvArr[0];
|
||||
|
||||
if (!header) return;
|
||||
|
||||
const formatText = `| ${header.join(' | ')} |
|
||||
| ${header.map(() => '---').join(' | ')} |
|
||||
${csvArr
|
||||
.slice(1)
|
||||
.map((row) => `| ${row.map((item) => item.replace(/\n/g, '\\n')).join(' | ')} |`)
|
||||
.join('\n')}`;
|
||||
|
||||
return formatText;
|
||||
})
|
||||
.filter(Boolean)
|
||||
.join(CUSTOM_SPLIT_SIGN);
|
||||
|
||||
return {
|
||||
rawText: rawText,
|
||||
formatText
|
||||
};
|
||||
};
|
Reference in New Issue
Block a user