perf: memory leak (#5370)

* perf: memory leak

* perf: workflow share buffer;Circle checker;Get file from stream

* doc

* remove report.md
This commit is contained in:
Archer
2025-08-03 22:37:45 +08:00
committed by GitHub
parent baf18b14d4
commit 7bcee82f5f
21 changed files with 525 additions and 349 deletions

View File

@@ -56,16 +56,16 @@ export const readPdfFile = async ({ buffer }: ReadRawTextByBuffer): Promise<Read
}
};
// @ts-ignore
const loadingTask = pdfjs.getDocument(buffer.buffer);
// Create a completely new ArrayBuffer to avoid SharedArrayBuffer transferList issues
const uint8Array = new Uint8Array(buffer.byteLength);
uint8Array.set(new Uint8Array(buffer.buffer, buffer.byteOffset, buffer.byteLength));
const loadingTask = pdfjs.getDocument({ data: uint8Array });
const doc = await loadingTask.promise;
// Avoid OOM.
let result = '';
const pageArr = Array.from({ length: doc.numPages }, (_, i) => i + 1);
for (let i = 0; i < pageArr.length; i++) {
result += await readPDFPage(doc, i + 1);
}
const result = (
await Promise.all(pageArr.map(async (page) => await readPDFPage(doc, page)))
).join('');
loadingTask.destroy();

View File

@@ -9,49 +9,60 @@ import { readXlsxRawText } from './extension/xlsx';
import { readCsvRawText } from './extension/csv';
import { workerResponse } from '../controller';
parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
const read = async (params: ReadRawTextByBuffer) => {
switch (params.extension) {
case 'txt':
case 'md':
return readFileRawText(params);
case 'html':
return readHtmlRawText(params);
case 'pdf':
return readPdfFile(params);
case 'docx':
return readDocsFile(params);
case 'pptx':
return readPptxRawText(params);
case 'xlsx':
return readXlsxRawText(params);
case 'csv':
return readCsvRawText(params);
default:
return Promise.reject(
`Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx. "${params.extension}" is not supported.`
);
parentPort?.on(
'message',
async (
props: Omit<ReadRawTextProps<any>, 'buffer'> & {
sharedBuffer: SharedArrayBuffer;
bufferSize: number;
}
};
) => {
const read = async (params: ReadRawTextByBuffer) => {
switch (params.extension) {
case 'txt':
case 'md':
return readFileRawText(params);
case 'html':
return readHtmlRawText(params);
case 'pdf':
return readPdfFile(params);
case 'docx':
return readDocsFile(params);
case 'pptx':
return readPptxRawText(params);
case 'xlsx':
return readXlsxRawText(params);
case 'csv':
return readCsvRawText(params);
default:
return Promise.reject(
`Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx. "${params.extension}" is not supported.`
);
}
};
// params.buffer: Uint8Array -> buffer
const buffer = Buffer.from(props.buffer);
const newProps: ReadRawTextByBuffer = {
...props,
buffer
};
// 使用 SharedArrayBuffer零拷贝共享内存
const sharedArray = new Uint8Array(props.sharedBuffer);
const buffer = Buffer.from(sharedArray.buffer, 0, props.bufferSize);
try {
workerResponse({
parentPort,
status: 'success',
data: await read(newProps)
});
} catch (error) {
workerResponse({
parentPort,
status: 'error',
data: error
});
const newProps: ReadRawTextByBuffer = {
extension: props.extension,
encoding: props.encoding,
buffer
};
try {
workerResponse({
parentPort,
status: 'success',
data: await read(newProps)
});
} catch (error) {
workerResponse({
parentPort,
status: 'error',
data: error
});
}
}
});
);