mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 21:13:50 +00:00

* update: Add type * fix: update import statement for NextApiRequest type * fix: update imports to use type for LexicalEditor and EditorState * Refactor imports to use 'import type' for type-only imports across multiple files - Updated imports in various components and API files to use 'import type' for better clarity and to optimize TypeScript's type checking. - Ensured consistent usage of type imports in files related to chat, dataset, workflow, and user management. - Improved code readability and maintainability by distinguishing between value and type imports. * refactor: remove old ESLint configuration and add new rules - Deleted the old ESLint configuration file from the app project. - Added a new ESLint configuration file with updated rules and settings. - Changed imports to use type-only imports in various files for better clarity and performance. - Updated TypeScript configuration to remove unnecessary options. - Added an ESLint ignore file to exclude build and dependency directories from linting. * fix: update imports to use 'import type' for type-only imports in schema files
76 lines
2.0 KiB
TypeScript
76 lines
2.0 KiB
TypeScript
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
|
|
// @ts-ignore
|
|
import('pdfjs-dist/legacy/build/pdf.worker.min.mjs');
|
|
import { type ReadRawTextByBuffer, type ReadFileResponse } from '../type';
|
|
|
|
type TokenType = {
|
|
str: string;
|
|
dir: string;
|
|
width: number;
|
|
height: number;
|
|
transform: number[];
|
|
fontName: string;
|
|
hasEOL: boolean;
|
|
};
|
|
|
|
export const readPdfFile = async ({ buffer }: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
|
|
const readPDFPage = async (doc: any, pageNo: number) => {
|
|
try {
|
|
const page = await doc.getPage(pageNo);
|
|
const tokenizedText = await page.getTextContent();
|
|
|
|
const viewport = page.getViewport({ scale: 1 });
|
|
const pageHeight = viewport.height;
|
|
const headerThreshold = pageHeight * 0.95;
|
|
const footerThreshold = pageHeight * 0.05;
|
|
|
|
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
|
|
return (
|
|
!token.transform ||
|
|
(token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
|
|
);
|
|
});
|
|
|
|
// concat empty string 'hasEOL'
|
|
for (let i = 0; i < pageTexts.length; i++) {
|
|
const item = pageTexts[i];
|
|
if (item.str === '' && pageTexts[i - 1]) {
|
|
pageTexts[i - 1].hasEOL = item.hasEOL;
|
|
pageTexts.splice(i, 1);
|
|
i--;
|
|
}
|
|
}
|
|
|
|
page.cleanup();
|
|
|
|
return pageTexts
|
|
.map((token) => {
|
|
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
|
|
|
|
return paragraphEnd ? `${token.str}\n` : token.str;
|
|
})
|
|
.join('');
|
|
} catch (error) {
|
|
console.log('pdf read error', error);
|
|
return '';
|
|
}
|
|
};
|
|
|
|
// @ts-ignore
|
|
const loadingTask = pdfjs.getDocument(buffer.buffer);
|
|
const doc = await loadingTask.promise;
|
|
|
|
// Avoid OOM.
|
|
let result = '';
|
|
const pageArr = Array.from({ length: doc.numPages }, (_, i) => i + 1);
|
|
for (let i = 0; i < pageArr.length; i++) {
|
|
result += await readPDFPage(doc, i + 1);
|
|
}
|
|
|
|
loadingTask.destroy();
|
|
|
|
return {
|
|
rawText: result
|
|
};
|
|
};
|