mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 13:03:50 +00:00
Feat: pptx and xlsx loader (#1118)
* perf: plan tip * perf: upload size controller * feat: add image ttl index * feat: new upload file ux * remove file * feat: support read pptx * feat: support xlsx * fix: rerank docker flie
This commit is contained in:
21
packages/service/common/file/read/csv.ts
Normal file
21
packages/service/common/file/read/csv.ts
Normal file
@@ -0,0 +1,21 @@
|
||||
import Papa from 'papaparse';
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
import { readFileRawText } from './rawText';
|
||||
|
||||
// 加载源文件内容
|
||||
export const readCsvRawText = async (params: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
const { rawText } = readFileRawText(params);
|
||||
|
||||
const csvArr = Papa.parse(rawText).data as string[][];
|
||||
|
||||
const header = csvArr[0];
|
||||
|
||||
const formatText = header
|
||||
? csvArr.map((item) => item.map((item, i) => `${header[i]}:${item}`).join('\n')).join('\n')
|
||||
: '';
|
||||
|
||||
return {
|
||||
rawText,
|
||||
formatText
|
||||
};
|
||||
};
|
23
packages/service/common/file/read/html.ts
Normal file
23
packages/service/common/file/read/html.ts
Normal file
@@ -0,0 +1,23 @@
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
import { initMarkdownText } from './utils';
|
||||
import { htmlToMarkdown } from '../../string/markdown';
|
||||
import { readFileRawText } from './rawText';
|
||||
|
||||
export const readHtmlRawText = async (
|
||||
params: ReadFileByBufferParams
|
||||
): Promise<ReadFileResponse> => {
|
||||
const { teamId, metadata } = params;
|
||||
const { rawText: html } = readFileRawText(params);
|
||||
|
||||
const md = await htmlToMarkdown(html);
|
||||
|
||||
const rawText = await initMarkdownText({
|
||||
teamId,
|
||||
md,
|
||||
metadata
|
||||
});
|
||||
|
||||
return {
|
||||
rawText
|
||||
};
|
||||
};
|
18
packages/service/common/file/read/markdown.ts
Normal file
18
packages/service/common/file/read/markdown.ts
Normal file
@@ -0,0 +1,18 @@
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
import { initMarkdownText } from './utils';
|
||||
import { readFileRawText } from './rawText';
|
||||
|
||||
export const readMarkdown = async (params: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
const { teamId, metadata } = params;
|
||||
const { rawText: md } = readFileRawText(params);
|
||||
|
||||
const rawText = await initMarkdownText({
|
||||
teamId,
|
||||
md,
|
||||
metadata
|
||||
});
|
||||
|
||||
return {
|
||||
rawText
|
||||
};
|
||||
};
|
119
packages/service/common/file/read/parseOffice.ts
Normal file
119
packages/service/common/file/read/parseOffice.ts
Normal file
@@ -0,0 +1,119 @@
|
||||
import { getNanoid } from '@fastgpt/global/common/string/tools';
|
||||
import fs from 'fs';
|
||||
import decompress from 'decompress';
|
||||
import { DOMParser } from '@xmldom/xmldom';
|
||||
import { clearDirFiles } from '../utils';
|
||||
import { addLog } from '../../system/log';
|
||||
|
||||
const DEFAULTDECOMPRESSSUBLOCATION = '/tmp';
|
||||
|
||||
function getNewFileName(ext: string) {
|
||||
return `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}.${ext}`;
|
||||
}
|
||||
|
||||
const parseString = (xml: string) => {
|
||||
let parser = new DOMParser();
|
||||
return parser.parseFromString(xml, 'text/xml');
|
||||
};
|
||||
|
||||
const parsePowerPoint = async ({
|
||||
filepath,
|
||||
decompressPath,
|
||||
encoding
|
||||
}: {
|
||||
filepath: string;
|
||||
decompressPath: string;
|
||||
encoding: BufferEncoding;
|
||||
}) => {
|
||||
// Files regex that hold our content of interest
|
||||
const allFilesRegex = /ppt\/(notesSlides|slides)\/(notesSlide|slide)\d+.xml/g;
|
||||
const slidesRegex = /ppt\/slides\/slide\d+.xml/g;
|
||||
|
||||
/** The decompress location which contains the filename in it */
|
||||
|
||||
const files = await decompress(filepath, decompressPath, {
|
||||
filter: (x) => !!x.path.match(allFilesRegex)
|
||||
});
|
||||
|
||||
// Verify if atleast the slides xml files exist in the extracted files list.
|
||||
if (
|
||||
files.length == 0 ||
|
||||
!files.map((file) => file.path).some((filename) => filename.match(slidesRegex))
|
||||
) {
|
||||
return Promise.reject('解析 PPT 失败');
|
||||
}
|
||||
|
||||
// Returning an array of all the xml contents read using fs.readFileSync
|
||||
const xmlContentArray = files.map((file) =>
|
||||
fs.readFileSync(`${decompressPath}/${file.path}`, encoding)
|
||||
);
|
||||
|
||||
let responseArr: string[] = [];
|
||||
|
||||
xmlContentArray.forEach((xmlContent) => {
|
||||
/** Find text nodes with a:p tags */
|
||||
const xmlParagraphNodesList = parseString(xmlContent).getElementsByTagName('a:p');
|
||||
|
||||
/** Store all the text content to respond */
|
||||
responseArr.push(
|
||||
Array.from(xmlParagraphNodesList)
|
||||
// Filter paragraph nodes than do not have any text nodes which are identifiable by a:t tag
|
||||
.filter((paragraphNode) => paragraphNode.getElementsByTagName('a:t').length != 0)
|
||||
.map((paragraphNode) => {
|
||||
/** Find text nodes with a:t tags */
|
||||
const xmlTextNodeList = paragraphNode.getElementsByTagName('a:t');
|
||||
return Array.from(xmlTextNodeList)
|
||||
.filter((textNode) => textNode.childNodes[0] && textNode.childNodes[0].nodeValue)
|
||||
.map((textNode) => textNode.childNodes[0].nodeValue)
|
||||
.join('');
|
||||
})
|
||||
.join('\n')
|
||||
);
|
||||
});
|
||||
|
||||
return responseArr.join('\n');
|
||||
};
|
||||
|
||||
export const parseOffice = async ({
|
||||
buffer,
|
||||
encoding,
|
||||
extension
|
||||
}: {
|
||||
buffer: Buffer;
|
||||
encoding: BufferEncoding;
|
||||
extension: string;
|
||||
}) => {
|
||||
// Prepare file for processing
|
||||
// create temp file subdirectory if it does not exist
|
||||
if (!fs.existsSync(DEFAULTDECOMPRESSSUBLOCATION)) {
|
||||
fs.mkdirSync(DEFAULTDECOMPRESSSUBLOCATION, { recursive: true });
|
||||
}
|
||||
|
||||
// temp file name
|
||||
const filepath = getNewFileName(extension);
|
||||
const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}`;
|
||||
// const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/test`;
|
||||
|
||||
// write new file
|
||||
fs.writeFileSync(filepath, buffer, {
|
||||
encoding
|
||||
});
|
||||
|
||||
const text = await (async () => {
|
||||
try {
|
||||
switch (extension) {
|
||||
case 'pptx':
|
||||
return parsePowerPoint({ filepath, decompressPath, encoding });
|
||||
default:
|
||||
return Promise.reject('只能读取 .pptx 文件');
|
||||
}
|
||||
} catch (error) {
|
||||
addLog.error(`Load ppt error`, { error });
|
||||
}
|
||||
return '';
|
||||
})();
|
||||
|
||||
fs.unlinkSync(filepath);
|
||||
clearDirFiles(decompressPath);
|
||||
return text;
|
||||
};
|
71
packages/service/common/file/read/pdf.ts
Normal file
71
packages/service/common/file/read/pdf.ts
Normal file
@@ -0,0 +1,71 @@
|
||||
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
|
||||
// @ts-ignore
|
||||
import('pdfjs-dist/legacy/build/pdf.worker.min.mjs');
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type';
|
||||
|
||||
type TokenType = {
|
||||
str: string;
|
||||
dir: string;
|
||||
width: number;
|
||||
height: number;
|
||||
transform: number[];
|
||||
fontName: string;
|
||||
hasEOL: boolean;
|
||||
};
|
||||
|
||||
export const readPdfFile = async ({
|
||||
buffer
|
||||
}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
const readPDFPage = async (doc: any, pageNo: number) => {
|
||||
const page = await doc.getPage(pageNo);
|
||||
const tokenizedText = await page.getTextContent();
|
||||
|
||||
const viewport = page.getViewport({ scale: 1 });
|
||||
const pageHeight = viewport.height;
|
||||
const headerThreshold = pageHeight * 0.95;
|
||||
const footerThreshold = pageHeight * 0.05;
|
||||
|
||||
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
|
||||
return (
|
||||
!token.transform ||
|
||||
(token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
|
||||
);
|
||||
});
|
||||
|
||||
// concat empty string 'hasEOL'
|
||||
for (let i = 0; i < pageTexts.length; i++) {
|
||||
const item = pageTexts[i];
|
||||
if (item.str === '' && pageTexts[i - 1]) {
|
||||
pageTexts[i - 1].hasEOL = item.hasEOL;
|
||||
pageTexts.splice(i, 1);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
|
||||
page.cleanup();
|
||||
|
||||
return pageTexts
|
||||
.map((token) => {
|
||||
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
|
||||
|
||||
return paragraphEnd ? `${token.str}\n` : token.str;
|
||||
})
|
||||
.join('');
|
||||
};
|
||||
|
||||
const loadingTask = pdfjs.getDocument(buffer.buffer);
|
||||
const doc = await loadingTask.promise;
|
||||
|
||||
const pageTextPromises = [];
|
||||
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
|
||||
pageTextPromises.push(readPDFPage(doc, pageNo));
|
||||
}
|
||||
const pageTexts = await Promise.all(pageTextPromises);
|
||||
|
||||
loadingTask.destroy();
|
||||
|
||||
return {
|
||||
rawText: pageTexts.join(''),
|
||||
metadata: {}
|
||||
};
|
||||
};
|
14
packages/service/common/file/read/pptx.ts
Normal file
14
packages/service/common/file/read/pptx.ts
Normal file
@@ -0,0 +1,14 @@
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
// import { parseOfficeAsync } from 'officeparser';
|
||||
import { parseOffice } from './parseOffice';
|
||||
|
||||
export const readPptxRawText = async ({
|
||||
buffer,
|
||||
encoding
|
||||
}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
const result = await parseOffice({ buffer, encoding, extension: 'pptx' });
|
||||
|
||||
return {
|
||||
rawText: result
|
||||
};
|
||||
};
|
10
packages/service/common/file/read/rawText.ts
Normal file
10
packages/service/common/file/read/rawText.ts
Normal file
@@ -0,0 +1,10 @@
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
|
||||
// 加载源文件内容
|
||||
export const readFileRawText = ({ buffer, encoding }: ReadFileByBufferParams): ReadFileResponse => {
|
||||
const content = buffer.toString(encoding);
|
||||
|
||||
return {
|
||||
rawText: content
|
||||
};
|
||||
};
|
12
packages/service/common/file/read/type.d.ts
vendored
Normal file
12
packages/service/common/file/read/type.d.ts
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
export type ReadFileByBufferParams = {
|
||||
teamId: string;
|
||||
buffer: Buffer;
|
||||
encoding: BufferEncoding;
|
||||
metadata?: Record<string, any>;
|
||||
};
|
||||
|
||||
export type ReadFileResponse = {
|
||||
rawText: string;
|
||||
formatText?: string;
|
||||
metadata?: Record<string, any>;
|
||||
};
|
25
packages/service/common/file/read/utils.ts
Normal file
25
packages/service/common/file/read/utils.ts
Normal file
@@ -0,0 +1,25 @@
|
||||
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
|
||||
import { uploadMongoImg } from '../image/controller';
|
||||
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
|
||||
import { addHours } from 'date-fns';
|
||||
|
||||
export const initMarkdownText = ({
|
||||
teamId,
|
||||
md,
|
||||
metadata
|
||||
}: {
|
||||
md: string;
|
||||
teamId: string;
|
||||
metadata?: Record<string, any>;
|
||||
}) =>
|
||||
markdownProcess({
|
||||
rawText: md,
|
||||
uploadImgController: (base64Img) =>
|
||||
uploadMongoImg({
|
||||
type: MongoImageTypeEnum.collectionImage,
|
||||
base64Img,
|
||||
teamId,
|
||||
metadata,
|
||||
expiredTime: addHours(new Date(), 2)
|
||||
})
|
||||
});
|
35
packages/service/common/file/read/word.ts
Normal file
35
packages/service/common/file/read/word.ts
Normal file
@@ -0,0 +1,35 @@
|
||||
import mammoth from 'mammoth';
|
||||
import { htmlToMarkdown } from '../../string/markdown';
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type';
|
||||
import { initMarkdownText } from './utils';
|
||||
|
||||
/**
|
||||
* read docx to markdown
|
||||
*/
|
||||
export const readWordFile = async ({
|
||||
teamId,
|
||||
buffer,
|
||||
metadata = {}
|
||||
}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
try {
|
||||
const { value: html } = await mammoth.convertToHtml({
|
||||
buffer
|
||||
});
|
||||
|
||||
const md = await htmlToMarkdown(html);
|
||||
|
||||
const rawText = await initMarkdownText({
|
||||
teamId,
|
||||
md,
|
||||
metadata
|
||||
});
|
||||
|
||||
return {
|
||||
rawText,
|
||||
metadata: {}
|
||||
};
|
||||
} catch (error) {
|
||||
console.log('error doc read:', error);
|
||||
return Promise.reject('Can not read doc file, please convert to PDF');
|
||||
}
|
||||
};
|
45
packages/service/common/file/read/xlsx.ts
Normal file
45
packages/service/common/file/read/xlsx.ts
Normal file
@@ -0,0 +1,45 @@
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
import xlsx from 'node-xlsx';
|
||||
import Papa from 'papaparse';
|
||||
|
||||
export const readXlsxRawText = async ({
|
||||
buffer
|
||||
}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
const result = xlsx.parse(buffer, {
|
||||
skipHidden: false,
|
||||
defval: ''
|
||||
});
|
||||
|
||||
const format2Csv = result.map(({ name, data }) => {
|
||||
return {
|
||||
title: `#${name}`,
|
||||
csvText: data.map((item) => item.join(',')).join('\n')
|
||||
};
|
||||
});
|
||||
|
||||
const rawText = format2Csv.map((item) => item.csvText).join('\n');
|
||||
const formatText = format2Csv
|
||||
.map((item) => {
|
||||
const csvArr = Papa.parse(item.csvText).data as string[][];
|
||||
const header = csvArr[0];
|
||||
|
||||
const formatText = header
|
||||
? csvArr
|
||||
.map((item) =>
|
||||
item
|
||||
.map((item, i) => (item ? `${header[i]}:${item}` : ''))
|
||||
.filter(Boolean)
|
||||
.join('\n')
|
||||
)
|
||||
.join('\n')
|
||||
: '';
|
||||
|
||||
return `${item.title}\n${formatText}`;
|
||||
})
|
||||
.join('\n');
|
||||
|
||||
return {
|
||||
rawText: rawText,
|
||||
formatText
|
||||
};
|
||||
};
|
Reference in New Issue
Block a user