mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 13:03:50 +00:00
Feat: pptx and xlsx loader (#1118)
* perf: plan tip * perf: upload size controller * feat: add image ttl index * feat: new upload file ux * remove file * feat: support read pptx * feat: support xlsx * fix: rerank docker flie
This commit is contained in:
@@ -4,6 +4,18 @@ import fsp from 'fs/promises';
|
||||
import fs from 'fs';
|
||||
import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
|
||||
import { MongoFileSchema } from './schema';
|
||||
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
|
||||
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
|
||||
import { readFileRawText } from '../read/rawText';
|
||||
import { ReadFileByBufferParams } from '../read/type';
|
||||
import { readMarkdown } from '../read/markdown';
|
||||
import { readHtmlRawText } from '../read/html';
|
||||
import { readPdfFile } from '../read/pdf';
|
||||
import { readWordFile } from '../read/word';
|
||||
import { readCsvRawText } from '../read/csv';
|
||||
import { MongoRwaTextBuffer } from '../../buffer/rawText/schema';
|
||||
import { readPptxRawText } from '../read/pptx';
|
||||
import { readXlsxRawText } from '../read/xlsx';
|
||||
|
||||
export function getGFSCollection(bucket: `${BucketNameEnum}`) {
|
||||
MongoFileSchema;
|
||||
@@ -111,3 +123,139 @@ export async function getDownloadStream({
|
||||
|
||||
return bucket.openDownloadStream(new Types.ObjectId(fileId));
|
||||
}
|
||||
|
||||
export const readFileEncode = async ({
|
||||
bucketName,
|
||||
fileId
|
||||
}: {
|
||||
bucketName: `${BucketNameEnum}`;
|
||||
fileId: string;
|
||||
}) => {
|
||||
const encodeStream = await getDownloadStream({ bucketName, fileId });
|
||||
let buffers: Buffer = Buffer.from([]);
|
||||
for await (const chunk of encodeStream) {
|
||||
buffers = Buffer.concat([buffers, chunk]);
|
||||
if (buffers.length > 10) {
|
||||
encodeStream.abort();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const encoding = detectFileEncoding(buffers);
|
||||
|
||||
return encoding as BufferEncoding;
|
||||
};
|
||||
|
||||
export const readFileContent = async ({
|
||||
teamId,
|
||||
bucketName,
|
||||
fileId,
|
||||
csvFormat = false
|
||||
}: {
|
||||
teamId: string;
|
||||
bucketName: `${BucketNameEnum}`;
|
||||
fileId: string;
|
||||
csvFormat?: boolean;
|
||||
}): Promise<{
|
||||
rawText: string;
|
||||
filename: string;
|
||||
}> => {
|
||||
// read buffer
|
||||
const fileBuffer = await MongoRwaTextBuffer.findOne({ sourceId: fileId }).lean();
|
||||
if (fileBuffer) {
|
||||
return {
|
||||
rawText: fileBuffer.rawText,
|
||||
filename: fileBuffer.metadata?.filename || ''
|
||||
};
|
||||
}
|
||||
|
||||
const [file, encoding, fileStream] = await Promise.all([
|
||||
getFileById({ bucketName, fileId }),
|
||||
readFileEncode({ bucketName, fileId }),
|
||||
getDownloadStream({ bucketName, fileId })
|
||||
]);
|
||||
|
||||
if (!file) {
|
||||
return Promise.reject(CommonErrEnum.fileNotFound);
|
||||
}
|
||||
|
||||
const extension = file?.filename?.split('.')?.pop()?.toLowerCase() || '';
|
||||
|
||||
const fileBuffers = await (() => {
|
||||
return new Promise<Buffer>((resolve, reject) => {
|
||||
let buffers = Buffer.from([]);
|
||||
fileStream.on('data', (chunk) => {
|
||||
buffers = Buffer.concat([buffers, chunk]);
|
||||
});
|
||||
fileStream.on('end', () => {
|
||||
resolve(buffers);
|
||||
});
|
||||
fileStream.on('error', (err) => {
|
||||
reject(err);
|
||||
});
|
||||
});
|
||||
})();
|
||||
|
||||
const params: ReadFileByBufferParams = {
|
||||
teamId,
|
||||
buffer: fileBuffers,
|
||||
encoding,
|
||||
metadata: {
|
||||
relatedId: fileId
|
||||
}
|
||||
};
|
||||
|
||||
const { rawText } = await (async () => {
|
||||
switch (extension) {
|
||||
case 'txt':
|
||||
return readFileRawText(params);
|
||||
case 'md':
|
||||
return readMarkdown(params);
|
||||
case 'html':
|
||||
return readHtmlRawText(params);
|
||||
case 'pdf':
|
||||
return readPdfFile(params);
|
||||
case 'docx':
|
||||
return readWordFile(params);
|
||||
case 'pptx':
|
||||
return readPptxRawText(params);
|
||||
case 'xlsx':
|
||||
const xlsxResult = await readXlsxRawText(params);
|
||||
if (csvFormat) {
|
||||
return {
|
||||
rawText: xlsxResult.formatText || ''
|
||||
};
|
||||
}
|
||||
return {
|
||||
rawText: xlsxResult.rawText
|
||||
};
|
||||
case 'csv':
|
||||
const csvResult = await readCsvRawText(params);
|
||||
if (csvFormat) {
|
||||
return {
|
||||
rawText: csvResult.formatText || ''
|
||||
};
|
||||
}
|
||||
return {
|
||||
rawText: csvResult.rawText
|
||||
};
|
||||
default:
|
||||
return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
|
||||
}
|
||||
})();
|
||||
|
||||
if (rawText.trim()) {
|
||||
await MongoRwaTextBuffer.create({
|
||||
sourceId: fileId,
|
||||
rawText,
|
||||
metadata: {
|
||||
filename: file.filename
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
rawText,
|
||||
filename: file.filename
|
||||
};
|
||||
};
|
||||
|
Reference in New Issue
Block a user