Feat: pptx and xlsx loader (#1118)

* perf: plan tip

* perf: upload size controller

* feat: add image ttl index

* feat: new upload file ux

* remove file

* feat: support read pptx

* feat: support xlsx

* fix: rerank docker flie
This commit is contained in:
Archer
2024-04-01 19:01:26 +08:00
committed by GitHub
parent f9d266a6af
commit 21288d1736
90 changed files with 2707 additions and 1678 deletions

View File

@@ -4,6 +4,18 @@ import fsp from 'fs/promises';
import fs from 'fs';
import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
import { MongoFileSchema } from './schema';
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { readFileRawText } from '../read/rawText';
import { ReadFileByBufferParams } from '../read/type';
import { readMarkdown } from '../read/markdown';
import { readHtmlRawText } from '../read/html';
import { readPdfFile } from '../read/pdf';
import { readWordFile } from '../read/word';
import { readCsvRawText } from '../read/csv';
import { MongoRwaTextBuffer } from '../../buffer/rawText/schema';
import { readPptxRawText } from '../read/pptx';
import { readXlsxRawText } from '../read/xlsx';
export function getGFSCollection(bucket: `${BucketNameEnum}`) {
MongoFileSchema;
@@ -111,3 +123,139 @@ export async function getDownloadStream({
return bucket.openDownloadStream(new Types.ObjectId(fileId));
}
export const readFileEncode = async ({
bucketName,
fileId
}: {
bucketName: `${BucketNameEnum}`;
fileId: string;
}) => {
const encodeStream = await getDownloadStream({ bucketName, fileId });
let buffers: Buffer = Buffer.from([]);
for await (const chunk of encodeStream) {
buffers = Buffer.concat([buffers, chunk]);
if (buffers.length > 10) {
encodeStream.abort();
break;
}
}
const encoding = detectFileEncoding(buffers);
return encoding as BufferEncoding;
};
export const readFileContent = async ({
teamId,
bucketName,
fileId,
csvFormat = false
}: {
teamId: string;
bucketName: `${BucketNameEnum}`;
fileId: string;
csvFormat?: boolean;
}): Promise<{
rawText: string;
filename: string;
}> => {
// read buffer
const fileBuffer = await MongoRwaTextBuffer.findOne({ sourceId: fileId }).lean();
if (fileBuffer) {
return {
rawText: fileBuffer.rawText,
filename: fileBuffer.metadata?.filename || ''
};
}
const [file, encoding, fileStream] = await Promise.all([
getFileById({ bucketName, fileId }),
readFileEncode({ bucketName, fileId }),
getDownloadStream({ bucketName, fileId })
]);
if (!file) {
return Promise.reject(CommonErrEnum.fileNotFound);
}
const extension = file?.filename?.split('.')?.pop()?.toLowerCase() || '';
const fileBuffers = await (() => {
return new Promise<Buffer>((resolve, reject) => {
let buffers = Buffer.from([]);
fileStream.on('data', (chunk) => {
buffers = Buffer.concat([buffers, chunk]);
});
fileStream.on('end', () => {
resolve(buffers);
});
fileStream.on('error', (err) => {
reject(err);
});
});
})();
const params: ReadFileByBufferParams = {
teamId,
buffer: fileBuffers,
encoding,
metadata: {
relatedId: fileId
}
};
const { rawText } = await (async () => {
switch (extension) {
case 'txt':
return readFileRawText(params);
case 'md':
return readMarkdown(params);
case 'html':
return readHtmlRawText(params);
case 'pdf':
return readPdfFile(params);
case 'docx':
return readWordFile(params);
case 'pptx':
return readPptxRawText(params);
case 'xlsx':
const xlsxResult = await readXlsxRawText(params);
if (csvFormat) {
return {
rawText: xlsxResult.formatText || ''
};
}
return {
rawText: xlsxResult.rawText
};
case 'csv':
const csvResult = await readCsvRawText(params);
if (csvFormat) {
return {
rawText: csvResult.formatText || ''
};
}
return {
rawText: csvResult.rawText
};
default:
return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
}
})();
if (rawText.trim()) {
await MongoRwaTextBuffer.create({
sourceId: fileId,
rawText,
metadata: {
filename: file.filename
}
});
}
return {
rawText,
filename: file.filename
};
};