Feat: pptx and xlsx loader (#1118)

* perf: plan tip

* perf: upload size controller

* feat: add image ttl index

* feat: new upload file ux

* remove file

* feat: support read pptx

* feat: support xlsx

* fix: rerank docker flie
This commit is contained in:
Archer
2024-04-01 19:01:26 +08:00
committed by GitHub
parent f9d266a6af
commit 21288d1736
90 changed files with 2707 additions and 1678 deletions

View File

@@ -0,0 +1,33 @@
import { connectionMongo, type Model } from '../../mongo';
const { Schema, model, models } = connectionMongo;
import { RawTextBufferSchemaType } from './type';
export const collectionName = 'buffer.rawText';
const RawTextBufferSchema = new Schema({
sourceId: {
type: String,
required: true
},
rawText: {
type: String,
default: ''
},
createTime: {
type: Date,
default: () => new Date()
},
metadata: Object
});
try {
RawTextBufferSchema.index({ sourceId: 1 });
// 20 minutes
RawTextBufferSchema.index({ createTime: 1 }, { expireAfterSeconds: 20 * 60 });
} catch (error) {
console.log(error);
}
export const MongoRwaTextBuffer: Model<RawTextBufferSchemaType> =
models[collectionName] || model(collectionName, RawTextBufferSchema);
MongoRwaTextBuffer.syncIndexes();

View File

@@ -0,0 +1,8 @@
export type RawTextBufferSchemaType = {
sourceId: string;
rawText: string;
createTime: Date;
metadata?: {
filename: string;
};
};

View File

@@ -2,7 +2,7 @@ import { connectionMongo, type Model } from '../../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { TTSBufferSchemaType } from './type.d';
export const collectionName = 'ttsbuffers';
export const collectionName = 'buffer.tts';
const TTSBufferSchema = new Schema({
bufferId: {

View File

@@ -4,6 +4,18 @@ import fsp from 'fs/promises';
import fs from 'fs';
import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
import { MongoFileSchema } from './schema';
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { readFileRawText } from '../read/rawText';
import { ReadFileByBufferParams } from '../read/type';
import { readMarkdown } from '../read/markdown';
import { readHtmlRawText } from '../read/html';
import { readPdfFile } from '../read/pdf';
import { readWordFile } from '../read/word';
import { readCsvRawText } from '../read/csv';
import { MongoRwaTextBuffer } from '../../buffer/rawText/schema';
import { readPptxRawText } from '../read/pptx';
import { readXlsxRawText } from '../read/xlsx';
export function getGFSCollection(bucket: `${BucketNameEnum}`) {
MongoFileSchema;
@@ -111,3 +123,139 @@ export async function getDownloadStream({
return bucket.openDownloadStream(new Types.ObjectId(fileId));
}
export const readFileEncode = async ({
bucketName,
fileId
}: {
bucketName: `${BucketNameEnum}`;
fileId: string;
}) => {
const encodeStream = await getDownloadStream({ bucketName, fileId });
let buffers: Buffer = Buffer.from([]);
for await (const chunk of encodeStream) {
buffers = Buffer.concat([buffers, chunk]);
if (buffers.length > 10) {
encodeStream.abort();
break;
}
}
const encoding = detectFileEncoding(buffers);
return encoding as BufferEncoding;
};
export const readFileContent = async ({
teamId,
bucketName,
fileId,
csvFormat = false
}: {
teamId: string;
bucketName: `${BucketNameEnum}`;
fileId: string;
csvFormat?: boolean;
}): Promise<{
rawText: string;
filename: string;
}> => {
// read buffer
const fileBuffer = await MongoRwaTextBuffer.findOne({ sourceId: fileId }).lean();
if (fileBuffer) {
return {
rawText: fileBuffer.rawText,
filename: fileBuffer.metadata?.filename || ''
};
}
const [file, encoding, fileStream] = await Promise.all([
getFileById({ bucketName, fileId }),
readFileEncode({ bucketName, fileId }),
getDownloadStream({ bucketName, fileId })
]);
if (!file) {
return Promise.reject(CommonErrEnum.fileNotFound);
}
const extension = file?.filename?.split('.')?.pop()?.toLowerCase() || '';
const fileBuffers = await (() => {
return new Promise<Buffer>((resolve, reject) => {
let buffers = Buffer.from([]);
fileStream.on('data', (chunk) => {
buffers = Buffer.concat([buffers, chunk]);
});
fileStream.on('end', () => {
resolve(buffers);
});
fileStream.on('error', (err) => {
reject(err);
});
});
})();
const params: ReadFileByBufferParams = {
teamId,
buffer: fileBuffers,
encoding,
metadata: {
relatedId: fileId
}
};
const { rawText } = await (async () => {
switch (extension) {
case 'txt':
return readFileRawText(params);
case 'md':
return readMarkdown(params);
case 'html':
return readHtmlRawText(params);
case 'pdf':
return readPdfFile(params);
case 'docx':
return readWordFile(params);
case 'pptx':
return readPptxRawText(params);
case 'xlsx':
const xlsxResult = await readXlsxRawText(params);
if (csvFormat) {
return {
rawText: xlsxResult.formatText || ''
};
}
return {
rawText: xlsxResult.rawText
};
case 'csv':
const csvResult = await readCsvRawText(params);
if (csvFormat) {
return {
rawText: csvResult.formatText || ''
};
}
return {
rawText: csvResult.rawText
};
default:
return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
}
})();
if (rawText.trim()) {
await MongoRwaTextBuffer.create({
sourceId: fileId,
rawText,
metadata: {
filename: file.filename
}
});
}
return {
rawText,
filename: file.filename
};
};

View File

@@ -14,7 +14,6 @@ export async function uploadMongoImg({
teamId,
expiredTime,
metadata,
shareId
}: UploadImgProps & {
teamId: string;
@@ -30,9 +29,8 @@ export async function uploadMongoImg({
type,
teamId,
binary,
expiredTime: expiredTime,
expiredTime,
metadata,
shareId
});

View File

@@ -25,13 +25,13 @@ const ImageSchema = new Schema({
enum: Object.keys(mongoImageTypeMap),
required: true
},
metadata: {
type: Object
}
});
try {
// tts expired
ImageSchema.index({ expiredTime: 1 }, { expireAfterSeconds: 60 });
ImageSchema.index({ type: 1 });
ImageSchema.index({ createTime: 1 });

View File

@@ -0,0 +1,21 @@
import Papa from 'papaparse';
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
import { readFileRawText } from './rawText';
// 加载源文件内容
export const readCsvRawText = async (params: ReadFileByBufferParams): Promise<ReadFileResponse> => {
const { rawText } = readFileRawText(params);
const csvArr = Papa.parse(rawText).data as string[][];
const header = csvArr[0];
const formatText = header
? csvArr.map((item) => item.map((item, i) => `${header[i]}:${item}`).join('\n')).join('\n')
: '';
return {
rawText,
formatText
};
};

View File

@@ -0,0 +1,23 @@
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
import { initMarkdownText } from './utils';
import { htmlToMarkdown } from '../../string/markdown';
import { readFileRawText } from './rawText';
export const readHtmlRawText = async (
params: ReadFileByBufferParams
): Promise<ReadFileResponse> => {
const { teamId, metadata } = params;
const { rawText: html } = readFileRawText(params);
const md = await htmlToMarkdown(html);
const rawText = await initMarkdownText({
teamId,
md,
metadata
});
return {
rawText
};
};

View File

@@ -0,0 +1,18 @@
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
import { initMarkdownText } from './utils';
import { readFileRawText } from './rawText';
export const readMarkdown = async (params: ReadFileByBufferParams): Promise<ReadFileResponse> => {
const { teamId, metadata } = params;
const { rawText: md } = readFileRawText(params);
const rawText = await initMarkdownText({
teamId,
md,
metadata
});
return {
rawText
};
};

View File

@@ -0,0 +1,119 @@
import { getNanoid } from '@fastgpt/global/common/string/tools';
import fs from 'fs';
import decompress from 'decompress';
import { DOMParser } from '@xmldom/xmldom';
import { clearDirFiles } from '../utils';
import { addLog } from '../../system/log';
const DEFAULTDECOMPRESSSUBLOCATION = '/tmp';
function getNewFileName(ext: string) {
return `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}.${ext}`;
}
const parseString = (xml: string) => {
let parser = new DOMParser();
return parser.parseFromString(xml, 'text/xml');
};
const parsePowerPoint = async ({
filepath,
decompressPath,
encoding
}: {
filepath: string;
decompressPath: string;
encoding: BufferEncoding;
}) => {
// Files regex that hold our content of interest
const allFilesRegex = /ppt\/(notesSlides|slides)\/(notesSlide|slide)\d+.xml/g;
const slidesRegex = /ppt\/slides\/slide\d+.xml/g;
/** The decompress location which contains the filename in it */
const files = await decompress(filepath, decompressPath, {
filter: (x) => !!x.path.match(allFilesRegex)
});
// Verify if atleast the slides xml files exist in the extracted files list.
if (
files.length == 0 ||
!files.map((file) => file.path).some((filename) => filename.match(slidesRegex))
) {
return Promise.reject('解析 PPT 失败');
}
// Returning an array of all the xml contents read using fs.readFileSync
const xmlContentArray = files.map((file) =>
fs.readFileSync(`${decompressPath}/${file.path}`, encoding)
);
let responseArr: string[] = [];
xmlContentArray.forEach((xmlContent) => {
/** Find text nodes with a:p tags */
const xmlParagraphNodesList = parseString(xmlContent).getElementsByTagName('a:p');
/** Store all the text content to respond */
responseArr.push(
Array.from(xmlParagraphNodesList)
// Filter paragraph nodes than do not have any text nodes which are identifiable by a:t tag
.filter((paragraphNode) => paragraphNode.getElementsByTagName('a:t').length != 0)
.map((paragraphNode) => {
/** Find text nodes with a:t tags */
const xmlTextNodeList = paragraphNode.getElementsByTagName('a:t');
return Array.from(xmlTextNodeList)
.filter((textNode) => textNode.childNodes[0] && textNode.childNodes[0].nodeValue)
.map((textNode) => textNode.childNodes[0].nodeValue)
.join('');
})
.join('\n')
);
});
return responseArr.join('\n');
};
export const parseOffice = async ({
buffer,
encoding,
extension
}: {
buffer: Buffer;
encoding: BufferEncoding;
extension: string;
}) => {
// Prepare file for processing
// create temp file subdirectory if it does not exist
if (!fs.existsSync(DEFAULTDECOMPRESSSUBLOCATION)) {
fs.mkdirSync(DEFAULTDECOMPRESSSUBLOCATION, { recursive: true });
}
// temp file name
const filepath = getNewFileName(extension);
const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}`;
// const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/test`;
// write new file
fs.writeFileSync(filepath, buffer, {
encoding
});
const text = await (async () => {
try {
switch (extension) {
case 'pptx':
return parsePowerPoint({ filepath, decompressPath, encoding });
default:
return Promise.reject('只能读取 .pptx 文件');
}
} catch (error) {
addLog.error(`Load ppt error`, { error });
}
return '';
})();
fs.unlinkSync(filepath);
clearDirFiles(decompressPath);
return text;
};

View File

@@ -0,0 +1,71 @@
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
// @ts-ignore
import('pdfjs-dist/legacy/build/pdf.worker.min.mjs');
import { ReadFileByBufferParams, ReadFileResponse } from './type';
type TokenType = {
str: string;
dir: string;
width: number;
height: number;
transform: number[];
fontName: string;
hasEOL: boolean;
};
export const readPdfFile = async ({
buffer
}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
const readPDFPage = async (doc: any, pageNo: number) => {
const page = await doc.getPage(pageNo);
const tokenizedText = await page.getTextContent();
const viewport = page.getViewport({ scale: 1 });
const pageHeight = viewport.height;
const headerThreshold = pageHeight * 0.95;
const footerThreshold = pageHeight * 0.05;
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
return (
!token.transform ||
(token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
);
});
// concat empty string 'hasEOL'
for (let i = 0; i < pageTexts.length; i++) {
const item = pageTexts[i];
if (item.str === '' && pageTexts[i - 1]) {
pageTexts[i - 1].hasEOL = item.hasEOL;
pageTexts.splice(i, 1);
i--;
}
}
page.cleanup();
return pageTexts
.map((token) => {
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
return paragraphEnd ? `${token.str}\n` : token.str;
})
.join('');
};
const loadingTask = pdfjs.getDocument(buffer.buffer);
const doc = await loadingTask.promise;
const pageTextPromises = [];
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
pageTextPromises.push(readPDFPage(doc, pageNo));
}
const pageTexts = await Promise.all(pageTextPromises);
loadingTask.destroy();
return {
rawText: pageTexts.join(''),
metadata: {}
};
};

View File

@@ -0,0 +1,14 @@
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
// import { parseOfficeAsync } from 'officeparser';
import { parseOffice } from './parseOffice';
export const readPptxRawText = async ({
buffer,
encoding
}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
const result = await parseOffice({ buffer, encoding, extension: 'pptx' });
return {
rawText: result
};
};

View File

@@ -0,0 +1,10 @@
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
// 加载源文件内容
export const readFileRawText = ({ buffer, encoding }: ReadFileByBufferParams): ReadFileResponse => {
const content = buffer.toString(encoding);
return {
rawText: content
};
};

View File

@@ -0,0 +1,12 @@
export type ReadFileByBufferParams = {
teamId: string;
buffer: Buffer;
encoding: BufferEncoding;
metadata?: Record<string, any>;
};
export type ReadFileResponse = {
rawText: string;
formatText?: string;
metadata?: Record<string, any>;
};

View File

@@ -0,0 +1,25 @@
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
import { uploadMongoImg } from '../image/controller';
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
import { addHours } from 'date-fns';
export const initMarkdownText = ({
teamId,
md,
metadata
}: {
md: string;
teamId: string;
metadata?: Record<string, any>;
}) =>
markdownProcess({
rawText: md,
uploadImgController: (base64Img) =>
uploadMongoImg({
type: MongoImageTypeEnum.collectionImage,
base64Img,
teamId,
metadata,
expiredTime: addHours(new Date(), 2)
})
});

View File

@@ -0,0 +1,35 @@
import mammoth from 'mammoth';
import { htmlToMarkdown } from '../../string/markdown';
import { ReadFileByBufferParams, ReadFileResponse } from './type';
import { initMarkdownText } from './utils';
/**
* read docx to markdown
*/
export const readWordFile = async ({
teamId,
buffer,
metadata = {}
}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
try {
const { value: html } = await mammoth.convertToHtml({
buffer
});
const md = await htmlToMarkdown(html);
const rawText = await initMarkdownText({
teamId,
md,
metadata
});
return {
rawText,
metadata: {}
};
} catch (error) {
console.log('error doc read:', error);
return Promise.reject('Can not read doc file, please convert to PDF');
}
};

View File

@@ -0,0 +1,45 @@
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
import xlsx from 'node-xlsx';
import Papa from 'papaparse';
export const readXlsxRawText = async ({
buffer
}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
const result = xlsx.parse(buffer, {
skipHidden: false,
defval: ''
});
const format2Csv = result.map(({ name, data }) => {
return {
title: `#${name}`,
csvText: data.map((item) => item.join(',')).join('\n')
};
});
const rawText = format2Csv.map((item) => item.csvText).join('\n');
const formatText = format2Csv
.map((item) => {
const csvArr = Papa.parse(item.csvText).data as string[][];
const header = csvArr[0];
const formatText = header
? csvArr
.map((item) =>
item
.map((item, i) => (item ? `${header[i]}:${item}` : ''))
.filter(Boolean)
.join('\n')
)
.join('\n')
: '';
return `${item.title}\n${formatText}`;
})
.join('\n');
return {
rawText: rawText,
formatText
};
};

View File

@@ -35,13 +35,8 @@ export const clearDirFiles = (dirPath: string) => {
return;
}
fs.readdirSync(dirPath).forEach((file) => {
const curPath = `${dirPath}/${file}`;
if (fs.lstatSync(curPath).isDirectory()) {
clearDirFiles(curPath);
} else {
fs.unlinkSync(curPath);
}
fs.rmdirSync(dirPath, {
recursive: true
});
};

View File

@@ -9,7 +9,6 @@ import {
DatasetCollectionSchemaType
} from '@fastgpt/global/core/dataset/type';
import { MongoDatasetTraining } from '../training/schema';
import { delay } from '@fastgpt/global/common/system/utils';
import { MongoDatasetData } from '../data/schema';
import { delImgByRelatedId } from '../../../common/file/image/controller';
import { deleteDatasetDataVector } from '../../../common/vectorStore/controller';

View File

@@ -0,0 +1,6 @@
export enum ImportDataSourceEnum {
fileLocal = 'fileLocal',
fileLink = 'fileLink',
fileCustom = 'fileCustom',
tableLocal = 'tableLocal'
}

View File

@@ -1,14 +1,16 @@
import { delay } from '@fastgpt/global/common/system/utils';
import { MongoDatasetTraining } from './schema';
import type {
PushDatasetDataChunkProps,
PushDatasetDataProps,
PushDatasetDataResponse
} from '@fastgpt/global/core/dataset/api.d';
import { getCollectionWithDataset } from '../controller';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { simpleText } from '@fastgpt/global/common/string/tools';
import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
import { ClientSession } from '../../../common/mongo';
import { getLLMModel, getVectorModel } from '../../ai/model';
import { addLog } from '../../../common/system/log';
import { getCollectionWithDataset } from '../controller';
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
try {
@@ -23,31 +25,52 @@ export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> =>
} catch (error) {}
};
export async function pushDataListToTrainingQueue({
teamId,
tmbId,
export const pushDataListToTrainingQueueByCollectionId = async ({
collectionId,
data,
prompt,
billId,
trainingMode = TrainingModeEnum.chunk
...props
}: {
teamId: string;
tmbId: string;
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
const vectorModelList = global.vectorModels;
const datasetModelList = global.llmModels;
session?: ClientSession;
} & PushDatasetDataProps) => {
const {
datasetId: { _id: datasetId, vectorModel, agentModel }
datasetId: { _id: datasetId, agentModel, vectorModel }
} = await getCollectionWithDataset(collectionId);
return pushDataListToTrainingQueue({
...props,
datasetId,
collectionId,
agentModel,
vectorModel
});
};
export async function pushDataListToTrainingQueue({
teamId,
tmbId,
datasetId,
collectionId,
agentModel,
vectorModel,
data,
prompt,
billId,
trainingMode = TrainingModeEnum.chunk,
session
}: {
teamId: string;
tmbId: string;
datasetId: string;
agentModel: string;
vectorModel: string;
session?: ClientSession;
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
const checkModelValid = async () => {
const agentModelData = datasetModelList?.find((item) => item.model === agentModel);
const agentModelData = getLLMModel(agentModel);
if (!agentModelData) {
return Promise.reject(`File model ${agentModel} is inValid`);
}
const vectorModelData = vectorModelList?.find((item) => item.model === vectorModel);
const vectorModelData = getVectorModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(`Vector model ${vectorModel} is inValid`);
}
@@ -124,52 +147,43 @@ export async function pushDataListToTrainingQueue({
});
// insert data to db
const insertData = async (dataList: PushDatasetDataChunkProps[], retry = 3): Promise<number> => {
try {
const results = await MongoDatasetTraining.insertMany(
dataList.map((item, i) => ({
teamId,
tmbId,
datasetId,
collectionId,
billId,
mode: trainingMode,
prompt,
model,
q: item.q,
a: item.a,
chunkIndex: item.chunkIndex ?? 0,
weight: weight ?? 0,
indexes: item.indexes
}))
);
await delay(500);
return results.length;
} catch (error) {
if (retry > 0) {
await delay(500);
return insertData(dataList, retry - 1);
}
return Promise.reject(error);
}
};
const insertLen = filterResult.success.length;
const failedDocuments: PushDatasetDataChunkProps[] = [];
let insertLen = 0;
const chunkSize = 50;
const chunkList = filterResult.success.reduce(
(acc, cur) => {
const lastChunk = acc[acc.length - 1];
if (lastChunk.length < chunkSize) {
lastChunk.push(cur);
} else {
acc.push([cur]);
// 使用 insertMany 批量插入
try {
await MongoDatasetTraining.insertMany(
filterResult.success.map((item) => ({
teamId,
tmbId,
datasetId,
collectionId,
billId,
mode: trainingMode,
prompt,
model,
q: item.q,
a: item.a,
chunkIndex: item.chunkIndex ?? 0,
weight: weight ?? 0,
indexes: item.indexes
})),
{
session
}
return acc;
},
[[]] as PushDatasetDataChunkProps[][]
);
for await (const chunks of chunkList) {
insertLen += await insertData(chunks);
);
} catch (error: any) {
addLog.error(`Insert error`, error);
// 如果有错误,将失败的文档添加到失败列表中
error.writeErrors.forEach((writeError: any) => {
failedDocuments.push(data[writeError.index]);
});
console.log('failed', failedDocuments);
}
// 对于失败的文档,尝试单独插入
for await (const item of failedDocuments) {
await MongoDatasetTraining.create(item);
}
delete filterResult.success;

View File

@@ -2,6 +2,7 @@ import { DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
import { addLog } from '../../../common/system/log';
import { getErrText } from '@fastgpt/global/common/error/utils';
import { MongoDatasetTraining } from './schema';
import Papa from 'papaparse';
export const checkInvalidChunkAndLock = async ({
err,
@@ -39,3 +40,18 @@ export const checkInvalidChunkAndLock = async ({
}
return false;
};
export const parseCsvTable2Chunks = (rawText: string) => {
const csvArr = Papa.parse(rawText).data as string[][];
const chunks = csvArr
.map((item) => ({
q: item[0] || '',
a: item[1] || ''
}))
.filter((item) => item.q || item.a);
return {
chunks
};
};

View File

@@ -4,27 +4,36 @@
"dependencies": {
"@fastgpt/global": "workspace:*",
"@node-rs/jieba": "1.10.0",
"@xmldom/xmldom": "^0.8.10",
"axios": "^1.5.1",
"cheerio": "1.0.0-rc.12",
"cookie": "^0.5.0",
"date-fns": "2.30.0",
"dayjs": "^1.11.7",
"decompress": "^4.2.1",
"encoding": "^0.1.13",
"file-type": "^19.0.0",
"json5": "^2.2.3",
"jsonwebtoken": "^9.0.2",
"mammoth": "^1.6.0",
"mongoose": "^7.0.2",
"multer": "1.4.5-lts.1",
"next": "13.5.2",
"nextjs-cors": "^2.1.2",
"node-cron": "^3.0.3",
"node-xlsx": "^0.23.0",
"papaparse": "5.4.1",
"pdfjs-dist": "4.0.269",
"pg": "^8.10.0",
"tunnel": "^0.0.6"
},
"devDependencies": {
"@types/cookie": "^0.5.2",
"@types/decompress": "^4.2.7",
"@types/jsonwebtoken": "^9.0.3",
"@types/multer": "^1.4.10",
"@types/node-cron": "^3.0.11",
"@types/papaparse": "5.3.7",
"@types/pg": "^8.6.6",
"@types/tunnel": "^0.0.4"
}

View File

@@ -0,0 +1,42 @@
import { AuthResponseType } from '@fastgpt/global/support/permission/type';
import { AuthModeType } from '../type';
import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
import { parseHeaderCert } from '../controller';
import { getFileById } from '../../../common/file/gridfs/controller';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
export async function authFile({
fileId,
per = 'owner',
...props
}: AuthModeType & {
fileId: string;
}): Promise<
AuthResponseType & {
file: DatasetFileSchema;
}
> {
const authRes = await parseHeaderCert(props);
const { teamId, tmbId } = authRes;
const file = await getFileById({ bucketName: BucketNameEnum.dataset, fileId });
if (!file) {
return Promise.reject(CommonErrEnum.fileNotFound);
}
if (file.metadata?.teamId !== teamId) {
return Promise.reject(CommonErrEnum.unAuthFile);
}
if (per === 'owner' && file.metadata?.tmbId !== tmbId) {
return Promise.reject(CommonErrEnum.unAuthFile);
}
return {
...authRes,
isOwner: per === 'owner',
canWrite: per === 'owner',
file
};
}