Feat: pptx and xlsx loader (#1118)

* perf: plan tip

* perf: upload size controller

* feat: add image ttl index

* feat: new upload file ux

* remove file

* feat: support read pptx

* feat: support xlsx

* fix: rerank docker flie
This commit is contained in:
Archer
2024-04-01 19:01:26 +08:00
committed by GitHub
parent f9d266a6af
commit 21288d1736
90 changed files with 2707 additions and 1678 deletions

View File

@@ -2,51 +2,15 @@ import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { addLog } from '@fastgpt/service/common/system/log';
import { checkFiles } from '../timerTask/dataset/checkInValidDatasetFiles';
import { addHours } from 'date-fns';
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
import { checkInvalidCollection } from '../timerTask/dataset/checkInvalidMongoCollection';
import { checkInvalidVector } from '../timerTask/dataset/checkInvalidVector';
import { MongoImage } from '@fastgpt/service/common/file/image/schema';
/*
1.
*/
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
let deleteImageAmount = 0;
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
const {
startHour = 72,
endHour = 24,
limit = 10
} = req.body as { startHour?: number; endHour?: number; limit?: number };
await authCert({ req, authRoot: true });
await connectToDatabase();
// start: now - maxDay, end: now - 3 day
const start = addHours(new Date(), -startHour);
const end = addHours(new Date(), -endHour);
deleteImageAmount = 0;
await checkInvalid(start, end, limit);
jsonRes(res, {
data: deleteImageAmount
});
} catch (error) {
addLog.error(`check Invalid user error`, error);
jsonRes(res, {
code: 500,
error
});
}
}
export async function checkInvalid(start: Date, end: Date, limit = 50) {
async function checkInvalidImg(start: Date, end: Date, limit = 50) {
const images = await MongoImage.find(
{
createTime: {
@@ -86,3 +50,37 @@ export async function checkInvalid(start: Date, end: Date, limit = 50) {
console.log(`检测完成,共删除 ${deleteImageAmount} 个无效图片`);
}
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
await connectToDatabase();
await authCert({ req, authRoot: true });
(async () => {
try {
console.log('执行脏数据清理任务');
const end = addHours(new Date(), -1);
const start = addHours(new Date(), -360 * 24);
await checkFiles(start, end);
await checkInvalidImg(start, end);
await checkInvalidCollection(start, end);
await checkInvalidVector(start, end);
console.log('执行脏数据清理任务完毕');
} catch (error) {
console.log('执行脏数据清理任务出错了');
}
})();
jsonRes(res, {
message: 'success'
});
} catch (error) {
console.log(error);
jsonRes(res, {
code: 500,
error
});
}
}

View File

@@ -6,9 +6,52 @@ import { MongoUsage } from '@fastgpt/service/support/wallet/usage/schema';
import { connectionMongo } from '@fastgpt/service/common/mongo';
import { checkFiles } from '../timerTask/dataset/checkInValidDatasetFiles';
import { addHours } from 'date-fns';
import { checkInvalid as checkInvalidImg } from '../timerTask/dataset/checkInvalidDatasetImage';
import { checkInvalidCollection } from '../timerTask/dataset/checkInvalidMongoCollection';
import { checkInvalidVector } from '../timerTask/dataset/checkInvalidVector';
import { MongoImage } from '@fastgpt/service/common/file/image/schema';
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
let deleteImageAmount = 0;
export async function checkInvalidImg(start: Date, end: Date, limit = 50) {
const images = await MongoImage.find(
{
createTime: {
$gte: start,
$lte: end
},
'metadata.relatedId': { $exists: true }
},
'_id teamId metadata'
);
console.log('total images', images.length);
let index = 0;
for await (const image of images) {
try {
// 1. 检测是否有对应的集合
const collection = await MongoDatasetCollection.findOne(
{
teamId: image.teamId,
'metadata.relatedImgId': image.metadata?.relatedId
},
'_id'
);
if (!collection) {
await image.deleteOne();
deleteImageAmount++;
}
index++;
index % 100 === 0 && console.log(index);
} catch (error) {
console.log(error);
}
}
console.log(`检测完成,共删除 ${deleteImageAmount} 个无效图片`);
}
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
export default async function handler(req: NextApiRequest, res: NextApiResponse) {

View File

@@ -2,13 +2,6 @@ import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { MongoUsage } from '@fastgpt/service/support/wallet/usage/schema';
import { connectionMongo } from '@fastgpt/service/common/mongo';
import { checkFiles } from '../timerTask/dataset/checkInValidDatasetFiles';
import { addHours } from 'date-fns';
import { checkInvalid as checkInvalidImg } from '../timerTask/dataset/checkInvalidDatasetImage';
import { checkInvalidCollection } from '../timerTask/dataset/checkInvalidMongoCollection';
import { checkInvalidVector } from '../timerTask/dataset/checkInvalidVector';
import { MongoPlugin } from '@fastgpt/service/core/plugin/schema';
import { PluginTypeEnum } from '@fastgpt/global/core/plugin/constants';

View File

@@ -0,0 +1,41 @@
/*
Read db file content and response 3000 words
*/
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { readFileContent } from '@fastgpt/service/common/file/gridfs/controller';
import { authFile } from '@fastgpt/service/support/permission/auth/file';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
await connectToDatabase();
const { fileId, csvFormat } = req.body as { fileId: string; csvFormat?: boolean };
if (!fileId) {
throw new Error('fileId is empty');
}
const { teamId } = await authFile({ req, authToken: true, fileId });
const { rawText } = await readFileContent({
teamId,
bucketName: BucketNameEnum.dataset,
fileId,
csvFormat
});
jsonRes(res, {
data: {
previewContent: rawText.slice(0, 3000),
totalLength: rawText.length
}
});
} catch (error) {
jsonRes(res, {
code: 500,
error
});
}
}

View File

@@ -2,9 +2,12 @@ import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { authFileToken } from '@fastgpt/service/support/permission/controller';
import { getDownloadStream, getFileById } from '@fastgpt/service/common/file/gridfs/controller';
import {
getDownloadStream,
getFileById,
readFileEncode
} from '@fastgpt/service/common/file/gridfs/controller';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
@@ -18,8 +21,9 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
throw new Error('fileId is empty');
}
const [file, encodeStream] = await Promise.all([
const [file, encoding, fileStream] = await Promise.all([
getFileById({ bucketName, fileId }),
readFileEncode({ bucketName, fileId }),
getDownloadStream({ bucketName, fileId })
]);
@@ -27,24 +31,10 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
return Promise.reject(CommonErrEnum.fileNotFound);
}
// get encoding
let buffers: Buffer = Buffer.from([]);
for await (const chunk of encodeStream) {
buffers = Buffer.concat([buffers, chunk]);
if (buffers.length > 10) {
encodeStream.abort();
break;
}
}
const encoding = detectFileEncoding(buffers);
res.setHeader('Content-Type', `${file.contentType}; charset=${encoding}`);
res.setHeader('Cache-Control', 'public, max-age=3600');
res.setHeader('Content-Disposition', `inline; filename="${encodeURIComponent(file.filename)}"`);
const fileStream = await getDownloadStream({ bucketName, fileId });
fileStream.pipe(res);
fileStream.on('error', () => {

View File

@@ -4,24 +4,22 @@ import { connectToDatabase } from '@/service/mongo';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { uploadFile } from '@fastgpt/service/common/file/gridfs/controller';
import { getUploadModel } from '@fastgpt/service/common/file/multer';
/**
* Creates the multer uploader
*/
const upload = getUploadModel({
maxSize: 500 * 1024 * 1024
});
import { removeFilesByPaths } from '@fastgpt/service/common/file/utils';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
let filePaths: string[] = [];
/* Creates the multer uploader */
const upload = getUploadModel({
maxSize: (global.feConfigs?.uploadFileMaxSize || 500) * 1024 * 1024
});
const filePaths: string[] = [];
try {
const { teamId, tmbId } = await authCert({ req, authToken: true });
await connectToDatabase();
const { file, bucketName, metadata } = await upload.doUpload(req, res);
filePaths = [file.path];
await connectToDatabase();
filePaths.push(file.path);
const { teamId, tmbId } = await authCert({ req, authToken: true });
if (!bucketName) {
throw new Error('bucketName is empty');
@@ -46,6 +44,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
error
});
}
removeFilesByPaths(filePaths);
}
export const config = {

View File

@@ -12,12 +12,12 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
const { teamId } = await authChatCert({ req, authToken: true });
const data = await uploadMongoImg({
const imgId = await uploadMongoImg({
teamId,
...body
});
jsonRes(res, { data });
jsonRes(res, { data: imgId });
} catch (error) {
jsonRes(res, {
code: 500,

View File

@@ -0,0 +1,112 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { readFileContent } from '@fastgpt/service/common/file/gridfs/controller';
import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
import { FileIdCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api';
import { createOneCollection } from '@fastgpt/service/core/dataset/collection/controller';
import {
DatasetCollectionTypeEnum,
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { checkDatasetLimit } from '@fastgpt/service/support/permission/teamLimit';
import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils';
import { pushDataListToTrainingQueue } from '@fastgpt/service/core/dataset/training/controller';
import { createTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
import { getLLMModel, getVectorModel } from '@fastgpt/service/core/ai/model';
import { parseCsvTable2Chunks } from '@fastgpt/service/core/dataset/training/utils';
import { startTrainingQueue } from '@/service/core/dataset/training/utils';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
const { datasetId, parentId, fileId } = req.body as FileIdCreateDatasetCollectionParams;
const trainingType = TrainingModeEnum.chunk;
try {
await connectToDatabase();
const { teamId, tmbId, dataset } = await authDataset({
req,
authToken: true,
authApiKey: true,
per: 'w',
datasetId: datasetId
});
// 1. read file
const { rawText, filename } = await readFileContent({
teamId,
bucketName: BucketNameEnum.dataset,
fileId
});
// 2. split chunks
const { chunks = [] } = parseCsvTable2Chunks(rawText);
// 3. auth limit
await checkDatasetLimit({
teamId,
insertLen: predictDataLimitLength(trainingType, chunks)
});
await mongoSessionRun(async (session) => {
// 4. create collection
const { _id: collectionId } = await createOneCollection({
teamId,
tmbId,
name: filename,
parentId,
datasetId,
type: DatasetCollectionTypeEnum.file,
fileId,
// special metadata
trainingType,
chunkSize: 0,
session
});
// 5. create training bill
const { billId } = await createTrainingUsage({
teamId,
tmbId,
appName: filename,
billSource: UsageSourceEnum.training,
vectorModel: getVectorModel(dataset.vectorModel)?.name,
agentModel: getLLMModel(dataset.agentModel)?.name,
session
});
// 6. insert to training queue
await pushDataListToTrainingQueue({
teamId,
tmbId,
datasetId: dataset._id,
collectionId,
agentModel: dataset.agentModel,
vectorModel: dataset.vectorModel,
trainingMode: trainingType,
billId,
data: chunks.map((chunk, index) => ({
q: chunk.q,
a: chunk.a,
chunkIndex: index
})),
session
});
return collectionId;
});
startTrainingQueue(true);
jsonRes(res);
} catch (error) {
jsonRes(res, {
code: 500,
error
});
}
}

View File

@@ -1,94 +1,151 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { delFileByFileIdList, uploadFile } from '@fastgpt/service/common/file/gridfs/controller';
import { getUploadModel } from '@fastgpt/service/common/file/multer';
import {
delFileByFileIdList,
readFileContent
} from '@fastgpt/service/common/file/gridfs/controller';
import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
import { FileCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api';
import { removeFilesByPaths } from '@fastgpt/service/common/file/utils';
import { FileIdCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api';
import { createOneCollection } from '@fastgpt/service/core/dataset/collection/controller';
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants';
import {
DatasetCollectionTypeEnum,
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
/**
* Creates the multer uploader
*/
const upload = getUploadModel({
maxSize: 500 * 1024 * 1024
});
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { MongoImage } from '@fastgpt/service/common/file/image/schema';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { checkDatasetLimit } from '@fastgpt/service/support/permission/teamLimit';
import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils';
import { pushDataListToTrainingQueue } from '@fastgpt/service/core/dataset/training/controller';
import { createTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
import { getLLMModel, getVectorModel } from '@fastgpt/service/core/ai/model';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { startTrainingQueue } from '@/service/core/dataset/training/utils';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
let filePaths: string[] = [];
let fileId: string = '';
const { datasetId } = req.query as { datasetId: string };
const {
fileId,
trainingType = TrainingModeEnum.chunk,
chunkSize = 512,
chunkSplitter,
qaPrompt,
...body
} = req.body as FileIdCreateDatasetCollectionParams;
try {
await connectToDatabase();
const { teamId, tmbId } = await authDataset({
const { teamId, tmbId, dataset } = await authDataset({
req,
authToken: true,
authApiKey: true,
per: 'w',
datasetId
datasetId: body.datasetId
});
const { file, bucketName, data } = await upload.doUpload<FileCreateDatasetCollectionParams>(
req,
res
);
filePaths = [file.path];
if (!file || !bucketName) {
throw new Error('file is empty');
}
const { fileMetadata, collectionMetadata, ...collectionData } = data;
// upload file and create collection
fileId = await uploadFile({
// 1. read file
const { rawText, filename } = await readFileContent({
teamId,
tmbId,
bucketName,
path: file.path,
filename: file.originalname,
contentType: file.mimetype,
metadata: fileMetadata
});
// create collection
const { _id: collectionId } = await createOneCollection({
...collectionData,
metadata: collectionMetadata,
teamId,
tmbId,
type: DatasetCollectionTypeEnum.file,
bucketName: BucketNameEnum.dataset,
fileId
});
jsonRes(res, {
data: collectionId
// 2. split chunks
const { chunks } = splitText2Chunks({
text: rawText,
chunkLen: chunkSize,
overlapRatio: trainingType === TrainingModeEnum.chunk ? 0.2 : 0,
customReg: chunkSplitter ? [chunkSplitter] : []
});
// 3. auth limit
await checkDatasetLimit({
teamId,
insertLen: predictDataLimitLength(trainingType, chunks)
});
await mongoSessionRun(async (session) => {
// 4. create collection
const { _id: collectionId } = await createOneCollection({
...body,
teamId,
tmbId,
type: DatasetCollectionTypeEnum.file,
name: filename,
fileId,
metadata: {
relatedImgId: fileId
},
// special metadata
trainingType,
chunkSize,
chunkSplitter,
qaPrompt,
hashRawText: hashStr(rawText),
rawTextLength: rawText.length,
session
});
// 5. create training bill
const { billId } = await createTrainingUsage({
teamId,
tmbId,
appName: filename,
billSource: UsageSourceEnum.training,
vectorModel: getVectorModel(dataset.vectorModel)?.name,
agentModel: getLLMModel(dataset.agentModel)?.name,
session
});
// 6. insert to training queue
await pushDataListToTrainingQueue({
teamId,
tmbId,
datasetId: dataset._id,
collectionId,
agentModel: dataset.agentModel,
vectorModel: dataset.vectorModel,
trainingMode: trainingType,
prompt: qaPrompt,
billId,
data: chunks.map((text, index) => ({
q: text,
chunkIndex: index
})),
session
});
// 7. remove related image ttl
await MongoImage.updateMany(
{
teamId,
'metadata.relatedId': fileId
},
{
// Remove expiredTime to avoid ttl expiration
$unset: {
expiredTime: 1
}
},
{
session
}
);
return collectionId;
});
startTrainingQueue(true);
jsonRes(res);
} catch (error) {
if (fileId) {
try {
await delFileByFileIdList({
fileIdList: [fileId],
bucketName: BucketNameEnum.dataset
});
} catch (error) {}
}
jsonRes(res, {
code: 500,
error
});
}
removeFilesByPaths(filePaths);
}
export const config = {
api: {
bodyParser: false
}
};

View File

@@ -19,6 +19,7 @@ import { hashStr } from '@fastgpt/global/common/string/tools';
import { createTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
import { getLLMModel, getVectorModel } from '@fastgpt/service/core/ai/model';
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
@@ -55,9 +56,9 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
insertLen: predictDataLimitLength(trainingType, chunks)
});
// 3. create collection and training bill
const [{ _id: collectionId }, { billId }] = await Promise.all([
createOneCollection({
const createResult = await mongoSessionRun(async (session) => {
// 3. create collection
const { _id: collectionId } = await createOneCollection({
...body,
teamId,
tmbId,
@@ -70,34 +71,44 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
qaPrompt,
hashRawText: hashStr(text),
rawTextLength: text.length
}),
createTrainingUsage({
rawTextLength: text.length,
session
});
// 4. create training bill
const { billId } = await createTrainingUsage({
teamId,
tmbId,
appName: name,
billSource: UsageSourceEnum.training,
vectorModel: getVectorModel(dataset.vectorModel)?.name,
agentModel: getLLMModel(dataset.agentModel)?.name
})
]);
agentModel: getLLMModel(dataset.agentModel)?.name,
session
});
// 4. push chunks to training queue
const insertResults = await pushDataListToTrainingQueue({
teamId,
tmbId,
collectionId,
trainingMode: trainingType,
prompt: qaPrompt,
billId,
data: chunks.map((text, index) => ({
q: text,
chunkIndex: index
}))
// 5. push chunks to training queue
const insertResults = await pushDataListToTrainingQueue({
teamId,
tmbId,
datasetId: dataset._id,
collectionId,
agentModel: dataset.agentModel,
vectorModel: dataset.vectorModel,
trainingMode: trainingType,
prompt: qaPrompt,
billId,
data: chunks.map((text, index) => ({
q: text,
chunkIndex: index
})),
session
});
return { collectionId, results: insertResults };
});
jsonRes(res, {
data: { collectionId, results: insertResults }
data: createResult
});
} catch (err) {
jsonRes(res, {

View File

@@ -15,7 +15,8 @@ import { pushDataListToTrainingQueue } from '@fastgpt/service/core/dataset/train
export default withNextCors(async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
await connectToDatabase();
const { collectionId, data } = req.body as PushDatasetDataProps;
const body = req.body as PushDatasetDataProps;
const { collectionId, data } = body;
if (!collectionId || !Array.isArray(data)) {
throw new Error('collectionId or data is empty');
@@ -42,9 +43,12 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
jsonRes<PushDatasetDataResponse>(res, {
data: await pushDataListToTrainingQueue({
...req.body,
...body,
teamId,
tmbId
tmbId,
datasetId: collection.datasetId._id,
agentModel: collection.datasetId.agentModel,
vectorModel: collection.datasetId.vectorModel
})
});
} catch (err) {

View File

@@ -0,0 +1,80 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { authFile } from '@fastgpt/service/support/permission/auth/file';
import { PostPreviewFilesChunksProps } from '@/global/core/dataset/api';
import { readFileContent } from '@fastgpt/service/common/file/gridfs/controller';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants';
import { parseCsvTable2Chunks } from '@fastgpt/service/core/dataset/training/utils';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
await connectToDatabase();
const { type, sourceId, chunkSize, customSplitChar, overlapRatio } =
req.body as PostPreviewFilesChunksProps;
if (!sourceId) {
throw new Error('fileIdList is empty');
}
if (chunkSize > 30000) {
throw new Error('chunkSize is too large, should be less than 30000');
}
const { chunks } = await (async () => {
if (type === ImportDataSourceEnum.fileLocal) {
const { file, teamId } = await authFile({ req, authToken: true, fileId: sourceId });
const fileId = String(file._id);
const { rawText } = await readFileContent({
teamId,
bucketName: BucketNameEnum.dataset,
fileId,
csvFormat: true
});
// split chunks (5 chunk)
const sliceRawText = 10 * chunkSize;
const { chunks } = splitText2Chunks({
text: rawText.slice(0, sliceRawText),
chunkLen: chunkSize,
overlapRatio,
customReg: customSplitChar ? [customSplitChar] : []
});
return {
chunks: chunks.map((item) => ({
q: item,
a: ''
}))
};
}
if (type === ImportDataSourceEnum.csvTable) {
const { file, teamId } = await authFile({ req, authToken: true, fileId: sourceId });
const fileId = String(file._id);
const { rawText } = await readFileContent({
teamId,
bucketName: BucketNameEnum.dataset,
fileId,
csvFormat: false
});
const { chunks } = parseCsvTable2Chunks(rawText);
return {
chunks: chunks || []
};
}
return { chunks: [] };
})();
jsonRes<{ q: string; a: string }[]>(res, {
data: chunks.slice(0, 5)
});
} catch (error) {
jsonRes(res, {
code: 500,
error
});
}
}