mirror of
https://github.com/labring/FastGPT.git
synced 2025-08-03 13:38:00 +00:00
Feat: pptx and xlsx loader (#1118)
* perf: plan tip * perf: upload size controller * feat: add image ttl index * feat: new upload file ux * remove file * feat: support read pptx * feat: support xlsx * fix: rerank docker flie
This commit is contained in:
@@ -2,51 +2,15 @@ import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import { authCert } from '@fastgpt/service/support/permission/auth/common';
|
||||
import { addLog } from '@fastgpt/service/common/system/log';
|
||||
import { checkFiles } from '../timerTask/dataset/checkInValidDatasetFiles';
|
||||
import { addHours } from 'date-fns';
|
||||
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
|
||||
import { checkInvalidCollection } from '../timerTask/dataset/checkInvalidMongoCollection';
|
||||
import { checkInvalidVector } from '../timerTask/dataset/checkInvalidVector';
|
||||
import { MongoImage } from '@fastgpt/service/common/file/image/schema';
|
||||
|
||||
/*
|
||||
检测无效的数据集图片
|
||||
|
||||
可能异常情况:
|
||||
1. 上传文件过程中,上传了图片,但是最终没有创建数据集。
|
||||
*/
|
||||
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
|
||||
|
||||
let deleteImageAmount = 0;
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
|
||||
try {
|
||||
const {
|
||||
startHour = 72,
|
||||
endHour = 24,
|
||||
limit = 10
|
||||
} = req.body as { startHour?: number; endHour?: number; limit?: number };
|
||||
await authCert({ req, authRoot: true });
|
||||
await connectToDatabase();
|
||||
|
||||
// start: now - maxDay, end: now - 3 day
|
||||
const start = addHours(new Date(), -startHour);
|
||||
const end = addHours(new Date(), -endHour);
|
||||
deleteImageAmount = 0;
|
||||
|
||||
await checkInvalid(start, end, limit);
|
||||
|
||||
jsonRes(res, {
|
||||
data: deleteImageAmount
|
||||
});
|
||||
} catch (error) {
|
||||
addLog.error(`check Invalid user error`, error);
|
||||
|
||||
jsonRes(res, {
|
||||
code: 500,
|
||||
error
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export async function checkInvalid(start: Date, end: Date, limit = 50) {
|
||||
async function checkInvalidImg(start: Date, end: Date, limit = 50) {
|
||||
const images = await MongoImage.find(
|
||||
{
|
||||
createTime: {
|
||||
@@ -86,3 +50,37 @@ export async function checkInvalid(start: Date, end: Date, limit = 50) {
|
||||
|
||||
console.log(`检测完成,共删除 ${deleteImageAmount} 个无效图片`);
|
||||
}
|
||||
|
||||
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
|
||||
try {
|
||||
await connectToDatabase();
|
||||
await authCert({ req, authRoot: true });
|
||||
|
||||
(async () => {
|
||||
try {
|
||||
console.log('执行脏数据清理任务');
|
||||
const end = addHours(new Date(), -1);
|
||||
const start = addHours(new Date(), -360 * 24);
|
||||
await checkFiles(start, end);
|
||||
await checkInvalidImg(start, end);
|
||||
await checkInvalidCollection(start, end);
|
||||
await checkInvalidVector(start, end);
|
||||
console.log('执行脏数据清理任务完毕');
|
||||
} catch (error) {
|
||||
console.log('执行脏数据清理任务出错了');
|
||||
}
|
||||
})();
|
||||
|
||||
jsonRes(res, {
|
||||
message: 'success'
|
||||
});
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
|
||||
jsonRes(res, {
|
||||
code: 500,
|
||||
error
|
||||
});
|
||||
}
|
||||
}
|
@@ -6,9 +6,52 @@ import { MongoUsage } from '@fastgpt/service/support/wallet/usage/schema';
|
||||
import { connectionMongo } from '@fastgpt/service/common/mongo';
|
||||
import { checkFiles } from '../timerTask/dataset/checkInValidDatasetFiles';
|
||||
import { addHours } from 'date-fns';
|
||||
import { checkInvalid as checkInvalidImg } from '../timerTask/dataset/checkInvalidDatasetImage';
|
||||
import { checkInvalidCollection } from '../timerTask/dataset/checkInvalidMongoCollection';
|
||||
import { checkInvalidVector } from '../timerTask/dataset/checkInvalidVector';
|
||||
import { MongoImage } from '@fastgpt/service/common/file/image/schema';
|
||||
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
|
||||
|
||||
let deleteImageAmount = 0;
|
||||
export async function checkInvalidImg(start: Date, end: Date, limit = 50) {
|
||||
const images = await MongoImage.find(
|
||||
{
|
||||
createTime: {
|
||||
$gte: start,
|
||||
$lte: end
|
||||
},
|
||||
'metadata.relatedId': { $exists: true }
|
||||
},
|
||||
'_id teamId metadata'
|
||||
);
|
||||
console.log('total images', images.length);
|
||||
let index = 0;
|
||||
|
||||
for await (const image of images) {
|
||||
try {
|
||||
// 1. 检测是否有对应的集合
|
||||
const collection = await MongoDatasetCollection.findOne(
|
||||
{
|
||||
teamId: image.teamId,
|
||||
'metadata.relatedImgId': image.metadata?.relatedId
|
||||
},
|
||||
'_id'
|
||||
);
|
||||
|
||||
if (!collection) {
|
||||
await image.deleteOne();
|
||||
deleteImageAmount++;
|
||||
}
|
||||
|
||||
index++;
|
||||
|
||||
index % 100 === 0 && console.log(index);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`检测完成,共删除 ${deleteImageAmount} 个无效图片`);
|
||||
}
|
||||
|
||||
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
|
||||
|
@@ -2,13 +2,6 @@ import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import { authCert } from '@fastgpt/service/support/permission/auth/common';
|
||||
import { MongoUsage } from '@fastgpt/service/support/wallet/usage/schema';
|
||||
import { connectionMongo } from '@fastgpt/service/common/mongo';
|
||||
import { checkFiles } from '../timerTask/dataset/checkInValidDatasetFiles';
|
||||
import { addHours } from 'date-fns';
|
||||
import { checkInvalid as checkInvalidImg } from '../timerTask/dataset/checkInvalidDatasetImage';
|
||||
import { checkInvalidCollection } from '../timerTask/dataset/checkInvalidMongoCollection';
|
||||
import { checkInvalidVector } from '../timerTask/dataset/checkInvalidVector';
|
||||
import { MongoPlugin } from '@fastgpt/service/core/plugin/schema';
|
||||
import { PluginTypeEnum } from '@fastgpt/global/core/plugin/constants';
|
||||
|
||||
|
41
projects/app/src/pages/api/common/file/previewContent.ts
Normal file
41
projects/app/src/pages/api/common/file/previewContent.ts
Normal file
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
Read db file content and response 3000 words
|
||||
*/
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import { readFileContent } from '@fastgpt/service/common/file/gridfs/controller';
|
||||
import { authFile } from '@fastgpt/service/support/permission/auth/file';
|
||||
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
await connectToDatabase();
|
||||
const { fileId, csvFormat } = req.body as { fileId: string; csvFormat?: boolean };
|
||||
|
||||
if (!fileId) {
|
||||
throw new Error('fileId is empty');
|
||||
}
|
||||
|
||||
const { teamId } = await authFile({ req, authToken: true, fileId });
|
||||
|
||||
const { rawText } = await readFileContent({
|
||||
teamId,
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileId,
|
||||
csvFormat
|
||||
});
|
||||
|
||||
jsonRes(res, {
|
||||
data: {
|
||||
previewContent: rawText.slice(0, 3000),
|
||||
totalLength: rawText.length
|
||||
}
|
||||
});
|
||||
} catch (error) {
|
||||
jsonRes(res, {
|
||||
code: 500,
|
||||
error
|
||||
});
|
||||
}
|
||||
}
|
@@ -2,9 +2,12 @@ import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import { authFileToken } from '@fastgpt/service/support/permission/controller';
|
||||
import { getDownloadStream, getFileById } from '@fastgpt/service/common/file/gridfs/controller';
|
||||
import {
|
||||
getDownloadStream,
|
||||
getFileById,
|
||||
readFileEncode
|
||||
} from '@fastgpt/service/common/file/gridfs/controller';
|
||||
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
|
||||
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
@@ -18,8 +21,9 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
throw new Error('fileId is empty');
|
||||
}
|
||||
|
||||
const [file, encodeStream] = await Promise.all([
|
||||
const [file, encoding, fileStream] = await Promise.all([
|
||||
getFileById({ bucketName, fileId }),
|
||||
readFileEncode({ bucketName, fileId }),
|
||||
getDownloadStream({ bucketName, fileId })
|
||||
]);
|
||||
|
||||
@@ -27,24 +31,10 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
return Promise.reject(CommonErrEnum.fileNotFound);
|
||||
}
|
||||
|
||||
// get encoding
|
||||
let buffers: Buffer = Buffer.from([]);
|
||||
for await (const chunk of encodeStream) {
|
||||
buffers = Buffer.concat([buffers, chunk]);
|
||||
if (buffers.length > 10) {
|
||||
encodeStream.abort();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const encoding = detectFileEncoding(buffers);
|
||||
|
||||
res.setHeader('Content-Type', `${file.contentType}; charset=${encoding}`);
|
||||
res.setHeader('Cache-Control', 'public, max-age=3600');
|
||||
res.setHeader('Content-Disposition', `inline; filename="${encodeURIComponent(file.filename)}"`);
|
||||
|
||||
const fileStream = await getDownloadStream({ bucketName, fileId });
|
||||
|
||||
fileStream.pipe(res);
|
||||
|
||||
fileStream.on('error', () => {
|
||||
|
@@ -4,24 +4,22 @@ import { connectToDatabase } from '@/service/mongo';
|
||||
import { authCert } from '@fastgpt/service/support/permission/auth/common';
|
||||
import { uploadFile } from '@fastgpt/service/common/file/gridfs/controller';
|
||||
import { getUploadModel } from '@fastgpt/service/common/file/multer';
|
||||
|
||||
/**
|
||||
* Creates the multer uploader
|
||||
*/
|
||||
const upload = getUploadModel({
|
||||
maxSize: 500 * 1024 * 1024
|
||||
});
|
||||
import { removeFilesByPaths } from '@fastgpt/service/common/file/utils';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
let filePaths: string[] = [];
|
||||
/* Creates the multer uploader */
|
||||
const upload = getUploadModel({
|
||||
maxSize: (global.feConfigs?.uploadFileMaxSize || 500) * 1024 * 1024
|
||||
});
|
||||
const filePaths: string[] = [];
|
||||
|
||||
try {
|
||||
const { teamId, tmbId } = await authCert({ req, authToken: true });
|
||||
|
||||
await connectToDatabase();
|
||||
const { file, bucketName, metadata } = await upload.doUpload(req, res);
|
||||
|
||||
filePaths = [file.path];
|
||||
await connectToDatabase();
|
||||
filePaths.push(file.path);
|
||||
|
||||
const { teamId, tmbId } = await authCert({ req, authToken: true });
|
||||
|
||||
if (!bucketName) {
|
||||
throw new Error('bucketName is empty');
|
||||
@@ -46,6 +44,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
error
|
||||
});
|
||||
}
|
||||
|
||||
removeFilesByPaths(filePaths);
|
||||
}
|
||||
|
||||
export const config = {
|
||||
|
@@ -12,12 +12,12 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
|
||||
|
||||
const { teamId } = await authChatCert({ req, authToken: true });
|
||||
|
||||
const data = await uploadMongoImg({
|
||||
const imgId = await uploadMongoImg({
|
||||
teamId,
|
||||
...body
|
||||
});
|
||||
|
||||
jsonRes(res, { data });
|
||||
jsonRes(res, { data: imgId });
|
||||
} catch (error) {
|
||||
jsonRes(res, {
|
||||
code: 500,
|
||||
|
@@ -0,0 +1,112 @@
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import { readFileContent } from '@fastgpt/service/common/file/gridfs/controller';
|
||||
import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
|
||||
import { FileIdCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api';
|
||||
import { createOneCollection } from '@fastgpt/service/core/dataset/collection/controller';
|
||||
import {
|
||||
DatasetCollectionTypeEnum,
|
||||
TrainingModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
|
||||
import { checkDatasetLimit } from '@fastgpt/service/support/permission/teamLimit';
|
||||
import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils';
|
||||
import { pushDataListToTrainingQueue } from '@fastgpt/service/core/dataset/training/controller';
|
||||
import { createTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
|
||||
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
|
||||
import { getLLMModel, getVectorModel } from '@fastgpt/service/core/ai/model';
|
||||
import { parseCsvTable2Chunks } from '@fastgpt/service/core/dataset/training/utils';
|
||||
import { startTrainingQueue } from '@/service/core/dataset/training/utils';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
const { datasetId, parentId, fileId } = req.body as FileIdCreateDatasetCollectionParams;
|
||||
const trainingType = TrainingModeEnum.chunk;
|
||||
|
||||
try {
|
||||
await connectToDatabase();
|
||||
|
||||
const { teamId, tmbId, dataset } = await authDataset({
|
||||
req,
|
||||
authToken: true,
|
||||
authApiKey: true,
|
||||
per: 'w',
|
||||
datasetId: datasetId
|
||||
});
|
||||
|
||||
// 1. read file
|
||||
const { rawText, filename } = await readFileContent({
|
||||
teamId,
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileId
|
||||
});
|
||||
// 2. split chunks
|
||||
const { chunks = [] } = parseCsvTable2Chunks(rawText);
|
||||
|
||||
// 3. auth limit
|
||||
await checkDatasetLimit({
|
||||
teamId,
|
||||
insertLen: predictDataLimitLength(trainingType, chunks)
|
||||
});
|
||||
|
||||
await mongoSessionRun(async (session) => {
|
||||
// 4. create collection
|
||||
const { _id: collectionId } = await createOneCollection({
|
||||
teamId,
|
||||
tmbId,
|
||||
name: filename,
|
||||
parentId,
|
||||
datasetId,
|
||||
type: DatasetCollectionTypeEnum.file,
|
||||
fileId,
|
||||
|
||||
// special metadata
|
||||
trainingType,
|
||||
chunkSize: 0,
|
||||
|
||||
session
|
||||
});
|
||||
|
||||
// 5. create training bill
|
||||
const { billId } = await createTrainingUsage({
|
||||
teamId,
|
||||
tmbId,
|
||||
appName: filename,
|
||||
billSource: UsageSourceEnum.training,
|
||||
vectorModel: getVectorModel(dataset.vectorModel)?.name,
|
||||
agentModel: getLLMModel(dataset.agentModel)?.name,
|
||||
session
|
||||
});
|
||||
|
||||
// 6. insert to training queue
|
||||
await pushDataListToTrainingQueue({
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId: dataset._id,
|
||||
collectionId,
|
||||
agentModel: dataset.agentModel,
|
||||
vectorModel: dataset.vectorModel,
|
||||
trainingMode: trainingType,
|
||||
billId,
|
||||
data: chunks.map((chunk, index) => ({
|
||||
q: chunk.q,
|
||||
a: chunk.a,
|
||||
chunkIndex: index
|
||||
})),
|
||||
session
|
||||
});
|
||||
|
||||
return collectionId;
|
||||
});
|
||||
|
||||
startTrainingQueue(true);
|
||||
|
||||
jsonRes(res);
|
||||
} catch (error) {
|
||||
jsonRes(res, {
|
||||
code: 500,
|
||||
error
|
||||
});
|
||||
}
|
||||
}
|
@@ -1,94 +1,151 @@
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import { delFileByFileIdList, uploadFile } from '@fastgpt/service/common/file/gridfs/controller';
|
||||
import { getUploadModel } from '@fastgpt/service/common/file/multer';
|
||||
import {
|
||||
delFileByFileIdList,
|
||||
readFileContent
|
||||
} from '@fastgpt/service/common/file/gridfs/controller';
|
||||
import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
|
||||
import { FileCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api';
|
||||
import { removeFilesByPaths } from '@fastgpt/service/common/file/utils';
|
||||
import { FileIdCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api';
|
||||
import { createOneCollection } from '@fastgpt/service/core/dataset/collection/controller';
|
||||
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import {
|
||||
DatasetCollectionTypeEnum,
|
||||
TrainingModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||
|
||||
/**
|
||||
* Creates the multer uploader
|
||||
*/
|
||||
const upload = getUploadModel({
|
||||
maxSize: 500 * 1024 * 1024
|
||||
});
|
||||
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
|
||||
import { MongoImage } from '@fastgpt/service/common/file/image/schema';
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { checkDatasetLimit } from '@fastgpt/service/support/permission/teamLimit';
|
||||
import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils';
|
||||
import { pushDataListToTrainingQueue } from '@fastgpt/service/core/dataset/training/controller';
|
||||
import { createTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
|
||||
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
|
||||
import { getLLMModel, getVectorModel } from '@fastgpt/service/core/ai/model';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
import { startTrainingQueue } from '@/service/core/dataset/training/utils';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
let filePaths: string[] = [];
|
||||
let fileId: string = '';
|
||||
const { datasetId } = req.query as { datasetId: string };
|
||||
const {
|
||||
fileId,
|
||||
trainingType = TrainingModeEnum.chunk,
|
||||
chunkSize = 512,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
...body
|
||||
} = req.body as FileIdCreateDatasetCollectionParams;
|
||||
|
||||
try {
|
||||
await connectToDatabase();
|
||||
|
||||
const { teamId, tmbId } = await authDataset({
|
||||
const { teamId, tmbId, dataset } = await authDataset({
|
||||
req,
|
||||
authToken: true,
|
||||
authApiKey: true,
|
||||
per: 'w',
|
||||
datasetId
|
||||
datasetId: body.datasetId
|
||||
});
|
||||
|
||||
const { file, bucketName, data } = await upload.doUpload<FileCreateDatasetCollectionParams>(
|
||||
req,
|
||||
res
|
||||
);
|
||||
filePaths = [file.path];
|
||||
|
||||
if (!file || !bucketName) {
|
||||
throw new Error('file is empty');
|
||||
}
|
||||
|
||||
const { fileMetadata, collectionMetadata, ...collectionData } = data;
|
||||
|
||||
// upload file and create collection
|
||||
fileId = await uploadFile({
|
||||
// 1. read file
|
||||
const { rawText, filename } = await readFileContent({
|
||||
teamId,
|
||||
tmbId,
|
||||
bucketName,
|
||||
path: file.path,
|
||||
filename: file.originalname,
|
||||
contentType: file.mimetype,
|
||||
metadata: fileMetadata
|
||||
});
|
||||
|
||||
// create collection
|
||||
const { _id: collectionId } = await createOneCollection({
|
||||
...collectionData,
|
||||
metadata: collectionMetadata,
|
||||
teamId,
|
||||
tmbId,
|
||||
type: DatasetCollectionTypeEnum.file,
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileId
|
||||
});
|
||||
|
||||
jsonRes(res, {
|
||||
data: collectionId
|
||||
// 2. split chunks
|
||||
const { chunks } = splitText2Chunks({
|
||||
text: rawText,
|
||||
chunkLen: chunkSize,
|
||||
overlapRatio: trainingType === TrainingModeEnum.chunk ? 0.2 : 0,
|
||||
customReg: chunkSplitter ? [chunkSplitter] : []
|
||||
});
|
||||
|
||||
// 3. auth limit
|
||||
await checkDatasetLimit({
|
||||
teamId,
|
||||
insertLen: predictDataLimitLength(trainingType, chunks)
|
||||
});
|
||||
|
||||
await mongoSessionRun(async (session) => {
|
||||
// 4. create collection
|
||||
const { _id: collectionId } = await createOneCollection({
|
||||
...body,
|
||||
teamId,
|
||||
tmbId,
|
||||
type: DatasetCollectionTypeEnum.file,
|
||||
name: filename,
|
||||
fileId,
|
||||
metadata: {
|
||||
relatedImgId: fileId
|
||||
},
|
||||
|
||||
// special metadata
|
||||
trainingType,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
|
||||
hashRawText: hashStr(rawText),
|
||||
rawTextLength: rawText.length,
|
||||
session
|
||||
});
|
||||
|
||||
// 5. create training bill
|
||||
const { billId } = await createTrainingUsage({
|
||||
teamId,
|
||||
tmbId,
|
||||
appName: filename,
|
||||
billSource: UsageSourceEnum.training,
|
||||
vectorModel: getVectorModel(dataset.vectorModel)?.name,
|
||||
agentModel: getLLMModel(dataset.agentModel)?.name,
|
||||
session
|
||||
});
|
||||
|
||||
// 6. insert to training queue
|
||||
await pushDataListToTrainingQueue({
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId: dataset._id,
|
||||
collectionId,
|
||||
agentModel: dataset.agentModel,
|
||||
vectorModel: dataset.vectorModel,
|
||||
trainingMode: trainingType,
|
||||
prompt: qaPrompt,
|
||||
billId,
|
||||
data: chunks.map((text, index) => ({
|
||||
q: text,
|
||||
chunkIndex: index
|
||||
})),
|
||||
session
|
||||
});
|
||||
|
||||
// 7. remove related image ttl
|
||||
await MongoImage.updateMany(
|
||||
{
|
||||
teamId,
|
||||
'metadata.relatedId': fileId
|
||||
},
|
||||
{
|
||||
// Remove expiredTime to avoid ttl expiration
|
||||
$unset: {
|
||||
expiredTime: 1
|
||||
}
|
||||
},
|
||||
{
|
||||
session
|
||||
}
|
||||
);
|
||||
|
||||
return collectionId;
|
||||
});
|
||||
|
||||
startTrainingQueue(true);
|
||||
|
||||
jsonRes(res);
|
||||
} catch (error) {
|
||||
if (fileId) {
|
||||
try {
|
||||
await delFileByFileIdList({
|
||||
fileIdList: [fileId],
|
||||
bucketName: BucketNameEnum.dataset
|
||||
});
|
||||
} catch (error) {}
|
||||
}
|
||||
jsonRes(res, {
|
||||
code: 500,
|
||||
error
|
||||
});
|
||||
}
|
||||
|
||||
removeFilesByPaths(filePaths);
|
||||
}
|
||||
|
||||
export const config = {
|
||||
api: {
|
||||
bodyParser: false
|
||||
}
|
||||
};
|
||||
|
@@ -19,6 +19,7 @@ import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
import { createTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
|
||||
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
|
||||
import { getLLMModel, getVectorModel } from '@fastgpt/service/core/ai/model';
|
||||
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
@@ -55,9 +56,9 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
insertLen: predictDataLimitLength(trainingType, chunks)
|
||||
});
|
||||
|
||||
// 3. create collection and training bill
|
||||
const [{ _id: collectionId }, { billId }] = await Promise.all([
|
||||
createOneCollection({
|
||||
const createResult = await mongoSessionRun(async (session) => {
|
||||
// 3. create collection
|
||||
const { _id: collectionId } = await createOneCollection({
|
||||
...body,
|
||||
teamId,
|
||||
tmbId,
|
||||
@@ -70,34 +71,44 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
qaPrompt,
|
||||
|
||||
hashRawText: hashStr(text),
|
||||
rawTextLength: text.length
|
||||
}),
|
||||
createTrainingUsage({
|
||||
rawTextLength: text.length,
|
||||
session
|
||||
});
|
||||
|
||||
// 4. create training bill
|
||||
const { billId } = await createTrainingUsage({
|
||||
teamId,
|
||||
tmbId,
|
||||
appName: name,
|
||||
billSource: UsageSourceEnum.training,
|
||||
vectorModel: getVectorModel(dataset.vectorModel)?.name,
|
||||
agentModel: getLLMModel(dataset.agentModel)?.name
|
||||
})
|
||||
]);
|
||||
agentModel: getLLMModel(dataset.agentModel)?.name,
|
||||
session
|
||||
});
|
||||
|
||||
// 4. push chunks to training queue
|
||||
const insertResults = await pushDataListToTrainingQueue({
|
||||
teamId,
|
||||
tmbId,
|
||||
collectionId,
|
||||
trainingMode: trainingType,
|
||||
prompt: qaPrompt,
|
||||
billId,
|
||||
data: chunks.map((text, index) => ({
|
||||
q: text,
|
||||
chunkIndex: index
|
||||
}))
|
||||
// 5. push chunks to training queue
|
||||
const insertResults = await pushDataListToTrainingQueue({
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId: dataset._id,
|
||||
collectionId,
|
||||
agentModel: dataset.agentModel,
|
||||
vectorModel: dataset.vectorModel,
|
||||
trainingMode: trainingType,
|
||||
prompt: qaPrompt,
|
||||
billId,
|
||||
data: chunks.map((text, index) => ({
|
||||
q: text,
|
||||
chunkIndex: index
|
||||
})),
|
||||
session
|
||||
});
|
||||
|
||||
return { collectionId, results: insertResults };
|
||||
});
|
||||
|
||||
jsonRes(res, {
|
||||
data: { collectionId, results: insertResults }
|
||||
data: createResult
|
||||
});
|
||||
} catch (err) {
|
||||
jsonRes(res, {
|
||||
|
@@ -15,7 +15,8 @@ import { pushDataListToTrainingQueue } from '@fastgpt/service/core/dataset/train
|
||||
export default withNextCors(async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
await connectToDatabase();
|
||||
const { collectionId, data } = req.body as PushDatasetDataProps;
|
||||
const body = req.body as PushDatasetDataProps;
|
||||
const { collectionId, data } = body;
|
||||
|
||||
if (!collectionId || !Array.isArray(data)) {
|
||||
throw new Error('collectionId or data is empty');
|
||||
@@ -42,9 +43,12 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
|
||||
|
||||
jsonRes<PushDatasetDataResponse>(res, {
|
||||
data: await pushDataListToTrainingQueue({
|
||||
...req.body,
|
||||
...body,
|
||||
teamId,
|
||||
tmbId
|
||||
tmbId,
|
||||
datasetId: collection.datasetId._id,
|
||||
agentModel: collection.datasetId.agentModel,
|
||||
vectorModel: collection.datasetId.vectorModel
|
||||
})
|
||||
});
|
||||
} catch (err) {
|
||||
|
@@ -0,0 +1,80 @@
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||
import { authFile } from '@fastgpt/service/support/permission/auth/file';
|
||||
import { PostPreviewFilesChunksProps } from '@/global/core/dataset/api';
|
||||
import { readFileContent } from '@fastgpt/service/common/file/gridfs/controller';
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { parseCsvTable2Chunks } from '@fastgpt/service/core/dataset/training/utils';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
await connectToDatabase();
|
||||
|
||||
const { type, sourceId, chunkSize, customSplitChar, overlapRatio } =
|
||||
req.body as PostPreviewFilesChunksProps;
|
||||
|
||||
if (!sourceId) {
|
||||
throw new Error('fileIdList is empty');
|
||||
}
|
||||
if (chunkSize > 30000) {
|
||||
throw new Error('chunkSize is too large, should be less than 30000');
|
||||
}
|
||||
|
||||
const { chunks } = await (async () => {
|
||||
if (type === ImportDataSourceEnum.fileLocal) {
|
||||
const { file, teamId } = await authFile({ req, authToken: true, fileId: sourceId });
|
||||
const fileId = String(file._id);
|
||||
|
||||
const { rawText } = await readFileContent({
|
||||
teamId,
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileId,
|
||||
csvFormat: true
|
||||
});
|
||||
// split chunks (5 chunk)
|
||||
const sliceRawText = 10 * chunkSize;
|
||||
const { chunks } = splitText2Chunks({
|
||||
text: rawText.slice(0, sliceRawText),
|
||||
chunkLen: chunkSize,
|
||||
overlapRatio,
|
||||
customReg: customSplitChar ? [customSplitChar] : []
|
||||
});
|
||||
|
||||
return {
|
||||
chunks: chunks.map((item) => ({
|
||||
q: item,
|
||||
a: ''
|
||||
}))
|
||||
};
|
||||
}
|
||||
if (type === ImportDataSourceEnum.csvTable) {
|
||||
const { file, teamId } = await authFile({ req, authToken: true, fileId: sourceId });
|
||||
const fileId = String(file._id);
|
||||
const { rawText } = await readFileContent({
|
||||
teamId,
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileId,
|
||||
csvFormat: false
|
||||
});
|
||||
const { chunks } = parseCsvTable2Chunks(rawText);
|
||||
|
||||
return {
|
||||
chunks: chunks || []
|
||||
};
|
||||
}
|
||||
return { chunks: [] };
|
||||
})();
|
||||
|
||||
jsonRes<{ q: string; a: string }[]>(res, {
|
||||
data: chunks.slice(0, 5)
|
||||
});
|
||||
} catch (error) {
|
||||
jsonRes(res, {
|
||||
code: 500,
|
||||
error
|
||||
});
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user