mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-21 03:35:36 +00:00
feat: text collecion auto save for a txt file (#4924)
This commit is contained in:
@@ -40,5 +40,6 @@ export function getSourceNameIcon({
|
||||
export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => {
|
||||
if (mode === TrainingModeEnum.qa) return data.length * 20;
|
||||
if (mode === TrainingModeEnum.auto) return data.length * 5;
|
||||
if (mode === TrainingModeEnum.image) return data.length * 2;
|
||||
return data.length;
|
||||
};
|
||||
|
@@ -223,7 +223,7 @@ export const readFileContentFromMongo = async ({
|
||||
rawText: string;
|
||||
filename: string;
|
||||
}> => {
|
||||
const bufferId = `${fileId}-${customPdfParse}`;
|
||||
const bufferId = `${String(fileId)}-${customPdfParse}`;
|
||||
// read buffer
|
||||
const fileBuffer = await getRawTextBuffer(bufferId);
|
||||
if (fileBuffer) {
|
||||
|
@@ -1,5 +1,57 @@
|
||||
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
|
||||
import { PassThrough } from 'stream';
|
||||
import { getGridBucket } from './controller';
|
||||
import { type BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||
import { retryFn } from '@fastgpt/global/common/system/utils';
|
||||
|
||||
export const createFileFromText = async ({
|
||||
bucket,
|
||||
filename,
|
||||
text,
|
||||
metadata
|
||||
}: {
|
||||
bucket: `${BucketNameEnum}`;
|
||||
filename: string;
|
||||
text: string;
|
||||
metadata: Record<string, any>;
|
||||
}) => {
|
||||
const gridBucket = getGridBucket(bucket);
|
||||
|
||||
const buffer = Buffer.from(text);
|
||||
|
||||
const fileSize = buffer.length;
|
||||
// 单块大小:尽可能大,但不超过 14MB,不小于128KB
|
||||
const chunkSizeBytes = (() => {
|
||||
// 计算理想块大小:文件大小 ÷ 目标块数(10)。 并且每个块需要小于 14MB
|
||||
const idealChunkSize = Math.min(Math.ceil(fileSize / 10), 14 * 1024 * 1024);
|
||||
|
||||
// 确保块大小至少为128KB
|
||||
const minChunkSize = 128 * 1024; // 128KB
|
||||
|
||||
// 取理想块大小和最小块大小中的较大值
|
||||
let chunkSize = Math.max(idealChunkSize, minChunkSize);
|
||||
|
||||
// 将块大小向上取整到最接近的64KB的倍数,使其更整齐
|
||||
chunkSize = Math.ceil(chunkSize / (64 * 1024)) * (64 * 1024);
|
||||
|
||||
return chunkSize;
|
||||
})();
|
||||
|
||||
const uploadStream = gridBucket.openUploadStream(filename, {
|
||||
metadata,
|
||||
chunkSizeBytes
|
||||
});
|
||||
|
||||
return retryFn(async () => {
|
||||
return new Promise<{ fileId: string }>((resolve, reject) => {
|
||||
uploadStream.end(buffer);
|
||||
uploadStream.on('finish', () => {
|
||||
resolve({ fileId: String(uploadStream.id) });
|
||||
});
|
||||
uploadStream.on('error', reject);
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
export const gridFsStream2Buffer = (stream: NodeJS.ReadableStream) => {
|
||||
return new Promise<Buffer>((resolve, reject) => {
|
||||
|
@@ -49,7 +49,7 @@ const CustomTextInput = () => {
|
||||
createStatus: 'waiting',
|
||||
rawText: data.value,
|
||||
sourceName: data.name,
|
||||
icon: 'file/fill/manual'
|
||||
icon: 'file/fill/txt'
|
||||
}
|
||||
]);
|
||||
goToNext();
|
||||
|
@@ -6,6 +6,7 @@ import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant
|
||||
import { NextAPI } from '@/service/middleware/entry';
|
||||
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
|
||||
import { type CreateCollectionResponse } from '@/global/core/dataset/api';
|
||||
import { createFileFromText } from '@fastgpt/service/common/file/gridfs/utils';
|
||||
|
||||
async function handler(req: NextApiRequest): CreateCollectionResponse {
|
||||
const { name, text, ...body } = req.body as TextCreateDatasetCollectionParams;
|
||||
@@ -18,6 +19,18 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
|
||||
per: WritePermissionVal
|
||||
});
|
||||
|
||||
// 1. Create file from text
|
||||
const filename = `${name}.txt`;
|
||||
const { fileId } = await createFileFromText({
|
||||
bucket: 'dataset',
|
||||
filename,
|
||||
text,
|
||||
metadata: {
|
||||
teamId,
|
||||
uid: tmbId
|
||||
}
|
||||
});
|
||||
|
||||
const { collectionId, insertResults } = await createCollectionAndInsertData({
|
||||
dataset,
|
||||
rawText: text,
|
||||
@@ -25,9 +38,9 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
|
||||
...body,
|
||||
teamId,
|
||||
tmbId,
|
||||
type: DatasetCollectionTypeEnum.virtual,
|
||||
|
||||
name
|
||||
type: DatasetCollectionTypeEnum.file,
|
||||
fileId,
|
||||
name: filename
|
||||
}
|
||||
});
|
||||
|
||||
|
@@ -2,7 +2,7 @@ import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/sch
|
||||
import { pushQAUsage } from '@/service/support/wallet/usage/push';
|
||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { createChatCompletion } from '@fastgpt/service/core/ai/config';
|
||||
import type { ChatCompletionMessageParam, StreamChatType } from '@fastgpt/global/core/ai/type.d';
|
||||
import type { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type.d';
|
||||
import { addLog } from '@fastgpt/service/common/system/log';
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { replaceVariable } from '@fastgpt/global/common/string/tools';
|
@@ -1,6 +1,6 @@
|
||||
import { TeamErrEnum } from '@fastgpt/global/common/error/code/team';
|
||||
import { checkTeamAIPoints } from '@fastgpt/service/support/permission/teamLimit';
|
||||
import { sendOneInform } from '../support/user/inform/api';
|
||||
import { sendOneInform } from '../../../support/user/inform/api';
|
||||
import { lockTrainingDataByTeamId } from '@fastgpt/service/core/dataset/training/controller';
|
||||
import { InformLevelEnum } from '@fastgpt/global/support/user/inform/constants';
|
||||
|
||||
@@ -18,7 +18,7 @@ export const checkTeamAiPointsAndLock = async (teamId: string) => {
|
||||
templateParam: {},
|
||||
teamId
|
||||
});
|
||||
console.log('余额不足,暂停【向量】生成任务');
|
||||
console.log('余额不足,暂停训练生成任务');
|
||||
await lockTrainingDataByTeamId(teamId);
|
||||
} catch (error) {}
|
||||
}
|
@@ -1,5 +1,5 @@
|
||||
import { generateQA } from '@/service/events/generateQA';
|
||||
import { generateVector } from '@/service/events/generateVector';
|
||||
import { generateQA } from '@/service/core/dataset/queues/generateQA';
|
||||
import { generateVector } from '@/service/core/dataset/queues/generateVector';
|
||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { type DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
|
||||
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
|
||||
|
Reference in New Issue
Block a user