External dataset (#1497)

* perf: read rawText and chunk code

* perf: read raw text

* perf: read rawtext

* perf: token count

* log
This commit is contained in:
Archer
2024-05-16 11:47:53 +08:00
committed by GitHub
parent d5073f98ab
commit c6d9b15897
36 changed files with 531 additions and 267 deletions

View File

@@ -1,41 +1,50 @@
/*
Read db file content and response 3000 words
*/
import type { NextApiRequest, NextApiResponse } from 'next';
import type { NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { readFileContentFromMongo } from '@fastgpt/service/common/file/gridfs/controller';
import { authFile } from '@fastgpt/service/support/permission/auth/file';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { NextAPI } from '@/service/middle/entry';
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
import { ApiRequestProps } from '@fastgpt/service/type/next';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
await connectToDatabase();
const { fileId, csvFormat } = req.body as { fileId: string; csvFormat?: boolean };
export type PreviewContextProps = {
type: DatasetSourceReadTypeEnum;
sourceId: string;
isQAImport?: boolean;
selector?: string;
};
if (!fileId) {
throw new Error('fileId is empty');
}
async function handler(req: ApiRequestProps<PreviewContextProps>, res: NextApiResponse<any>) {
const { type, sourceId, isQAImport, selector } = req.body;
const { teamId } = await authFile({ req, authToken: true, fileId });
const { rawText } = await readFileContentFromMongo({
teamId,
bucketName: BucketNameEnum.dataset,
fileId,
csvFormat
});
jsonRes(res, {
data: {
previewContent: rawText.slice(0, 3000),
totalLength: rawText.length
}
});
} catch (error) {
jsonRes(res, {
code: 500,
error
});
if (!sourceId) {
throw new Error('fileId is empty');
}
const { teamId } = await (async () => {
if (type === DatasetSourceReadTypeEnum.fileLocal) {
return authFile({ req, authToken: true, authApiKey: true, fileId: sourceId });
}
return authCert({ req, authApiKey: true, authToken: true });
})();
const rawText = await readDatasetSourceRawText({
teamId,
type,
sourceId: sourceId,
isQAImport,
selector
});
jsonRes(res, {
data: {
previewContent: rawText.slice(0, 3000),
totalLength: rawText.length
}
});
}
export default NextAPI(handler);

View File

@@ -0,0 +1,41 @@
import type { ApiRequestProps, ApiResponseType } from '@fastgpt/service/type/next';
import { NextAPI } from '@/service/middle/entry';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type';
import { countGptMessagesTokens } from '@fastgpt/service/common/string/tiktoken';
export type tokenQuery = {};
export type tokenBody = {
messages: ChatCompletionMessageParam[];
};
export type tokenResponse = {};
async function handler(
req: ApiRequestProps<tokenBody, tokenQuery>,
res: ApiResponseType<any>
): Promise<tokenResponse> {
await authCert({ req, authRoot: true });
const start = Date.now();
const tokens = await countGptMessagesTokens(req.body.messages);
return {
tokens,
time: Date.now() - start,
memory: process.memoryUsage()
};
}
export default NextAPI(handler);
export const config = {
api: {
bodyParser: {
sizeLimit: '20mb'
},
responseLimit: '20mb'
}
};

View File

@@ -19,6 +19,7 @@ import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants'
import { getLLMModel, getVectorModel } from '@fastgpt/service/core/ai/model';
import { parseCsvTable2Chunks } from '@fastgpt/service/core/dataset/training/utils';
import { startTrainingQueue } from '@/service/core/dataset/training/utils';
import { rawText2Chunks } from '@fastgpt/service/core/dataset/read';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
const { datasetId, parentId, fileId } = req.body as FileIdCreateDatasetCollectionParams;
@@ -39,10 +40,15 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
const { rawText, filename } = await readFileContentFromMongo({
teamId,
bucketName: BucketNameEnum.dataset,
fileId
fileId,
isQAImport: true
});
console.log(rawText);
// 2. split chunks
const { chunks = [] } = parseCsvTable2Chunks(rawText);
const chunks = rawText2Chunks({
rawText,
isQAImport: true
});
// 3. auth limit
await checkDatasetLimit({

View File

@@ -22,6 +22,7 @@ import { getLLMModel, getVectorModel } from '@fastgpt/service/core/ai/model';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { startTrainingQueue } from '@/service/core/dataset/training/utils';
import { MongoRawTextBuffer } from '@fastgpt/service/common/buffer/rawText/schema';
import { rawText2Chunks } from '@fastgpt/service/core/dataset/read';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
const {
@@ -51,8 +52,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
fileId
});
// 2. split chunks
const { chunks } = splitText2Chunks({
text: rawText,
const chunks = rawText2Chunks({
rawText,
chunkLen: chunkSize,
overlapRatio: trainingType === TrainingModeEnum.chunk ? 0.2 : 0,
customReg: chunkSplitter ? [chunkSplitter] : []
@@ -110,8 +111,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
trainingMode: trainingType,
prompt: qaPrompt,
billId,
data: chunks.map((text, index) => ({
q: text,
data: chunks.map((item, index) => ({
...item,
chunkIndex: index
})),
session

View File

@@ -1,79 +1,60 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import type { NextApiResponse } from 'next';
import { authFile } from '@fastgpt/service/support/permission/auth/file';
import { PostPreviewFilesChunksProps } from '@/global/core/dataset/api';
import { readFileContentFromMongo } from '@fastgpt/service/common/file/gridfs/controller';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants';
import { parseCsvTable2Chunks } from '@fastgpt/service/core/dataset/training/utils';
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { NextAPI } from '@/service/middle/entry';
import { ApiRequestProps } from '@fastgpt/service/type/next';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
await connectToDatabase();
export type PostPreviewFilesChunksProps = {
type: DatasetSourceReadTypeEnum;
sourceId: string;
chunkSize: number;
overlapRatio: number;
customSplitChar?: string;
selector?: string;
isQAImport?: boolean;
};
export type PreviewChunksResponse = {
q: string;
a: string;
}[];
const { type, sourceId, chunkSize, customSplitChar, overlapRatio } =
req.body as PostPreviewFilesChunksProps;
async function handler(
req: ApiRequestProps<PostPreviewFilesChunksProps>,
res: NextApiResponse<any>
): Promise<PreviewChunksResponse> {
const { type, sourceId, chunkSize, customSplitChar, overlapRatio, selector, isQAImport } =
req.body;
if (!sourceId) {
throw new Error('fileIdList is empty');
}
if (chunkSize > 30000) {
throw new Error('chunkSize is too large, should be less than 30000');
}
const { chunks } = await (async () => {
if (type === ImportDataSourceEnum.fileLocal) {
const { file, teamId } = await authFile({ req, authToken: true, fileId: sourceId });
const fileId = String(file._id);
const { rawText } = await readFileContentFromMongo({
teamId,
bucketName: BucketNameEnum.dataset,
fileId,
csvFormat: true
});
// split chunks (5 chunk)
const { chunks } = splitText2Chunks({
text: rawText,
chunkLen: chunkSize,
overlapRatio,
customReg: customSplitChar ? [customSplitChar] : []
});
return {
chunks: chunks.map((item) => ({
q: item,
a: ''
}))
};
}
if (type === ImportDataSourceEnum.csvTable) {
const { file, teamId } = await authFile({ req, authToken: true, fileId: sourceId });
const fileId = String(file._id);
const { rawText } = await readFileContentFromMongo({
teamId,
bucketName: BucketNameEnum.dataset,
fileId,
csvFormat: false
});
const { chunks } = parseCsvTable2Chunks(rawText);
return {
chunks: chunks || []
};
}
return { chunks: [] };
})();
jsonRes<{ q: string; a: string }[]>(res, {
data: chunks.slice(0, 5)
});
} catch (error) {
jsonRes(res, {
code: 500,
error
});
if (!sourceId) {
throw new Error('sourceId is empty');
}
if (chunkSize > 30000) {
throw new Error('chunkSize is too large, should be less than 30000');
}
const { teamId } = await (async () => {
if (type === DatasetSourceReadTypeEnum.fileLocal) {
return authFile({ req, authToken: true, authApiKey: true, fileId: sourceId });
}
return authCert({ req, authApiKey: true, authToken: true });
})();
const rawText = await readDatasetSourceRawText({
teamId,
type,
sourceId: sourceId,
selector,
isQAImport
});
return rawText2Chunks({
rawText,
chunkLen: chunkSize,
overlapRatio,
customReg: customSplitChar ? [customSplitChar] : [],
isQAImport: isQAImport
}).slice(0, 5);
}
export default NextAPI(handler);