mirror of
https://github.com/labring/FastGPT.git
synced 2025-08-02 12:48:30 +00:00
External dataset (#1497)
* perf: read rawText and chunk code * perf: read raw text * perf: read rawtext * perf: token count * log
This commit is contained in:
@@ -1,41 +1,50 @@
|
||||
/*
|
||||
Read db file content and response 3000 words
|
||||
*/
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import type { NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import { readFileContentFromMongo } from '@fastgpt/service/common/file/gridfs/controller';
|
||||
import { authFile } from '@fastgpt/service/support/permission/auth/file';
|
||||
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||
import { NextAPI } from '@/service/middle/entry';
|
||||
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
|
||||
import { ApiRequestProps } from '@fastgpt/service/type/next';
|
||||
import { authCert } from '@fastgpt/service/support/permission/auth/common';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
await connectToDatabase();
|
||||
const { fileId, csvFormat } = req.body as { fileId: string; csvFormat?: boolean };
|
||||
export type PreviewContextProps = {
|
||||
type: DatasetSourceReadTypeEnum;
|
||||
sourceId: string;
|
||||
isQAImport?: boolean;
|
||||
selector?: string;
|
||||
};
|
||||
|
||||
if (!fileId) {
|
||||
throw new Error('fileId is empty');
|
||||
}
|
||||
async function handler(req: ApiRequestProps<PreviewContextProps>, res: NextApiResponse<any>) {
|
||||
const { type, sourceId, isQAImport, selector } = req.body;
|
||||
|
||||
const { teamId } = await authFile({ req, authToken: true, fileId });
|
||||
|
||||
const { rawText } = await readFileContentFromMongo({
|
||||
teamId,
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileId,
|
||||
csvFormat
|
||||
});
|
||||
|
||||
jsonRes(res, {
|
||||
data: {
|
||||
previewContent: rawText.slice(0, 3000),
|
||||
totalLength: rawText.length
|
||||
}
|
||||
});
|
||||
} catch (error) {
|
||||
jsonRes(res, {
|
||||
code: 500,
|
||||
error
|
||||
});
|
||||
if (!sourceId) {
|
||||
throw new Error('fileId is empty');
|
||||
}
|
||||
|
||||
const { teamId } = await (async () => {
|
||||
if (type === DatasetSourceReadTypeEnum.fileLocal) {
|
||||
return authFile({ req, authToken: true, authApiKey: true, fileId: sourceId });
|
||||
}
|
||||
return authCert({ req, authApiKey: true, authToken: true });
|
||||
})();
|
||||
|
||||
const rawText = await readDatasetSourceRawText({
|
||||
teamId,
|
||||
type,
|
||||
sourceId: sourceId,
|
||||
isQAImport,
|
||||
selector
|
||||
});
|
||||
|
||||
jsonRes(res, {
|
||||
data: {
|
||||
previewContent: rawText.slice(0, 3000),
|
||||
totalLength: rawText.length
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
export default NextAPI(handler);
|
||||
|
41
projects/app/src/pages/api/core/ai/token.ts
Normal file
41
projects/app/src/pages/api/core/ai/token.ts
Normal file
@@ -0,0 +1,41 @@
|
||||
import type { ApiRequestProps, ApiResponseType } from '@fastgpt/service/type/next';
|
||||
import { NextAPI } from '@/service/middle/entry';
|
||||
import { authCert } from '@fastgpt/service/support/permission/auth/common';
|
||||
import { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type';
|
||||
import { countGptMessagesTokens } from '@fastgpt/service/common/string/tiktoken';
|
||||
|
||||
export type tokenQuery = {};
|
||||
|
||||
export type tokenBody = {
|
||||
messages: ChatCompletionMessageParam[];
|
||||
};
|
||||
|
||||
export type tokenResponse = {};
|
||||
|
||||
async function handler(
|
||||
req: ApiRequestProps<tokenBody, tokenQuery>,
|
||||
res: ApiResponseType<any>
|
||||
): Promise<tokenResponse> {
|
||||
await authCert({ req, authRoot: true });
|
||||
const start = Date.now();
|
||||
|
||||
const tokens = await countGptMessagesTokens(req.body.messages);
|
||||
|
||||
return {
|
||||
tokens,
|
||||
time: Date.now() - start,
|
||||
|
||||
memory: process.memoryUsage()
|
||||
};
|
||||
}
|
||||
|
||||
export default NextAPI(handler);
|
||||
|
||||
export const config = {
|
||||
api: {
|
||||
bodyParser: {
|
||||
sizeLimit: '20mb'
|
||||
},
|
||||
responseLimit: '20mb'
|
||||
}
|
||||
};
|
@@ -19,6 +19,7 @@ import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants'
|
||||
import { getLLMModel, getVectorModel } from '@fastgpt/service/core/ai/model';
|
||||
import { parseCsvTable2Chunks } from '@fastgpt/service/core/dataset/training/utils';
|
||||
import { startTrainingQueue } from '@/service/core/dataset/training/utils';
|
||||
import { rawText2Chunks } from '@fastgpt/service/core/dataset/read';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
const { datasetId, parentId, fileId } = req.body as FileIdCreateDatasetCollectionParams;
|
||||
@@ -39,10 +40,15 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
const { rawText, filename } = await readFileContentFromMongo({
|
||||
teamId,
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileId
|
||||
fileId,
|
||||
isQAImport: true
|
||||
});
|
||||
console.log(rawText);
|
||||
// 2. split chunks
|
||||
const { chunks = [] } = parseCsvTable2Chunks(rawText);
|
||||
const chunks = rawText2Chunks({
|
||||
rawText,
|
||||
isQAImport: true
|
||||
});
|
||||
|
||||
// 3. auth limit
|
||||
await checkDatasetLimit({
|
||||
|
@@ -22,6 +22,7 @@ import { getLLMModel, getVectorModel } from '@fastgpt/service/core/ai/model';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
import { startTrainingQueue } from '@/service/core/dataset/training/utils';
|
||||
import { MongoRawTextBuffer } from '@fastgpt/service/common/buffer/rawText/schema';
|
||||
import { rawText2Chunks } from '@fastgpt/service/core/dataset/read';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
const {
|
||||
@@ -51,8 +52,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
fileId
|
||||
});
|
||||
// 2. split chunks
|
||||
const { chunks } = splitText2Chunks({
|
||||
text: rawText,
|
||||
const chunks = rawText2Chunks({
|
||||
rawText,
|
||||
chunkLen: chunkSize,
|
||||
overlapRatio: trainingType === TrainingModeEnum.chunk ? 0.2 : 0,
|
||||
customReg: chunkSplitter ? [chunkSplitter] : []
|
||||
@@ -110,8 +111,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
trainingMode: trainingType,
|
||||
prompt: qaPrompt,
|
||||
billId,
|
||||
data: chunks.map((text, index) => ({
|
||||
q: text,
|
||||
data: chunks.map((item, index) => ({
|
||||
...item,
|
||||
chunkIndex: index
|
||||
})),
|
||||
session
|
||||
|
@@ -1,79 +1,60 @@
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||
import type { NextApiResponse } from 'next';
|
||||
import { authFile } from '@fastgpt/service/support/permission/auth/file';
|
||||
import { PostPreviewFilesChunksProps } from '@/global/core/dataset/api';
|
||||
import { readFileContentFromMongo } from '@fastgpt/service/common/file/gridfs/controller';
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { parseCsvTable2Chunks } from '@fastgpt/service/core/dataset/training/utils';
|
||||
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
|
||||
import { authCert } from '@fastgpt/service/support/permission/auth/common';
|
||||
import { NextAPI } from '@/service/middle/entry';
|
||||
import { ApiRequestProps } from '@fastgpt/service/type/next';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
await connectToDatabase();
|
||||
export type PostPreviewFilesChunksProps = {
|
||||
type: DatasetSourceReadTypeEnum;
|
||||
sourceId: string;
|
||||
chunkSize: number;
|
||||
overlapRatio: number;
|
||||
customSplitChar?: string;
|
||||
selector?: string;
|
||||
isQAImport?: boolean;
|
||||
};
|
||||
export type PreviewChunksResponse = {
|
||||
q: string;
|
||||
a: string;
|
||||
}[];
|
||||
|
||||
const { type, sourceId, chunkSize, customSplitChar, overlapRatio } =
|
||||
req.body as PostPreviewFilesChunksProps;
|
||||
async function handler(
|
||||
req: ApiRequestProps<PostPreviewFilesChunksProps>,
|
||||
res: NextApiResponse<any>
|
||||
): Promise<PreviewChunksResponse> {
|
||||
const { type, sourceId, chunkSize, customSplitChar, overlapRatio, selector, isQAImport } =
|
||||
req.body;
|
||||
|
||||
if (!sourceId) {
|
||||
throw new Error('fileIdList is empty');
|
||||
}
|
||||
if (chunkSize > 30000) {
|
||||
throw new Error('chunkSize is too large, should be less than 30000');
|
||||
}
|
||||
|
||||
const { chunks } = await (async () => {
|
||||
if (type === ImportDataSourceEnum.fileLocal) {
|
||||
const { file, teamId } = await authFile({ req, authToken: true, fileId: sourceId });
|
||||
const fileId = String(file._id);
|
||||
|
||||
const { rawText } = await readFileContentFromMongo({
|
||||
teamId,
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileId,
|
||||
csvFormat: true
|
||||
});
|
||||
// split chunks (5 chunk)
|
||||
const { chunks } = splitText2Chunks({
|
||||
text: rawText,
|
||||
chunkLen: chunkSize,
|
||||
overlapRatio,
|
||||
customReg: customSplitChar ? [customSplitChar] : []
|
||||
});
|
||||
|
||||
return {
|
||||
chunks: chunks.map((item) => ({
|
||||
q: item,
|
||||
a: ''
|
||||
}))
|
||||
};
|
||||
}
|
||||
if (type === ImportDataSourceEnum.csvTable) {
|
||||
const { file, teamId } = await authFile({ req, authToken: true, fileId: sourceId });
|
||||
const fileId = String(file._id);
|
||||
const { rawText } = await readFileContentFromMongo({
|
||||
teamId,
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileId,
|
||||
csvFormat: false
|
||||
});
|
||||
const { chunks } = parseCsvTable2Chunks(rawText);
|
||||
|
||||
return {
|
||||
chunks: chunks || []
|
||||
};
|
||||
}
|
||||
return { chunks: [] };
|
||||
})();
|
||||
|
||||
jsonRes<{ q: string; a: string }[]>(res, {
|
||||
data: chunks.slice(0, 5)
|
||||
});
|
||||
} catch (error) {
|
||||
jsonRes(res, {
|
||||
code: 500,
|
||||
error
|
||||
});
|
||||
if (!sourceId) {
|
||||
throw new Error('sourceId is empty');
|
||||
}
|
||||
if (chunkSize > 30000) {
|
||||
throw new Error('chunkSize is too large, should be less than 30000');
|
||||
}
|
||||
|
||||
const { teamId } = await (async () => {
|
||||
if (type === DatasetSourceReadTypeEnum.fileLocal) {
|
||||
return authFile({ req, authToken: true, authApiKey: true, fileId: sourceId });
|
||||
}
|
||||
return authCert({ req, authApiKey: true, authToken: true });
|
||||
})();
|
||||
|
||||
const rawText = await readDatasetSourceRawText({
|
||||
teamId,
|
||||
type,
|
||||
sourceId: sourceId,
|
||||
selector,
|
||||
isQAImport
|
||||
});
|
||||
|
||||
return rawText2Chunks({
|
||||
rawText,
|
||||
chunkLen: chunkSize,
|
||||
overlapRatio,
|
||||
customReg: customSplitChar ? [customSplitChar] : [],
|
||||
isQAImport: isQAImport
|
||||
}).slice(0, 5);
|
||||
}
|
||||
export default NextAPI(handler);
|
||||
|
Reference in New Issue
Block a user