mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-21 03:35:36 +00:00
perf: password special chars;feat: llm paragraph;perf: chunk setting params;perf: text splitter worker (#4984)
* perf: password special chars * feat: llm paragraph;perf: chunk setting params * perf: text splitter worker * perf: get rawtext buffer * fix: test * fix: test * doc * min chunk size
This commit is contained in:
22
docSite/content/zh-cn/docs/development/upgrading/4912.md
Normal file
22
docSite/content/zh-cn/docs/development/upgrading/4912.md
Normal file
@@ -0,0 +1,22 @@
|
||||
---
|
||||
title: 'V4.9.11(进行中)'
|
||||
description: 'FastGPT V4.9.12 更新说明'
|
||||
icon: 'upgrade'
|
||||
draft: false
|
||||
toc: true
|
||||
weight: 789
|
||||
---
|
||||
|
||||
## 🚀 新增内容
|
||||
|
||||
1. 商业版支持知识库分块时,LLM 进行自动分段识别。
|
||||
|
||||
## ⚙️ 优化
|
||||
|
||||
1. 密码校验时,增加更多的特殊字符
|
||||
2. 后端全量计算知识库 chunk 参数,避免自动模式下部分参数未正确使用默认值。
|
||||
3. 将文本分块移至 worker 线程,避免阻塞。
|
||||
|
||||
## 🐛 修复
|
||||
|
||||
1. 自定义问答提取提示词被覆盖。
|
@@ -3,9 +3,9 @@ export const checkPasswordRule = (password: string) => {
|
||||
/\d/, // Contains digits
|
||||
/[a-z]/, // Contains lowercase letters
|
||||
/[A-Z]/, // Contains uppercase letters
|
||||
/[!@#$%^&*()_+=-]/ // Contains special characters
|
||||
/[!@#$%^&*()_+=.,:;?\/\\|`~"'<>{}\[\]-]/ // Contains special characters
|
||||
];
|
||||
const validChars = /^[\dA-Za-z!@#$%^&*()_+=-]{8,100}$/;
|
||||
const validChars = /^[\dA-Za-z!@#$%^&*()_+=.,:;?\/\\|`~"'<>{}\[\]-]{8,100}$/;
|
||||
|
||||
// Check length and valid characters
|
||||
if (!validChars.test(password)) return false;
|
||||
|
@@ -1,10 +1,11 @@
|
||||
import { defaultMaxChunkSize } from '../../core/dataset/training/utils';
|
||||
import { getErrText } from '../error/utils';
|
||||
import { simpleText } from './tools';
|
||||
import { getTextValidLength } from './utils';
|
||||
|
||||
export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
|
||||
|
||||
type SplitProps = {
|
||||
export type SplitProps = {
|
||||
text: string;
|
||||
chunkSize: number;
|
||||
|
||||
@@ -19,7 +20,7 @@ export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkSize'> & {
|
||||
chunkSize?: number;
|
||||
};
|
||||
|
||||
type SplitResponse = {
|
||||
export type SplitResponse = {
|
||||
chunks: string[];
|
||||
chars: number;
|
||||
};
|
||||
@@ -474,7 +475,10 @@ export const splitText2Chunks = (props: SplitProps): SplitResponse => {
|
||||
});
|
||||
|
||||
return {
|
||||
chunks: splitResult.map((item) => item.chunks).flat(),
|
||||
chunks: splitResult
|
||||
.map((item) => item.chunks)
|
||||
.flat()
|
||||
.map((chunk) => simpleText(chunk)),
|
||||
chars: splitResult.reduce((sum, item) => sum + item.chars, 0)
|
||||
};
|
||||
};
|
||||
|
@@ -7,3 +7,4 @@ export const DEFAULT_ORG_AVATAR = '/imgs/avatar/defaultOrgAvatar.svg';
|
||||
export const DEFAULT_USER_AVATAR = '/imgs/avatar/BlueAvatar.svg';
|
||||
|
||||
export const isProduction = process.env.NODE_ENV === 'production';
|
||||
export const isTestEnv = process.env.NODE_ENV === 'test';
|
||||
|
@@ -211,7 +211,8 @@ export enum DataChunkSplitModeEnum {
|
||||
}
|
||||
export enum ParagraphChunkAIModeEnum {
|
||||
auto = 'auto',
|
||||
force = 'force'
|
||||
force = 'force',
|
||||
forbid = 'forbid'
|
||||
}
|
||||
|
||||
/* ------------ data -------------- */
|
||||
|
@@ -3,8 +3,11 @@ import { type EmbeddingModelItemType, type LLMModelItemType } from '../../../cor
|
||||
import {
|
||||
ChunkSettingModeEnum,
|
||||
DataChunkSplitModeEnum,
|
||||
DatasetCollectionDataProcessModeEnum
|
||||
DatasetCollectionDataProcessModeEnum,
|
||||
ParagraphChunkAIModeEnum
|
||||
} from '../constants';
|
||||
import type { ChunkSettingsType } from '../type';
|
||||
import { cloneDeep } from 'lodash';
|
||||
|
||||
export const minChunkSize = 64; // min index and chunk size
|
||||
|
||||
@@ -103,53 +106,78 @@ export const getIndexSizeSelectList = (max = 512) => {
|
||||
};
|
||||
|
||||
// Compute
|
||||
export const computeChunkSize = (params: {
|
||||
trainingType: DatasetCollectionDataProcessModeEnum;
|
||||
chunkSettingMode?: ChunkSettingModeEnum;
|
||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||
export const computedCollectionChunkSettings = <T extends ChunkSettingsType>({
|
||||
llmModel,
|
||||
vectorModel,
|
||||
...data
|
||||
}: {
|
||||
llmModel?: LLMModelItemType;
|
||||
chunkSize?: number;
|
||||
}) => {
|
||||
if (params.trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return getLLMDefaultChunkSize(params.llmModel);
|
||||
vectorModel?: EmbeddingModelItemType;
|
||||
} & T) => {
|
||||
const {
|
||||
trainingType = DatasetCollectionDataProcessModeEnum.chunk,
|
||||
chunkSettingMode = ChunkSettingModeEnum.auto,
|
||||
chunkSplitMode,
|
||||
chunkSize,
|
||||
paragraphChunkDeep = 5,
|
||||
indexSize,
|
||||
autoIndexes
|
||||
} = data;
|
||||
const cloneChunkSettings = cloneDeep(data);
|
||||
|
||||
if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
|
||||
delete cloneChunkSettings.qaPrompt;
|
||||
}
|
||||
|
||||
// Format training type indexSize/chunkSize
|
||||
const trainingModeSize: {
|
||||
autoChunkSize: number;
|
||||
autoIndexSize: number;
|
||||
chunkSize?: number;
|
||||
indexSize?: number;
|
||||
} = (() => {
|
||||
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||
return {
|
||||
autoChunkSize: getLLMDefaultChunkSize(llmModel),
|
||||
autoIndexSize: getMaxIndexSize(vectorModel),
|
||||
chunkSize,
|
||||
indexSize: getMaxIndexSize(vectorModel)
|
||||
};
|
||||
} else if (autoIndexes) {
|
||||
return {
|
||||
autoChunkSize: chunkAutoChunkSize,
|
||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||
chunkSize,
|
||||
indexSize
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
autoChunkSize: chunkAutoChunkSize,
|
||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||
chunkSize,
|
||||
indexSize
|
||||
};
|
||||
}
|
||||
})();
|
||||
|
||||
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
cloneChunkSettings.chunkSplitMode = DataChunkSplitModeEnum.paragraph;
|
||||
cloneChunkSettings.paragraphChunkAIMode = ParagraphChunkAIModeEnum.forbid;
|
||||
cloneChunkSettings.paragraphChunkDeep = 5;
|
||||
cloneChunkSettings.paragraphChunkMinSize = 100;
|
||||
cloneChunkSettings.chunkSize = trainingModeSize.autoChunkSize;
|
||||
cloneChunkSettings.indexSize = trainingModeSize.autoIndexSize;
|
||||
|
||||
cloneChunkSettings.chunkSplitter = undefined;
|
||||
} else {
|
||||
// chunk
|
||||
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return chunkAutoChunkSize;
|
||||
}
|
||||
cloneChunkSettings.paragraphChunkDeep =
|
||||
chunkSplitMode === DataChunkSplitModeEnum.paragraph ? paragraphChunkDeep : 0;
|
||||
|
||||
cloneChunkSettings.chunkSize = trainingModeSize.chunkSize
|
||||
? Math.min(trainingModeSize.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(llmModel))
|
||||
: undefined;
|
||||
cloneChunkSettings.indexSize = trainingModeSize.indexSize;
|
||||
}
|
||||
|
||||
if (params.chunkSplitMode === DataChunkSplitModeEnum.char) {
|
||||
return getLLMMaxChunkSize(params.llmModel);
|
||||
}
|
||||
|
||||
return Math.min(params.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel));
|
||||
};
|
||||
export const computeChunkSplitter = (params: {
|
||||
chunkSettingMode?: ChunkSettingModeEnum;
|
||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||
chunkSplitter?: string;
|
||||
}) => {
|
||||
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return undefined;
|
||||
}
|
||||
if (params.chunkSplitMode !== DataChunkSplitModeEnum.char) {
|
||||
return undefined;
|
||||
}
|
||||
return params.chunkSplitter;
|
||||
};
|
||||
export const computeParagraphChunkDeep = (params: {
|
||||
chunkSettingMode?: ChunkSettingModeEnum;
|
||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||
paragraphChunkDeep?: number;
|
||||
}) => {
|
||||
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return 5;
|
||||
}
|
||||
if (params.chunkSplitMode === DataChunkSplitModeEnum.paragraph) {
|
||||
return params.paragraphChunkDeep;
|
||||
}
|
||||
return 0;
|
||||
return cloneChunkSettings;
|
||||
};
|
||||
|
@@ -15,9 +15,11 @@
|
||||
"next": "14.2.28",
|
||||
"openai": "4.61.0",
|
||||
"openapi-types": "^12.1.3",
|
||||
"timezones-list": "^3.0.2"
|
||||
"timezones-list": "^3.0.2",
|
||||
"lodash": "^4.17.21"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/lodash": "^4.14.191",
|
||||
"@types/js-yaml": "^4.0.9",
|
||||
"@types/node": "20.14.0"
|
||||
}
|
||||
|
@@ -5,6 +5,8 @@ import { addLog } from '../../system/log';
|
||||
import { setCron } from '../../system/cron';
|
||||
import { checkTimerLock } from '../../system/timerLock/utils';
|
||||
import { TimerIdEnum } from '../../system/timerLock/constants';
|
||||
import { gridFsStream2Buffer } from '../../file/gridfs/utils';
|
||||
import { readRawContentFromBuffer } from '../../../worker/function';
|
||||
|
||||
const getGridBucket = () => {
|
||||
return new connectionMongo.mongo.GridFSBucket(connectionMongo.connection.db!, {
|
||||
@@ -85,30 +87,27 @@ export const getRawTextBuffer = async (sourceId: string) => {
|
||||
|
||||
// Read file content
|
||||
const downloadStream = gridBucket.openDownloadStream(bufferData._id);
|
||||
const chunks: Buffer[] = [];
|
||||
|
||||
return new Promise<{
|
||||
text: string;
|
||||
sourceName: string;
|
||||
} | null>((resolve, reject) => {
|
||||
downloadStream.on('data', (chunk) => {
|
||||
chunks.push(chunk);
|
||||
});
|
||||
const fileBuffers = await gridFsStream2Buffer(downloadStream);
|
||||
|
||||
downloadStream.on('end', () => {
|
||||
const buffer = Buffer.concat(chunks);
|
||||
const text = buffer.toString('utf8');
|
||||
resolve({
|
||||
text,
|
||||
sourceName: bufferData.metadata?.sourceName || ''
|
||||
});
|
||||
});
|
||||
const rawText = await (async () => {
|
||||
if (fileBuffers.length < 10000000) {
|
||||
return fileBuffers.toString('utf8');
|
||||
} else {
|
||||
return (
|
||||
await readRawContentFromBuffer({
|
||||
extension: 'txt',
|
||||
encoding: 'utf8',
|
||||
buffer: fileBuffers
|
||||
})
|
||||
).rawText;
|
||||
}
|
||||
})();
|
||||
|
||||
downloadStream.on('error', (error) => {
|
||||
addLog.error('getRawTextBuffer error', error);
|
||||
resolve(null);
|
||||
});
|
||||
});
|
||||
return {
|
||||
text: rawText,
|
||||
sourceName: bufferData.metadata?.sourceName || ''
|
||||
};
|
||||
});
|
||||
};
|
||||
|
||||
|
@@ -55,13 +55,17 @@ export const createFileFromText = async ({
|
||||
|
||||
export const gridFsStream2Buffer = (stream: NodeJS.ReadableStream) => {
|
||||
return new Promise<Buffer>((resolve, reject) => {
|
||||
if (!stream.readable) {
|
||||
return resolve(Buffer.from([]));
|
||||
}
|
||||
|
||||
const chunks: Uint8Array[] = [];
|
||||
|
||||
stream.on('data', (chunk) => {
|
||||
chunks.push(chunk);
|
||||
});
|
||||
stream.on('end', () => {
|
||||
const resultBuffer = Buffer.concat(chunks); // 一次性拼接
|
||||
const resultBuffer = Buffer.concat(chunks); // One-time splicing
|
||||
resolve(resultBuffer);
|
||||
});
|
||||
stream.on('error', (err) => {
|
||||
|
@@ -1,6 +1,5 @@
|
||||
import { uploadMongoImg } from '../image/controller';
|
||||
import FormData from 'form-data';
|
||||
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
|
||||
import fs from 'fs';
|
||||
import type { ReadFileResponse } from '../../../worker/readFile/type';
|
||||
import axios from 'axios';
|
||||
@@ -9,6 +8,7 @@ import { batchRun } from '@fastgpt/global/common/system/utils';
|
||||
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
|
||||
import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
|
||||
import { useDoc2xServer } from '../../../thirdProvider/doc2x';
|
||||
import { readRawContentFromBuffer } from '../../../worker/function';
|
||||
|
||||
export type readRawTextByLocalFileParams = {
|
||||
teamId: string;
|
||||
@@ -63,11 +63,10 @@ export const readRawContentByFileBuffer = async ({
|
||||
rawText: string;
|
||||
}> => {
|
||||
const systemParse = () =>
|
||||
runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
|
||||
readRawContentFromBuffer({
|
||||
extension,
|
||||
encoding,
|
||||
buffer,
|
||||
teamId
|
||||
buffer
|
||||
});
|
||||
const parsePdfFromCustomService = async (): Promise<ReadFileResponse> => {
|
||||
const url = global.systemEnv.customPdfParse?.url;
|
||||
|
@@ -1,3 +1,4 @@
|
||||
import { isTestEnv } from '@fastgpt/global/common/system/constants';
|
||||
import { addLog } from '../../common/system/log';
|
||||
import type { Model } from 'mongoose';
|
||||
import mongoose, { Mongoose } from 'mongoose';
|
||||
@@ -70,7 +71,7 @@ const addCommonMiddleware = (schema: mongoose.Schema) => {
|
||||
|
||||
export const getMongoModel = <T>(name: string, schema: mongoose.Schema) => {
|
||||
if (connectionMongo.models[name]) return connectionMongo.models[name] as Model<T>;
|
||||
if (process.env.NODE_ENV !== 'test') console.log('Load model======', name);
|
||||
if (!isTestEnv) console.log('Load model======', name);
|
||||
addCommonMiddleware(schema);
|
||||
|
||||
const model = connectionMongo.model<T>(name, schema);
|
||||
|
@@ -32,10 +32,7 @@ import { MongoDatasetDataText } from '../data/dataTextSchema';
|
||||
import { retryFn } from '@fastgpt/global/common/system/utils';
|
||||
import { getTrainingModeByCollection } from './utils';
|
||||
import {
|
||||
computeChunkSize,
|
||||
computeChunkSplitter,
|
||||
computeParagraphChunkDeep,
|
||||
getAutoIndexSize,
|
||||
computedCollectionChunkSettings,
|
||||
getLLMMaxChunkSize
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
|
||||
@@ -68,31 +65,50 @@ export const createCollectionAndInsertData = async ({
|
||||
createCollectionParams.autoIndexes = true;
|
||||
}
|
||||
|
||||
const teamId = createCollectionParams.teamId;
|
||||
const tmbId = createCollectionParams.tmbId;
|
||||
const formatCreateCollectionParams = computedCollectionChunkSettings({
|
||||
...createCollectionParams,
|
||||
llmModel: getLLMModel(dataset.agentModel),
|
||||
vectorModel: getEmbeddingModel(dataset.vectorModel)
|
||||
});
|
||||
|
||||
const teamId = formatCreateCollectionParams.teamId;
|
||||
const tmbId = formatCreateCollectionParams.tmbId;
|
||||
|
||||
// Set default params
|
||||
const trainingType =
|
||||
createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
|
||||
const chunkSplitter = computeChunkSplitter(createCollectionParams);
|
||||
const paragraphChunkDeep = computeParagraphChunkDeep(createCollectionParams);
|
||||
formatCreateCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
|
||||
const trainingMode = getTrainingModeByCollection({
|
||||
trainingType: trainingType,
|
||||
autoIndexes: createCollectionParams.autoIndexes,
|
||||
imageIndex: createCollectionParams.imageIndex
|
||||
autoIndexes: formatCreateCollectionParams.autoIndexes,
|
||||
imageIndex: formatCreateCollectionParams.imageIndex
|
||||
});
|
||||
|
||||
if (
|
||||
trainingType === DatasetCollectionDataProcessModeEnum.qa ||
|
||||
trainingType === DatasetCollectionDataProcessModeEnum.backup
|
||||
trainingType === DatasetCollectionDataProcessModeEnum.backup ||
|
||||
trainingType === DatasetCollectionDataProcessModeEnum.template
|
||||
) {
|
||||
delete createCollectionParams.chunkTriggerType;
|
||||
delete createCollectionParams.chunkTriggerMinSize;
|
||||
delete createCollectionParams.dataEnhanceCollectionName;
|
||||
delete createCollectionParams.imageIndex;
|
||||
delete createCollectionParams.autoIndexes;
|
||||
delete createCollectionParams.indexSize;
|
||||
delete createCollectionParams.qaPrompt;
|
||||
delete formatCreateCollectionParams.chunkTriggerType;
|
||||
delete formatCreateCollectionParams.chunkTriggerMinSize;
|
||||
delete formatCreateCollectionParams.dataEnhanceCollectionName;
|
||||
delete formatCreateCollectionParams.imageIndex;
|
||||
delete formatCreateCollectionParams.autoIndexes;
|
||||
|
||||
if (
|
||||
trainingType === DatasetCollectionDataProcessModeEnum.backup ||
|
||||
trainingType === DatasetCollectionDataProcessModeEnum.template
|
||||
) {
|
||||
delete formatCreateCollectionParams.paragraphChunkAIMode;
|
||||
delete formatCreateCollectionParams.paragraphChunkDeep;
|
||||
delete formatCreateCollectionParams.paragraphChunkMinSize;
|
||||
delete formatCreateCollectionParams.chunkSplitMode;
|
||||
delete formatCreateCollectionParams.chunkSize;
|
||||
delete formatCreateCollectionParams.chunkSplitter;
|
||||
delete formatCreateCollectionParams.indexSize;
|
||||
}
|
||||
}
|
||||
if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
|
||||
delete formatCreateCollectionParams.qaPrompt;
|
||||
}
|
||||
|
||||
// 1. split chunks or create image chunks
|
||||
@@ -109,30 +125,27 @@ export const createCollectionAndInsertData = async ({
|
||||
}>;
|
||||
chunkSize?: number;
|
||||
indexSize?: number;
|
||||
} = (() => {
|
||||
} = await (async () => {
|
||||
if (rawText) {
|
||||
const chunkSize = computeChunkSize({
|
||||
...createCollectionParams,
|
||||
trainingType,
|
||||
llmModel: getLLMModel(dataset.agentModel)
|
||||
});
|
||||
// Process text chunks
|
||||
const chunks = rawText2Chunks({
|
||||
const chunks = await rawText2Chunks({
|
||||
rawText,
|
||||
chunkTriggerType: createCollectionParams.chunkTriggerType,
|
||||
chunkTriggerMinSize: createCollectionParams.chunkTriggerMinSize,
|
||||
chunkSize,
|
||||
paragraphChunkDeep,
|
||||
paragraphChunkMinSize: createCollectionParams.paragraphChunkMinSize,
|
||||
chunkTriggerType: formatCreateCollectionParams.chunkTriggerType,
|
||||
chunkTriggerMinSize: formatCreateCollectionParams.chunkTriggerMinSize,
|
||||
chunkSize: formatCreateCollectionParams.chunkSize,
|
||||
paragraphChunkDeep: formatCreateCollectionParams.paragraphChunkDeep,
|
||||
paragraphChunkMinSize: formatCreateCollectionParams.paragraphChunkMinSize,
|
||||
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
|
||||
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
|
||||
customReg: chunkSplitter ? [chunkSplitter] : [],
|
||||
customReg: formatCreateCollectionParams.chunkSplitter
|
||||
? [formatCreateCollectionParams.chunkSplitter]
|
||||
: [],
|
||||
backupParse
|
||||
});
|
||||
return {
|
||||
chunks,
|
||||
chunkSize,
|
||||
indexSize: createCollectionParams.indexSize ?? getAutoIndexSize(dataset.vectorModel)
|
||||
chunkSize: formatCreateCollectionParams.chunkSize,
|
||||
indexSize: formatCreateCollectionParams.indexSize
|
||||
};
|
||||
}
|
||||
|
||||
@@ -147,12 +160,8 @@ export const createCollectionAndInsertData = async ({
|
||||
|
||||
return {
|
||||
chunks: [],
|
||||
chunkSize: computeChunkSize({
|
||||
...createCollectionParams,
|
||||
trainingType,
|
||||
llmModel: getLLMModel(dataset.agentModel)
|
||||
}),
|
||||
indexSize: createCollectionParams.indexSize ?? getAutoIndexSize(dataset.vectorModel)
|
||||
chunkSize: formatCreateCollectionParams.chunkSize,
|
||||
indexSize: formatCreateCollectionParams.indexSize
|
||||
};
|
||||
})();
|
||||
|
||||
@@ -165,11 +174,9 @@ export const createCollectionAndInsertData = async ({
|
||||
const fn = async (session: ClientSession) => {
|
||||
// 3. Create collection
|
||||
const { _id: collectionId } = await createOneCollection({
|
||||
...createCollectionParams,
|
||||
...formatCreateCollectionParams,
|
||||
trainingType,
|
||||
paragraphChunkDeep,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
indexSize,
|
||||
|
||||
hashRawText: rawText ? hashStr(rawText) : undefined,
|
||||
@@ -179,7 +186,7 @@ export const createCollectionAndInsertData = async ({
|
||||
if (!dataset.autoSync && dataset.type === DatasetTypeEnum.websiteDataset) return undefined;
|
||||
if (
|
||||
[DatasetCollectionTypeEnum.link, DatasetCollectionTypeEnum.apiFile].includes(
|
||||
createCollectionParams.type
|
||||
formatCreateCollectionParams.type
|
||||
)
|
||||
) {
|
||||
return addDays(new Date(), 1);
|
||||
@@ -195,7 +202,7 @@ export const createCollectionAndInsertData = async ({
|
||||
const { billId: newBillId } = await createTrainingUsage({
|
||||
teamId,
|
||||
tmbId,
|
||||
appName: createCollectionParams.name,
|
||||
appName: formatCreateCollectionParams.name,
|
||||
billSource: UsageSourceEnum.training,
|
||||
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
|
||||
agentModel: getLLMModel(dataset.agentModel)?.name,
|
||||
@@ -218,7 +225,7 @@ export const createCollectionAndInsertData = async ({
|
||||
vlmModel: dataset.vlmModel,
|
||||
indexSize,
|
||||
mode: trainingMode,
|
||||
prompt: createCollectionParams.qaPrompt,
|
||||
prompt: formatCreateCollectionParams.qaPrompt,
|
||||
billId: traingBillId,
|
||||
data: chunks.map((item, index) => ({
|
||||
...item,
|
||||
|
@@ -5,13 +5,14 @@ import {
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
|
||||
import { urlsFetch } from '../../common/string/cheerio';
|
||||
import { type TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { type TextSplitProps } from '@fastgpt/global/common/string/textSplitter';
|
||||
import axios from 'axios';
|
||||
import { readRawContentByFileBuffer } from '../../common/file/read/utils';
|
||||
import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools';
|
||||
import { getApiDatasetRequest } from './apiDataset';
|
||||
import Papa from 'papaparse';
|
||||
import type { ApiDatasetServerType } from '@fastgpt/global/core/dataset/apiDataset/type';
|
||||
import { text2Chunks } from '../../worker/function';
|
||||
|
||||
export const readFileRawTextByUrl = async ({
|
||||
teamId,
|
||||
@@ -165,7 +166,7 @@ export const readApiServerFileContent = async ({
|
||||
});
|
||||
};
|
||||
|
||||
export const rawText2Chunks = ({
|
||||
export const rawText2Chunks = async ({
|
||||
rawText,
|
||||
chunkTriggerType = ChunkTriggerConfigTypeEnum.minSize,
|
||||
chunkTriggerMinSize = 1000,
|
||||
@@ -182,12 +183,14 @@ export const rawText2Chunks = ({
|
||||
|
||||
backupParse?: boolean;
|
||||
tableParse?: boolean;
|
||||
} & TextSplitProps): {
|
||||
q: string;
|
||||
a: string;
|
||||
indexes?: string[];
|
||||
imageIdList?: string[];
|
||||
}[] => {
|
||||
} & TextSplitProps): Promise<
|
||||
{
|
||||
q: string;
|
||||
a: string;
|
||||
indexes?: string[];
|
||||
imageIdList?: string[];
|
||||
}[]
|
||||
> => {
|
||||
const parseDatasetBackup2Chunks = (rawText: string) => {
|
||||
const csvArr = Papa.parse(rawText).data as string[][];
|
||||
|
||||
@@ -233,7 +236,7 @@ export const rawText2Chunks = ({
|
||||
}
|
||||
}
|
||||
|
||||
const { chunks } = splitText2Chunks({
|
||||
const { chunks } = await text2Chunks({
|
||||
text: rawText,
|
||||
chunkSize,
|
||||
...splitProps
|
||||
|
@@ -112,24 +112,15 @@ export async function pushDataListToTrainingQueue({
|
||||
|
||||
// format q and a, remove empty char
|
||||
data = data.filter((item) => {
|
||||
item.q = simpleText(item.q);
|
||||
item.a = simpleText(item.a);
|
||||
|
||||
item.indexes = item.indexes
|
||||
?.map((index) => {
|
||||
return {
|
||||
...index,
|
||||
text: simpleText(index.text)
|
||||
};
|
||||
})
|
||||
.filter(Boolean);
|
||||
const q = item.q || '';
|
||||
const a = item.a || '';
|
||||
|
||||
// filter repeat content
|
||||
if (!item.imageId && !item.q) {
|
||||
if (!item.imageId && !q) {
|
||||
return;
|
||||
}
|
||||
|
||||
const text = item.q + item.a;
|
||||
const text = q + a;
|
||||
|
||||
// Oversize llm tokens
|
||||
if (text.length > maxToken) {
|
||||
|
@@ -8,6 +8,8 @@ import {
|
||||
type CreateUsageProps
|
||||
} from '@fastgpt/global/support/wallet/usage/api';
|
||||
import { i18nT } from '../../../../web/i18n/utils';
|
||||
import { formatModelChars2Points } from './utils';
|
||||
import { ModelTypeEnum } from '@fastgpt/global/core/ai/model';
|
||||
|
||||
export async function createUsage(data: CreateUsageProps) {
|
||||
try {
|
||||
@@ -67,6 +69,14 @@ export const createChatUsage = ({
|
||||
return { totalPoints };
|
||||
};
|
||||
|
||||
export type DatasetTrainingMode = 'paragraph' | 'qa' | 'autoIndex' | 'imageIndex' | 'imageParse';
|
||||
export const datasetTrainingUsageIndexMap: Record<DatasetTrainingMode, number> = {
|
||||
paragraph: 1,
|
||||
qa: 2,
|
||||
autoIndex: 3,
|
||||
imageIndex: 4,
|
||||
imageParse: 5
|
||||
};
|
||||
export const createTrainingUsage = async ({
|
||||
teamId,
|
||||
tmbId,
|
||||
@@ -108,6 +118,13 @@ export const createTrainingUsage = async ({
|
||||
: []),
|
||||
...(agentModel
|
||||
? [
|
||||
{
|
||||
moduleName: i18nT('account_usage:llm_paragraph'),
|
||||
model: agentModel,
|
||||
amount: 0,
|
||||
inputTokens: 0,
|
||||
outputTokens: 0
|
||||
},
|
||||
{
|
||||
moduleName: i18nT('account_usage:qa'),
|
||||
model: agentModel,
|
||||
@@ -126,6 +143,13 @@ export const createTrainingUsage = async ({
|
||||
: []),
|
||||
...(vllmModel
|
||||
? [
|
||||
{
|
||||
moduleName: i18nT('account_usage:image_index'),
|
||||
model: vllmModel,
|
||||
amount: 0,
|
||||
inputTokens: 0,
|
||||
outputTokens: 0
|
||||
},
|
||||
{
|
||||
moduleName: i18nT('account_usage:image_parse'),
|
||||
model: vllmModel,
|
||||
@@ -171,3 +195,43 @@ export const createPdfParseUsage = async ({
|
||||
]
|
||||
});
|
||||
};
|
||||
|
||||
export const pushLLMTrainingUsage = async ({
|
||||
teamId,
|
||||
tmbId,
|
||||
model,
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
billId,
|
||||
mode
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
model: string;
|
||||
inputTokens: number;
|
||||
outputTokens: number;
|
||||
billId: string;
|
||||
mode: DatasetTrainingMode;
|
||||
}) => {
|
||||
const index = datasetTrainingUsageIndexMap[mode];
|
||||
|
||||
// Compute points
|
||||
const { totalPoints } = formatModelChars2Points({
|
||||
model,
|
||||
modelType: ModelTypeEnum.llm,
|
||||
inputTokens,
|
||||
outputTokens
|
||||
});
|
||||
|
||||
concatUsage({
|
||||
billId,
|
||||
teamId,
|
||||
tmbId,
|
||||
totalPoints,
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
listIndex: index
|
||||
});
|
||||
|
||||
return { totalPoints };
|
||||
};
|
||||
|
18
packages/service/worker/controller.ts
Normal file
18
packages/service/worker/controller.ts
Normal file
@@ -0,0 +1,18 @@
|
||||
import type { MessagePort } from 'worker_threads';
|
||||
|
||||
export const workerResponse = ({
|
||||
parentPort,
|
||||
status,
|
||||
data
|
||||
}: {
|
||||
parentPort: MessagePort | null;
|
||||
status: 'success' | 'error';
|
||||
data: any;
|
||||
}) => {
|
||||
parentPort?.postMessage({
|
||||
type: status,
|
||||
data: data
|
||||
});
|
||||
|
||||
process.exit();
|
||||
};
|
24
packages/service/worker/function.ts
Normal file
24
packages/service/worker/function.ts
Normal file
@@ -0,0 +1,24 @@
|
||||
import {
|
||||
splitText2Chunks,
|
||||
type SplitProps,
|
||||
type SplitResponse
|
||||
} from '@fastgpt/global/common/string/textSplitter';
|
||||
import { runWorker, WorkerNameEnum } from './utils';
|
||||
import type { ReadFileResponse } from './readFile/type';
|
||||
import { isTestEnv } from '@fastgpt/global/common/system/constants';
|
||||
|
||||
export const text2Chunks = (props: SplitProps) => {
|
||||
// Test env, not run worker
|
||||
if (isTestEnv) {
|
||||
return splitText2Chunks(props);
|
||||
}
|
||||
return runWorker<SplitResponse>(WorkerNameEnum.text2Chunks, props);
|
||||
};
|
||||
|
||||
export const readRawContentFromBuffer = (props: {
|
||||
extension: string;
|
||||
encoding: string;
|
||||
buffer: Buffer;
|
||||
}) => {
|
||||
return runWorker<ReadFileResponse>(WorkerNameEnum.readFile, props);
|
||||
};
|
@@ -1,19 +1,21 @@
|
||||
import { parentPort } from 'worker_threads';
|
||||
import { html2md } from './utils';
|
||||
import { workerResponse } from '../controller';
|
||||
|
||||
parentPort?.on('message', (params: { html: string }) => {
|
||||
try {
|
||||
const md = html2md(params?.html || '');
|
||||
|
||||
parentPort?.postMessage({
|
||||
type: 'success',
|
||||
workerResponse({
|
||||
parentPort,
|
||||
status: 'success',
|
||||
data: md
|
||||
});
|
||||
} catch (error) {
|
||||
parentPort?.postMessage({
|
||||
type: 'error',
|
||||
workerResponse({
|
||||
parentPort,
|
||||
status: 'error',
|
||||
data: error
|
||||
});
|
||||
}
|
||||
process.exit();
|
||||
});
|
||||
|
@@ -7,6 +7,7 @@ import { readDocsFile } from './extension/docx';
|
||||
import { readPptxRawText } from './extension/pptx';
|
||||
import { readXlsxRawText } from './extension/xlsx';
|
||||
import { readCsvRawText } from './extension/csv';
|
||||
import { workerResponse } from '../controller';
|
||||
|
||||
parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
|
||||
const read = async (params: ReadRawTextByBuffer) => {
|
||||
@@ -41,17 +42,16 @@ parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
|
||||
};
|
||||
|
||||
try {
|
||||
parentPort?.postMessage({
|
||||
type: 'success',
|
||||
workerResponse({
|
||||
parentPort,
|
||||
status: 'success',
|
||||
data: await read(newProps)
|
||||
});
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
parentPort?.postMessage({
|
||||
type: 'error',
|
||||
workerResponse({
|
||||
parentPort,
|
||||
status: 'error',
|
||||
data: error
|
||||
});
|
||||
}
|
||||
|
||||
process.exit();
|
||||
});
|
||||
|
14
packages/service/worker/text2Chunks/index.ts
Normal file
14
packages/service/worker/text2Chunks/index.ts
Normal file
@@ -0,0 +1,14 @@
|
||||
import { parentPort } from 'worker_threads';
|
||||
import type { SplitProps } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { workerResponse } from '../controller';
|
||||
|
||||
parentPort?.on('message', async (props: SplitProps) => {
|
||||
const result = splitText2Chunks(props);
|
||||
|
||||
workerResponse({
|
||||
parentPort,
|
||||
status: 'success',
|
||||
data: result
|
||||
});
|
||||
});
|
@@ -6,7 +6,8 @@ export enum WorkerNameEnum {
|
||||
readFile = 'readFile',
|
||||
htmlStr2Md = 'htmlStr2Md',
|
||||
countGptMessagesTokens = 'countGptMessagesTokens',
|
||||
systemPluginRun = 'systemPluginRun'
|
||||
systemPluginRun = 'systemPluginRun',
|
||||
text2Chunks = 'text2Chunks'
|
||||
}
|
||||
|
||||
export const getSafeEnv = () => {
|
||||
|
@@ -151,8 +151,7 @@ const MySelect = <T = any,>(
|
||||
? {
|
||||
ref: SelectedItemRef,
|
||||
color: 'primary.700',
|
||||
bg: 'myGray.100',
|
||||
fontWeight: '600'
|
||||
bg: 'myGray.100'
|
||||
}
|
||||
: {
|
||||
color: 'myGray.900'
|
||||
@@ -167,7 +166,7 @@ const MySelect = <T = any,>(
|
||||
display={'block'}
|
||||
mb={0.5}
|
||||
>
|
||||
<Flex alignItems={'center'}>
|
||||
<Flex alignItems={'center'} fontWeight={value === item.value ? '600' : 'normal'}>
|
||||
{item.icon && (
|
||||
<Avatar mr={2} src={item.icon as any} w={item.iconSize ?? '1rem'} />
|
||||
)}
|
||||
|
@@ -20,8 +20,10 @@
|
||||
"export_title": "Time,Members,Type,Project name,AI points",
|
||||
"feishu": "Feishu",
|
||||
"generation_time": "Generation time",
|
||||
"image_index": "Image index",
|
||||
"image_parse": "Image tagging",
|
||||
"input_token_length": "input tokens",
|
||||
"llm_paragraph": "LLM segmentation",
|
||||
"mcp": "MCP call",
|
||||
"member": "member",
|
||||
"member_name": "Member name",
|
||||
|
@@ -45,6 +45,7 @@
|
||||
"core.dataset.import.Adjust parameters": "Adjust parameters",
|
||||
"custom_data_process_params": "Custom",
|
||||
"custom_data_process_params_desc": "Customize data processing rules",
|
||||
"custom_split_char": "Char",
|
||||
"custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.",
|
||||
"data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
|
||||
"data_error_amount": "{{errorAmount}} Group training exception",
|
||||
@@ -117,6 +118,11 @@
|
||||
"insert_images_success": "The new picture is successfully added, and you need to wait for the training to be completed before it will be displayed.",
|
||||
"is_open_schedule": "Enable scheduled synchronization",
|
||||
"keep_image": "Keep the picture",
|
||||
"llm_paragraph_mode": "LLM recognition paragraph(Beta)",
|
||||
"llm_paragraph_mode_auto": "automatic",
|
||||
"llm_paragraph_mode_auto_desc": "Enable the model to automatically recognize the title when the file content does not contain a Markdown title.",
|
||||
"llm_paragraph_mode_forbid": "Disabled",
|
||||
"llm_paragraph_mode_forbid_desc": "Force the disabling of the model's automatic paragraph recognition",
|
||||
"loading": "Loading...",
|
||||
"max_chunk_size": "Maximum chunk size",
|
||||
"move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.",
|
||||
|
@@ -20,8 +20,10 @@
|
||||
"export_title": "时间,成员,类型,项目名,AI 积分消耗",
|
||||
"feishu": "飞书",
|
||||
"generation_time": "生成时间",
|
||||
"image_index": "图片索引",
|
||||
"image_parse": "图片标注",
|
||||
"input_token_length": "输入 tokens",
|
||||
"llm_paragraph": "模型分段",
|
||||
"mcp": "MCP 调用",
|
||||
"member": "成员",
|
||||
"member_name": "成员名",
|
||||
|
@@ -45,6 +45,7 @@
|
||||
"core.dataset.import.Adjust parameters": "调整参数",
|
||||
"custom_data_process_params": "自定义",
|
||||
"custom_data_process_params_desc": "自定义设置数据处理规则",
|
||||
"custom_split_char": "分隔符",
|
||||
"custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号,例如: * () [] {} 等。",
|
||||
"data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引",
|
||||
"data_error_amount": "{{errorAmount}} 组训练异常",
|
||||
@@ -117,6 +118,11 @@
|
||||
"insert_images_success": "新增图片成功,需等待训练完成才会展示",
|
||||
"is_open_schedule": "启用定时同步",
|
||||
"keep_image": "保留图片",
|
||||
"llm_paragraph_mode": "模型识别段落(Beta)",
|
||||
"llm_paragraph_mode_auto": "自动",
|
||||
"llm_paragraph_mode_auto_desc": "当文件内容不包含 Markdown 标题时,启用模型自动识别标题。",
|
||||
"llm_paragraph_mode_forbid": "禁用",
|
||||
"llm_paragraph_mode_forbid_desc": "强制禁用模型自动识别段落",
|
||||
"loading": "加载中...",
|
||||
"max_chunk_size": "最大分块大小",
|
||||
"move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。",
|
||||
|
@@ -20,8 +20,10 @@
|
||||
"export_title": "時間,成員,類型,項目名,AI 積分消耗",
|
||||
"feishu": "飛書",
|
||||
"generation_time": "生成時間",
|
||||
"image_index": "圖片索引",
|
||||
"image_parse": "圖片標註",
|
||||
"input_token_length": "輸入 tokens",
|
||||
"llm_paragraph": "模型分段",
|
||||
"mcp": "MCP 調用",
|
||||
"member": "成員",
|
||||
"member_name": "成員名",
|
||||
|
@@ -44,6 +44,7 @@
|
||||
"core.dataset.import.Adjust parameters": "調整參數",
|
||||
"custom_data_process_params": "自訂",
|
||||
"custom_data_process_params_desc": "自訂資料處理規則",
|
||||
"custom_split_char": "分隔符",
|
||||
"custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的資料,使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.”表示中英文句號。\n\n盡量避免使用正則相關特殊符號,例如:* () [] {} 等。",
|
||||
"data_amount": "{{dataAmount}} 組資料,{{indexAmount}} 組索引",
|
||||
"data_error_amount": "{{errorAmount}} 組訓練異常",
|
||||
@@ -116,6 +117,11 @@
|
||||
"insert_images_success": "新增圖片成功,需等待訓練完成才會展示",
|
||||
"is_open_schedule": "啟用定時同步",
|
||||
"keep_image": "保留圖片",
|
||||
"llm_paragraph_mode": "模型識別段落(Beta)",
|
||||
"llm_paragraph_mode_auto": "自動",
|
||||
"llm_paragraph_mode_auto_desc": "當文件內容不包含 Markdown 標題時,啟用模型自動識別標題。",
|
||||
"llm_paragraph_mode_forbid": "禁用",
|
||||
"llm_paragraph_mode_forbid_desc": "強制禁用模型自動識別段落",
|
||||
"loading": "加載中...",
|
||||
"max_chunk_size": "最大分塊大小",
|
||||
"move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。",
|
||||
|
6
pnpm-lock.yaml
generated
6
pnpm-lock.yaml
generated
@@ -89,6 +89,9 @@ importers:
|
||||
json5:
|
||||
specifier: ^2.2.3
|
||||
version: 2.2.3
|
||||
lodash:
|
||||
specifier: ^4.17.21
|
||||
version: 4.17.21
|
||||
nanoid:
|
||||
specifier: ^5.1.3
|
||||
version: 5.1.3
|
||||
@@ -108,6 +111,9 @@ importers:
|
||||
'@types/js-yaml':
|
||||
specifier: ^4.0.9
|
||||
version: 4.0.9
|
||||
'@types/lodash':
|
||||
specifier: ^4.14.191
|
||||
version: 4.17.16
|
||||
'@types/node':
|
||||
specifier: 20.14.0
|
||||
version: 20.14.0
|
||||
|
@@ -9,25 +9,14 @@ import { useMyStep } from '@fastgpt/web/hooks/useStep';
|
||||
import MyDivider from '@fastgpt/web/components/common/MyDivider';
|
||||
import React from 'react';
|
||||
import { Box, Link, Input, Button, ModalBody, ModalFooter, Stack } from '@chakra-ui/react';
|
||||
import {
|
||||
DataChunkSplitModeEnum,
|
||||
DatasetCollectionDataProcessModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
|
||||
import { useContextSelector } from 'use-context-selector';
|
||||
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
||||
import CollectionChunkForm, {
|
||||
collectionChunkForm2StoreChunkData,
|
||||
type CollectionChunkFormType
|
||||
} from '../Form/CollectionChunkForm';
|
||||
import {
|
||||
getAutoIndexSize,
|
||||
getLLMDefaultChunkSize
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
import CollectionChunkForm, { type CollectionChunkFormType } from '../Form/CollectionChunkForm';
|
||||
import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
|
||||
import PopoverConfirm from '@fastgpt/web/components/common/MyPopover/PopoverConfirm';
|
||||
import { defaultFormData } from '../Import/Context';
|
||||
import { computedCollectionChunkSettings } from '@fastgpt/global/core/dataset/training/utils';
|
||||
|
||||
export type WebsiteConfigFormType = {
|
||||
websiteConfig: {
|
||||
@@ -80,7 +69,7 @@ const WebsiteConfigModal = ({
|
||||
|
||||
const form = useForm<CollectionChunkFormType>({
|
||||
defaultValues: {
|
||||
trainingType: chunkSettings?.trainingType,
|
||||
trainingType: chunkSettings?.trainingType || defaultFormData.trainingType,
|
||||
|
||||
chunkTriggerType: chunkSettings?.chunkTriggerType || defaultFormData.chunkTriggerType,
|
||||
chunkTriggerMinSize:
|
||||
@@ -204,9 +193,9 @@ const WebsiteConfigModal = ({
|
||||
form.handleSubmit((data) =>
|
||||
onSuccess({
|
||||
websiteConfig: websiteInfoGetValues(),
|
||||
chunkSettings: collectionChunkForm2StoreChunkData({
|
||||
chunkSettings: computedCollectionChunkSettings({
|
||||
...data,
|
||||
agentModel: datasetDetail.agentModel,
|
||||
llmModel: datasetDetail.agentModel,
|
||||
vectorModel: datasetDetail.vectorModel
|
||||
})
|
||||
})
|
||||
|
@@ -17,7 +17,7 @@ import {
|
||||
} from '@chakra-ui/react';
|
||||
import MyIcon from '@fastgpt/web/components/common/Icon';
|
||||
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
|
||||
import type { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { ChunkTriggerConfigTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import {
|
||||
DataChunkSplitModeEnum,
|
||||
@@ -133,6 +133,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
const autoIndexes = watch('autoIndexes');
|
||||
const indexSize = watch('indexSize');
|
||||
const imageIndex = watch('imageIndex');
|
||||
const paragraphChunkAIMode = watch('paragraphChunkAIMode');
|
||||
|
||||
const trainingModeList = useMemo(() => {
|
||||
const list = {
|
||||
@@ -362,11 +363,35 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
onChange={(e) => {
|
||||
setValue('chunkSplitMode', e);
|
||||
}}
|
||||
fontSize={'md'}
|
||||
/>
|
||||
|
||||
{chunkSplitMode === DataChunkSplitModeEnum.paragraph && (
|
||||
<>
|
||||
<Box mt={1.5}>
|
||||
<Box mt={3}>
|
||||
<Box fontSize={'sm'}>{t('dataset:llm_paragraph_mode')}</Box>
|
||||
<MySelect<ParagraphChunkAIModeEnum>
|
||||
size={'sm'}
|
||||
bg={'myGray.50'}
|
||||
value={paragraphChunkAIMode}
|
||||
onChange={(e) => {
|
||||
setValue('paragraphChunkAIMode', e);
|
||||
}}
|
||||
list={[
|
||||
{
|
||||
label: t('dataset:llm_paragraph_mode_forbid'),
|
||||
value: ParagraphChunkAIModeEnum.forbid,
|
||||
description: t('dataset:llm_paragraph_mode_forbid_desc')
|
||||
},
|
||||
{
|
||||
label: t('dataset:llm_paragraph_mode_auto'),
|
||||
value: ParagraphChunkAIModeEnum.auto,
|
||||
description: t('dataset:llm_paragraph_mode_auto_desc')
|
||||
}
|
||||
]}
|
||||
/>
|
||||
</Box>
|
||||
<Box mt={2} fontSize={'sm'}>
|
||||
<Box>{t('dataset:paragraph_max_deep')}</Box>
|
||||
<MyNumberInput
|
||||
size={'sm'}
|
||||
@@ -379,7 +404,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
h={'32px'}
|
||||
/>
|
||||
</Box>
|
||||
<Box mt={1.5}>
|
||||
<Box mt={2} fontSize={'sm'}>
|
||||
<Box>{t('dataset:max_chunk_size')}</Box>
|
||||
<Box
|
||||
css={{
|
||||
@@ -409,7 +434,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
)}
|
||||
|
||||
{chunkSplitMode === DataChunkSplitModeEnum.size && (
|
||||
<Box mt={1.5}>
|
||||
<Box mt={3} fontSize={'sm'}>
|
||||
<Box>{t('dataset:chunk_size')}</Box>
|
||||
<Box
|
||||
css={{
|
||||
@@ -438,45 +463,48 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
)}
|
||||
|
||||
{chunkSplitMode === DataChunkSplitModeEnum.char && (
|
||||
<HStack mt={1.5}>
|
||||
<Box flex={'1 0 0'}>
|
||||
<MySelect<string>
|
||||
list={customSplitList}
|
||||
size={'sm'}
|
||||
bg={'myGray.50'}
|
||||
value={customListSelectValue}
|
||||
h={'32px'}
|
||||
onChange={(val) => {
|
||||
if (val === 'Other') {
|
||||
setValue('chunkSplitter', '');
|
||||
} else {
|
||||
setValue('chunkSplitter', val);
|
||||
}
|
||||
setCustomListSelectValue(val);
|
||||
}}
|
||||
/>
|
||||
</Box>
|
||||
{customListSelectValue === 'Other' && (
|
||||
<Input
|
||||
flex={'1 0 0'}
|
||||
h={'32px'}
|
||||
size={'sm'}
|
||||
bg={'myGray.50'}
|
||||
placeholder="\n;======;==SPLIT=="
|
||||
{...register('chunkSplitter')}
|
||||
/>
|
||||
)}
|
||||
</HStack>
|
||||
<Box mt={3} fontSize={'sm'}>
|
||||
<Box>{t('dataset:custom_split_char')}</Box>
|
||||
<HStack>
|
||||
<Box flex={'1 0 0'}>
|
||||
<MySelect<string>
|
||||
list={customSplitList}
|
||||
size={'sm'}
|
||||
bg={'myGray.50'}
|
||||
value={customListSelectValue}
|
||||
h={'32px'}
|
||||
onChange={(val) => {
|
||||
if (val === 'Other') {
|
||||
setValue('chunkSplitter', '');
|
||||
} else {
|
||||
setValue('chunkSplitter', val);
|
||||
}
|
||||
setCustomListSelectValue(val);
|
||||
}}
|
||||
/>
|
||||
</Box>
|
||||
{customListSelectValue === 'Other' && (
|
||||
<Input
|
||||
flex={'1 0 0'}
|
||||
h={'32px'}
|
||||
size={'sm'}
|
||||
bg={'myGray.50'}
|
||||
placeholder="\n;======;==SPLIT=="
|
||||
{...register('chunkSplitter')}
|
||||
/>
|
||||
)}
|
||||
</HStack>
|
||||
</Box>
|
||||
)}
|
||||
</Box>
|
||||
|
||||
{trainingType === DatasetCollectionDataProcessModeEnum.chunk && (
|
||||
<Box>
|
||||
<Flex alignItems={'center'} mt={3}>
|
||||
<Box fontSize={'sm'} mt={2}>
|
||||
<Flex alignItems={'center'}>
|
||||
<Box>{t('dataset:index_size')}</Box>
|
||||
<QuestionTip label={t('dataset:index_size_tips')} />
|
||||
</Flex>
|
||||
<Box mt={1}>
|
||||
<Box>
|
||||
<MySelect<number>
|
||||
bg={'myGray.50'}
|
||||
list={indexSizeSeletorList}
|
||||
@@ -490,7 +518,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
)}
|
||||
|
||||
{showQAPromptInput && (
|
||||
<Box mt={3}>
|
||||
<Box mt={2}>
|
||||
<Box>{t('common:core.dataset.collection.QA Prompt')}</Box>
|
||||
<Box
|
||||
position={'relative'}
|
||||
@@ -570,83 +598,3 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
};
|
||||
|
||||
export default CollectionChunkForm;
|
||||
|
||||
// Get chunk settings from form
|
||||
export const collectionChunkForm2StoreChunkData = ({
|
||||
agentModel,
|
||||
vectorModel,
|
||||
...data
|
||||
}: CollectionChunkFormType & {
|
||||
agentModel: LLMModelItemType;
|
||||
vectorModel: EmbeddingModelItemType;
|
||||
}): CollectionChunkFormType => {
|
||||
const {
|
||||
trainingType,
|
||||
autoIndexes,
|
||||
chunkSettingMode,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
indexSize,
|
||||
qaPrompt
|
||||
} = data;
|
||||
|
||||
// 根据处理方式,获取 auto 和 custom 的参数。
|
||||
const trainingModeSize: {
|
||||
autoChunkSize: number;
|
||||
autoIndexSize: number;
|
||||
chunkSize: number;
|
||||
indexSize: number;
|
||||
} = (() => {
|
||||
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||
return {
|
||||
autoChunkSize: getLLMDefaultChunkSize(agentModel),
|
||||
autoIndexSize: getMaxIndexSize(vectorModel),
|
||||
chunkSize,
|
||||
indexSize: getMaxIndexSize(vectorModel)
|
||||
};
|
||||
} else if (autoIndexes) {
|
||||
return {
|
||||
autoChunkSize: chunkAutoChunkSize,
|
||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||
chunkSize,
|
||||
indexSize
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
autoChunkSize: chunkAutoChunkSize,
|
||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||
chunkSize,
|
||||
indexSize
|
||||
};
|
||||
}
|
||||
})();
|
||||
|
||||
// 获取真实参数
|
||||
const {
|
||||
chunkSize: formatChunkIndex,
|
||||
indexSize: formatIndexSize,
|
||||
chunkSplitter: formatChunkSplitter
|
||||
} = (() => {
|
||||
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return {
|
||||
chunkSize: trainingModeSize.autoChunkSize,
|
||||
indexSize: trainingModeSize.autoIndexSize,
|
||||
chunkSplitter: ''
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
chunkSize: trainingModeSize.chunkSize,
|
||||
indexSize: trainingModeSize.indexSize,
|
||||
chunkSplitter
|
||||
};
|
||||
}
|
||||
})();
|
||||
|
||||
return {
|
||||
...data,
|
||||
chunkSize: formatChunkIndex,
|
||||
indexSize: formatIndexSize,
|
||||
chunkSplitter: formatChunkSplitter,
|
||||
qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
|
||||
};
|
||||
};
|
||||
|
@@ -52,7 +52,7 @@ export const defaultFormData: ImportFormType = {
|
||||
|
||||
chunkSettingMode: ChunkSettingModeEnum.auto,
|
||||
chunkSplitMode: DataChunkSplitModeEnum.paragraph,
|
||||
paragraphChunkAIMode: ParagraphChunkAIModeEnum.auto,
|
||||
paragraphChunkAIMode: ParagraphChunkAIModeEnum.forbid,
|
||||
paragraphChunkDeep: 5,
|
||||
paragraphChunkMinSize: 100,
|
||||
|
||||
@@ -198,10 +198,10 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
|
||||
const vectorModel = datasetDetail.vectorModel;
|
||||
|
||||
const processParamsForm = useForm<ImportFormType>({
|
||||
defaultValues: {
|
||||
defaultValues: (() => ({
|
||||
...defaultFormData,
|
||||
indexSize: getAutoIndexSize(vectorModel)
|
||||
}
|
||||
}))()
|
||||
});
|
||||
|
||||
const [sources, setSources] = useState<ImportSourceItemType[]>([]);
|
||||
|
@@ -17,7 +17,6 @@ import MyBox from '@fastgpt/web/components/common/MyBox';
|
||||
import Markdown from '@/components/Markdown';
|
||||
import { useToast } from '@fastgpt/web/hooks/useToast';
|
||||
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
|
||||
import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';
|
||||
|
||||
const PreviewData = () => {
|
||||
const { t } = useTranslation();
|
||||
@@ -37,11 +36,7 @@ const PreviewData = () => {
|
||||
async () => {
|
||||
if (!previewFile) return { chunks: [], total: 0 };
|
||||
|
||||
const chunkData = collectionChunkForm2StoreChunkData({
|
||||
...processParamsForm.getValues(),
|
||||
vectorModel: datasetDetail.vectorModel,
|
||||
agentModel: datasetDetail.agentModel
|
||||
});
|
||||
const chunkData = processParamsForm.getValues();
|
||||
|
||||
if (importSource === ImportDataSourceEnum.fileCustom) {
|
||||
const chunkSplitter = processParamsForm.getValues('chunkSplitter');
|
||||
|
@@ -37,7 +37,6 @@ import { useContextSelector } from 'use-context-selector';
|
||||
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
||||
import { DatasetImportContext, type ImportFormType } from '../Context';
|
||||
import { type ApiCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
|
||||
import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';
|
||||
|
||||
const Upload = () => {
|
||||
const { t } = useTranslation();
|
||||
@@ -82,12 +81,6 @@ const Upload = () => {
|
||||
|
||||
const { runAsync: startUpload, loading: isLoading } = useRequest2(
|
||||
async ({ customPdfParse, webSelector, ...data }: ImportFormType) => {
|
||||
const chunkData = collectionChunkForm2StoreChunkData({
|
||||
...data,
|
||||
vectorModel: datasetDetail.vectorModel,
|
||||
agentModel: datasetDetail.agentModel
|
||||
});
|
||||
|
||||
if (sources.length === 0) return;
|
||||
const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting');
|
||||
|
||||
@@ -108,7 +101,7 @@ const Upload = () => {
|
||||
const commonParams: ApiCreateDatasetCollectionParams & {
|
||||
name: string;
|
||||
} = {
|
||||
...chunkData,
|
||||
...data,
|
||||
parentId,
|
||||
datasetId: datasetDetail._id,
|
||||
name: item.sourceName,
|
||||
|
@@ -1,7 +1,3 @@
|
||||
import {
|
||||
ChunkSettingModeEnum,
|
||||
DatasetCollectionDataProcessModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
|
||||
import { NextAPI } from '@/service/middleware/entry';
|
||||
@@ -13,13 +9,11 @@ import {
|
||||
import { authCollectionFile } from '@fastgpt/service/support/permission/auth/file';
|
||||
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
|
||||
import {
|
||||
computeChunkSize,
|
||||
computeChunkSplitter,
|
||||
computeParagraphChunkDeep,
|
||||
computedCollectionChunkSettings,
|
||||
getLLMMaxChunkSize
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
|
||||
import { getLLMModel } from '@fastgpt/service/core/ai/model';
|
||||
import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
|
||||
import type { ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
|
||||
|
||||
export type PostPreviewFilesChunksProps = ChunkSettingsType & {
|
||||
@@ -52,22 +46,12 @@ async function handler(
|
||||
sourceId,
|
||||
customPdfParse = false,
|
||||
|
||||
trainingType = DatasetCollectionDataProcessModeEnum.chunk,
|
||||
|
||||
chunkTriggerType,
|
||||
chunkTriggerMinSize,
|
||||
|
||||
chunkSettingMode = ChunkSettingModeEnum.auto,
|
||||
chunkSplitMode,
|
||||
paragraphChunkDeep,
|
||||
paragraphChunkMinSize,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
|
||||
overlapRatio,
|
||||
selector,
|
||||
datasetId,
|
||||
externalFileId
|
||||
externalFileId,
|
||||
|
||||
...chunkSettings
|
||||
} = req.body;
|
||||
|
||||
if (!sourceId) {
|
||||
@@ -97,22 +81,10 @@ async function handler(
|
||||
return Promise.reject(CommonErrEnum.unAuthFile);
|
||||
}
|
||||
|
||||
chunkSize = computeChunkSize({
|
||||
trainingType,
|
||||
chunkSettingMode,
|
||||
chunkSplitMode,
|
||||
chunkSize,
|
||||
llmModel: getLLMModel(dataset.agentModel)
|
||||
});
|
||||
chunkSplitter = computeChunkSplitter({
|
||||
chunkSettingMode,
|
||||
chunkSplitMode,
|
||||
chunkSplitter
|
||||
});
|
||||
paragraphChunkDeep = computeParagraphChunkDeep({
|
||||
chunkSettingMode,
|
||||
chunkSplitMode,
|
||||
paragraphChunkDeep
|
||||
const formatChunkSettings = computedCollectionChunkSettings({
|
||||
...chunkSettings,
|
||||
llmModel: getLLMModel(dataset.agentModel),
|
||||
vectorModel: getEmbeddingModel(dataset.vectorModel)
|
||||
});
|
||||
|
||||
const { rawText } = await readDatasetSourceRawText({
|
||||
@@ -126,16 +98,16 @@ async function handler(
|
||||
apiDatasetServer: dataset.apiDatasetServer
|
||||
});
|
||||
|
||||
const chunks = rawText2Chunks({
|
||||
const chunks = await rawText2Chunks({
|
||||
rawText,
|
||||
chunkTriggerType,
|
||||
chunkTriggerMinSize,
|
||||
chunkSize,
|
||||
paragraphChunkDeep,
|
||||
paragraphChunkMinSize,
|
||||
chunkTriggerType: formatChunkSettings.chunkTriggerType,
|
||||
chunkTriggerMinSize: formatChunkSettings.chunkTriggerMinSize,
|
||||
chunkSize: formatChunkSettings.chunkSize,
|
||||
paragraphChunkDeep: formatChunkSettings.paragraphChunkDeep,
|
||||
paragraphChunkMinSize: formatChunkSettings.paragraphChunkMinSize,
|
||||
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
|
||||
overlapRatio,
|
||||
customReg: chunkSplitter ? [chunkSplitter] : []
|
||||
customReg: formatChunkSettings.chunkSplitter ? [formatChunkSettings.chunkSplitter] : []
|
||||
});
|
||||
|
||||
return {
|
||||
|
@@ -40,6 +40,8 @@ import { isEqual } from 'lodash';
|
||||
import { addOperationLog } from '@fastgpt/service/support/operationLog/addOperationLog';
|
||||
import { OperationLogEventEnum } from '@fastgpt/global/support/operationLog/constants';
|
||||
import { getI18nDatasetType } from '@fastgpt/service/support/operationLog/util';
|
||||
import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
|
||||
import { computedCollectionChunkSettings } from '@fastgpt/global/core/dataset/training/utils';
|
||||
|
||||
export type DatasetUpdateQuery = {};
|
||||
export type DatasetUpdateResponse = any;
|
||||
@@ -59,7 +61,7 @@ async function handler(
|
||||
req: ApiRequestProps<DatasetUpdateBody, DatasetUpdateQuery>,
|
||||
_res: ApiResponseType<any>
|
||||
): Promise<DatasetUpdateResponse> {
|
||||
const {
|
||||
let {
|
||||
id,
|
||||
parentId,
|
||||
name,
|
||||
@@ -89,6 +91,14 @@ async function handler(
|
||||
|
||||
let targetName = '';
|
||||
|
||||
chunkSettings = chunkSettings
|
||||
? computedCollectionChunkSettings({
|
||||
...chunkSettings,
|
||||
llmModel: getLLMModel(dataset.agentModel),
|
||||
vectorModel: getEmbeddingModel(dataset.vectorModel)
|
||||
})
|
||||
: undefined;
|
||||
|
||||
if (isMove) {
|
||||
if (parentId) {
|
||||
// move to a folder, check the target folder's permission
|
||||
|
@@ -16,9 +16,9 @@ import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
|
||||
import { type ClientSession } from '@fastgpt/service/common/mongo';
|
||||
import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema';
|
||||
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
|
||||
import { deleteDatasetImage } from '@fastgpt/service/core/dataset/image/controller';
|
||||
import { text2Chunks } from '@fastgpt/service/worker/function';
|
||||
|
||||
const formatIndexes = async ({
|
||||
indexes = [],
|
||||
@@ -40,7 +40,7 @@ const formatIndexes = async ({
|
||||
}[]
|
||||
> => {
|
||||
/* get dataset data default index */
|
||||
const getDefaultIndex = ({
|
||||
const getDefaultIndex = async ({
|
||||
q = '',
|
||||
a,
|
||||
indexSize
|
||||
@@ -49,13 +49,15 @@ const formatIndexes = async ({
|
||||
a?: string;
|
||||
indexSize: number;
|
||||
}) => {
|
||||
const qChunks = splitText2Chunks({
|
||||
text: q,
|
||||
chunkSize: indexSize,
|
||||
maxSize: maxIndexSize
|
||||
}).chunks;
|
||||
const qChunks = (
|
||||
await text2Chunks({
|
||||
text: q,
|
||||
chunkSize: indexSize,
|
||||
maxSize: maxIndexSize
|
||||
})
|
||||
).chunks;
|
||||
const aChunks = a
|
||||
? splitText2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize }).chunks
|
||||
? (await text2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize })).chunks
|
||||
: [];
|
||||
|
||||
return [
|
||||
@@ -80,7 +82,7 @@ const formatIndexes = async ({
|
||||
.filter((item) => !!item.text.trim());
|
||||
|
||||
// Recompute default indexes, Merge ids of the same index, reduce the number of rebuilds
|
||||
const defaultIndexes = getDefaultIndex({ q, a, indexSize });
|
||||
const defaultIndexes = await getDefaultIndex({ q, a, indexSize });
|
||||
|
||||
const concatDefaultIndexes = defaultIndexes.map((item) => {
|
||||
const oldIndex = indexes!.find((index) => index.text === item.text);
|
||||
@@ -114,11 +116,13 @@ const formatIndexes = async ({
|
||||
// If oversize tokens, split it
|
||||
const tokens = await countPromptTokens(item.text);
|
||||
if (tokens > maxIndexSize) {
|
||||
const splitText = splitText2Chunks({
|
||||
text: item.text,
|
||||
chunkSize: indexSize,
|
||||
maxSize: maxIndexSize
|
||||
}).chunks;
|
||||
const splitText = (
|
||||
await text2Chunks({
|
||||
text: item.text,
|
||||
chunkSize: indexSize,
|
||||
maxSize: maxIndexSize
|
||||
})
|
||||
).chunks;
|
||||
return splitText.map((text) => ({
|
||||
text,
|
||||
type: item.type
|
||||
|
@@ -1,6 +1,6 @@
|
||||
/* Dataset collection source parse, not max size. */
|
||||
|
||||
import type { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import {
|
||||
DatasetCollectionDataProcessModeEnum,
|
||||
DatasetCollectionTypeEnum,
|
||||
@@ -29,7 +29,7 @@ import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
|
||||
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
import { POST } from '@fastgpt/service/common/api/plusRequest';
|
||||
import { deleteRawTextBuffer } from '@fastgpt/service/common/buffer/rawText/controller';
|
||||
import { pushLLMTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
|
||||
|
||||
const requestLLMPargraph = async ({
|
||||
rawText,
|
||||
@@ -42,13 +42,11 @@ const requestLLMPargraph = async ({
|
||||
billId: string;
|
||||
paragraphChunkAIMode: ParagraphChunkAIModeEnum;
|
||||
}) => {
|
||||
return {
|
||||
resultText: rawText,
|
||||
totalInputTokens: 0,
|
||||
totalOutputTokens: 0
|
||||
};
|
||||
|
||||
if (!global.feConfigs?.isPlus || !paragraphChunkAIMode) {
|
||||
if (
|
||||
!global.feConfigs?.isPlus ||
|
||||
!paragraphChunkAIMode ||
|
||||
paragraphChunkAIMode === ParagraphChunkAIModeEnum.forbid
|
||||
) {
|
||||
return {
|
||||
resultText: rawText,
|
||||
totalInputTokens: 0,
|
||||
@@ -57,16 +55,16 @@ const requestLLMPargraph = async ({
|
||||
}
|
||||
|
||||
// Check is markdown text(Include 1 group of title)
|
||||
// if (paragraphChunkAIMode === ParagraphChunkAIModeEnum.auto) {
|
||||
// const isMarkdown = /^(#+)\s/.test(rawText);
|
||||
// if (isMarkdown) {
|
||||
// return {
|
||||
// resultText: rawText,
|
||||
// totalInputTokens: 0,
|
||||
// totalOutputTokens: 0
|
||||
// };
|
||||
// }
|
||||
// }
|
||||
if (paragraphChunkAIMode === ParagraphChunkAIModeEnum.auto) {
|
||||
const isMarkdown = /^(#+)\s/.test(rawText);
|
||||
if (isMarkdown) {
|
||||
return {
|
||||
resultText: rawText,
|
||||
totalInputTokens: 0,
|
||||
totalOutputTokens: 0
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
const data = await POST<{
|
||||
resultText: string;
|
||||
@@ -226,15 +224,25 @@ export const datasetParseQueue = async (): Promise<any> => {
|
||||
});
|
||||
|
||||
// 3. LLM Pargraph
|
||||
const { resultText } = await requestLLMPargraph({
|
||||
const { resultText, totalInputTokens, totalOutputTokens } = await requestLLMPargraph({
|
||||
rawText,
|
||||
model: dataset.agentModel,
|
||||
billId: data.billId,
|
||||
paragraphChunkAIMode: collection.paragraphChunkAIMode
|
||||
});
|
||||
// Push usage
|
||||
pushLLMTrainingUsage({
|
||||
teamId: data.teamId,
|
||||
tmbId: data.tmbId,
|
||||
model: dataset.agentModel,
|
||||
inputTokens: totalInputTokens,
|
||||
outputTokens: totalOutputTokens,
|
||||
billId: data.billId,
|
||||
mode: 'paragraph'
|
||||
});
|
||||
|
||||
// 4. Chunk split
|
||||
const chunks = rawText2Chunks({
|
||||
const chunks = await rawText2Chunks({
|
||||
rawText: resultText,
|
||||
chunkTriggerType: collection.chunkTriggerType,
|
||||
chunkTriggerMinSize: collection.chunkTriggerMinSize,
|
||||
|
@@ -1,10 +1,9 @@
|
||||
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
|
||||
import { pushQAUsage } from '@/service/support/wallet/usage/push';
|
||||
import { pushLLMTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
|
||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { createChatCompletion } from '@fastgpt/service/core/ai/config';
|
||||
import type { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type.d';
|
||||
import { addLog } from '@fastgpt/service/common/system/log';
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { replaceVariable } from '@fastgpt/global/common/string/tools';
|
||||
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
|
||||
import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api.d';
|
||||
@@ -24,6 +23,7 @@ import {
|
||||
getLLMMaxChunkSize
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||
import { text2Chunks } from '@fastgpt/service/worker/function';
|
||||
|
||||
const reduceQueue = () => {
|
||||
global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0;
|
||||
@@ -144,7 +144,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
|
||||
const inputTokens = usage?.prompt_tokens || (await countGptMessagesTokens(messages));
|
||||
const outputTokens = usage?.completion_tokens || (await countPromptTokens(answer));
|
||||
|
||||
const qaArr = formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对
|
||||
const qaArr = await formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对
|
||||
|
||||
// get vector and insert
|
||||
await pushDataListToTrainingQueueByCollectionId({
|
||||
@@ -163,13 +163,14 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
|
||||
await MongoDatasetTraining.findByIdAndDelete(data._id);
|
||||
|
||||
// add bill
|
||||
pushQAUsage({
|
||||
pushLLMTrainingUsage({
|
||||
teamId: data.teamId,
|
||||
tmbId: data.tmbId,
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
billId: data.billId,
|
||||
model: modelData.model
|
||||
model: modelData.model,
|
||||
mode: 'qa'
|
||||
});
|
||||
addLog.info(`[QA Queue] Finish`, {
|
||||
time: Date.now() - startTime,
|
||||
@@ -196,7 +197,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
|
||||
}
|
||||
|
||||
// Format qa answer
|
||||
function formatSplitText({
|
||||
async function formatSplitText({
|
||||
answer,
|
||||
rawText,
|
||||
llmModel
|
||||
@@ -223,7 +224,7 @@ function formatSplitText({
|
||||
|
||||
// empty result. direct split chunk
|
||||
if (result.length === 0) {
|
||||
const { chunks } = splitText2Chunks({
|
||||
const { chunks } = await text2Chunks({
|
||||
text: rawText,
|
||||
chunkSize: chunkAutoChunkSize,
|
||||
maxSize: getLLMMaxChunkSize(llmModel)
|
||||
|
@@ -5,42 +5,6 @@ import { i18nT } from '@fastgpt/web/i18n/utils';
|
||||
import { ModelTypeEnum } from '@fastgpt/global/core/ai/model';
|
||||
import { getDefaultTTSModel } from '@fastgpt/service/core/ai/model';
|
||||
|
||||
export const pushQAUsage = async ({
|
||||
teamId,
|
||||
tmbId,
|
||||
model,
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
billId
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
model: string;
|
||||
inputTokens: number;
|
||||
outputTokens: number;
|
||||
billId: string;
|
||||
}) => {
|
||||
// 计算价格
|
||||
const { totalPoints } = formatModelChars2Points({
|
||||
model,
|
||||
modelType: ModelTypeEnum.llm,
|
||||
inputTokens,
|
||||
outputTokens
|
||||
});
|
||||
|
||||
concatUsage({
|
||||
billId,
|
||||
teamId,
|
||||
tmbId,
|
||||
totalPoints,
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
listIndex: 1
|
||||
});
|
||||
|
||||
return { totalPoints };
|
||||
};
|
||||
|
||||
export const pushGenerateVectorUsage = ({
|
||||
billId,
|
||||
teamId,
|
||||
|
@@ -16,7 +16,7 @@ const formatResult = (result: string[]) => {
|
||||
};
|
||||
|
||||
// 最大值分块测试-小于最大值,不分块
|
||||
it(`Test splitText2Chunks 1`, () => {
|
||||
it(`Test splitText2Chunks 1`, async () => {
|
||||
const mock = {
|
||||
text: `# A
|
||||
|
||||
@@ -61,7 +61,7 @@ dsgsgfsgs22sddddddd`
|
||||
]
|
||||
};
|
||||
|
||||
const data = rawText2Chunks({
|
||||
const data = await rawText2Chunks({
|
||||
rawText: mock.text,
|
||||
chunkTriggerType: ChunkTriggerConfigTypeEnum.maxSize,
|
||||
chunkTriggerMinSize: 1000,
|
||||
@@ -72,7 +72,7 @@ dsgsgfsgs22sddddddd`
|
||||
expect(formatChunks(data)).toEqual(formatResult(mock.result));
|
||||
});
|
||||
// 最大值分块测试-大于最大值,分块
|
||||
it(`Test splitText2Chunks 2`, () => {
|
||||
it(`Test splitText2Chunks 2`, async () => {
|
||||
const mock = {
|
||||
text: `# A
|
||||
|
||||
@@ -122,7 +122,7 @@ dsgsgfsgs22sddddddd`
|
||||
]
|
||||
};
|
||||
|
||||
const data = rawText2Chunks({
|
||||
const data = await rawText2Chunks({
|
||||
rawText: mock.text,
|
||||
chunkTriggerType: ChunkTriggerConfigTypeEnum.maxSize,
|
||||
chunkTriggerMinSize: 10,
|
||||
@@ -135,7 +135,7 @@ dsgsgfsgs22sddddddd`
|
||||
});
|
||||
|
||||
// 最小值分块测试-大于最小值,不分块
|
||||
it(`Test splitText2Chunks 3`, () => {
|
||||
it(`Test splitText2Chunks 3`, async () => {
|
||||
const mock = {
|
||||
text: `# A
|
||||
|
||||
@@ -179,7 +179,7 @@ it(`Test splitText2Chunks 3`, () => {
|
||||
]
|
||||
};
|
||||
|
||||
const data = rawText2Chunks({
|
||||
const data = await rawText2Chunks({
|
||||
rawText: mock.text,
|
||||
chunkTriggerType: ChunkTriggerConfigTypeEnum.minSize,
|
||||
chunkTriggerMinSize: 1000,
|
||||
@@ -191,7 +191,7 @@ it(`Test splitText2Chunks 3`, () => {
|
||||
expect(formatChunks(data)).toEqual(formatResult(mock.result));
|
||||
});
|
||||
// 最小值分块测试-小于最小值,分块
|
||||
it(`Test splitText2Chunks 4`, () => {
|
||||
it(`Test splitText2Chunks 4`, async () => {
|
||||
const mock = {
|
||||
text: `# A
|
||||
|
||||
@@ -241,7 +241,7 @@ dsgsgfsgs22sddddddd`,
|
||||
]
|
||||
};
|
||||
|
||||
const data = rawText2Chunks({
|
||||
const data = await rawText2Chunks({
|
||||
rawText: mock.text,
|
||||
chunkTriggerType: ChunkTriggerConfigTypeEnum.minSize,
|
||||
chunkTriggerMinSize: 10,
|
||||
@@ -254,7 +254,7 @@ dsgsgfsgs22sddddddd`,
|
||||
});
|
||||
|
||||
// 强制分块测试-小于最小值和最大值
|
||||
it(`Test splitText2Chunks 5`, () => {
|
||||
it(`Test splitText2Chunks 5`, async () => {
|
||||
const mock = {
|
||||
text: `# A
|
||||
|
||||
@@ -304,7 +304,7 @@ dsgsgfsgs22sddddddd`,
|
||||
]
|
||||
};
|
||||
|
||||
const data = rawText2Chunks({
|
||||
const data = await rawText2Chunks({
|
||||
rawText: mock.text,
|
||||
chunkTriggerType: ChunkTriggerConfigTypeEnum.forceChunk,
|
||||
chunkTriggerMinSize: 1000,
|
||||
@@ -317,7 +317,7 @@ dsgsgfsgs22sddddddd`,
|
||||
});
|
||||
|
||||
// 强制分块测试-大于最小值
|
||||
it(`Test splitText2Chunks 6`, () => {
|
||||
it(`Test splitText2Chunks 6`, async () => {
|
||||
const mock = {
|
||||
text: `# A
|
||||
|
||||
@@ -367,7 +367,7 @@ dsgsgfsgs22sddddddd`,
|
||||
]
|
||||
};
|
||||
|
||||
const data = rawText2Chunks({
|
||||
const data = await rawText2Chunks({
|
||||
rawText: mock.text,
|
||||
chunkTriggerType: ChunkTriggerConfigTypeEnum.forceChunk,
|
||||
chunkTriggerMinSize: 10,
|
||||
|
Reference in New Issue
Block a user