perf: password special chars;feat: llm paragraph;perf: chunk setting params;perf: text splitter worker (#4984)

* perf: password special chars

* feat: llm paragraph;perf: chunk setting params

* perf: text splitter worker

* perf: get rawtext buffer

* fix: test

* fix: test

* doc

* min chunk size
This commit is contained in:
Archer
2025-06-10 00:05:54 +08:00
committed by GitHub
parent 068918a9ee
commit 01ff56b42b
41 changed files with 546 additions and 448 deletions

View File

@@ -0,0 +1,22 @@
---
title: 'V4.9.11(进行中)'
description: 'FastGPT V4.9.12 更新说明'
icon: 'upgrade'
draft: false
toc: true
weight: 789
---
## 🚀 新增内容
1. 商业版支持知识库分块时LLM 进行自动分段识别。
## ⚙️ 优化
1. 密码校验时,增加更多的特殊字符
2. 后端全量计算知识库 chunk 参数,避免自动模式下部分参数未正确使用默认值。
3. 将文本分块移至 worker 线程,避免阻塞。
## 🐛 修复
1. 自定义问答提取提示词被覆盖。

View File

@@ -3,9 +3,9 @@ export const checkPasswordRule = (password: string) => {
/\d/, // Contains digits /\d/, // Contains digits
/[a-z]/, // Contains lowercase letters /[a-z]/, // Contains lowercase letters
/[A-Z]/, // Contains uppercase letters /[A-Z]/, // Contains uppercase letters
/[!@#$%^&*()_+=-]/ // Contains special characters /[!@#$%^&*()_+=.,:;?\/\\|`~"'<>{}\[\]-]/ // Contains special characters
]; ];
const validChars = /^[\dA-Za-z!@#$%^&*()_+=-]{8,100}$/; const validChars = /^[\dA-Za-z!@#$%^&*()_+=.,:;?\/\\|`~"'<>{}\[\]-]{8,100}$/;
// Check length and valid characters // Check length and valid characters
if (!validChars.test(password)) return false; if (!validChars.test(password)) return false;

View File

@@ -1,10 +1,11 @@
import { defaultMaxChunkSize } from '../../core/dataset/training/utils'; import { defaultMaxChunkSize } from '../../core/dataset/training/utils';
import { getErrText } from '../error/utils'; import { getErrText } from '../error/utils';
import { simpleText } from './tools';
import { getTextValidLength } from './utils'; import { getTextValidLength } from './utils';
export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----'; export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
type SplitProps = { export type SplitProps = {
text: string; text: string;
chunkSize: number; chunkSize: number;
@@ -19,7 +20,7 @@ export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkSize'> & {
chunkSize?: number; chunkSize?: number;
}; };
type SplitResponse = { export type SplitResponse = {
chunks: string[]; chunks: string[];
chars: number; chars: number;
}; };
@@ -474,7 +475,10 @@ export const splitText2Chunks = (props: SplitProps): SplitResponse => {
}); });
return { return {
chunks: splitResult.map((item) => item.chunks).flat(), chunks: splitResult
.map((item) => item.chunks)
.flat()
.map((chunk) => simpleText(chunk)),
chars: splitResult.reduce((sum, item) => sum + item.chars, 0) chars: splitResult.reduce((sum, item) => sum + item.chars, 0)
}; };
}; };

View File

@@ -7,3 +7,4 @@ export const DEFAULT_ORG_AVATAR = '/imgs/avatar/defaultOrgAvatar.svg';
export const DEFAULT_USER_AVATAR = '/imgs/avatar/BlueAvatar.svg'; export const DEFAULT_USER_AVATAR = '/imgs/avatar/BlueAvatar.svg';
export const isProduction = process.env.NODE_ENV === 'production'; export const isProduction = process.env.NODE_ENV === 'production';
export const isTestEnv = process.env.NODE_ENV === 'test';

View File

@@ -211,7 +211,8 @@ export enum DataChunkSplitModeEnum {
} }
export enum ParagraphChunkAIModeEnum { export enum ParagraphChunkAIModeEnum {
auto = 'auto', auto = 'auto',
force = 'force' force = 'force',
forbid = 'forbid'
} }
/* ------------ data -------------- */ /* ------------ data -------------- */

View File

@@ -3,8 +3,11 @@ import { type EmbeddingModelItemType, type LLMModelItemType } from '../../../cor
import { import {
ChunkSettingModeEnum, ChunkSettingModeEnum,
DataChunkSplitModeEnum, DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum DatasetCollectionDataProcessModeEnum,
ParagraphChunkAIModeEnum
} from '../constants'; } from '../constants';
import type { ChunkSettingsType } from '../type';
import { cloneDeep } from 'lodash';
export const minChunkSize = 64; // min index and chunk size export const minChunkSize = 64; // min index and chunk size
@@ -103,53 +106,78 @@ export const getIndexSizeSelectList = (max = 512) => {
}; };
// Compute // Compute
export const computeChunkSize = (params: { export const computedCollectionChunkSettings = <T extends ChunkSettingsType>({
trainingType: DatasetCollectionDataProcessModeEnum; llmModel,
chunkSettingMode?: ChunkSettingModeEnum; vectorModel,
chunkSplitMode?: DataChunkSplitModeEnum; ...data
}: {
llmModel?: LLMModelItemType; llmModel?: LLMModelItemType;
chunkSize?: number; vectorModel?: EmbeddingModelItemType;
}) => { } & T) => {
if (params.trainingType === DatasetCollectionDataProcessModeEnum.qa) { const {
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) { trainingType = DatasetCollectionDataProcessModeEnum.chunk,
return getLLMDefaultChunkSize(params.llmModel); chunkSettingMode = ChunkSettingModeEnum.auto,
chunkSplitMode,
chunkSize,
paragraphChunkDeep = 5,
indexSize,
autoIndexes
} = data;
const cloneChunkSettings = cloneDeep(data);
if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
delete cloneChunkSettings.qaPrompt;
}
// Format training type indexSize/chunkSize
const trainingModeSize: {
autoChunkSize: number;
autoIndexSize: number;
chunkSize?: number;
indexSize?: number;
} = (() => {
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
return {
autoChunkSize: getLLMDefaultChunkSize(llmModel),
autoIndexSize: getMaxIndexSize(vectorModel),
chunkSize,
indexSize: getMaxIndexSize(vectorModel)
};
} else if (autoIndexes) {
return {
autoChunkSize: chunkAutoChunkSize,
autoIndexSize: getAutoIndexSize(vectorModel),
chunkSize,
indexSize
};
} else {
return {
autoChunkSize: chunkAutoChunkSize,
autoIndexSize: getAutoIndexSize(vectorModel),
chunkSize,
indexSize
};
} }
})();
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
cloneChunkSettings.chunkSplitMode = DataChunkSplitModeEnum.paragraph;
cloneChunkSettings.paragraphChunkAIMode = ParagraphChunkAIModeEnum.forbid;
cloneChunkSettings.paragraphChunkDeep = 5;
cloneChunkSettings.paragraphChunkMinSize = 100;
cloneChunkSettings.chunkSize = trainingModeSize.autoChunkSize;
cloneChunkSettings.indexSize = trainingModeSize.autoIndexSize;
cloneChunkSettings.chunkSplitter = undefined;
} else { } else {
// chunk cloneChunkSettings.paragraphChunkDeep =
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) { chunkSplitMode === DataChunkSplitModeEnum.paragraph ? paragraphChunkDeep : 0;
return chunkAutoChunkSize;
} cloneChunkSettings.chunkSize = trainingModeSize.chunkSize
? Math.min(trainingModeSize.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(llmModel))
: undefined;
cloneChunkSettings.indexSize = trainingModeSize.indexSize;
} }
if (params.chunkSplitMode === DataChunkSplitModeEnum.char) { return cloneChunkSettings;
return getLLMMaxChunkSize(params.llmModel);
}
return Math.min(params.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel));
};
export const computeChunkSplitter = (params: {
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
chunkSplitter?: string;
}) => {
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
return undefined;
}
if (params.chunkSplitMode !== DataChunkSplitModeEnum.char) {
return undefined;
}
return params.chunkSplitter;
};
export const computeParagraphChunkDeep = (params: {
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
paragraphChunkDeep?: number;
}) => {
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
return 5;
}
if (params.chunkSplitMode === DataChunkSplitModeEnum.paragraph) {
return params.paragraphChunkDeep;
}
return 0;
}; };

View File

@@ -15,9 +15,11 @@
"next": "14.2.28", "next": "14.2.28",
"openai": "4.61.0", "openai": "4.61.0",
"openapi-types": "^12.1.3", "openapi-types": "^12.1.3",
"timezones-list": "^3.0.2" "timezones-list": "^3.0.2",
"lodash": "^4.17.21"
}, },
"devDependencies": { "devDependencies": {
"@types/lodash": "^4.14.191",
"@types/js-yaml": "^4.0.9", "@types/js-yaml": "^4.0.9",
"@types/node": "20.14.0" "@types/node": "20.14.0"
} }

View File

@@ -5,6 +5,8 @@ import { addLog } from '../../system/log';
import { setCron } from '../../system/cron'; import { setCron } from '../../system/cron';
import { checkTimerLock } from '../../system/timerLock/utils'; import { checkTimerLock } from '../../system/timerLock/utils';
import { TimerIdEnum } from '../../system/timerLock/constants'; import { TimerIdEnum } from '../../system/timerLock/constants';
import { gridFsStream2Buffer } from '../../file/gridfs/utils';
import { readRawContentFromBuffer } from '../../../worker/function';
const getGridBucket = () => { const getGridBucket = () => {
return new connectionMongo.mongo.GridFSBucket(connectionMongo.connection.db!, { return new connectionMongo.mongo.GridFSBucket(connectionMongo.connection.db!, {
@@ -85,30 +87,27 @@ export const getRawTextBuffer = async (sourceId: string) => {
// Read file content // Read file content
const downloadStream = gridBucket.openDownloadStream(bufferData._id); const downloadStream = gridBucket.openDownloadStream(bufferData._id);
const chunks: Buffer[] = [];
return new Promise<{ const fileBuffers = await gridFsStream2Buffer(downloadStream);
text: string;
sourceName: string;
} | null>((resolve, reject) => {
downloadStream.on('data', (chunk) => {
chunks.push(chunk);
});
downloadStream.on('end', () => { const rawText = await (async () => {
const buffer = Buffer.concat(chunks); if (fileBuffers.length < 10000000) {
const text = buffer.toString('utf8'); return fileBuffers.toString('utf8');
resolve({ } else {
text, return (
sourceName: bufferData.metadata?.sourceName || '' await readRawContentFromBuffer({
}); extension: 'txt',
}); encoding: 'utf8',
buffer: fileBuffers
})
).rawText;
}
})();
downloadStream.on('error', (error) => { return {
addLog.error('getRawTextBuffer error', error); text: rawText,
resolve(null); sourceName: bufferData.metadata?.sourceName || ''
}); };
});
}); });
}; };

View File

@@ -55,13 +55,17 @@ export const createFileFromText = async ({
export const gridFsStream2Buffer = (stream: NodeJS.ReadableStream) => { export const gridFsStream2Buffer = (stream: NodeJS.ReadableStream) => {
return new Promise<Buffer>((resolve, reject) => { return new Promise<Buffer>((resolve, reject) => {
if (!stream.readable) {
return resolve(Buffer.from([]));
}
const chunks: Uint8Array[] = []; const chunks: Uint8Array[] = [];
stream.on('data', (chunk) => { stream.on('data', (chunk) => {
chunks.push(chunk); chunks.push(chunk);
}); });
stream.on('end', () => { stream.on('end', () => {
const resultBuffer = Buffer.concat(chunks); // 一次性拼接 const resultBuffer = Buffer.concat(chunks); // One-time splicing
resolve(resultBuffer); resolve(resultBuffer);
}); });
stream.on('error', (err) => { stream.on('error', (err) => {

View File

@@ -1,6 +1,5 @@
import { uploadMongoImg } from '../image/controller'; import { uploadMongoImg } from '../image/controller';
import FormData from 'form-data'; import FormData from 'form-data';
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
import fs from 'fs'; import fs from 'fs';
import type { ReadFileResponse } from '../../../worker/readFile/type'; import type { ReadFileResponse } from '../../../worker/readFile/type';
import axios from 'axios'; import axios from 'axios';
@@ -9,6 +8,7 @@ import { batchRun } from '@fastgpt/global/common/system/utils';
import { matchMdImg } from '@fastgpt/global/common/string/markdown'; import { matchMdImg } from '@fastgpt/global/common/string/markdown';
import { createPdfParseUsage } from '../../../support/wallet/usage/controller'; import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
import { useDoc2xServer } from '../../../thirdProvider/doc2x'; import { useDoc2xServer } from '../../../thirdProvider/doc2x';
import { readRawContentFromBuffer } from '../../../worker/function';
export type readRawTextByLocalFileParams = { export type readRawTextByLocalFileParams = {
teamId: string; teamId: string;
@@ -63,11 +63,10 @@ export const readRawContentByFileBuffer = async ({
rawText: string; rawText: string;
}> => { }> => {
const systemParse = () => const systemParse = () =>
runWorker<ReadFileResponse>(WorkerNameEnum.readFile, { readRawContentFromBuffer({
extension, extension,
encoding, encoding,
buffer, buffer
teamId
}); });
const parsePdfFromCustomService = async (): Promise<ReadFileResponse> => { const parsePdfFromCustomService = async (): Promise<ReadFileResponse> => {
const url = global.systemEnv.customPdfParse?.url; const url = global.systemEnv.customPdfParse?.url;

View File

@@ -1,3 +1,4 @@
import { isTestEnv } from '@fastgpt/global/common/system/constants';
import { addLog } from '../../common/system/log'; import { addLog } from '../../common/system/log';
import type { Model } from 'mongoose'; import type { Model } from 'mongoose';
import mongoose, { Mongoose } from 'mongoose'; import mongoose, { Mongoose } from 'mongoose';
@@ -70,7 +71,7 @@ const addCommonMiddleware = (schema: mongoose.Schema) => {
export const getMongoModel = <T>(name: string, schema: mongoose.Schema) => { export const getMongoModel = <T>(name: string, schema: mongoose.Schema) => {
if (connectionMongo.models[name]) return connectionMongo.models[name] as Model<T>; if (connectionMongo.models[name]) return connectionMongo.models[name] as Model<T>;
if (process.env.NODE_ENV !== 'test') console.log('Load model======', name); if (!isTestEnv) console.log('Load model======', name);
addCommonMiddleware(schema); addCommonMiddleware(schema);
const model = connectionMongo.model<T>(name, schema); const model = connectionMongo.model<T>(name, schema);

View File

@@ -32,10 +32,7 @@ import { MongoDatasetDataText } from '../data/dataTextSchema';
import { retryFn } from '@fastgpt/global/common/system/utils'; import { retryFn } from '@fastgpt/global/common/system/utils';
import { getTrainingModeByCollection } from './utils'; import { getTrainingModeByCollection } from './utils';
import { import {
computeChunkSize, computedCollectionChunkSettings,
computeChunkSplitter,
computeParagraphChunkDeep,
getAutoIndexSize,
getLLMMaxChunkSize getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils'; } from '@fastgpt/global/core/dataset/training/utils';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants'; import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
@@ -68,31 +65,50 @@ export const createCollectionAndInsertData = async ({
createCollectionParams.autoIndexes = true; createCollectionParams.autoIndexes = true;
} }
const teamId = createCollectionParams.teamId; const formatCreateCollectionParams = computedCollectionChunkSettings({
const tmbId = createCollectionParams.tmbId; ...createCollectionParams,
llmModel: getLLMModel(dataset.agentModel),
vectorModel: getEmbeddingModel(dataset.vectorModel)
});
const teamId = formatCreateCollectionParams.teamId;
const tmbId = formatCreateCollectionParams.tmbId;
// Set default params // Set default params
const trainingType = const trainingType =
createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk; formatCreateCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
const chunkSplitter = computeChunkSplitter(createCollectionParams);
const paragraphChunkDeep = computeParagraphChunkDeep(createCollectionParams);
const trainingMode = getTrainingModeByCollection({ const trainingMode = getTrainingModeByCollection({
trainingType: trainingType, trainingType: trainingType,
autoIndexes: createCollectionParams.autoIndexes, autoIndexes: formatCreateCollectionParams.autoIndexes,
imageIndex: createCollectionParams.imageIndex imageIndex: formatCreateCollectionParams.imageIndex
}); });
if ( if (
trainingType === DatasetCollectionDataProcessModeEnum.qa || trainingType === DatasetCollectionDataProcessModeEnum.qa ||
trainingType === DatasetCollectionDataProcessModeEnum.backup trainingType === DatasetCollectionDataProcessModeEnum.backup ||
trainingType === DatasetCollectionDataProcessModeEnum.template
) { ) {
delete createCollectionParams.chunkTriggerType; delete formatCreateCollectionParams.chunkTriggerType;
delete createCollectionParams.chunkTriggerMinSize; delete formatCreateCollectionParams.chunkTriggerMinSize;
delete createCollectionParams.dataEnhanceCollectionName; delete formatCreateCollectionParams.dataEnhanceCollectionName;
delete createCollectionParams.imageIndex; delete formatCreateCollectionParams.imageIndex;
delete createCollectionParams.autoIndexes; delete formatCreateCollectionParams.autoIndexes;
delete createCollectionParams.indexSize;
delete createCollectionParams.qaPrompt; if (
trainingType === DatasetCollectionDataProcessModeEnum.backup ||
trainingType === DatasetCollectionDataProcessModeEnum.template
) {
delete formatCreateCollectionParams.paragraphChunkAIMode;
delete formatCreateCollectionParams.paragraphChunkDeep;
delete formatCreateCollectionParams.paragraphChunkMinSize;
delete formatCreateCollectionParams.chunkSplitMode;
delete formatCreateCollectionParams.chunkSize;
delete formatCreateCollectionParams.chunkSplitter;
delete formatCreateCollectionParams.indexSize;
}
}
if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
delete formatCreateCollectionParams.qaPrompt;
} }
// 1. split chunks or create image chunks // 1. split chunks or create image chunks
@@ -109,30 +125,27 @@ export const createCollectionAndInsertData = async ({
}>; }>;
chunkSize?: number; chunkSize?: number;
indexSize?: number; indexSize?: number;
} = (() => { } = await (async () => {
if (rawText) { if (rawText) {
const chunkSize = computeChunkSize({
...createCollectionParams,
trainingType,
llmModel: getLLMModel(dataset.agentModel)
});
// Process text chunks // Process text chunks
const chunks = rawText2Chunks({ const chunks = await rawText2Chunks({
rawText, rawText,
chunkTriggerType: createCollectionParams.chunkTriggerType, chunkTriggerType: formatCreateCollectionParams.chunkTriggerType,
chunkTriggerMinSize: createCollectionParams.chunkTriggerMinSize, chunkTriggerMinSize: formatCreateCollectionParams.chunkTriggerMinSize,
chunkSize, chunkSize: formatCreateCollectionParams.chunkSize,
paragraphChunkDeep, paragraphChunkDeep: formatCreateCollectionParams.paragraphChunkDeep,
paragraphChunkMinSize: createCollectionParams.paragraphChunkMinSize, paragraphChunkMinSize: formatCreateCollectionParams.paragraphChunkMinSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)), maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0, overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
customReg: chunkSplitter ? [chunkSplitter] : [], customReg: formatCreateCollectionParams.chunkSplitter
? [formatCreateCollectionParams.chunkSplitter]
: [],
backupParse backupParse
}); });
return { return {
chunks, chunks,
chunkSize, chunkSize: formatCreateCollectionParams.chunkSize,
indexSize: createCollectionParams.indexSize ?? getAutoIndexSize(dataset.vectorModel) indexSize: formatCreateCollectionParams.indexSize
}; };
} }
@@ -147,12 +160,8 @@ export const createCollectionAndInsertData = async ({
return { return {
chunks: [], chunks: [],
chunkSize: computeChunkSize({ chunkSize: formatCreateCollectionParams.chunkSize,
...createCollectionParams, indexSize: formatCreateCollectionParams.indexSize
trainingType,
llmModel: getLLMModel(dataset.agentModel)
}),
indexSize: createCollectionParams.indexSize ?? getAutoIndexSize(dataset.vectorModel)
}; };
})(); })();
@@ -165,11 +174,9 @@ export const createCollectionAndInsertData = async ({
const fn = async (session: ClientSession) => { const fn = async (session: ClientSession) => {
// 3. Create collection // 3. Create collection
const { _id: collectionId } = await createOneCollection({ const { _id: collectionId } = await createOneCollection({
...createCollectionParams, ...formatCreateCollectionParams,
trainingType, trainingType,
paragraphChunkDeep,
chunkSize, chunkSize,
chunkSplitter,
indexSize, indexSize,
hashRawText: rawText ? hashStr(rawText) : undefined, hashRawText: rawText ? hashStr(rawText) : undefined,
@@ -179,7 +186,7 @@ export const createCollectionAndInsertData = async ({
if (!dataset.autoSync && dataset.type === DatasetTypeEnum.websiteDataset) return undefined; if (!dataset.autoSync && dataset.type === DatasetTypeEnum.websiteDataset) return undefined;
if ( if (
[DatasetCollectionTypeEnum.link, DatasetCollectionTypeEnum.apiFile].includes( [DatasetCollectionTypeEnum.link, DatasetCollectionTypeEnum.apiFile].includes(
createCollectionParams.type formatCreateCollectionParams.type
) )
) { ) {
return addDays(new Date(), 1); return addDays(new Date(), 1);
@@ -195,7 +202,7 @@ export const createCollectionAndInsertData = async ({
const { billId: newBillId } = await createTrainingUsage({ const { billId: newBillId } = await createTrainingUsage({
teamId, teamId,
tmbId, tmbId,
appName: createCollectionParams.name, appName: formatCreateCollectionParams.name,
billSource: UsageSourceEnum.training, billSource: UsageSourceEnum.training,
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name, vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
agentModel: getLLMModel(dataset.agentModel)?.name, agentModel: getLLMModel(dataset.agentModel)?.name,
@@ -218,7 +225,7 @@ export const createCollectionAndInsertData = async ({
vlmModel: dataset.vlmModel, vlmModel: dataset.vlmModel,
indexSize, indexSize,
mode: trainingMode, mode: trainingMode,
prompt: createCollectionParams.qaPrompt, prompt: formatCreateCollectionParams.qaPrompt,
billId: traingBillId, billId: traingBillId,
data: chunks.map((item, index) => ({ data: chunks.map((item, index) => ({
...item, ...item,

View File

@@ -5,13 +5,14 @@ import {
} from '@fastgpt/global/core/dataset/constants'; } from '@fastgpt/global/core/dataset/constants';
import { readFileContentFromMongo } from '../../common/file/gridfs/controller'; import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
import { urlsFetch } from '../../common/string/cheerio'; import { urlsFetch } from '../../common/string/cheerio';
import { type TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter'; import { type TextSplitProps } from '@fastgpt/global/common/string/textSplitter';
import axios from 'axios'; import axios from 'axios';
import { readRawContentByFileBuffer } from '../../common/file/read/utils'; import { readRawContentByFileBuffer } from '../../common/file/read/utils';
import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools'; import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools';
import { getApiDatasetRequest } from './apiDataset'; import { getApiDatasetRequest } from './apiDataset';
import Papa from 'papaparse'; import Papa from 'papaparse';
import type { ApiDatasetServerType } from '@fastgpt/global/core/dataset/apiDataset/type'; import type { ApiDatasetServerType } from '@fastgpt/global/core/dataset/apiDataset/type';
import { text2Chunks } from '../../worker/function';
export const readFileRawTextByUrl = async ({ export const readFileRawTextByUrl = async ({
teamId, teamId,
@@ -165,7 +166,7 @@ export const readApiServerFileContent = async ({
}); });
}; };
export const rawText2Chunks = ({ export const rawText2Chunks = async ({
rawText, rawText,
chunkTriggerType = ChunkTriggerConfigTypeEnum.minSize, chunkTriggerType = ChunkTriggerConfigTypeEnum.minSize,
chunkTriggerMinSize = 1000, chunkTriggerMinSize = 1000,
@@ -182,12 +183,14 @@ export const rawText2Chunks = ({
backupParse?: boolean; backupParse?: boolean;
tableParse?: boolean; tableParse?: boolean;
} & TextSplitProps): { } & TextSplitProps): Promise<
q: string; {
a: string; q: string;
indexes?: string[]; a: string;
imageIdList?: string[]; indexes?: string[];
}[] => { imageIdList?: string[];
}[]
> => {
const parseDatasetBackup2Chunks = (rawText: string) => { const parseDatasetBackup2Chunks = (rawText: string) => {
const csvArr = Papa.parse(rawText).data as string[][]; const csvArr = Papa.parse(rawText).data as string[][];
@@ -233,7 +236,7 @@ export const rawText2Chunks = ({
} }
} }
const { chunks } = splitText2Chunks({ const { chunks } = await text2Chunks({
text: rawText, text: rawText,
chunkSize, chunkSize,
...splitProps ...splitProps

View File

@@ -112,24 +112,15 @@ export async function pushDataListToTrainingQueue({
// format q and a, remove empty char // format q and a, remove empty char
data = data.filter((item) => { data = data.filter((item) => {
item.q = simpleText(item.q); const q = item.q || '';
item.a = simpleText(item.a); const a = item.a || '';
item.indexes = item.indexes
?.map((index) => {
return {
...index,
text: simpleText(index.text)
};
})
.filter(Boolean);
// filter repeat content // filter repeat content
if (!item.imageId && !item.q) { if (!item.imageId && !q) {
return; return;
} }
const text = item.q + item.a; const text = q + a;
// Oversize llm tokens // Oversize llm tokens
if (text.length > maxToken) { if (text.length > maxToken) {

View File

@@ -8,6 +8,8 @@ import {
type CreateUsageProps type CreateUsageProps
} from '@fastgpt/global/support/wallet/usage/api'; } from '@fastgpt/global/support/wallet/usage/api';
import { i18nT } from '../../../../web/i18n/utils'; import { i18nT } from '../../../../web/i18n/utils';
import { formatModelChars2Points } from './utils';
import { ModelTypeEnum } from '@fastgpt/global/core/ai/model';
export async function createUsage(data: CreateUsageProps) { export async function createUsage(data: CreateUsageProps) {
try { try {
@@ -67,6 +69,14 @@ export const createChatUsage = ({
return { totalPoints }; return { totalPoints };
}; };
export type DatasetTrainingMode = 'paragraph' | 'qa' | 'autoIndex' | 'imageIndex' | 'imageParse';
export const datasetTrainingUsageIndexMap: Record<DatasetTrainingMode, number> = {
paragraph: 1,
qa: 2,
autoIndex: 3,
imageIndex: 4,
imageParse: 5
};
export const createTrainingUsage = async ({ export const createTrainingUsage = async ({
teamId, teamId,
tmbId, tmbId,
@@ -108,6 +118,13 @@ export const createTrainingUsage = async ({
: []), : []),
...(agentModel ...(agentModel
? [ ? [
{
moduleName: i18nT('account_usage:llm_paragraph'),
model: agentModel,
amount: 0,
inputTokens: 0,
outputTokens: 0
},
{ {
moduleName: i18nT('account_usage:qa'), moduleName: i18nT('account_usage:qa'),
model: agentModel, model: agentModel,
@@ -126,6 +143,13 @@ export const createTrainingUsage = async ({
: []), : []),
...(vllmModel ...(vllmModel
? [ ? [
{
moduleName: i18nT('account_usage:image_index'),
model: vllmModel,
amount: 0,
inputTokens: 0,
outputTokens: 0
},
{ {
moduleName: i18nT('account_usage:image_parse'), moduleName: i18nT('account_usage:image_parse'),
model: vllmModel, model: vllmModel,
@@ -171,3 +195,43 @@ export const createPdfParseUsage = async ({
] ]
}); });
}; };
export const pushLLMTrainingUsage = async ({
teamId,
tmbId,
model,
inputTokens,
outputTokens,
billId,
mode
}: {
teamId: string;
tmbId: string;
model: string;
inputTokens: number;
outputTokens: number;
billId: string;
mode: DatasetTrainingMode;
}) => {
const index = datasetTrainingUsageIndexMap[mode];
// Compute points
const { totalPoints } = formatModelChars2Points({
model,
modelType: ModelTypeEnum.llm,
inputTokens,
outputTokens
});
concatUsage({
billId,
teamId,
tmbId,
totalPoints,
inputTokens,
outputTokens,
listIndex: index
});
return { totalPoints };
};

View File

@@ -0,0 +1,18 @@
import type { MessagePort } from 'worker_threads';
export const workerResponse = ({
parentPort,
status,
data
}: {
parentPort: MessagePort | null;
status: 'success' | 'error';
data: any;
}) => {
parentPort?.postMessage({
type: status,
data: data
});
process.exit();
};

View File

@@ -0,0 +1,24 @@
import {
splitText2Chunks,
type SplitProps,
type SplitResponse
} from '@fastgpt/global/common/string/textSplitter';
import { runWorker, WorkerNameEnum } from './utils';
import type { ReadFileResponse } from './readFile/type';
import { isTestEnv } from '@fastgpt/global/common/system/constants';
export const text2Chunks = (props: SplitProps) => {
// Test env, not run worker
if (isTestEnv) {
return splitText2Chunks(props);
}
return runWorker<SplitResponse>(WorkerNameEnum.text2Chunks, props);
};
export const readRawContentFromBuffer = (props: {
extension: string;
encoding: string;
buffer: Buffer;
}) => {
return runWorker<ReadFileResponse>(WorkerNameEnum.readFile, props);
};

View File

@@ -1,19 +1,21 @@
import { parentPort } from 'worker_threads'; import { parentPort } from 'worker_threads';
import { html2md } from './utils'; import { html2md } from './utils';
import { workerResponse } from '../controller';
parentPort?.on('message', (params: { html: string }) => { parentPort?.on('message', (params: { html: string }) => {
try { try {
const md = html2md(params?.html || ''); const md = html2md(params?.html || '');
parentPort?.postMessage({ workerResponse({
type: 'success', parentPort,
status: 'success',
data: md data: md
}); });
} catch (error) { } catch (error) {
parentPort?.postMessage({ workerResponse({
type: 'error', parentPort,
status: 'error',
data: error data: error
}); });
} }
process.exit();
}); });

View File

@@ -7,6 +7,7 @@ import { readDocsFile } from './extension/docx';
import { readPptxRawText } from './extension/pptx'; import { readPptxRawText } from './extension/pptx';
import { readXlsxRawText } from './extension/xlsx'; import { readXlsxRawText } from './extension/xlsx';
import { readCsvRawText } from './extension/csv'; import { readCsvRawText } from './extension/csv';
import { workerResponse } from '../controller';
parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => { parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
const read = async (params: ReadRawTextByBuffer) => { const read = async (params: ReadRawTextByBuffer) => {
@@ -41,17 +42,16 @@ parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
}; };
try { try {
parentPort?.postMessage({ workerResponse({
type: 'success', parentPort,
status: 'success',
data: await read(newProps) data: await read(newProps)
}); });
} catch (error) { } catch (error) {
console.log(error); workerResponse({
parentPort?.postMessage({ parentPort,
type: 'error', status: 'error',
data: error data: error
}); });
} }
process.exit();
}); });

View File

@@ -0,0 +1,14 @@
import { parentPort } from 'worker_threads';
import type { SplitProps } from '@fastgpt/global/common/string/textSplitter';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { workerResponse } from '../controller';
parentPort?.on('message', async (props: SplitProps) => {
const result = splitText2Chunks(props);
workerResponse({
parentPort,
status: 'success',
data: result
});
});

View File

@@ -6,7 +6,8 @@ export enum WorkerNameEnum {
readFile = 'readFile', readFile = 'readFile',
htmlStr2Md = 'htmlStr2Md', htmlStr2Md = 'htmlStr2Md',
countGptMessagesTokens = 'countGptMessagesTokens', countGptMessagesTokens = 'countGptMessagesTokens',
systemPluginRun = 'systemPluginRun' systemPluginRun = 'systemPluginRun',
text2Chunks = 'text2Chunks'
} }
export const getSafeEnv = () => { export const getSafeEnv = () => {

View File

@@ -151,8 +151,7 @@ const MySelect = <T = any,>(
? { ? {
ref: SelectedItemRef, ref: SelectedItemRef,
color: 'primary.700', color: 'primary.700',
bg: 'myGray.100', bg: 'myGray.100'
fontWeight: '600'
} }
: { : {
color: 'myGray.900' color: 'myGray.900'
@@ -167,7 +166,7 @@ const MySelect = <T = any,>(
display={'block'} display={'block'}
mb={0.5} mb={0.5}
> >
<Flex alignItems={'center'}> <Flex alignItems={'center'} fontWeight={value === item.value ? '600' : 'normal'}>
{item.icon && ( {item.icon && (
<Avatar mr={2} src={item.icon as any} w={item.iconSize ?? '1rem'} /> <Avatar mr={2} src={item.icon as any} w={item.iconSize ?? '1rem'} />
)} )}

View File

@@ -20,8 +20,10 @@
"export_title": "Time,Members,Type,Project name,AI points", "export_title": "Time,Members,Type,Project name,AI points",
"feishu": "Feishu", "feishu": "Feishu",
"generation_time": "Generation time", "generation_time": "Generation time",
"image_index": "Image index",
"image_parse": "Image tagging", "image_parse": "Image tagging",
"input_token_length": "input tokens", "input_token_length": "input tokens",
"llm_paragraph": "LLM segmentation",
"mcp": "MCP call", "mcp": "MCP call",
"member": "member", "member": "member",
"member_name": "Member name", "member_name": "Member name",

View File

@@ -45,6 +45,7 @@
"core.dataset.import.Adjust parameters": "Adjust parameters", "core.dataset.import.Adjust parameters": "Adjust parameters",
"custom_data_process_params": "Custom", "custom_data_process_params": "Custom",
"custom_data_process_params_desc": "Customize data processing rules", "custom_data_process_params_desc": "Customize data processing rules",
"custom_split_char": "Char",
"custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.", "custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.",
"data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes", "data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
"data_error_amount": "{{errorAmount}} Group training exception", "data_error_amount": "{{errorAmount}} Group training exception",
@@ -117,6 +118,11 @@
"insert_images_success": "The new picture is successfully added, and you need to wait for the training to be completed before it will be displayed.", "insert_images_success": "The new picture is successfully added, and you need to wait for the training to be completed before it will be displayed.",
"is_open_schedule": "Enable scheduled synchronization", "is_open_schedule": "Enable scheduled synchronization",
"keep_image": "Keep the picture", "keep_image": "Keep the picture",
"llm_paragraph_mode": "LLM recognition paragraph(Beta)",
"llm_paragraph_mode_auto": "automatic",
"llm_paragraph_mode_auto_desc": "Enable the model to automatically recognize the title when the file content does not contain a Markdown title.",
"llm_paragraph_mode_forbid": "Disabled",
"llm_paragraph_mode_forbid_desc": "Force the disabling of the model's automatic paragraph recognition",
"loading": "Loading...", "loading": "Loading...",
"max_chunk_size": "Maximum chunk size", "max_chunk_size": "Maximum chunk size",
"move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.", "move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.",

View File

@@ -20,8 +20,10 @@
"export_title": "时间,成员,类型,项目名,AI 积分消耗", "export_title": "时间,成员,类型,项目名,AI 积分消耗",
"feishu": "飞书", "feishu": "飞书",
"generation_time": "生成时间", "generation_time": "生成时间",
"image_index": "图片索引",
"image_parse": "图片标注", "image_parse": "图片标注",
"input_token_length": "输入 tokens", "input_token_length": "输入 tokens",
"llm_paragraph": "模型分段",
"mcp": "MCP 调用", "mcp": "MCP 调用",
"member": "成员", "member": "成员",
"member_name": "成员名", "member_name": "成员名",

View File

@@ -45,6 +45,7 @@
"core.dataset.import.Adjust parameters": "调整参数", "core.dataset.import.Adjust parameters": "调整参数",
"custom_data_process_params": "自定义", "custom_data_process_params": "自定义",
"custom_data_process_params_desc": "自定义设置数据处理规则", "custom_data_process_params_desc": "自定义设置数据处理规则",
"custom_split_char": "分隔符",
"custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号例如: * () [] {} 等。", "custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号例如: * () [] {} 等。",
"data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引", "data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引",
"data_error_amount": "{{errorAmount}} 组训练异常", "data_error_amount": "{{errorAmount}} 组训练异常",
@@ -117,6 +118,11 @@
"insert_images_success": "新增图片成功,需等待训练完成才会展示", "insert_images_success": "新增图片成功,需等待训练完成才会展示",
"is_open_schedule": "启用定时同步", "is_open_schedule": "启用定时同步",
"keep_image": "保留图片", "keep_image": "保留图片",
"llm_paragraph_mode": "模型识别段落(Beta)",
"llm_paragraph_mode_auto": "自动",
"llm_paragraph_mode_auto_desc": "当文件内容不包含 Markdown 标题时,启用模型自动识别标题。",
"llm_paragraph_mode_forbid": "禁用",
"llm_paragraph_mode_forbid_desc": "强制禁用模型自动识别段落",
"loading": "加载中...", "loading": "加载中...",
"max_chunk_size": "最大分块大小", "max_chunk_size": "最大分块大小",
"move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。", "move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。",

View File

@@ -20,8 +20,10 @@
"export_title": "時間,成員,類型,項目名,AI 積分消耗", "export_title": "時間,成員,類型,項目名,AI 積分消耗",
"feishu": "飛書", "feishu": "飛書",
"generation_time": "生成時間", "generation_time": "生成時間",
"image_index": "圖片索引",
"image_parse": "圖片標註", "image_parse": "圖片標註",
"input_token_length": "輸入 tokens", "input_token_length": "輸入 tokens",
"llm_paragraph": "模型分段",
"mcp": "MCP 調用", "mcp": "MCP 調用",
"member": "成員", "member": "成員",
"member_name": "成員名", "member_name": "成員名",

View File

@@ -44,6 +44,7 @@
"core.dataset.import.Adjust parameters": "調整參數", "core.dataset.import.Adjust parameters": "調整參數",
"custom_data_process_params": "自訂", "custom_data_process_params": "自訂",
"custom_data_process_params_desc": "自訂資料處理規則", "custom_data_process_params_desc": "自訂資料處理規則",
"custom_split_char": "分隔符",
"custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的資料使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.”表示中英文句號。\n\n盡量避免使用正則相關特殊符號例如* () [] {} 等。", "custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的資料使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.”表示中英文句號。\n\n盡量避免使用正則相關特殊符號例如* () [] {} 等。",
"data_amount": "{{dataAmount}} 組資料,{{indexAmount}} 組索引", "data_amount": "{{dataAmount}} 組資料,{{indexAmount}} 組索引",
"data_error_amount": "{{errorAmount}} 組訓練異常", "data_error_amount": "{{errorAmount}} 組訓練異常",
@@ -116,6 +117,11 @@
"insert_images_success": "新增圖片成功,需等待訓練完成才會展示", "insert_images_success": "新增圖片成功,需等待訓練完成才會展示",
"is_open_schedule": "啟用定時同步", "is_open_schedule": "啟用定時同步",
"keep_image": "保留圖片", "keep_image": "保留圖片",
"llm_paragraph_mode": "模型識別段落(Beta)",
"llm_paragraph_mode_auto": "自動",
"llm_paragraph_mode_auto_desc": "當文件內容不包含 Markdown 標題時,啟用模型自動識別標題。",
"llm_paragraph_mode_forbid": "禁用",
"llm_paragraph_mode_forbid_desc": "強制禁用模型自動識別段落",
"loading": "加載中...", "loading": "加載中...",
"max_chunk_size": "最大分塊大小", "max_chunk_size": "最大分塊大小",
"move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。", "move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。",

6
pnpm-lock.yaml generated
View File

@@ -89,6 +89,9 @@ importers:
json5: json5:
specifier: ^2.2.3 specifier: ^2.2.3
version: 2.2.3 version: 2.2.3
lodash:
specifier: ^4.17.21
version: 4.17.21
nanoid: nanoid:
specifier: ^5.1.3 specifier: ^5.1.3
version: 5.1.3 version: 5.1.3
@@ -108,6 +111,9 @@ importers:
'@types/js-yaml': '@types/js-yaml':
specifier: ^4.0.9 specifier: ^4.0.9
version: 4.0.9 version: 4.0.9
'@types/lodash':
specifier: ^4.14.191
version: 4.17.16
'@types/node': '@types/node':
specifier: 20.14.0 specifier: 20.14.0
version: 20.14.0 version: 20.14.0

View File

@@ -9,25 +9,14 @@ import { useMyStep } from '@fastgpt/web/hooks/useStep';
import MyDivider from '@fastgpt/web/components/common/MyDivider'; import MyDivider from '@fastgpt/web/components/common/MyDivider';
import React from 'react'; import React from 'react';
import { Box, Link, Input, Button, ModalBody, ModalFooter, Stack } from '@chakra-ui/react'; import { Box, Link, Input, Button, ModalBody, ModalFooter, Stack } from '@chakra-ui/react';
import {
DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent'; import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
import { useContextSelector } from 'use-context-selector'; import { useContextSelector } from 'use-context-selector';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import CollectionChunkForm, { import CollectionChunkForm, { type CollectionChunkFormType } from '../Form/CollectionChunkForm';
collectionChunkForm2StoreChunkData,
type CollectionChunkFormType
} from '../Form/CollectionChunkForm';
import {
getAutoIndexSize,
getLLMDefaultChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type'; import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
import PopoverConfirm from '@fastgpt/web/components/common/MyPopover/PopoverConfirm'; import PopoverConfirm from '@fastgpt/web/components/common/MyPopover/PopoverConfirm';
import { defaultFormData } from '../Import/Context'; import { defaultFormData } from '../Import/Context';
import { computedCollectionChunkSettings } from '@fastgpt/global/core/dataset/training/utils';
export type WebsiteConfigFormType = { export type WebsiteConfigFormType = {
websiteConfig: { websiteConfig: {
@@ -80,7 +69,7 @@ const WebsiteConfigModal = ({
const form = useForm<CollectionChunkFormType>({ const form = useForm<CollectionChunkFormType>({
defaultValues: { defaultValues: {
trainingType: chunkSettings?.trainingType, trainingType: chunkSettings?.trainingType || defaultFormData.trainingType,
chunkTriggerType: chunkSettings?.chunkTriggerType || defaultFormData.chunkTriggerType, chunkTriggerType: chunkSettings?.chunkTriggerType || defaultFormData.chunkTriggerType,
chunkTriggerMinSize: chunkTriggerMinSize:
@@ -204,9 +193,9 @@ const WebsiteConfigModal = ({
form.handleSubmit((data) => form.handleSubmit((data) =>
onSuccess({ onSuccess({
websiteConfig: websiteInfoGetValues(), websiteConfig: websiteInfoGetValues(),
chunkSettings: collectionChunkForm2StoreChunkData({ chunkSettings: computedCollectionChunkSettings({
...data, ...data,
agentModel: datasetDetail.agentModel, llmModel: datasetDetail.agentModel,
vectorModel: datasetDetail.vectorModel vectorModel: datasetDetail.vectorModel
}) })
}) })

View File

@@ -17,7 +17,7 @@ import {
} from '@chakra-ui/react'; } from '@chakra-ui/react';
import MyIcon from '@fastgpt/web/components/common/Icon'; import MyIcon from '@fastgpt/web/components/common/Icon';
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio'; import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
import type { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants'; import { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
import { ChunkTriggerConfigTypeEnum } from '@fastgpt/global/core/dataset/constants'; import { ChunkTriggerConfigTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { import {
DataChunkSplitModeEnum, DataChunkSplitModeEnum,
@@ -133,6 +133,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
const autoIndexes = watch('autoIndexes'); const autoIndexes = watch('autoIndexes');
const indexSize = watch('indexSize'); const indexSize = watch('indexSize');
const imageIndex = watch('imageIndex'); const imageIndex = watch('imageIndex');
const paragraphChunkAIMode = watch('paragraphChunkAIMode');
const trainingModeList = useMemo(() => { const trainingModeList = useMemo(() => {
const list = { const list = {
@@ -362,11 +363,35 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
onChange={(e) => { onChange={(e) => {
setValue('chunkSplitMode', e); setValue('chunkSplitMode', e);
}} }}
fontSize={'md'}
/> />
{chunkSplitMode === DataChunkSplitModeEnum.paragraph && ( {chunkSplitMode === DataChunkSplitModeEnum.paragraph && (
<> <>
<Box mt={1.5}> <Box mt={3}>
<Box fontSize={'sm'}>{t('dataset:llm_paragraph_mode')}</Box>
<MySelect<ParagraphChunkAIModeEnum>
size={'sm'}
bg={'myGray.50'}
value={paragraphChunkAIMode}
onChange={(e) => {
setValue('paragraphChunkAIMode', e);
}}
list={[
{
label: t('dataset:llm_paragraph_mode_forbid'),
value: ParagraphChunkAIModeEnum.forbid,
description: t('dataset:llm_paragraph_mode_forbid_desc')
},
{
label: t('dataset:llm_paragraph_mode_auto'),
value: ParagraphChunkAIModeEnum.auto,
description: t('dataset:llm_paragraph_mode_auto_desc')
}
]}
/>
</Box>
<Box mt={2} fontSize={'sm'}>
<Box>{t('dataset:paragraph_max_deep')}</Box> <Box>{t('dataset:paragraph_max_deep')}</Box>
<MyNumberInput <MyNumberInput
size={'sm'} size={'sm'}
@@ -379,7 +404,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
h={'32px'} h={'32px'}
/> />
</Box> </Box>
<Box mt={1.5}> <Box mt={2} fontSize={'sm'}>
<Box>{t('dataset:max_chunk_size')}</Box> <Box>{t('dataset:max_chunk_size')}</Box>
<Box <Box
css={{ css={{
@@ -409,7 +434,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
)} )}
{chunkSplitMode === DataChunkSplitModeEnum.size && ( {chunkSplitMode === DataChunkSplitModeEnum.size && (
<Box mt={1.5}> <Box mt={3} fontSize={'sm'}>
<Box>{t('dataset:chunk_size')}</Box> <Box>{t('dataset:chunk_size')}</Box>
<Box <Box
css={{ css={{
@@ -438,45 +463,48 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
)} )}
{chunkSplitMode === DataChunkSplitModeEnum.char && ( {chunkSplitMode === DataChunkSplitModeEnum.char && (
<HStack mt={1.5}> <Box mt={3} fontSize={'sm'}>
<Box flex={'1 0 0'}> <Box>{t('dataset:custom_split_char')}</Box>
<MySelect<string> <HStack>
list={customSplitList} <Box flex={'1 0 0'}>
size={'sm'} <MySelect<string>
bg={'myGray.50'} list={customSplitList}
value={customListSelectValue} size={'sm'}
h={'32px'} bg={'myGray.50'}
onChange={(val) => { value={customListSelectValue}
if (val === 'Other') { h={'32px'}
setValue('chunkSplitter', ''); onChange={(val) => {
} else { if (val === 'Other') {
setValue('chunkSplitter', val); setValue('chunkSplitter', '');
} } else {
setCustomListSelectValue(val); setValue('chunkSplitter', val);
}} }
/> setCustomListSelectValue(val);
</Box> }}
{customListSelectValue === 'Other' && ( />
<Input </Box>
flex={'1 0 0'} {customListSelectValue === 'Other' && (
h={'32px'} <Input
size={'sm'} flex={'1 0 0'}
bg={'myGray.50'} h={'32px'}
placeholder="\n;======;==SPLIT==" size={'sm'}
{...register('chunkSplitter')} bg={'myGray.50'}
/> placeholder="\n;======;==SPLIT=="
)} {...register('chunkSplitter')}
</HStack> />
)}
</HStack>
</Box>
)} )}
</Box> </Box>
{trainingType === DatasetCollectionDataProcessModeEnum.chunk && ( {trainingType === DatasetCollectionDataProcessModeEnum.chunk && (
<Box> <Box fontSize={'sm'} mt={2}>
<Flex alignItems={'center'} mt={3}> <Flex alignItems={'center'}>
<Box>{t('dataset:index_size')}</Box> <Box>{t('dataset:index_size')}</Box>
<QuestionTip label={t('dataset:index_size_tips')} /> <QuestionTip label={t('dataset:index_size_tips')} />
</Flex> </Flex>
<Box mt={1}> <Box>
<MySelect<number> <MySelect<number>
bg={'myGray.50'} bg={'myGray.50'}
list={indexSizeSeletorList} list={indexSizeSeletorList}
@@ -490,7 +518,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
)} )}
{showQAPromptInput && ( {showQAPromptInput && (
<Box mt={3}> <Box mt={2}>
<Box>{t('common:core.dataset.collection.QA Prompt')}</Box> <Box>{t('common:core.dataset.collection.QA Prompt')}</Box>
<Box <Box
position={'relative'} position={'relative'}
@@ -570,83 +598,3 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
}; };
export default CollectionChunkForm; export default CollectionChunkForm;
// Get chunk settings from form
export const collectionChunkForm2StoreChunkData = ({
agentModel,
vectorModel,
...data
}: CollectionChunkFormType & {
agentModel: LLMModelItemType;
vectorModel: EmbeddingModelItemType;
}): CollectionChunkFormType => {
const {
trainingType,
autoIndexes,
chunkSettingMode,
chunkSize,
chunkSplitter,
indexSize,
qaPrompt
} = data;
// 根据处理方式,获取 auto 和 custom 的参数。
const trainingModeSize: {
autoChunkSize: number;
autoIndexSize: number;
chunkSize: number;
indexSize: number;
} = (() => {
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
return {
autoChunkSize: getLLMDefaultChunkSize(agentModel),
autoIndexSize: getMaxIndexSize(vectorModel),
chunkSize,
indexSize: getMaxIndexSize(vectorModel)
};
} else if (autoIndexes) {
return {
autoChunkSize: chunkAutoChunkSize,
autoIndexSize: getAutoIndexSize(vectorModel),
chunkSize,
indexSize
};
} else {
return {
autoChunkSize: chunkAutoChunkSize,
autoIndexSize: getAutoIndexSize(vectorModel),
chunkSize,
indexSize
};
}
})();
// 获取真实参数
const {
chunkSize: formatChunkIndex,
indexSize: formatIndexSize,
chunkSplitter: formatChunkSplitter
} = (() => {
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
return {
chunkSize: trainingModeSize.autoChunkSize,
indexSize: trainingModeSize.autoIndexSize,
chunkSplitter: ''
};
} else {
return {
chunkSize: trainingModeSize.chunkSize,
indexSize: trainingModeSize.indexSize,
chunkSplitter
};
}
})();
return {
...data,
chunkSize: formatChunkIndex,
indexSize: formatIndexSize,
chunkSplitter: formatChunkSplitter,
qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
};
};

View File

@@ -52,7 +52,7 @@ export const defaultFormData: ImportFormType = {
chunkSettingMode: ChunkSettingModeEnum.auto, chunkSettingMode: ChunkSettingModeEnum.auto,
chunkSplitMode: DataChunkSplitModeEnum.paragraph, chunkSplitMode: DataChunkSplitModeEnum.paragraph,
paragraphChunkAIMode: ParagraphChunkAIModeEnum.auto, paragraphChunkAIMode: ParagraphChunkAIModeEnum.forbid,
paragraphChunkDeep: 5, paragraphChunkDeep: 5,
paragraphChunkMinSize: 100, paragraphChunkMinSize: 100,
@@ -198,10 +198,10 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
const vectorModel = datasetDetail.vectorModel; const vectorModel = datasetDetail.vectorModel;
const processParamsForm = useForm<ImportFormType>({ const processParamsForm = useForm<ImportFormType>({
defaultValues: { defaultValues: (() => ({
...defaultFormData, ...defaultFormData,
indexSize: getAutoIndexSize(vectorModel) indexSize: getAutoIndexSize(vectorModel)
} }))()
}); });
const [sources, setSources] = useState<ImportSourceItemType[]>([]); const [sources, setSources] = useState<ImportSourceItemType[]>([]);

View File

@@ -17,7 +17,6 @@ import MyBox from '@fastgpt/web/components/common/MyBox';
import Markdown from '@/components/Markdown'; import Markdown from '@/components/Markdown';
import { useToast } from '@fastgpt/web/hooks/useToast'; import { useToast } from '@fastgpt/web/hooks/useToast';
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils'; import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';
const PreviewData = () => { const PreviewData = () => {
const { t } = useTranslation(); const { t } = useTranslation();
@@ -37,11 +36,7 @@ const PreviewData = () => {
async () => { async () => {
if (!previewFile) return { chunks: [], total: 0 }; if (!previewFile) return { chunks: [], total: 0 };
const chunkData = collectionChunkForm2StoreChunkData({ const chunkData = processParamsForm.getValues();
...processParamsForm.getValues(),
vectorModel: datasetDetail.vectorModel,
agentModel: datasetDetail.agentModel
});
if (importSource === ImportDataSourceEnum.fileCustom) { if (importSource === ImportDataSourceEnum.fileCustom) {
const chunkSplitter = processParamsForm.getValues('chunkSplitter'); const chunkSplitter = processParamsForm.getValues('chunkSplitter');

View File

@@ -37,7 +37,6 @@ import { useContextSelector } from 'use-context-selector';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import { DatasetImportContext, type ImportFormType } from '../Context'; import { DatasetImportContext, type ImportFormType } from '../Context';
import { type ApiCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d'; import { type ApiCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';
const Upload = () => { const Upload = () => {
const { t } = useTranslation(); const { t } = useTranslation();
@@ -82,12 +81,6 @@ const Upload = () => {
const { runAsync: startUpload, loading: isLoading } = useRequest2( const { runAsync: startUpload, loading: isLoading } = useRequest2(
async ({ customPdfParse, webSelector, ...data }: ImportFormType) => { async ({ customPdfParse, webSelector, ...data }: ImportFormType) => {
const chunkData = collectionChunkForm2StoreChunkData({
...data,
vectorModel: datasetDetail.vectorModel,
agentModel: datasetDetail.agentModel
});
if (sources.length === 0) return; if (sources.length === 0) return;
const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting'); const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting');
@@ -108,7 +101,7 @@ const Upload = () => {
const commonParams: ApiCreateDatasetCollectionParams & { const commonParams: ApiCreateDatasetCollectionParams & {
name: string; name: string;
} = { } = {
...chunkData, ...data,
parentId, parentId,
datasetId: datasetDetail._id, datasetId: datasetDetail._id,
name: item.sourceName, name: item.sourceName,

View File

@@ -1,7 +1,3 @@
import {
ChunkSettingModeEnum,
DatasetCollectionDataProcessModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants'; import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read'; import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
import { NextAPI } from '@/service/middleware/entry'; import { NextAPI } from '@/service/middleware/entry';
@@ -13,13 +9,11 @@ import {
import { authCollectionFile } from '@fastgpt/service/support/permission/auth/file'; import { authCollectionFile } from '@fastgpt/service/support/permission/auth/file';
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth'; import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import { import {
computeChunkSize, computedCollectionChunkSettings,
computeChunkSplitter,
computeParagraphChunkDeep,
getLLMMaxChunkSize getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils'; } from '@fastgpt/global/core/dataset/training/utils';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common'; import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { getLLMModel } from '@fastgpt/service/core/ai/model'; import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
import type { ChunkSettingsType } from '@fastgpt/global/core/dataset/type'; import type { ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
export type PostPreviewFilesChunksProps = ChunkSettingsType & { export type PostPreviewFilesChunksProps = ChunkSettingsType & {
@@ -52,22 +46,12 @@ async function handler(
sourceId, sourceId,
customPdfParse = false, customPdfParse = false,
trainingType = DatasetCollectionDataProcessModeEnum.chunk,
chunkTriggerType,
chunkTriggerMinSize,
chunkSettingMode = ChunkSettingModeEnum.auto,
chunkSplitMode,
paragraphChunkDeep,
paragraphChunkMinSize,
chunkSize,
chunkSplitter,
overlapRatio, overlapRatio,
selector, selector,
datasetId, datasetId,
externalFileId externalFileId,
...chunkSettings
} = req.body; } = req.body;
if (!sourceId) { if (!sourceId) {
@@ -97,22 +81,10 @@ async function handler(
return Promise.reject(CommonErrEnum.unAuthFile); return Promise.reject(CommonErrEnum.unAuthFile);
} }
chunkSize = computeChunkSize({ const formatChunkSettings = computedCollectionChunkSettings({
trainingType, ...chunkSettings,
chunkSettingMode, llmModel: getLLMModel(dataset.agentModel),
chunkSplitMode, vectorModel: getEmbeddingModel(dataset.vectorModel)
chunkSize,
llmModel: getLLMModel(dataset.agentModel)
});
chunkSplitter = computeChunkSplitter({
chunkSettingMode,
chunkSplitMode,
chunkSplitter
});
paragraphChunkDeep = computeParagraphChunkDeep({
chunkSettingMode,
chunkSplitMode,
paragraphChunkDeep
}); });
const { rawText } = await readDatasetSourceRawText({ const { rawText } = await readDatasetSourceRawText({
@@ -126,16 +98,16 @@ async function handler(
apiDatasetServer: dataset.apiDatasetServer apiDatasetServer: dataset.apiDatasetServer
}); });
const chunks = rawText2Chunks({ const chunks = await rawText2Chunks({
rawText, rawText,
chunkTriggerType, chunkTriggerType: formatChunkSettings.chunkTriggerType,
chunkTriggerMinSize, chunkTriggerMinSize: formatChunkSettings.chunkTriggerMinSize,
chunkSize, chunkSize: formatChunkSettings.chunkSize,
paragraphChunkDeep, paragraphChunkDeep: formatChunkSettings.paragraphChunkDeep,
paragraphChunkMinSize, paragraphChunkMinSize: formatChunkSettings.paragraphChunkMinSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)), maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio, overlapRatio,
customReg: chunkSplitter ? [chunkSplitter] : [] customReg: formatChunkSettings.chunkSplitter ? [formatChunkSettings.chunkSplitter] : []
}); });
return { return {

View File

@@ -40,6 +40,8 @@ import { isEqual } from 'lodash';
import { addOperationLog } from '@fastgpt/service/support/operationLog/addOperationLog'; import { addOperationLog } from '@fastgpt/service/support/operationLog/addOperationLog';
import { OperationLogEventEnum } from '@fastgpt/global/support/operationLog/constants'; import { OperationLogEventEnum } from '@fastgpt/global/support/operationLog/constants';
import { getI18nDatasetType } from '@fastgpt/service/support/operationLog/util'; import { getI18nDatasetType } from '@fastgpt/service/support/operationLog/util';
import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
import { computedCollectionChunkSettings } from '@fastgpt/global/core/dataset/training/utils';
export type DatasetUpdateQuery = {}; export type DatasetUpdateQuery = {};
export type DatasetUpdateResponse = any; export type DatasetUpdateResponse = any;
@@ -59,7 +61,7 @@ async function handler(
req: ApiRequestProps<DatasetUpdateBody, DatasetUpdateQuery>, req: ApiRequestProps<DatasetUpdateBody, DatasetUpdateQuery>,
_res: ApiResponseType<any> _res: ApiResponseType<any>
): Promise<DatasetUpdateResponse> { ): Promise<DatasetUpdateResponse> {
const { let {
id, id,
parentId, parentId,
name, name,
@@ -89,6 +91,14 @@ async function handler(
let targetName = ''; let targetName = '';
chunkSettings = chunkSettings
? computedCollectionChunkSettings({
...chunkSettings,
llmModel: getLLMModel(dataset.agentModel),
vectorModel: getEmbeddingModel(dataset.vectorModel)
})
: undefined;
if (isMove) { if (isMove) {
if (parentId) { if (parentId) {
// move to a folder, check the target folder's permission // move to a folder, check the target folder's permission

View File

@@ -16,9 +16,9 @@ import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { type ClientSession } from '@fastgpt/service/common/mongo'; import { type ClientSession } from '@fastgpt/service/common/mongo';
import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema'; import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants'; import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken'; import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
import { deleteDatasetImage } from '@fastgpt/service/core/dataset/image/controller'; import { deleteDatasetImage } from '@fastgpt/service/core/dataset/image/controller';
import { text2Chunks } from '@fastgpt/service/worker/function';
const formatIndexes = async ({ const formatIndexes = async ({
indexes = [], indexes = [],
@@ -40,7 +40,7 @@ const formatIndexes = async ({
}[] }[]
> => { > => {
/* get dataset data default index */ /* get dataset data default index */
const getDefaultIndex = ({ const getDefaultIndex = async ({
q = '', q = '',
a, a,
indexSize indexSize
@@ -49,13 +49,15 @@ const formatIndexes = async ({
a?: string; a?: string;
indexSize: number; indexSize: number;
}) => { }) => {
const qChunks = splitText2Chunks({ const qChunks = (
text: q, await text2Chunks({
chunkSize: indexSize, text: q,
maxSize: maxIndexSize chunkSize: indexSize,
}).chunks; maxSize: maxIndexSize
})
).chunks;
const aChunks = a const aChunks = a
? splitText2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize }).chunks ? (await text2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize })).chunks
: []; : [];
return [ return [
@@ -80,7 +82,7 @@ const formatIndexes = async ({
.filter((item) => !!item.text.trim()); .filter((item) => !!item.text.trim());
// Recompute default indexes, Merge ids of the same index, reduce the number of rebuilds // Recompute default indexes, Merge ids of the same index, reduce the number of rebuilds
const defaultIndexes = getDefaultIndex({ q, a, indexSize }); const defaultIndexes = await getDefaultIndex({ q, a, indexSize });
const concatDefaultIndexes = defaultIndexes.map((item) => { const concatDefaultIndexes = defaultIndexes.map((item) => {
const oldIndex = indexes!.find((index) => index.text === item.text); const oldIndex = indexes!.find((index) => index.text === item.text);
@@ -114,11 +116,13 @@ const formatIndexes = async ({
// If oversize tokens, split it // If oversize tokens, split it
const tokens = await countPromptTokens(item.text); const tokens = await countPromptTokens(item.text);
if (tokens > maxIndexSize) { if (tokens > maxIndexSize) {
const splitText = splitText2Chunks({ const splitText = (
text: item.text, await text2Chunks({
chunkSize: indexSize, text: item.text,
maxSize: maxIndexSize chunkSize: indexSize,
}).chunks; maxSize: maxIndexSize
})
).chunks;
return splitText.map((text) => ({ return splitText.map((text) => ({
text, text,
type: item.type type: item.type

View File

@@ -1,6 +1,6 @@
/* Dataset collection source parse, not max size. */ /* Dataset collection source parse, not max size. */
import type { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants'; import { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
import { import {
DatasetCollectionDataProcessModeEnum, DatasetCollectionDataProcessModeEnum,
DatasetCollectionTypeEnum, DatasetCollectionTypeEnum,
@@ -29,7 +29,7 @@ import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema'; import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
import { hashStr } from '@fastgpt/global/common/string/tools'; import { hashStr } from '@fastgpt/global/common/string/tools';
import { POST } from '@fastgpt/service/common/api/plusRequest'; import { POST } from '@fastgpt/service/common/api/plusRequest';
import { deleteRawTextBuffer } from '@fastgpt/service/common/buffer/rawText/controller'; import { pushLLMTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
const requestLLMPargraph = async ({ const requestLLMPargraph = async ({
rawText, rawText,
@@ -42,13 +42,11 @@ const requestLLMPargraph = async ({
billId: string; billId: string;
paragraphChunkAIMode: ParagraphChunkAIModeEnum; paragraphChunkAIMode: ParagraphChunkAIModeEnum;
}) => { }) => {
return { if (
resultText: rawText, !global.feConfigs?.isPlus ||
totalInputTokens: 0, !paragraphChunkAIMode ||
totalOutputTokens: 0 paragraphChunkAIMode === ParagraphChunkAIModeEnum.forbid
}; ) {
if (!global.feConfigs?.isPlus || !paragraphChunkAIMode) {
return { return {
resultText: rawText, resultText: rawText,
totalInputTokens: 0, totalInputTokens: 0,
@@ -57,16 +55,16 @@ const requestLLMPargraph = async ({
} }
// Check is markdown text(Include 1 group of title) // Check is markdown text(Include 1 group of title)
// if (paragraphChunkAIMode === ParagraphChunkAIModeEnum.auto) { if (paragraphChunkAIMode === ParagraphChunkAIModeEnum.auto) {
// const isMarkdown = /^(#+)\s/.test(rawText); const isMarkdown = /^(#+)\s/.test(rawText);
// if (isMarkdown) { if (isMarkdown) {
// return { return {
// resultText: rawText, resultText: rawText,
// totalInputTokens: 0, totalInputTokens: 0,
// totalOutputTokens: 0 totalOutputTokens: 0
// }; };
// } }
// } }
const data = await POST<{ const data = await POST<{
resultText: string; resultText: string;
@@ -226,15 +224,25 @@ export const datasetParseQueue = async (): Promise<any> => {
}); });
// 3. LLM Pargraph // 3. LLM Pargraph
const { resultText } = await requestLLMPargraph({ const { resultText, totalInputTokens, totalOutputTokens } = await requestLLMPargraph({
rawText, rawText,
model: dataset.agentModel, model: dataset.agentModel,
billId: data.billId, billId: data.billId,
paragraphChunkAIMode: collection.paragraphChunkAIMode paragraphChunkAIMode: collection.paragraphChunkAIMode
}); });
// Push usage
pushLLMTrainingUsage({
teamId: data.teamId,
tmbId: data.tmbId,
model: dataset.agentModel,
inputTokens: totalInputTokens,
outputTokens: totalOutputTokens,
billId: data.billId,
mode: 'paragraph'
});
// 4. Chunk split // 4. Chunk split
const chunks = rawText2Chunks({ const chunks = await rawText2Chunks({
rawText: resultText, rawText: resultText,
chunkTriggerType: collection.chunkTriggerType, chunkTriggerType: collection.chunkTriggerType,
chunkTriggerMinSize: collection.chunkTriggerMinSize, chunkTriggerMinSize: collection.chunkTriggerMinSize,

View File

@@ -1,10 +1,9 @@
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema'; import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
import { pushQAUsage } from '@/service/support/wallet/usage/push'; import { pushLLMTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { createChatCompletion } from '@fastgpt/service/core/ai/config'; import { createChatCompletion } from '@fastgpt/service/core/ai/config';
import type { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type.d'; import type { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type.d';
import { addLog } from '@fastgpt/service/common/system/log'; import { addLog } from '@fastgpt/service/common/system/log';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { replaceVariable } from '@fastgpt/global/common/string/tools'; import { replaceVariable } from '@fastgpt/global/common/string/tools';
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent'; import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api.d'; import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api.d';
@@ -24,6 +23,7 @@ import {
getLLMMaxChunkSize getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils'; } from '@fastgpt/global/core/dataset/training/utils';
import { getErrText } from '@fastgpt/global/common/error/utils'; import { getErrText } from '@fastgpt/global/common/error/utils';
import { text2Chunks } from '@fastgpt/service/worker/function';
const reduceQueue = () => { const reduceQueue = () => {
global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0; global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0;
@@ -144,7 +144,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
const inputTokens = usage?.prompt_tokens || (await countGptMessagesTokens(messages)); const inputTokens = usage?.prompt_tokens || (await countGptMessagesTokens(messages));
const outputTokens = usage?.completion_tokens || (await countPromptTokens(answer)); const outputTokens = usage?.completion_tokens || (await countPromptTokens(answer));
const qaArr = formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对 const qaArr = await formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对
// get vector and insert // get vector and insert
await pushDataListToTrainingQueueByCollectionId({ await pushDataListToTrainingQueueByCollectionId({
@@ -163,13 +163,14 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
await MongoDatasetTraining.findByIdAndDelete(data._id); await MongoDatasetTraining.findByIdAndDelete(data._id);
// add bill // add bill
pushQAUsage({ pushLLMTrainingUsage({
teamId: data.teamId, teamId: data.teamId,
tmbId: data.tmbId, tmbId: data.tmbId,
inputTokens, inputTokens,
outputTokens, outputTokens,
billId: data.billId, billId: data.billId,
model: modelData.model model: modelData.model,
mode: 'qa'
}); });
addLog.info(`[QA Queue] Finish`, { addLog.info(`[QA Queue] Finish`, {
time: Date.now() - startTime, time: Date.now() - startTime,
@@ -196,7 +197,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
} }
// Format qa answer // Format qa answer
function formatSplitText({ async function formatSplitText({
answer, answer,
rawText, rawText,
llmModel llmModel
@@ -223,7 +224,7 @@ function formatSplitText({
// empty result. direct split chunk // empty result. direct split chunk
if (result.length === 0) { if (result.length === 0) {
const { chunks } = splitText2Chunks({ const { chunks } = await text2Chunks({
text: rawText, text: rawText,
chunkSize: chunkAutoChunkSize, chunkSize: chunkAutoChunkSize,
maxSize: getLLMMaxChunkSize(llmModel) maxSize: getLLMMaxChunkSize(llmModel)

View File

@@ -5,42 +5,6 @@ import { i18nT } from '@fastgpt/web/i18n/utils';
import { ModelTypeEnum } from '@fastgpt/global/core/ai/model'; import { ModelTypeEnum } from '@fastgpt/global/core/ai/model';
import { getDefaultTTSModel } from '@fastgpt/service/core/ai/model'; import { getDefaultTTSModel } from '@fastgpt/service/core/ai/model';
export const pushQAUsage = async ({
teamId,
tmbId,
model,
inputTokens,
outputTokens,
billId
}: {
teamId: string;
tmbId: string;
model: string;
inputTokens: number;
outputTokens: number;
billId: string;
}) => {
// 计算价格
const { totalPoints } = formatModelChars2Points({
model,
modelType: ModelTypeEnum.llm,
inputTokens,
outputTokens
});
concatUsage({
billId,
teamId,
tmbId,
totalPoints,
inputTokens,
outputTokens,
listIndex: 1
});
return { totalPoints };
};
export const pushGenerateVectorUsage = ({ export const pushGenerateVectorUsage = ({
billId, billId,
teamId, teamId,

View File

@@ -16,7 +16,7 @@ const formatResult = (result: string[]) => {
}; };
// 最大值分块测试-小于最大值,不分块 // 最大值分块测试-小于最大值,不分块
it(`Test splitText2Chunks 1`, () => { it(`Test splitText2Chunks 1`, async () => {
const mock = { const mock = {
text: `# A text: `# A
@@ -61,7 +61,7 @@ dsgsgfsgs22sddddddd`
] ]
}; };
const data = rawText2Chunks({ const data = await rawText2Chunks({
rawText: mock.text, rawText: mock.text,
chunkTriggerType: ChunkTriggerConfigTypeEnum.maxSize, chunkTriggerType: ChunkTriggerConfigTypeEnum.maxSize,
chunkTriggerMinSize: 1000, chunkTriggerMinSize: 1000,
@@ -72,7 +72,7 @@ dsgsgfsgs22sddddddd`
expect(formatChunks(data)).toEqual(formatResult(mock.result)); expect(formatChunks(data)).toEqual(formatResult(mock.result));
}); });
// 最大值分块测试-大于最大值,分块 // 最大值分块测试-大于最大值,分块
it(`Test splitText2Chunks 2`, () => { it(`Test splitText2Chunks 2`, async () => {
const mock = { const mock = {
text: `# A text: `# A
@@ -122,7 +122,7 @@ dsgsgfsgs22sddddddd`
] ]
}; };
const data = rawText2Chunks({ const data = await rawText2Chunks({
rawText: mock.text, rawText: mock.text,
chunkTriggerType: ChunkTriggerConfigTypeEnum.maxSize, chunkTriggerType: ChunkTriggerConfigTypeEnum.maxSize,
chunkTriggerMinSize: 10, chunkTriggerMinSize: 10,
@@ -135,7 +135,7 @@ dsgsgfsgs22sddddddd`
}); });
// 最小值分块测试-大于最小值,不分块 // 最小值分块测试-大于最小值,不分块
it(`Test splitText2Chunks 3`, () => { it(`Test splitText2Chunks 3`, async () => {
const mock = { const mock = {
text: `# A text: `# A
@@ -179,7 +179,7 @@ it(`Test splitText2Chunks 3`, () => {
] ]
}; };
const data = rawText2Chunks({ const data = await rawText2Chunks({
rawText: mock.text, rawText: mock.text,
chunkTriggerType: ChunkTriggerConfigTypeEnum.minSize, chunkTriggerType: ChunkTriggerConfigTypeEnum.minSize,
chunkTriggerMinSize: 1000, chunkTriggerMinSize: 1000,
@@ -191,7 +191,7 @@ it(`Test splitText2Chunks 3`, () => {
expect(formatChunks(data)).toEqual(formatResult(mock.result)); expect(formatChunks(data)).toEqual(formatResult(mock.result));
}); });
// 最小值分块测试-小于最小值,分块 // 最小值分块测试-小于最小值,分块
it(`Test splitText2Chunks 4`, () => { it(`Test splitText2Chunks 4`, async () => {
const mock = { const mock = {
text: `# A text: `# A
@@ -241,7 +241,7 @@ dsgsgfsgs22sddddddd`,
] ]
}; };
const data = rawText2Chunks({ const data = await rawText2Chunks({
rawText: mock.text, rawText: mock.text,
chunkTriggerType: ChunkTriggerConfigTypeEnum.minSize, chunkTriggerType: ChunkTriggerConfigTypeEnum.minSize,
chunkTriggerMinSize: 10, chunkTriggerMinSize: 10,
@@ -254,7 +254,7 @@ dsgsgfsgs22sddddddd`,
}); });
// 强制分块测试-小于最小值和最大值 // 强制分块测试-小于最小值和最大值
it(`Test splitText2Chunks 5`, () => { it(`Test splitText2Chunks 5`, async () => {
const mock = { const mock = {
text: `# A text: `# A
@@ -304,7 +304,7 @@ dsgsgfsgs22sddddddd`,
] ]
}; };
const data = rawText2Chunks({ const data = await rawText2Chunks({
rawText: mock.text, rawText: mock.text,
chunkTriggerType: ChunkTriggerConfigTypeEnum.forceChunk, chunkTriggerType: ChunkTriggerConfigTypeEnum.forceChunk,
chunkTriggerMinSize: 1000, chunkTriggerMinSize: 1000,
@@ -317,7 +317,7 @@ dsgsgfsgs22sddddddd`,
}); });
// 强制分块测试-大于最小值 // 强制分块测试-大于最小值
it(`Test splitText2Chunks 6`, () => { it(`Test splitText2Chunks 6`, async () => {
const mock = { const mock = {
text: `# A text: `# A
@@ -367,7 +367,7 @@ dsgsgfsgs22sddddddd`,
] ]
}; };
const data = rawText2Chunks({ const data = await rawText2Chunks({
rawText: mock.text, rawText: mock.text,
chunkTriggerType: ChunkTriggerConfigTypeEnum.forceChunk, chunkTriggerType: ChunkTriggerConfigTypeEnum.forceChunk,
chunkTriggerMinSize: 10, chunkTriggerMinSize: 10,