feat: chunk index independent config (#4271)

* sync collection

* remove lock

* feat: chunk index independent config

* feat: add max chunksize to split chunk function

* remove log

* update doc

* remove

* remove log
This commit is contained in:
Archer
2025-03-21 16:44:25 +08:00
committed by archer
parent 222ff0d49a
commit e812ad6e84
47 changed files with 784 additions and 443 deletions

View File

@@ -27,6 +27,11 @@ import { addDays } from 'date-fns';
import { MongoDatasetDataText } from '../data/dataTextSchema';
import { retryFn } from '@fastgpt/global/common/system/utils';
import { getTrainingModeByCollection } from './utils';
import {
computeChunkSize,
computeChunkSplitter,
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
export const createCollectionAndInsertData = async ({
dataset,
@@ -54,18 +59,22 @@ export const createCollectionAndInsertData = async ({
const teamId = createCollectionParams.teamId;
const tmbId = createCollectionParams.tmbId;
// Chunk split params
// Set default params
const trainingType =
createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
const chunkSize = createCollectionParams.chunkSize || 512;
const chunkSplitter = createCollectionParams.chunkSplitter;
const qaPrompt = createCollectionParams.qaPrompt;
const usageName = createCollectionParams.name;
const chunkSize = computeChunkSize({
...createCollectionParams,
trainingType,
llmModel: getLLMModel(dataset.agentModel)
});
const chunkSplitter = computeChunkSplitter(createCollectionParams);
// 1. split chunks
const chunks = rawText2Chunks({
rawText,
chunkLen: chunkSize,
chunkSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
customReg: chunkSplitter ? [chunkSplitter] : [],
isQAImport
@@ -76,7 +85,7 @@ export const createCollectionAndInsertData = async ({
teamId,
insertLen: predictDataLimitLength(
getTrainingModeByCollection({
trainingType,
trainingType: trainingType,
autoIndexes: createCollectionParams.autoIndexes,
imageIndex: createCollectionParams.imageIndex
}),
@@ -88,6 +97,9 @@ export const createCollectionAndInsertData = async ({
// 3. create collection
const { _id: collectionId } = await createOneCollection({
...createCollectionParams,
trainingType,
chunkSize,
chunkSplitter,
hashRawText: hashStr(rawText),
rawTextLength: rawText.length,
@@ -111,7 +123,7 @@ export const createCollectionAndInsertData = async ({
const { billId: newBillId } = await createTrainingUsage({
teamId,
tmbId,
appName: usageName,
appName: createCollectionParams.name,
billSource: UsageSourceEnum.training,
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
agentModel: getLLMModel(dataset.agentModel)?.name,
@@ -130,12 +142,13 @@ export const createCollectionAndInsertData = async ({
agentModel: dataset.agentModel,
vectorModel: dataset.vectorModel,
vlmModel: dataset.vlmModel,
indexSize: createCollectionParams.indexSize,
mode: getTrainingModeByCollection({
trainingType,
trainingType: trainingType,
autoIndexes: createCollectionParams.autoIndexes,
imageIndex: createCollectionParams.imageIndex
}),
prompt: qaPrompt,
prompt: createCollectionParams.qaPrompt,
billId: traingBillId,
data: chunks.map((item, index) => ({
...item,
@@ -207,11 +220,14 @@ export async function createOneCollection({
// Parse settings
customPdfParse,
imageIndex,
autoIndexes,
// Chunk settings
trainingType = DatasetCollectionDataProcessModeEnum.chunk,
autoIndexes,
chunkSize = 512,
trainingType,
chunkSettingMode,
chunkSplitMode,
chunkSize,
indexSize,
chunkSplitter,
qaPrompt,
@@ -249,11 +265,14 @@ export async function createOneCollection({
// Parse settings
customPdfParse,
imageIndex,
autoIndexes,
// Chunk settings
trainingType,
autoIndexes,
chunkSettingMode,
chunkSplitMode,
chunkSize,
indexSize,
chunkSplitter,
qaPrompt
}

View File

@@ -3,7 +3,9 @@ const { Schema, model, models } = connectionMongo;
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
import {
DatasetCollectionTypeMap,
DatasetCollectionDataProcessModeEnum
DatasetCollectionDataProcessModeEnum,
ChunkSettingModeEnum,
DataChunkSplitModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { DatasetCollectionName } from '../schema';
import {
@@ -94,11 +96,18 @@ const DatasetCollectionSchema = new Schema({
type: String,
enum: Object.values(DatasetCollectionDataProcessModeEnum)
},
chunkSize: {
type: Number,
required: true
chunkSettingMode: {
type: String,
enum: Object.values(ChunkSettingModeEnum)
},
chunkSplitMode: {
type: String,
enum: Object.values(DataChunkSplitModeEnum)
},
chunkSize: Number,
chunkSplitter: String,
indexSize: Number,
qaPrompt: String
});

View File

@@ -185,7 +185,7 @@ export const readApiServerFileContent = async ({
export const rawText2Chunks = ({
rawText,
isQAImport,
chunkLen = 512,
chunkSize = 512,
...splitProps
}: {
rawText: string;
@@ -198,7 +198,7 @@ export const rawText2Chunks = ({
const { chunks } = splitText2Chunks({
text: rawText,
chunkLen,
chunkSize,
...splitProps
});

View File

@@ -12,6 +12,10 @@ import { getCollectionWithDataset } from '../controller';
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
import { PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type';
import { i18nT } from '../../../../web/i18n/utils';
import {
getLLMDefaultChunkSize,
getLLMMaxChunkSize
} from '../../../../global/core/dataset/training/utils';
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
try {
@@ -55,6 +59,7 @@ export async function pushDataListToTrainingQueue({
prompt,
billId,
mode = TrainingModeEnum.chunk,
indexSize,
session
}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
const getImageChunkMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => {
@@ -68,38 +73,41 @@ export async function pushDataListToTrainingQueue({
}
return mode;
};
const vectorModelData = getEmbeddingModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(i18nT('common:error_embedding_not_config'));
}
const agentModelData = getLLMModel(agentModel);
if (!agentModelData) {
return Promise.reject(i18nT('common:error_llm_not_config'));
}
if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) {
prompt = undefined;
}
const { model, maxToken, weight } = await (async () => {
if (mode === TrainingModeEnum.chunk) {
const vectorModelData = getEmbeddingModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(i18nT('common:error_embedding_not_config'));
}
return {
maxToken: vectorModelData.maxToken * 1.5,
maxToken: getLLMMaxChunkSize(agentModelData),
model: vectorModelData.model,
weight: vectorModelData.weight
};
}
if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) {
const agentModelData = getLLMModel(agentModel);
if (!agentModelData) {
return Promise.reject(i18nT('common:error_llm_not_config'));
}
return {
maxToken: agentModelData.maxContext * 0.8,
maxToken: getLLMMaxChunkSize(agentModelData),
model: agentModelData.model,
weight: 0
};
}
if (mode === TrainingModeEnum.image) {
const vllmModelData = getVlmModel(vlmModel);
if (!vllmModelData) {
return Promise.reject(i18nT('common:error_vlm_not_config'));
}
return {
maxToken: vllmModelData.maxContext * 0.8,
maxToken: getLLMMaxChunkSize(vllmModelData),
model: vllmModelData.model,
weight: 0
};
@@ -107,10 +115,6 @@ export async function pushDataListToTrainingQueue({
return Promise.reject(`Training mode "${mode}" is inValid`);
})();
// Filter redundant params
if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) {
prompt = undefined;
}
// filter repeat or equal content
const set = new Set();
@@ -143,13 +147,13 @@ export async function pushDataListToTrainingQueue({
const text = item.q + item.a;
// Oversize llm tokens
if (text.length > maxToken) {
filterResult.overToken.push(item);
return;
}
if (set.has(text)) {
console.log('repeat', item);
filterResult.repeat.push(item);
} else {
filterResult.success.push(item);
@@ -182,6 +186,7 @@ export async function pushDataListToTrainingQueue({
q: item.q,
a: item.a,
chunkIndex: item.chunkIndex ?? 0,
indexSize,
weight: weight ?? 0,
indexes: item.indexes,
retryCount: 5

View File

@@ -76,6 +76,7 @@ const TrainingDataSchema = new Schema({
type: Number,
default: 0
},
indexSize: Number,
weight: {
type: Number,
default: 0