mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-22 12:20:34 +00:00
feat: chunk index independent config (#4271)
* sync collection * remove lock * feat: chunk index independent config * feat: add max chunksize to split chunk function * remove log * update doc * remove * remove log
This commit is contained in:
@@ -27,6 +27,11 @@ import { addDays } from 'date-fns';
|
||||
import { MongoDatasetDataText } from '../data/dataTextSchema';
|
||||
import { retryFn } from '@fastgpt/global/common/system/utils';
|
||||
import { getTrainingModeByCollection } from './utils';
|
||||
import {
|
||||
computeChunkSize,
|
||||
computeChunkSplitter,
|
||||
getLLMMaxChunkSize
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
|
||||
export const createCollectionAndInsertData = async ({
|
||||
dataset,
|
||||
@@ -54,18 +59,22 @@ export const createCollectionAndInsertData = async ({
|
||||
|
||||
const teamId = createCollectionParams.teamId;
|
||||
const tmbId = createCollectionParams.tmbId;
|
||||
// Chunk split params
|
||||
|
||||
// Set default params
|
||||
const trainingType =
|
||||
createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
|
||||
const chunkSize = createCollectionParams.chunkSize || 512;
|
||||
const chunkSplitter = createCollectionParams.chunkSplitter;
|
||||
const qaPrompt = createCollectionParams.qaPrompt;
|
||||
const usageName = createCollectionParams.name;
|
||||
const chunkSize = computeChunkSize({
|
||||
...createCollectionParams,
|
||||
trainingType,
|
||||
llmModel: getLLMModel(dataset.agentModel)
|
||||
});
|
||||
const chunkSplitter = computeChunkSplitter(createCollectionParams);
|
||||
|
||||
// 1. split chunks
|
||||
const chunks = rawText2Chunks({
|
||||
rawText,
|
||||
chunkLen: chunkSize,
|
||||
chunkSize,
|
||||
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
|
||||
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
|
||||
customReg: chunkSplitter ? [chunkSplitter] : [],
|
||||
isQAImport
|
||||
@@ -76,7 +85,7 @@ export const createCollectionAndInsertData = async ({
|
||||
teamId,
|
||||
insertLen: predictDataLimitLength(
|
||||
getTrainingModeByCollection({
|
||||
trainingType,
|
||||
trainingType: trainingType,
|
||||
autoIndexes: createCollectionParams.autoIndexes,
|
||||
imageIndex: createCollectionParams.imageIndex
|
||||
}),
|
||||
@@ -88,6 +97,9 @@ export const createCollectionAndInsertData = async ({
|
||||
// 3. create collection
|
||||
const { _id: collectionId } = await createOneCollection({
|
||||
...createCollectionParams,
|
||||
trainingType,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
|
||||
hashRawText: hashStr(rawText),
|
||||
rawTextLength: rawText.length,
|
||||
@@ -111,7 +123,7 @@ export const createCollectionAndInsertData = async ({
|
||||
const { billId: newBillId } = await createTrainingUsage({
|
||||
teamId,
|
||||
tmbId,
|
||||
appName: usageName,
|
||||
appName: createCollectionParams.name,
|
||||
billSource: UsageSourceEnum.training,
|
||||
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
|
||||
agentModel: getLLMModel(dataset.agentModel)?.name,
|
||||
@@ -130,12 +142,13 @@ export const createCollectionAndInsertData = async ({
|
||||
agentModel: dataset.agentModel,
|
||||
vectorModel: dataset.vectorModel,
|
||||
vlmModel: dataset.vlmModel,
|
||||
indexSize: createCollectionParams.indexSize,
|
||||
mode: getTrainingModeByCollection({
|
||||
trainingType,
|
||||
trainingType: trainingType,
|
||||
autoIndexes: createCollectionParams.autoIndexes,
|
||||
imageIndex: createCollectionParams.imageIndex
|
||||
}),
|
||||
prompt: qaPrompt,
|
||||
prompt: createCollectionParams.qaPrompt,
|
||||
billId: traingBillId,
|
||||
data: chunks.map((item, index) => ({
|
||||
...item,
|
||||
@@ -207,11 +220,14 @@ export async function createOneCollection({
|
||||
// Parse settings
|
||||
customPdfParse,
|
||||
imageIndex,
|
||||
autoIndexes,
|
||||
|
||||
// Chunk settings
|
||||
trainingType = DatasetCollectionDataProcessModeEnum.chunk,
|
||||
autoIndexes,
|
||||
chunkSize = 512,
|
||||
trainingType,
|
||||
chunkSettingMode,
|
||||
chunkSplitMode,
|
||||
chunkSize,
|
||||
indexSize,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
|
||||
@@ -249,11 +265,14 @@ export async function createOneCollection({
|
||||
// Parse settings
|
||||
customPdfParse,
|
||||
imageIndex,
|
||||
autoIndexes,
|
||||
|
||||
// Chunk settings
|
||||
trainingType,
|
||||
autoIndexes,
|
||||
chunkSettingMode,
|
||||
chunkSplitMode,
|
||||
chunkSize,
|
||||
indexSize,
|
||||
chunkSplitter,
|
||||
qaPrompt
|
||||
}
|
||||
|
@@ -3,7 +3,9 @@ const { Schema, model, models } = connectionMongo;
|
||||
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
|
||||
import {
|
||||
DatasetCollectionTypeMap,
|
||||
DatasetCollectionDataProcessModeEnum
|
||||
DatasetCollectionDataProcessModeEnum,
|
||||
ChunkSettingModeEnum,
|
||||
DataChunkSplitModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { DatasetCollectionName } from '../schema';
|
||||
import {
|
||||
@@ -94,11 +96,18 @@ const DatasetCollectionSchema = new Schema({
|
||||
type: String,
|
||||
enum: Object.values(DatasetCollectionDataProcessModeEnum)
|
||||
},
|
||||
chunkSize: {
|
||||
type: Number,
|
||||
required: true
|
||||
chunkSettingMode: {
|
||||
type: String,
|
||||
enum: Object.values(ChunkSettingModeEnum)
|
||||
},
|
||||
chunkSplitMode: {
|
||||
type: String,
|
||||
enum: Object.values(DataChunkSplitModeEnum)
|
||||
},
|
||||
chunkSize: Number,
|
||||
chunkSplitter: String,
|
||||
|
||||
indexSize: Number,
|
||||
qaPrompt: String
|
||||
});
|
||||
|
||||
|
@@ -185,7 +185,7 @@ export const readApiServerFileContent = async ({
|
||||
export const rawText2Chunks = ({
|
||||
rawText,
|
||||
isQAImport,
|
||||
chunkLen = 512,
|
||||
chunkSize = 512,
|
||||
...splitProps
|
||||
}: {
|
||||
rawText: string;
|
||||
@@ -198,7 +198,7 @@ export const rawText2Chunks = ({
|
||||
|
||||
const { chunks } = splitText2Chunks({
|
||||
text: rawText,
|
||||
chunkLen,
|
||||
chunkSize,
|
||||
...splitProps
|
||||
});
|
||||
|
||||
|
@@ -12,6 +12,10 @@ import { getCollectionWithDataset } from '../controller';
|
||||
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
|
||||
import { PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type';
|
||||
import { i18nT } from '../../../../web/i18n/utils';
|
||||
import {
|
||||
getLLMDefaultChunkSize,
|
||||
getLLMMaxChunkSize
|
||||
} from '../../../../global/core/dataset/training/utils';
|
||||
|
||||
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
|
||||
try {
|
||||
@@ -55,6 +59,7 @@ export async function pushDataListToTrainingQueue({
|
||||
prompt,
|
||||
billId,
|
||||
mode = TrainingModeEnum.chunk,
|
||||
indexSize,
|
||||
session
|
||||
}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
|
||||
const getImageChunkMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => {
|
||||
@@ -68,38 +73,41 @@ export async function pushDataListToTrainingQueue({
|
||||
}
|
||||
return mode;
|
||||
};
|
||||
|
||||
const vectorModelData = getEmbeddingModel(vectorModel);
|
||||
if (!vectorModelData) {
|
||||
return Promise.reject(i18nT('common:error_embedding_not_config'));
|
||||
}
|
||||
const agentModelData = getLLMModel(agentModel);
|
||||
if (!agentModelData) {
|
||||
return Promise.reject(i18nT('common:error_llm_not_config'));
|
||||
}
|
||||
if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) {
|
||||
prompt = undefined;
|
||||
}
|
||||
|
||||
const { model, maxToken, weight } = await (async () => {
|
||||
if (mode === TrainingModeEnum.chunk) {
|
||||
const vectorModelData = getEmbeddingModel(vectorModel);
|
||||
if (!vectorModelData) {
|
||||
return Promise.reject(i18nT('common:error_embedding_not_config'));
|
||||
}
|
||||
return {
|
||||
maxToken: vectorModelData.maxToken * 1.5,
|
||||
maxToken: getLLMMaxChunkSize(agentModelData),
|
||||
model: vectorModelData.model,
|
||||
weight: vectorModelData.weight
|
||||
};
|
||||
}
|
||||
|
||||
if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) {
|
||||
const agentModelData = getLLMModel(agentModel);
|
||||
if (!agentModelData) {
|
||||
return Promise.reject(i18nT('common:error_llm_not_config'));
|
||||
}
|
||||
return {
|
||||
maxToken: agentModelData.maxContext * 0.8,
|
||||
maxToken: getLLMMaxChunkSize(agentModelData),
|
||||
model: agentModelData.model,
|
||||
weight: 0
|
||||
};
|
||||
}
|
||||
|
||||
if (mode === TrainingModeEnum.image) {
|
||||
const vllmModelData = getVlmModel(vlmModel);
|
||||
if (!vllmModelData) {
|
||||
return Promise.reject(i18nT('common:error_vlm_not_config'));
|
||||
}
|
||||
return {
|
||||
maxToken: vllmModelData.maxContext * 0.8,
|
||||
maxToken: getLLMMaxChunkSize(vllmModelData),
|
||||
model: vllmModelData.model,
|
||||
weight: 0
|
||||
};
|
||||
@@ -107,10 +115,6 @@ export async function pushDataListToTrainingQueue({
|
||||
|
||||
return Promise.reject(`Training mode "${mode}" is inValid`);
|
||||
})();
|
||||
// Filter redundant params
|
||||
if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) {
|
||||
prompt = undefined;
|
||||
}
|
||||
|
||||
// filter repeat or equal content
|
||||
const set = new Set();
|
||||
@@ -143,13 +147,13 @@ export async function pushDataListToTrainingQueue({
|
||||
|
||||
const text = item.q + item.a;
|
||||
|
||||
// Oversize llm tokens
|
||||
if (text.length > maxToken) {
|
||||
filterResult.overToken.push(item);
|
||||
return;
|
||||
}
|
||||
|
||||
if (set.has(text)) {
|
||||
console.log('repeat', item);
|
||||
filterResult.repeat.push(item);
|
||||
} else {
|
||||
filterResult.success.push(item);
|
||||
@@ -182,6 +186,7 @@ export async function pushDataListToTrainingQueue({
|
||||
q: item.q,
|
||||
a: item.a,
|
||||
chunkIndex: item.chunkIndex ?? 0,
|
||||
indexSize,
|
||||
weight: weight ?? 0,
|
||||
indexes: item.indexes,
|
||||
retryCount: 5
|
||||
|
@@ -76,6 +76,7 @@ const TrainingDataSchema = new Schema({
|
||||
type: Number,
|
||||
default: 0
|
||||
},
|
||||
indexSize: Number,
|
||||
weight: {
|
||||
type: Number,
|
||||
default: 0
|
||||
|
Reference in New Issue
Block a user