feat: chunk index independent config (#4271)

* sync collection

* remove lock

* feat: chunk index independent config

* feat: add max chunksize to split chunk function

* remove log

* update doc

* remove

* remove log
This commit is contained in:
Archer
2025-03-21 16:44:25 +08:00
committed by archer
parent 222ff0d49a
commit e812ad6e84
47 changed files with 784 additions and 443 deletions

View File

@@ -1,5 +1,10 @@
import { DatasetDataIndexItemType, DatasetSchemaType } from './type';
import { DatasetCollectionTypeEnum, DatasetCollectionDataProcessModeEnum } from './constants';
import {
DatasetCollectionTypeEnum,
DatasetCollectionDataProcessModeEnum,
ChunkSettingModeEnum,
DataChunkSplitModeEnum
} from './constants';
import type { LLMModelItemType } from '../ai/model.d';
import { ParentIdType } from 'common/parentFolder/type';
@@ -33,7 +38,13 @@ export type DatasetCollectionChunkMetadataType = {
trainingType?: DatasetCollectionDataProcessModeEnum;
imageIndex?: boolean;
autoIndexes?: boolean;
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
chunkSize?: number;
indexSize?: number;
chunkSplitter?: string;
qaPrompt?: string;
metadata?: Record<string, any>;

View File

@@ -129,6 +129,16 @@ export const DatasetCollectionDataProcessModeMap = {
}
};
export enum ChunkSettingModeEnum {
auto = 'auto',
custom = 'custom'
}
export enum DataChunkSplitModeEnum {
size = 'size',
char = 'char'
}
/* ------------ data -------------- */
/* ------------ training -------------- */

View File

@@ -13,6 +13,7 @@ export type CreateDatasetDataProps = {
export type UpdateDatasetDataProps = {
dataId: string;
q?: string;
a?: string;
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & {

View File

@@ -15,6 +15,8 @@ export type PushDataToTrainingQueueProps = {
vectorModel: string;
vlmModel?: string;
indexSize?: number;
billId?: string;
session?: ClientSession;
};

View File

@@ -0,0 +1,136 @@
import { EmbeddingModelItemType, LLMModelItemType } from '../../../core/ai/model.d';
import {
ChunkSettingModeEnum,
DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum
} from '../constants';
export const minChunkSize = 64; // min index and chunk size
// Chunk size
export const chunkAutoChunkSize = 1500;
export const getMaxChunkSize = (model: LLMModelItemType) => {
return Math.max(model.maxContext - model.maxResponse, 2000);
};
// QA
export const defaultMaxChunkSize = 8000;
export const getLLMDefaultChunkSize = (model?: LLMModelItemType) => {
if (!model) return defaultMaxChunkSize;
return Math.max(Math.min(model.maxContext - model.maxResponse, defaultMaxChunkSize), 2000);
};
export const getLLMMaxChunkSize = (model?: LLMModelItemType) => {
if (!model) return 8000;
return Math.max(model.maxContext - model.maxResponse, 2000);
};
// Index size
export const getMaxIndexSize = (model?: EmbeddingModelItemType) => {
return model?.maxToken || 512;
};
export const getAutoIndexSize = (model?: EmbeddingModelItemType) => {
return model?.defaultToken || 512;
};
const indexSizeSelectList = [
{
label: '64',
value: 64
},
{
label: '128',
value: 128
},
{
label: '256',
value: 256
},
{
label: '512',
value: 512
},
{
label: '768',
value: 768
},
{
label: '1024',
value: 1024
},
{
label: '1536',
value: 1536
},
{
label: '2048',
value: 2048
},
{
label: '3072',
value: 3072
},
{
label: '4096',
value: 4096
},
{
label: '5120',
value: 5120
},
{
label: '6144',
value: 6144
},
{
label: '7168',
value: 7168
},
{
label: '8192',
value: 8192
}
];
export const getIndexSizeSelectList = (max = 512) => {
return indexSizeSelectList.filter((item) => item.value <= max);
};
// Compute
export const computeChunkSize = (params: {
trainingType: DatasetCollectionDataProcessModeEnum;
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
llmModel?: LLMModelItemType;
chunkSize?: number;
}) => {
if (params.trainingType === DatasetCollectionDataProcessModeEnum.qa) {
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
return getLLMDefaultChunkSize(params.llmModel);
}
} else {
// chunk
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
return chunkAutoChunkSize;
}
}
if (params.chunkSplitMode === DataChunkSplitModeEnum.char) {
return getLLMMaxChunkSize(params.llmModel);
}
return Math.min(params.chunkSize || chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel));
};
export const computeChunkSplitter = (params: {
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
chunkSplitter?: string;
}) => {
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
return undefined;
}
if (params.chunkSplitMode === DataChunkSplitModeEnum.size) {
return undefined;
}
return params.chunkSplitter;
};

View File

@@ -2,6 +2,7 @@ import type { LLMModelItemType, EmbeddingModelItemType } from '../../core/ai/mod
import { PermissionTypeEnum } from '../../support/permission/constant';
import { PushDatasetDataChunkProps } from './api';
import {
DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum,
DatasetCollectionTypeEnum,
DatasetStatusEnum,
@@ -14,6 +15,7 @@ import { Permission } from '../../support/permission/controller';
import { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
import { SourceMemberType } from 'support/user/type';
import { DatasetDataIndexTypeEnum } from './data/constants';
import { ChunkSettingModeEnum } from './constants';
export type DatasetSchemaType = {
_id: string;
@@ -88,7 +90,12 @@ export type DatasetCollectionSchemaType = {
autoIndexes?: boolean;
imageIndex?: boolean;
trainingType: DatasetCollectionDataProcessModeEnum;
chunkSize: number;
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
chunkSize?: number;
indexSize?: number;
chunkSplitter?: string;
qaPrompt?: string;
};

View File

@@ -1,7 +1,6 @@
import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
import { getFileIcon } from '../../common/file/icon';
import { strIsLink } from '../../common/string/tools';
import { DatasetDataIndexTypeEnum } from './data/constants';
export function getCollectionIcon(
type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file,
@@ -38,26 +37,6 @@ export function getSourceNameIcon({
return 'file/fill/file';
}
/* get dataset data default index */
export function getDefaultIndex(props?: { q?: string; a?: string }) {
const { q = '', a } = props || {};
return [
{
text: q,
type: DatasetDataIndexTypeEnum.default
},
...(a
? [
{
text: a,
type: DatasetDataIndexTypeEnum.default
}
]
: [])
];
}
export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => {
if (mode === TrainingModeEnum.qa) return data.length * 20;
if (mode === TrainingModeEnum.auto) return data.length * 5;