Files
FastGPT/packages/global/core/dataset/training/utils.ts
Archer fdd4e9edbd Test parse cite and add tool call parallel (#4737)
* add quote response filter (#4727)

* chatting

* add quote response filter

* add test

* remove comment

* perf: cite hidden

* perf: format llm response

* feat: comment

* update default chunk size

* update default chunk size

---------

Co-authored-by: heheer <heheer@sealos.io>
2025-04-30 17:43:50 +08:00

137 lines
3.0 KiB
TypeScript

import { EmbeddingModelItemType, LLMModelItemType } from '../../../core/ai/model.d';
import {
ChunkSettingModeEnum,
DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum
} from '../constants';
export const minChunkSize = 64; // min index and chunk size
// Chunk size
export const chunkAutoChunkSize = 1000;
export const getMaxChunkSize = (model: LLMModelItemType) => {
return Math.max(model.maxContext - model.maxResponse, 2000);
};
// QA
export const defaultMaxChunkSize = 8000;
export const getLLMDefaultChunkSize = (model?: LLMModelItemType) => {
if (!model) return defaultMaxChunkSize;
return Math.max(Math.min(model.maxContext - model.maxResponse, defaultMaxChunkSize), 2000);
};
export const getLLMMaxChunkSize = (model?: LLMModelItemType) => {
if (!model) return 8000;
return Math.max(model.maxContext - model.maxResponse, 2000);
};
// Index size
export const getMaxIndexSize = (model?: EmbeddingModelItemType) => {
return model?.maxToken || 512;
};
export const getAutoIndexSize = (model?: EmbeddingModelItemType) => {
return model?.defaultToken || 512;
};
const indexSizeSelectList = [
{
label: '64',
value: 64
},
{
label: '128',
value: 128
},
{
label: '256',
value: 256
},
{
label: '512',
value: 512
},
{
label: '768',
value: 768
},
{
label: '1024',
value: 1024
},
{
label: '1536',
value: 1536
},
{
label: '2048',
value: 2048
},
{
label: '3072',
value: 3072
},
{
label: '4096',
value: 4096
},
{
label: '5120',
value: 5120
},
{
label: '6144',
value: 6144
},
{
label: '7168',
value: 7168
},
{
label: '8192',
value: 8192
}
];
export const getIndexSizeSelectList = (max = 512) => {
return indexSizeSelectList.filter((item) => item.value <= max);
};
// Compute
export const computeChunkSize = (params: {
trainingType: DatasetCollectionDataProcessModeEnum;
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
llmModel?: LLMModelItemType;
chunkSize?: number;
}) => {
if (params.trainingType === DatasetCollectionDataProcessModeEnum.qa) {
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
return getLLMDefaultChunkSize(params.llmModel);
}
} else {
// chunk
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
return chunkAutoChunkSize;
}
}
if (params.chunkSplitMode === DataChunkSplitModeEnum.char) {
return getLLMMaxChunkSize(params.llmModel);
}
return Math.min(params.chunkSize || chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel));
};
export const computeChunkSplitter = (params: {
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
chunkSplitter?: string;
}) => {
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
return undefined;
}
if (params.chunkSplitMode === DataChunkSplitModeEnum.size) {
return undefined;
}
return params.chunkSplitter;
};