Files
FastGPT/packages/global/core/dataset/training/utils.ts
Archer c51395b2c8 V4.12.0 features (#5435)
* add logs chart (#5352)

* charts

* chart data

* log chart

* delete

* rename api

* fix

* move api

* fix

* fix

* pro config

* fix

* feat: Repository interaction (#5356)

* feat: 1好像功能没问题了,明天再测

* feat: 2 解决了昨天遗留的bug,但全选按钮又bug了

* feat: 3 第三版,解决了全选功能bug

* feat: 4 第四版,下面改小细节

* feat: 5 我勒个痘

* feat: 6

* feat: 6 pr

* feat: 7

* feat: 8

* feat: 9

* feat: 10

* feat: 11

* feat: 12

* perf: checkbox ui

* refactor: tweak login loyout (#5357)

Co-authored-by: Archer <545436317@qq.com>

* login ui

* app chat log chart pro display (#5392)

* app chat log chart pro display

* add canopen props

* perf: pro tag tip

* perf: pro tag tip

* feat: openrouter provider (#5406)

* perf: login ui

* feat: openrouter provider

* provider

* perf: custom error throw

* perf: emb batch (#5407)

* perf: emb batch

* perf: vector retry

* doc

* doc (#5411)

* doc

* fix: team folder will add to workflow

* fix: generateToc shell

* Tool price (#5376)

* resolve conflicts for cherry-pick

* fix i18n

* Enhance system plugin template data structure and update ToolSelectModal to include CostTooltip component

* refactor: update systemKeyCost type to support array of objects in plugin and workflow types

* refactor: simplify systemKeyCost type across plugin and workflow types to a single number

* refactor: streamline systemKeyCost handling in plugin and workflow components

* fix

* fix

* perf: toolset price config;fix: workflow array selector ui (#5419)

* fix: workflow array selector ui

* update default model tip

* perf: toolset price config

* doc

* fix: test

* Refactor/chat (#5418)

* refactor: add homepage configuration; add home chat page; add side bar animated collapse and layout

* fix: fix lint rules

* chore: improve logics and code

* chore: more clearer logics

* chore: adjust api

---------

Co-authored-by: Archer <545436317@qq.com>

* perf: chat setting code

* del history

* logo image

* perf: home chat ui

* feat: enhance chat response handling with external links and user info (#5427)

* feat: enhance chat response handling with external links and user info

* fix

* cite code

* perf: toolset add in workflow

* fix: test

* fix: search paraentId

* Fix/chat (#5434)

* wip: rebase了upstream

* wip: adapt mobile UI

* fix: fix chat page logic and UI

* fix: fix UI and improve some logics

* fix: model selector missing logo; vision model to retrieve file

* perf: role selector

* fix: chat ui

* optimize export app chat log (#5436)

* doc

* chore: move components to proper directory; fix the api to get app list (#5437)

* chore: improve team app panel display form (#5438)

* feat: add home chat log tab

* chore: improve team app panel display form

* chore: improve log panel

* fix: spec

* doc

* fix: log permission

* fix: dataset schema required

* add loading status

* remove ui weight

* manage log

* fix: log detail per

* doc

* fix: log menu

* rename permission

* bg color

* fix: app log per

* fix: log key selector

* fix: log

* doc

---------

Co-authored-by: heheer <zhiyu44@qq.com>
Co-authored-by: colnii <1286949794@qq.com>
Co-authored-by: 伍闲犬 <76519998+xqvvu@users.noreply.github.com>
Co-authored-by: Ctrlz <143257420+ctrlz526@users.noreply.github.com>
Co-authored-by: 伍闲犬 <whoeverimf5@gmail.com>
Co-authored-by: heheer <heheer@sealos.io>
2025-08-12 22:22:18 +08:00

184 lines
4.5 KiB
TypeScript

import { getEmbeddingModel } from '../../../../service/core/ai/model';
import { type EmbeddingModelItemType, type LLMModelItemType } from '../../../core/ai/model.d';
import {
ChunkSettingModeEnum,
DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum,
ParagraphChunkAIModeEnum
} from '../constants';
import type { ChunkSettingsType } from '../type';
import { cloneDeep } from 'lodash';
export const minChunkSize = 64; // min index and chunk size
// Chunk size
export const chunkAutoChunkSize = 1000;
export const getMaxChunkSize = (model: LLMModelItemType) => {
return Math.max(model.maxContext - model.maxResponse, 2000);
};
// QA
export const defaultMaxChunkSize = 8000;
export const getLLMDefaultChunkSize = (model?: LLMModelItemType) => {
if (!model) return defaultMaxChunkSize;
return Math.max(Math.min(model.maxContext - model.maxResponse, defaultMaxChunkSize), 2000);
};
export const getLLMMaxChunkSize = (model?: LLMModelItemType) => {
if (!model) return 8000;
return Math.max(model.maxContext, 4000);
};
// Index size
export const getMaxIndexSize = (model?: EmbeddingModelItemType | string) => {
if (!model) return 512;
const modelData = typeof model === 'string' ? getEmbeddingModel(model) : model;
return modelData?.maxToken || 512;
};
export const getAutoIndexSize = (model?: EmbeddingModelItemType | string) => {
if (!model) return 512;
const modelData = typeof model === 'string' ? getEmbeddingModel(model) : model;
return modelData?.defaultToken || 512;
};
const indexSizeSelectList = [
{
label: '64',
value: 64
},
{
label: '128',
value: 128
},
{
label: '256',
value: 256
},
{
label: '512',
value: 512
},
{
label: '768',
value: 768
},
{
label: '1024',
value: 1024
},
{
label: '1536',
value: 1536
},
{
label: '2048',
value: 2048
},
{
label: '3072',
value: 3072
},
{
label: '4096',
value: 4096
},
{
label: '5120',
value: 5120
},
{
label: '6144',
value: 6144
},
{
label: '7168',
value: 7168
},
{
label: '8192',
value: 8192
}
];
export const getIndexSizeSelectList = (max = 512) => {
return indexSizeSelectList.filter((item) => item.value <= max);
};
// Compute
export const computedCollectionChunkSettings = <T extends ChunkSettingsType>({
llmModel,
vectorModel,
...data
}: {
llmModel?: LLMModelItemType;
vectorModel?: EmbeddingModelItemType;
} & T) => {
const {
trainingType = DatasetCollectionDataProcessModeEnum.chunk,
chunkSettingMode = ChunkSettingModeEnum.auto,
chunkSplitMode,
chunkSize,
paragraphChunkDeep = 5,
indexSize,
autoIndexes
} = data;
const cloneChunkSettings = cloneDeep(data);
if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
delete cloneChunkSettings.qaPrompt;
}
// Format training type indexSize/chunkSize
const trainingModeSize: {
autoChunkSize: number;
autoIndexSize: number;
chunkSize?: number;
indexSize?: number;
} = (() => {
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
return {
autoChunkSize: getLLMDefaultChunkSize(llmModel),
autoIndexSize: getMaxIndexSize(vectorModel),
chunkSize,
indexSize: getMaxIndexSize(vectorModel)
};
} else if (autoIndexes) {
return {
autoChunkSize: chunkAutoChunkSize,
autoIndexSize: getAutoIndexSize(vectorModel),
chunkSize,
indexSize
};
} else {
return {
autoChunkSize: chunkAutoChunkSize,
autoIndexSize: getAutoIndexSize(vectorModel),
chunkSize,
indexSize
};
}
})();
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
cloneChunkSettings.chunkSplitMode = DataChunkSplitModeEnum.paragraph;
cloneChunkSettings.paragraphChunkAIMode = ParagraphChunkAIModeEnum.forbid;
cloneChunkSettings.paragraphChunkDeep = 5;
cloneChunkSettings.paragraphChunkMinSize = 100;
cloneChunkSettings.chunkSize = trainingModeSize.autoChunkSize;
cloneChunkSettings.indexSize = trainingModeSize.autoIndexSize;
cloneChunkSettings.chunkSplitter = undefined;
} else {
cloneChunkSettings.paragraphChunkDeep =
chunkSplitMode === DataChunkSplitModeEnum.paragraph ? paragraphChunkDeep : 0;
cloneChunkSettings.chunkSize = trainingModeSize.chunkSize
? Math.min(trainingModeSize.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(llmModel))
: undefined;
cloneChunkSettings.indexSize = trainingModeSize.indexSize;
}
return cloneChunkSettings;
};