perf: chunk trigger and paragraph split (#4893)

* perf: chunk trigger and paragraph split

* update max size computed

* perf: i18n

* remove table
This commit is contained in:
Archer
2025-05-26 18:57:22 +08:00
committed by GitHub
parent 874300a56a
commit c25cd48e72
23 changed files with 859 additions and 164 deletions

View File

@@ -100,8 +100,6 @@ const WebsiteConfigModal = ({
paragraphChunkDeep: chunkSettings?.paragraphChunkDeep || defaultFormData.paragraphChunkDeep,
paragraphChunkMinSize:
chunkSettings?.paragraphChunkMinSize || defaultFormData.paragraphChunkMinSize,
paragraphChunkMaxSize:
chunkSettings?.paragraphChunkMaxSize || defaultFormData.paragraphChunkMaxSize,
chunkSize: chunkSettings?.chunkSize || defaultFormData.chunkSize,

View File

@@ -17,10 +17,8 @@ import {
} from '@chakra-ui/react';
import MyIcon from '@fastgpt/web/components/common/Icon';
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
import type {
ChunkTriggerConfigTypeEnum,
ParagraphChunkAIModeEnum
} from '@fastgpt/global/core/dataset/constants';
import type { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
import { ChunkTriggerConfigTypeEnum } from '@fastgpt/global/core/dataset/constants';
import {
DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum,
@@ -108,7 +106,6 @@ export type CollectionChunkFormType = {
paragraphChunkAIMode: ParagraphChunkAIModeEnum;
paragraphChunkDeep: number; // Paragraph deep
paragraphChunkMinSize: number; // Paragraph min size, if too small, it will merge
paragraphChunkMaxSize: number; // Paragraph max size, if too large, it will split
// Size split
chunkSize: number;
// Char split
@@ -130,6 +127,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
const { setValue, register, watch, getValues } = form;
const trainingType = watch('trainingType');
const chunkTriggerType = watch('chunkTriggerType');
const chunkSettingMode = watch('chunkSettingMode');
const chunkSplitMode = watch('chunkSplitMode');
const autoIndexes = watch('autoIndexes');
@@ -151,6 +149,14 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
}));
}, [t]);
// Chunk trigger
const chunkTriggerSelectList = [
{ label: t('dataset:chunk_trigger_min_size'), value: ChunkTriggerConfigTypeEnum.minSize },
{ label: t('dataset:chunk_trigger_max_size'), value: ChunkTriggerConfigTypeEnum.maxSize },
{ label: t('dataset:chunk_trigger_force_chunk'), value: ChunkTriggerConfigTypeEnum.forceChunk }
];
// Form max or min value
const {
maxChunkSize,
minChunkSize: minChunkSizeValue,
@@ -189,14 +195,11 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
{ label: '=====', value: '=====' },
{ label: t('dataset:split_sign_custom'), value: 'Other' }
];
const [customListSelectValue, setCustomListSelectValue] = useState(getValues('chunkSplitter'));
useEffect(() => {
if (customListSelectValue === 'Other') {
setValue('chunkSplitter', '');
} else {
setValue('chunkSplitter', customListSelectValue);
}
}, [customListSelectValue, setValue]);
const [customListSelectValue, setCustomListSelectValue] = useState(
customSplitList.some((item) => item.value === getValues('chunkSplitter'))
? getValues('chunkSplitter')
: 'Other'
);
// Index size
const indexSizeSeletorList = useMemo(() => getIndexSizeSelectList(maxIndexSize), [maxIndexSize]);
@@ -243,6 +246,41 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
gridTemplateColumns={'repeat(2, 1fr)'}
/>
</Box>
{trainingType === DatasetCollectionDataProcessModeEnum.chunk && (
<Box mt={6}>
<HStack fontSize={'sm'} mb={2} color={'myGray.600'} spacing={1}>
<Box>{t('dataset:chunk_trigger')}</Box>
<QuestionTip label={t('dataset:chunk_trigger_tips')} />
</HStack>
<HStack>
<Box flex={'1 0 0'} h={'34px'}>
<MySelect
borderRadius={'md'}
list={chunkTriggerSelectList}
value={chunkTriggerType}
onChange={(e) => {
setValue('chunkTriggerType', e);
}}
/>
</Box>
{chunkTriggerType === ChunkTriggerConfigTypeEnum.minSize && (
<Box flex={'1 0 0'}>
<MyNumberInput
h={'34px'}
bg={'white'}
min={100}
max={100000}
register={register}
name={'chunkTriggerMinSize'}
step={100}
/>
</Box>
)}
</HStack>
</Box>
)}
{trainingType === DatasetCollectionDataProcessModeEnum.chunk &&
feConfigs?.show_dataset_enhance !== false && (
<Box mt={6}>
@@ -287,7 +325,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
)}
<Box mt={6}>
<Box fontSize={'sm'} mb={2} color={'myGray.600'}>
{t('dataset:params_setting')}
{t('dataset:chunk_process_params')}
</Box>
<LeftRadio<ChunkSettingModeEnum>
list={[
@@ -305,6 +343,11 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
<Box>
<RadioGroup<DataChunkSplitModeEnum>
list={[
{
title: t('dataset:paragraph_split'),
value: DataChunkSplitModeEnum.paragraph,
tooltip: t('dataset:paragraph_split_tip')
},
{
title: t('dataset:split_chunk_size'),
value: DataChunkSplitModeEnum.size
@@ -321,30 +364,76 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
}}
/>
{chunkSplitMode === DataChunkSplitModeEnum.size && (
<Box
mt={1.5}
css={{
'& > span': {
display: 'block'
}
}}
>
<MyTooltip
label={t('common:core.dataset.import.Chunk Range', {
min: minChunkSizeValue,
max: maxChunkSize
})}
>
{chunkSplitMode === DataChunkSplitModeEnum.paragraph && (
<>
<Box mt={1.5}>
<Box>{t('dataset:paragraph_max_deep')}</Box>
<MyNumberInput
register={register}
name={'chunkSize'}
min={minChunkSizeValue}
max={maxChunkSize}
size={'sm'}
step={100}
bg={'myGray.50'}
register={register}
name={'paragraphChunkDeep'}
min={1}
max={8}
step={1}
h={'32px'}
/>
</MyTooltip>
</Box>
<Box mt={1.5}>
<Box>{t('dataset:max_chunk_size')}</Box>
<Box
css={{
'& > span': {
display: 'block'
}
}}
>
<MyTooltip
label={t('common:core.dataset.import.Chunk Range', {
min: minChunkSizeValue,
max: maxChunkSize
})}
>
<MyNumberInput
register={register}
name={'chunkSize'}
min={minChunkSizeValue}
max={maxChunkSize}
size={'sm'}
step={100}
/>
</MyTooltip>
</Box>
</Box>
</>
)}
{chunkSplitMode === DataChunkSplitModeEnum.size && (
<Box mt={1.5}>
<Box>{t('dataset:chunk_size')}</Box>
<Box
css={{
'& > span': {
display: 'block'
}
}}
>
<MyTooltip
label={t('common:core.dataset.import.Chunk Range', {
min: minChunkSizeValue,
max: maxChunkSize
})}
>
<MyNumberInput
register={register}
name={'chunkSize'}
min={minChunkSizeValue}
max={maxChunkSize}
size={'sm'}
step={100}
/>
</MyTooltip>
</Box>
</Box>
)}
@@ -358,6 +447,11 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
value={customListSelectValue}
h={'32px'}
onChange={(val) => {
if (val === 'Other') {
setValue('chunkSplitter', '');
} else {
setValue('chunkSplitter', val);
}
setCustomListSelectValue(val);
}}
/>

View File

@@ -51,11 +51,10 @@ export const defaultFormData: ImportFormType = {
autoIndexes: false,
chunkSettingMode: ChunkSettingModeEnum.auto,
chunkSplitMode: DataChunkSplitModeEnum.size,
chunkSplitMode: DataChunkSplitModeEnum.paragraph,
paragraphChunkAIMode: ParagraphChunkAIModeEnum.auto,
paragraphChunkDeep: 4,
paragraphChunkDeep: 5,
paragraphChunkMinSize: 100,
paragraphChunkMaxSize: chunkAutoChunkSize,
chunkSize: chunkAutoChunkSize,
chunkSplitter: '',

View File

@@ -8,10 +8,8 @@ import { useRouter } from 'next/router';
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
import { getDatasetCollectionById } from '@/web/core/dataset/api';
import MyBox from '@fastgpt/web/components/common/MyBox';
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils';
import { Box } from '@chakra-ui/react';
import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants';
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
const Upload = dynamic(() => import('../commonProgress/Upload'));
@@ -68,8 +66,6 @@ const ReTraining = () => {
paragraphChunkDeep: collection.paragraphChunkDeep || defaultFormData.paragraphChunkDeep,
paragraphChunkMinSize:
collection.paragraphChunkMinSize || defaultFormData.paragraphChunkMinSize,
paragraphChunkMaxSize:
collection.paragraphChunkMaxSize || defaultFormData.paragraphChunkMaxSize,
chunkSize: collection.chunkSize || defaultFormData.chunkSize,
@@ -85,11 +81,13 @@ const ReTraining = () => {
return (
<MyBox isLoading={loading} h={'100%'}>
<Box h={'100%'} overflow={'auto'}>
{activeStep === 0 && <DataProcess />}
{activeStep === 1 && <PreviewData />}
{activeStep === 2 && <Upload />}
</Box>
{!loading && (
<Box h={'100%'} overflow={'auto'}>
{activeStep === 0 && <DataProcess />}
{activeStep === 1 && <PreviewData />}
{activeStep === 2 && <Upload />}
</Box>
)}
</MyBox>
);
};

View File

@@ -1,7 +1,6 @@
import {
type ChunkSettingModeEnum,
type DataChunkSplitModeEnum,
type DatasetCollectionDataProcessModeEnum
ChunkSettingModeEnum,
DatasetCollectionDataProcessModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
@@ -16,25 +15,21 @@ import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import {
computeChunkSize,
computeChunkSplitter,
computeParagraphChunkDeep,
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { getLLMModel } from '@fastgpt/service/core/ai/model';
import type { ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
export type PostPreviewFilesChunksProps = {
export type PostPreviewFilesChunksProps = ChunkSettingsType & {
datasetId: string;
type: DatasetSourceReadTypeEnum;
sourceId: string;
customPdfParse?: boolean;
trainingType: DatasetCollectionDataProcessModeEnum;
// Chunk settings
chunkSettingMode: ChunkSettingModeEnum;
chunkSplitMode: DataChunkSplitModeEnum;
chunkSize: number;
chunkSplitter?: string;
overlapRatio: number;
// Read params
@@ -57,9 +52,15 @@ async function handler(
sourceId,
customPdfParse = false,
trainingType,
chunkSettingMode,
trainingType = DatasetCollectionDataProcessModeEnum.chunk,
chunkTriggerType,
chunkTriggerMinSize,
chunkSettingMode = ChunkSettingModeEnum.auto,
chunkSplitMode,
paragraphChunkDeep,
paragraphChunkMinSize,
chunkSize,
chunkSplitter,
@@ -103,12 +104,16 @@ async function handler(
chunkSize,
llmModel: getLLMModel(dataset.agentModel)
});
chunkSplitter = computeChunkSplitter({
chunkSettingMode,
chunkSplitMode,
chunkSplitter
});
paragraphChunkDeep = computeParagraphChunkDeep({
chunkSettingMode,
chunkSplitMode,
paragraphChunkDeep
});
const { rawText } = await readDatasetSourceRawText({
teamId,
@@ -125,7 +130,11 @@ async function handler(
const chunks = rawText2Chunks({
rawText,
chunkTriggerType,
chunkTriggerMinSize,
chunkSize,
paragraphChunkDeep,
paragraphChunkMinSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio,
customReg: chunkSplitter ? [chunkSplitter] : []

View File

@@ -16,7 +16,6 @@ import type {
ApiDatasetCreateDatasetCollectionParams,
CreateDatasetCollectionParams,
CreateDatasetCollectionTagParams,
CsvTableCreateDatasetCollectionParams,
DatasetUpdateBody,
ExternalFileCreateDatasetCollectionParams,
FileIdCreateDatasetCollectionParams,