mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 05:12:39 +00:00
perf: chunk trigger and paragraph split (#4893)
* perf: chunk trigger and paragraph split * update max size computed * perf: i18n * remove table
This commit is contained in:
@@ -100,8 +100,6 @@ const WebsiteConfigModal = ({
|
||||
paragraphChunkDeep: chunkSettings?.paragraphChunkDeep || defaultFormData.paragraphChunkDeep,
|
||||
paragraphChunkMinSize:
|
||||
chunkSettings?.paragraphChunkMinSize || defaultFormData.paragraphChunkMinSize,
|
||||
paragraphChunkMaxSize:
|
||||
chunkSettings?.paragraphChunkMaxSize || defaultFormData.paragraphChunkMaxSize,
|
||||
|
||||
chunkSize: chunkSettings?.chunkSize || defaultFormData.chunkSize,
|
||||
|
||||
|
@@ -17,10 +17,8 @@ import {
|
||||
} from '@chakra-ui/react';
|
||||
import MyIcon from '@fastgpt/web/components/common/Icon';
|
||||
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
|
||||
import type {
|
||||
ChunkTriggerConfigTypeEnum,
|
||||
ParagraphChunkAIModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import type { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { ChunkTriggerConfigTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import {
|
||||
DataChunkSplitModeEnum,
|
||||
DatasetCollectionDataProcessModeEnum,
|
||||
@@ -108,7 +106,6 @@ export type CollectionChunkFormType = {
|
||||
paragraphChunkAIMode: ParagraphChunkAIModeEnum;
|
||||
paragraphChunkDeep: number; // Paragraph deep
|
||||
paragraphChunkMinSize: number; // Paragraph min size, if too small, it will merge
|
||||
paragraphChunkMaxSize: number; // Paragraph max size, if too large, it will split
|
||||
// Size split
|
||||
chunkSize: number;
|
||||
// Char split
|
||||
@@ -130,6 +127,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
const { setValue, register, watch, getValues } = form;
|
||||
|
||||
const trainingType = watch('trainingType');
|
||||
const chunkTriggerType = watch('chunkTriggerType');
|
||||
const chunkSettingMode = watch('chunkSettingMode');
|
||||
const chunkSplitMode = watch('chunkSplitMode');
|
||||
const autoIndexes = watch('autoIndexes');
|
||||
@@ -151,6 +149,14 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
}));
|
||||
}, [t]);
|
||||
|
||||
// Chunk trigger
|
||||
const chunkTriggerSelectList = [
|
||||
{ label: t('dataset:chunk_trigger_min_size'), value: ChunkTriggerConfigTypeEnum.minSize },
|
||||
{ label: t('dataset:chunk_trigger_max_size'), value: ChunkTriggerConfigTypeEnum.maxSize },
|
||||
{ label: t('dataset:chunk_trigger_force_chunk'), value: ChunkTriggerConfigTypeEnum.forceChunk }
|
||||
];
|
||||
|
||||
// Form max or min value
|
||||
const {
|
||||
maxChunkSize,
|
||||
minChunkSize: minChunkSizeValue,
|
||||
@@ -189,14 +195,11 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
{ label: '=====', value: '=====' },
|
||||
{ label: t('dataset:split_sign_custom'), value: 'Other' }
|
||||
];
|
||||
const [customListSelectValue, setCustomListSelectValue] = useState(getValues('chunkSplitter'));
|
||||
useEffect(() => {
|
||||
if (customListSelectValue === 'Other') {
|
||||
setValue('chunkSplitter', '');
|
||||
} else {
|
||||
setValue('chunkSplitter', customListSelectValue);
|
||||
}
|
||||
}, [customListSelectValue, setValue]);
|
||||
const [customListSelectValue, setCustomListSelectValue] = useState(
|
||||
customSplitList.some((item) => item.value === getValues('chunkSplitter'))
|
||||
? getValues('chunkSplitter')
|
||||
: 'Other'
|
||||
);
|
||||
|
||||
// Index size
|
||||
const indexSizeSeletorList = useMemo(() => getIndexSizeSelectList(maxIndexSize), [maxIndexSize]);
|
||||
@@ -243,6 +246,41 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
gridTemplateColumns={'repeat(2, 1fr)'}
|
||||
/>
|
||||
</Box>
|
||||
|
||||
{trainingType === DatasetCollectionDataProcessModeEnum.chunk && (
|
||||
<Box mt={6}>
|
||||
<HStack fontSize={'sm'} mb={2} color={'myGray.600'} spacing={1}>
|
||||
<Box>{t('dataset:chunk_trigger')}</Box>
|
||||
<QuestionTip label={t('dataset:chunk_trigger_tips')} />
|
||||
</HStack>
|
||||
<HStack>
|
||||
<Box flex={'1 0 0'} h={'34px'}>
|
||||
<MySelect
|
||||
borderRadius={'md'}
|
||||
list={chunkTriggerSelectList}
|
||||
value={chunkTriggerType}
|
||||
onChange={(e) => {
|
||||
setValue('chunkTriggerType', e);
|
||||
}}
|
||||
/>
|
||||
</Box>
|
||||
{chunkTriggerType === ChunkTriggerConfigTypeEnum.minSize && (
|
||||
<Box flex={'1 0 0'}>
|
||||
<MyNumberInput
|
||||
h={'34px'}
|
||||
bg={'white'}
|
||||
min={100}
|
||||
max={100000}
|
||||
register={register}
|
||||
name={'chunkTriggerMinSize'}
|
||||
step={100}
|
||||
/>
|
||||
</Box>
|
||||
)}
|
||||
</HStack>
|
||||
</Box>
|
||||
)}
|
||||
|
||||
{trainingType === DatasetCollectionDataProcessModeEnum.chunk &&
|
||||
feConfigs?.show_dataset_enhance !== false && (
|
||||
<Box mt={6}>
|
||||
@@ -287,7 +325,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
)}
|
||||
<Box mt={6}>
|
||||
<Box fontSize={'sm'} mb={2} color={'myGray.600'}>
|
||||
{t('dataset:params_setting')}
|
||||
{t('dataset:chunk_process_params')}
|
||||
</Box>
|
||||
<LeftRadio<ChunkSettingModeEnum>
|
||||
list={[
|
||||
@@ -305,6 +343,11 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
<Box>
|
||||
<RadioGroup<DataChunkSplitModeEnum>
|
||||
list={[
|
||||
{
|
||||
title: t('dataset:paragraph_split'),
|
||||
value: DataChunkSplitModeEnum.paragraph,
|
||||
tooltip: t('dataset:paragraph_split_tip')
|
||||
},
|
||||
{
|
||||
title: t('dataset:split_chunk_size'),
|
||||
value: DataChunkSplitModeEnum.size
|
||||
@@ -321,30 +364,76 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
}}
|
||||
/>
|
||||
|
||||
{chunkSplitMode === DataChunkSplitModeEnum.size && (
|
||||
<Box
|
||||
mt={1.5}
|
||||
css={{
|
||||
'& > span': {
|
||||
display: 'block'
|
||||
}
|
||||
}}
|
||||
>
|
||||
<MyTooltip
|
||||
label={t('common:core.dataset.import.Chunk Range', {
|
||||
min: minChunkSizeValue,
|
||||
max: maxChunkSize
|
||||
})}
|
||||
>
|
||||
{chunkSplitMode === DataChunkSplitModeEnum.paragraph && (
|
||||
<>
|
||||
<Box mt={1.5}>
|
||||
<Box>{t('dataset:paragraph_max_deep')}</Box>
|
||||
<MyNumberInput
|
||||
register={register}
|
||||
name={'chunkSize'}
|
||||
min={minChunkSizeValue}
|
||||
max={maxChunkSize}
|
||||
size={'sm'}
|
||||
step={100}
|
||||
bg={'myGray.50'}
|
||||
register={register}
|
||||
name={'paragraphChunkDeep'}
|
||||
min={1}
|
||||
max={8}
|
||||
step={1}
|
||||
h={'32px'}
|
||||
/>
|
||||
</MyTooltip>
|
||||
</Box>
|
||||
<Box mt={1.5}>
|
||||
<Box>{t('dataset:max_chunk_size')}</Box>
|
||||
<Box
|
||||
css={{
|
||||
'& > span': {
|
||||
display: 'block'
|
||||
}
|
||||
}}
|
||||
>
|
||||
<MyTooltip
|
||||
label={t('common:core.dataset.import.Chunk Range', {
|
||||
min: minChunkSizeValue,
|
||||
max: maxChunkSize
|
||||
})}
|
||||
>
|
||||
<MyNumberInput
|
||||
register={register}
|
||||
name={'chunkSize'}
|
||||
min={minChunkSizeValue}
|
||||
max={maxChunkSize}
|
||||
size={'sm'}
|
||||
step={100}
|
||||
/>
|
||||
</MyTooltip>
|
||||
</Box>
|
||||
</Box>
|
||||
</>
|
||||
)}
|
||||
|
||||
{chunkSplitMode === DataChunkSplitModeEnum.size && (
|
||||
<Box mt={1.5}>
|
||||
<Box>{t('dataset:chunk_size')}</Box>
|
||||
<Box
|
||||
css={{
|
||||
'& > span': {
|
||||
display: 'block'
|
||||
}
|
||||
}}
|
||||
>
|
||||
<MyTooltip
|
||||
label={t('common:core.dataset.import.Chunk Range', {
|
||||
min: minChunkSizeValue,
|
||||
max: maxChunkSize
|
||||
})}
|
||||
>
|
||||
<MyNumberInput
|
||||
register={register}
|
||||
name={'chunkSize'}
|
||||
min={minChunkSizeValue}
|
||||
max={maxChunkSize}
|
||||
size={'sm'}
|
||||
step={100}
|
||||
/>
|
||||
</MyTooltip>
|
||||
</Box>
|
||||
</Box>
|
||||
)}
|
||||
|
||||
@@ -358,6 +447,11 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
value={customListSelectValue}
|
||||
h={'32px'}
|
||||
onChange={(val) => {
|
||||
if (val === 'Other') {
|
||||
setValue('chunkSplitter', '');
|
||||
} else {
|
||||
setValue('chunkSplitter', val);
|
||||
}
|
||||
setCustomListSelectValue(val);
|
||||
}}
|
||||
/>
|
||||
|
@@ -51,11 +51,10 @@ export const defaultFormData: ImportFormType = {
|
||||
autoIndexes: false,
|
||||
|
||||
chunkSettingMode: ChunkSettingModeEnum.auto,
|
||||
chunkSplitMode: DataChunkSplitModeEnum.size,
|
||||
chunkSplitMode: DataChunkSplitModeEnum.paragraph,
|
||||
paragraphChunkAIMode: ParagraphChunkAIModeEnum.auto,
|
||||
paragraphChunkDeep: 4,
|
||||
paragraphChunkDeep: 5,
|
||||
paragraphChunkMinSize: 100,
|
||||
paragraphChunkMaxSize: chunkAutoChunkSize,
|
||||
|
||||
chunkSize: chunkAutoChunkSize,
|
||||
chunkSplitter: '',
|
||||
|
@@ -8,10 +8,8 @@ import { useRouter } from 'next/router';
|
||||
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
|
||||
import { getDatasetCollectionById } from '@/web/core/dataset/api';
|
||||
import MyBox from '@fastgpt/web/components/common/MyBox';
|
||||
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils';
|
||||
import { Box } from '@chakra-ui/react';
|
||||
import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
|
||||
|
||||
const Upload = dynamic(() => import('../commonProgress/Upload'));
|
||||
@@ -68,8 +66,6 @@ const ReTraining = () => {
|
||||
paragraphChunkDeep: collection.paragraphChunkDeep || defaultFormData.paragraphChunkDeep,
|
||||
paragraphChunkMinSize:
|
||||
collection.paragraphChunkMinSize || defaultFormData.paragraphChunkMinSize,
|
||||
paragraphChunkMaxSize:
|
||||
collection.paragraphChunkMaxSize || defaultFormData.paragraphChunkMaxSize,
|
||||
|
||||
chunkSize: collection.chunkSize || defaultFormData.chunkSize,
|
||||
|
||||
@@ -85,11 +81,13 @@ const ReTraining = () => {
|
||||
|
||||
return (
|
||||
<MyBox isLoading={loading} h={'100%'}>
|
||||
<Box h={'100%'} overflow={'auto'}>
|
||||
{activeStep === 0 && <DataProcess />}
|
||||
{activeStep === 1 && <PreviewData />}
|
||||
{activeStep === 2 && <Upload />}
|
||||
</Box>
|
||||
{!loading && (
|
||||
<Box h={'100%'} overflow={'auto'}>
|
||||
{activeStep === 0 && <DataProcess />}
|
||||
{activeStep === 1 && <PreviewData />}
|
||||
{activeStep === 2 && <Upload />}
|
||||
</Box>
|
||||
)}
|
||||
</MyBox>
|
||||
);
|
||||
};
|
||||
|
@@ -1,7 +1,6 @@
|
||||
import {
|
||||
type ChunkSettingModeEnum,
|
||||
type DataChunkSplitModeEnum,
|
||||
type DatasetCollectionDataProcessModeEnum
|
||||
ChunkSettingModeEnum,
|
||||
DatasetCollectionDataProcessModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
|
||||
@@ -16,25 +15,21 @@ import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
|
||||
import {
|
||||
computeChunkSize,
|
||||
computeChunkSplitter,
|
||||
computeParagraphChunkDeep,
|
||||
getLLMMaxChunkSize
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
|
||||
import { getLLMModel } from '@fastgpt/service/core/ai/model';
|
||||
import type { ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
|
||||
|
||||
export type PostPreviewFilesChunksProps = {
|
||||
export type PostPreviewFilesChunksProps = ChunkSettingsType & {
|
||||
datasetId: string;
|
||||
type: DatasetSourceReadTypeEnum;
|
||||
sourceId: string;
|
||||
|
||||
customPdfParse?: boolean;
|
||||
|
||||
trainingType: DatasetCollectionDataProcessModeEnum;
|
||||
|
||||
// Chunk settings
|
||||
chunkSettingMode: ChunkSettingModeEnum;
|
||||
chunkSplitMode: DataChunkSplitModeEnum;
|
||||
chunkSize: number;
|
||||
chunkSplitter?: string;
|
||||
overlapRatio: number;
|
||||
|
||||
// Read params
|
||||
@@ -57,9 +52,15 @@ async function handler(
|
||||
sourceId,
|
||||
customPdfParse = false,
|
||||
|
||||
trainingType,
|
||||
chunkSettingMode,
|
||||
trainingType = DatasetCollectionDataProcessModeEnum.chunk,
|
||||
|
||||
chunkTriggerType,
|
||||
chunkTriggerMinSize,
|
||||
|
||||
chunkSettingMode = ChunkSettingModeEnum.auto,
|
||||
chunkSplitMode,
|
||||
paragraphChunkDeep,
|
||||
paragraphChunkMinSize,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
|
||||
@@ -103,12 +104,16 @@ async function handler(
|
||||
chunkSize,
|
||||
llmModel: getLLMModel(dataset.agentModel)
|
||||
});
|
||||
|
||||
chunkSplitter = computeChunkSplitter({
|
||||
chunkSettingMode,
|
||||
chunkSplitMode,
|
||||
chunkSplitter
|
||||
});
|
||||
paragraphChunkDeep = computeParagraphChunkDeep({
|
||||
chunkSettingMode,
|
||||
chunkSplitMode,
|
||||
paragraphChunkDeep
|
||||
});
|
||||
|
||||
const { rawText } = await readDatasetSourceRawText({
|
||||
teamId,
|
||||
@@ -125,7 +130,11 @@ async function handler(
|
||||
|
||||
const chunks = rawText2Chunks({
|
||||
rawText,
|
||||
chunkTriggerType,
|
||||
chunkTriggerMinSize,
|
||||
chunkSize,
|
||||
paragraphChunkDeep,
|
||||
paragraphChunkMinSize,
|
||||
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
|
||||
overlapRatio,
|
||||
customReg: chunkSplitter ? [chunkSplitter] : []
|
||||
|
@@ -16,7 +16,6 @@ import type {
|
||||
ApiDatasetCreateDatasetCollectionParams,
|
||||
CreateDatasetCollectionParams,
|
||||
CreateDatasetCollectionTagParams,
|
||||
CsvTableCreateDatasetCollectionParams,
|
||||
DatasetUpdateBody,
|
||||
ExternalFileCreateDatasetCollectionParams,
|
||||
FileIdCreateDatasetCollectionParams,
|
||||
|
Reference in New Issue
Block a user