feat: chunk index independent config (#4271)

* sync collection

* remove lock

* feat: chunk index independent config

* feat: add max chunksize to split chunk function

* remove log

* update doc

* remove

* remove log
This commit is contained in:
Archer
2025-03-21 16:44:25 +08:00
committed by archer
parent 222ff0d49a
commit e812ad6e84
47 changed files with 784 additions and 443 deletions

View File

@@ -71,7 +71,7 @@ const EditResourceModal = ({
{...register('name', { required: true })}
bg={'myGray.50'}
autoFocus
maxLength={20}
maxLength={100}
/>
</HStack>
</Box>

View File

@@ -338,7 +338,7 @@ function EditKeyModal({
<FormLabel flex={'0 0 90px'}>{t('common:Name')}</FormLabel>
<Input
placeholder={t('publish:key_alias') || 'key_alias'}
maxLength={20}
maxLength={100}
{...register('name', {
required: t('common:common.name_is_empty') || 'name_is_empty'
})}

View File

@@ -117,7 +117,7 @@ function EditModal({
ml={4}
autoFocus
bg={'myWhite.600'}
maxLength={20}
maxLength={100}
placeholder={t('user:team.Team Name')}
{...register('name', {
required: t('common:common.Please Input Name')

View File

@@ -326,7 +326,7 @@ function EditLinkModal({
<FormLabel flex={'0 0 90px'}>{t('common:Name')}</FormLabel>
<Input
placeholder={t('publish:link_name')}
maxLength={20}
maxLength={100}
{...register('name', {
required: t('common:common.name_is_empty')
})}

View File

@@ -26,7 +26,7 @@ function BasicInfo({
</FormLabel>
<Input
placeholder={t('publish:publish_name')}
maxLength={20}
maxLength={100}
{...register('name', {
required: t('common:common.name_is_empty')
})}

View File

@@ -96,7 +96,7 @@ const ExtractFieldModal = ({
<Input
bg={'myGray.50'}
placeholder="name/age/sql"
maxLength={20}
maxLength={100}
{...register('key', { required: true })}
/>
</Flex>

View File

@@ -418,7 +418,7 @@ const NodeCard = (props: Props) => {
{RenderToolHandle}
<ConfirmSyncModal />
<EditTitleModal maxLength={50} />
<EditTitleModal maxLength={100} />
</Flex>
);
};

View File

@@ -319,7 +319,7 @@ const TemplateMarketModal = ({
onChange={(e) => setCurrentSearch(e.target.value)}
h={8}
bg={'myGray.50'}
maxLength={20}
maxLength={100}
borderRadius={'sm'}
/>
</Box>

View File

@@ -49,7 +49,7 @@ const EditFolderModal = ({
defaultValue={name}
placeholder={t('common:dataset.Folder Name') || ''}
autoFocus
maxLength={20}
maxLength={100}
/>
</ModalBody>
<ModalFooter>

View File

@@ -10,11 +10,21 @@ import { useMyStep } from '@fastgpt/web/hooks/useStep';
import { Box, Button, Flex, IconButton } from '@chakra-ui/react';
import MyIcon from '@fastgpt/web/components/common/Icon';
import { TabEnum } from '../NavBar';
import { ChunkSettingModeEnum } from '@/web/core/dataset/constants';
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { UseFormReturn, useForm } from 'react-hook-form';
import { ImportSourceItemType } from '@/web/core/dataset/type';
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants';
import {
getMaxChunkSize,
getLLMDefaultChunkSize,
getLLMMaxChunkSize,
chunkAutoChunkSize,
minChunkSize,
getAutoIndexSize,
getMaxIndexSize
} from '@fastgpt/global/core/dataset/training/utils';
type TrainingFiledType = {
chunkOverlapRatio: number;
@@ -22,6 +32,9 @@ type TrainingFiledType = {
minChunkSize: number;
autoChunkSize: number;
chunkSize: number;
maxIndexSize?: number;
indexSize?: number;
autoIndexSize?: number;
charsPointsPrice: number;
priceTip: string;
uploadRate: number;
@@ -47,9 +60,13 @@ export type ImportFormType = {
autoIndexes: boolean;
chunkSettingMode: ChunkSettingModeEnum;
chunkSplitMode: DataChunkSplitModeEnum;
embeddingChunkSize: number;
qaChunkSize: number;
customSplitChar: string;
chunkSplitter: string;
indexSize: number;
qaPrompt: string;
webSelector: string;
};
@@ -199,9 +216,12 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
chunkSettingMode: ChunkSettingModeEnum.auto,
embeddingChunkSize: vectorModel?.defaultToken || 512,
qaChunkSize: Math.min(agentModel.maxResponse * 1, agentModel.maxContext * 0.7),
customSplitChar: '',
chunkSplitMode: DataChunkSplitModeEnum.size,
embeddingChunkSize: 2000,
indexSize: vectorModel?.defaultToken || 512,
qaChunkSize: getLLMDefaultChunkSize(agentModel),
chunkSplitter: '',
qaPrompt: Prompt_AgentQA.description,
webSelector: '',
customPdfParse: false
@@ -215,17 +235,18 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
const chunkSettingMode = processParamsForm.watch('chunkSettingMode');
const embeddingChunkSize = processParamsForm.watch('embeddingChunkSize');
const qaChunkSize = processParamsForm.watch('qaChunkSize');
const customSplitChar = processParamsForm.watch('customSplitChar');
const chunkSplitter = processParamsForm.watch('chunkSplitter');
const autoIndexes = processParamsForm.watch('autoIndexes');
const indexSize = processParamsForm.watch('indexSize');
const TrainingModeMap = useMemo<TrainingFiledType>(() => {
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
return {
chunkSizeField: 'qaChunkSize',
chunkOverlapRatio: 0,
maxChunkSize: Math.min(agentModel.maxResponse * 4, agentModel.maxContext * 0.7),
minChunkSize: 4000,
autoChunkSize: Math.min(agentModel.maxResponse * 1, agentModel.maxContext * 0.7),
maxChunkSize: getLLMMaxChunkSize(agentModel),
minChunkSize: 1000,
autoChunkSize: getLLMDefaultChunkSize(agentModel),
chunkSize: qaChunkSize,
charsPointsPrice: agentModel.charsPointsPrice || 0,
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
@@ -237,10 +258,13 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
return {
chunkSizeField: 'embeddingChunkSize',
chunkOverlapRatio: 0.2,
maxChunkSize: 2048,
minChunkSize: 100,
autoChunkSize: vectorModel?.defaultToken ? vectorModel.defaultToken * 2 : 1024,
maxChunkSize: getMaxChunkSize(agentModel),
minChunkSize: minChunkSize,
autoChunkSize: chunkAutoChunkSize,
chunkSize: embeddingChunkSize,
maxIndexSize: getMaxIndexSize(vectorModel),
autoIndexSize: getAutoIndexSize(vectorModel),
indexSize,
charsPointsPrice: agentModel.charsPointsPrice || 0,
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
price: agentModel.charsPointsPrice
@@ -251,10 +275,13 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
return {
chunkSizeField: 'embeddingChunkSize',
chunkOverlapRatio: 0.2,
maxChunkSize: vectorModel?.maxToken || 512,
minChunkSize: 100,
autoChunkSize: vectorModel?.defaultToken || 512,
maxChunkSize: getMaxChunkSize(agentModel),
minChunkSize: minChunkSize,
autoChunkSize: chunkAutoChunkSize,
chunkSize: embeddingChunkSize,
maxIndexSize: getMaxIndexSize(vectorModel),
autoIndexSize: getAutoIndexSize(vectorModel),
indexSize,
charsPointsPrice: vectorModel.charsPointsPrice || 0,
priceTip: t('dataset:import.Embedding Estimated Price Tips', {
price: vectorModel.charsPointsPrice
@@ -265,30 +292,36 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
}, [
trainingType,
autoIndexes,
agentModel.maxResponse,
agentModel.maxContext,
agentModel.charsPointsPrice,
agentModel,
qaChunkSize,
t,
vectorModel.defaultToken,
vectorModel?.maxToken,
vectorModel.charsPointsPrice,
embeddingChunkSize
embeddingChunkSize,
vectorModel,
indexSize
]);
const chunkSettingModeMap = useMemo(() => {
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
return {
chunkSize: TrainingModeMap.autoChunkSize,
customSplitChar: ''
indexSize: TrainingModeMap.autoIndexSize,
chunkSplitter: ''
};
} else {
return {
chunkSize: TrainingModeMap.chunkSize,
customSplitChar
indexSize: TrainingModeMap.indexSize,
chunkSplitter
};
}
}, [chunkSettingMode, TrainingModeMap.autoChunkSize, TrainingModeMap.chunkSize, customSplitChar]);
}, [
chunkSettingMode,
TrainingModeMap.autoChunkSize,
TrainingModeMap.autoIndexSize,
TrainingModeMap.chunkSize,
TrainingModeMap.indexSize,
chunkSplitter
]);
const contextValue = {
...TrainingModeMap,

View File

@@ -20,10 +20,11 @@ import MyIcon from '@fastgpt/web/components/common/Icon';
import { useTranslation } from 'next-i18next';
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
import {
DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum,
DatasetCollectionDataProcessModeMap
} from '@fastgpt/global/core/dataset/constants';
import { ChunkSettingModeEnum } from '@/web/core/dataset/constants';
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
import MyTooltip from '@fastgpt/web/components/common/MyTooltip';
import { useSystemStore } from '@/web/common/system/useSystemStore';
import MyModal from '@fastgpt/web/components/common/MyModal';
@@ -37,25 +38,39 @@ import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip';
import { shadowLight } from '@fastgpt/web/styles/theme';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import MySelect from '@fastgpt/web/components/common/MySelect';
import { getIndexSizeSelectList } from '@fastgpt/global/core/dataset/training/utils';
import RadioGroup from '@fastgpt/web/components/common/Radio/RadioGroup';
function DataProcess() {
const { t } = useTranslation();
const { feConfigs } = useSystemStore();
const { goToNext, processParamsForm, chunkSizeField, minChunkSize, maxChunkSize } =
useContextSelector(DatasetImportContext, (v) => v);
const {
goToNext,
processParamsForm,
chunkSizeField,
minChunkSize,
maxChunkSize,
maxIndexSize,
indexSize
} = useContextSelector(DatasetImportContext, (v) => v);
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
const { setValue, register, watch, getValues } = processParamsForm;
const trainingType = watch('trainingType');
const chunkSettingMode = watch('chunkSettingMode');
const trainingModeList = useMemo(() => {
const list = Object.entries(DatasetCollectionDataProcessModeMap);
return list
.filter(([key]) => key !== DatasetCollectionDataProcessModeEnum.auto)
.map(([key, value]) => ({
title: t(value.label as any),
value: key as DatasetCollectionDataProcessModeEnum,
tooltip: t(value.tooltip as any)
}));
}, [t]);
const qaPrompt = watch('qaPrompt');
const {
isOpen: isOpenCustomPrompt,
onOpen: onOpenCustomPrompt,
onClose: onCloseCustomPrompt
} = useDisclosure();
const chunkSettingMode = watch('chunkSettingMode');
const chunkSplitMode = watch('chunkSplitMode');
const customSplitList = [
{ label: t('dataset:split_sign_null'), value: '' },
@@ -69,25 +84,25 @@ function DataProcess() {
{ label: t('dataset:split_sign_custom'), value: 'Other' }
];
const [customListSelectValue, setCustomListSelectValue] = useState(getValues('customSplitChar'));
const [customListSelectValue, setCustomListSelectValue] = useState(getValues('chunkSplitter'));
useEffect(() => {
if (customListSelectValue === 'Other') {
setValue('customSplitChar', '');
setValue('chunkSplitter', '');
} else {
setValue('customSplitChar', customListSelectValue);
setValue('chunkSplitter', customListSelectValue);
}
}, [customListSelectValue, setValue]);
const trainingModeList = useMemo(() => {
const list = Object.entries(DatasetCollectionDataProcessModeMap);
return list
.filter(([key]) => key !== DatasetCollectionDataProcessModeEnum.auto)
.map(([key, value]) => ({
title: t(value.label as any),
value: key as DatasetCollectionDataProcessModeEnum,
tooltip: t(value.tooltip as any)
}));
}, [t]);
// Index size
const indexSizeSeletorList = useMemo(() => getIndexSizeSelectList(maxIndexSize), [maxIndexSize]);
// QA
const qaPrompt = watch('qaPrompt');
const {
isOpen: isOpenCustomPrompt,
onOpen: onOpenCustomPrompt,
onClose: onCloseCustomPrompt
} = useDisclosure();
const Title = useCallback(({ title }: { title: string }) => {
return (
@@ -237,67 +252,97 @@ function DataProcess() {
children: chunkSettingMode === ChunkSettingModeEnum.custom && (
<Box mt={5}>
<Box>
<Flex alignItems={'center'}>
<Box>{t('dataset:ideal_chunk_length')}</Box>
<QuestionTip label={t('dataset:ideal_chunk_length_tips')} />
</Flex>
<Box
mt={1}
css={{
'& > span': {
display: 'block'
<RadioGroup<DataChunkSplitModeEnum>
list={[
{
title: t('dataset:split_chunk_size'),
value: DataChunkSplitModeEnum.size
},
{
title: t('dataset:split_chunk_char'),
value: DataChunkSplitModeEnum.char,
tooltip: t('dataset:custom_split_sign_tip')
}
]}
value={chunkSplitMode}
onChange={(e) => {
setValue('chunkSplitMode', e);
}}
>
<MyTooltip
label={t('common:core.dataset.import.Chunk Range', {
min: minChunkSize,
max: maxChunkSize
})}
/>
{chunkSplitMode === DataChunkSplitModeEnum.size && (
<Box
mt={1.5}
css={{
'& > span': {
display: 'block'
}
}}
>
<MyNumberInput
register={register}
name={chunkSizeField}
min={minChunkSize}
max={maxChunkSize}
size={'sm'}
step={100}
/>
</MyTooltip>
</Box>
<MyTooltip
label={t('common:core.dataset.import.Chunk Range', {
min: minChunkSize,
max: maxChunkSize
})}
>
<MyNumberInput
register={register}
name={chunkSizeField}
min={minChunkSize}
max={maxChunkSize}
size={'sm'}
step={100}
/>
</MyTooltip>
</Box>
)}
{chunkSplitMode === DataChunkSplitModeEnum.char && (
<HStack mt={1.5}>
<Box flex={'1 0 0'}>
<MySelect<string>
list={customSplitList}
size={'sm'}
bg={'myGray.50'}
value={customListSelectValue}
h={'32px'}
onChange={(val) => {
setCustomListSelectValue(val);
}}
/>
</Box>
{customListSelectValue === 'Other' && (
<Input
flex={'1 0 0'}
h={'32px'}
size={'sm'}
bg={'myGray.50'}
placeholder="\n;======;==SPLIT=="
{...register('chunkSplitter')}
/>
)}
</HStack>
)}
</Box>
<Box mt={3}>
{trainingType === DatasetCollectionDataProcessModeEnum.chunk && (
<Box>
{t('common:core.dataset.import.Custom split char')}
<QuestionTip label={t('dataset:custom_split_sign_tip')} />
</Box>
<HStack mt={1}>
<Box flex={'1 0 0'}>
<MySelect<string>
list={customSplitList}
size={'sm'}
<Flex alignItems={'center'} mt={3}>
<Box>{t('dataset:index_size')}</Box>
<QuestionTip label={t('dataset:index_size_tips')} />
</Flex>
<Box mt={1}>
<MySelect<number>
bg={'myGray.50'}
value={customListSelectValue}
h={'32px'}
list={indexSizeSeletorList}
value={indexSize}
onChange={(val) => {
setCustomListSelectValue(val);
setValue('indexSize', val);
}}
/>
</Box>
{customListSelectValue === 'Other' && (
<Input
flex={'1 0 0'}
h={'32px'}
size={'sm'}
bg={'myGray.50'}
placeholder="\n;======;==SPLIT=="
{...register('customSplitChar')}
/>
)}
</HStack>
</Box>
</Box>
)}
{showQAPromptInput && (
<Box mt={3}>

View File

@@ -16,6 +16,7 @@ import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContex
import MyBox from '@fastgpt/web/components/common/MyBox';
import Markdown from '@/components/Markdown';
import { useToast } from '@fastgpt/web/hooks/useToast';
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
const PreviewData = () => {
const { t } = useTranslation();
@@ -23,6 +24,7 @@ const PreviewData = () => {
const goToNext = useContextSelector(DatasetImportContext, (v) => v.goToNext);
const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId);
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
const sources = useContextSelector(DatasetImportContext, (v) => v.sources);
const importSource = useContextSelector(DatasetImportContext, (v) => v.importSource);
@@ -36,12 +38,13 @@ const PreviewData = () => {
async () => {
if (!previewFile) return;
if (importSource === ImportDataSourceEnum.fileCustom) {
const customSplitChar = processParamsForm.getValues('customSplitChar');
const chunkSplitter = processParamsForm.getValues('chunkSplitter');
const { chunks } = splitText2Chunks({
text: previewFile.rawText || '',
chunkLen: chunkSize,
chunkSize,
maxSize: getLLMMaxChunkSize(datasetDetail.agentModel),
overlapRatio: chunkOverlapRatio,
customReg: customSplitChar ? [customSplitChar] : []
customReg: chunkSplitter ? [chunkSplitter] : []
});
return chunks.map((chunk) => ({
q: chunk,
@@ -61,9 +64,12 @@ const PreviewData = () => {
customPdfParse: processParamsForm.getValues('customPdfParse'),
trainingType: processParamsForm.getValues('trainingType'),
chunkSettingMode: processParamsForm.getValues('chunkSettingMode'),
chunkSplitMode: processParamsForm.getValues('chunkSplitMode'),
chunkSize,
chunkSplitter: processParamsForm.getValues('chunkSplitter'),
overlapRatio: chunkOverlapRatio,
customSplitChar: processParamsForm.getValues('customSplitChar'),
selector: processParamsForm.getValues('webSelector'),
isQAImport: importSource === ImportDataSourceEnum.csvTable,

View File

@@ -49,7 +49,7 @@ const Upload = () => {
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
const retrainNewCollectionId = useRef('');
const { importSource, parentId, sources, setSources, processParamsForm, chunkSize } =
const { importSource, parentId, sources, setSources, processParamsForm, chunkSize, indexSize } =
useContextSelector(DatasetImportContext, (v) => v);
const { handleSubmit } = processParamsForm;
@@ -81,7 +81,7 @@ const Upload = () => {
}, [waitingFilesCount, totalFilesCount, allFinished, t]);
const { runAsync: startUpload, loading: isLoading } = useRequest2(
async ({ trainingType, customSplitChar, qaPrompt, webSelector }: ImportFormType) => {
async ({ trainingType, chunkSplitter, qaPrompt, webSelector }: ImportFormType) => {
if (sources.length === 0) return;
const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting');
@@ -111,10 +111,16 @@ const Upload = () => {
trainingType,
imageIndex: processParamsForm.getValues('imageIndex'),
autoIndexes: processParamsForm.getValues('autoIndexes'),
chunkSettingMode: processParamsForm.getValues('chunkSettingMode'),
chunkSplitMode: processParamsForm.getValues('chunkSplitMode'),
chunkSize,
chunkSplitter: customSplitChar,
indexSize,
chunkSplitter,
qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
};
if (importSource === ImportDataSourceEnum.reTraining) {
const res = await postReTrainingDatasetFileCollection({
...commonParams,

View File

@@ -1,102 +0,0 @@
import React from 'react';
import { Box } from '@chakra-ui/react';
import { ImportSourceItemType } from '@/web/core/dataset/type';
import MyRightDrawer from '@fastgpt/web/components/common/MyDrawer/MyRightDrawer';
import { getPreviewChunks } from '@/web/core/dataset/api';
import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { useContextSelector } from 'use-context-selector';
import { DatasetImportContext } from '../Context';
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import { getPreviewSourceReadType } from '../utils';
const PreviewChunks = ({
previewSource,
onClose
}: {
previewSource: ImportSourceItemType;
onClose: () => void;
}) => {
const { importSource, chunkSize, chunkOverlapRatio, processParamsForm } = useContextSelector(
DatasetImportContext,
(v) => v
);
const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId);
const { data = [], loading: isLoading } = useRequest2(
async () => {
if (importSource === ImportDataSourceEnum.fileCustom) {
const customSplitChar = processParamsForm.getValues('customSplitChar');
const { chunks } = splitText2Chunks({
text: previewSource.rawText || '',
chunkLen: chunkSize,
overlapRatio: chunkOverlapRatio,
customReg: customSplitChar ? [customSplitChar] : []
});
return chunks.map((chunk) => ({
q: chunk,
a: ''
}));
}
return getPreviewChunks({
datasetId,
type: getPreviewSourceReadType(previewSource),
sourceId:
previewSource.dbFileId ||
previewSource.link ||
previewSource.externalFileUrl ||
previewSource.apiFileId ||
'',
chunkSize,
overlapRatio: chunkOverlapRatio,
customSplitChar: processParamsForm.getValues('customSplitChar'),
selector: processParamsForm.getValues('webSelector'),
isQAImport: importSource === ImportDataSourceEnum.csvTable,
externalFileId: previewSource.externalFileId
});
},
{
manual: false
}
);
return (
<MyRightDrawer
onClose={onClose}
iconSrc={previewSource.icon}
title={previewSource.sourceName}
isLoading={isLoading}
maxW={['90vw', '40vw']}
px={0}
>
<Box overflowY={'auto'} px={5} fontSize={'sm'}>
{data.map((item, index) => (
<Box
key={index}
whiteSpace={'pre-wrap'}
fontSize={'sm'}
p={4}
bg={index % 2 === 0 ? 'white' : 'myWhite.600'}
mb={3}
borderRadius={'md'}
borderWidth={'1px'}
borderColor={'borderColor.low'}
boxShadow={'2'}
_notLast={{
mb: 2
}}
>
<Box color={'myGray.900'}>{item.q}</Box>
<Box color={'myGray.500'}>{item.a}</Box>
</Box>
))}
</Box>
</MyRightDrawer>
);
};
export default React.memo(PreviewChunks);

View File

@@ -8,10 +8,11 @@ import { useRouter } from 'next/router';
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
import { getDatasetCollectionById } from '@/web/core/dataset/api';
import MyBox from '@fastgpt/web/components/common/MyBox';
import { ChunkSettingModeEnum } from '@/web/core/dataset/constants';
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import { Box } from '@chakra-ui/react';
import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants';
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
const Upload = dynamic(() => import('../commonProgress/Upload'));
const PreviewData = dynamic(() => import('../commonProgress/PreviewData'));
@@ -23,7 +24,6 @@ const ReTraining = () => {
collectionId: string;
};
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
const activeStep = useContextSelector(DatasetImportContext, (v) => v.activeStep);
const setSources = useContextSelector(DatasetImportContext, (v) => v.setSources);
const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm);
@@ -46,18 +46,21 @@ const ReTraining = () => {
uploadedFileRate: 100
}
]);
processParamsForm.reset({
customPdfParse: collection.customPdfParse,
trainingType: collection.trainingType,
imageIndex: collection.imageIndex,
autoIndexes: collection.autoIndexes,
chunkSettingMode: ChunkSettingModeEnum.auto,
chunkSettingMode: collection.chunkSettingMode || ChunkSettingModeEnum.auto,
chunkSplitMode: collection.chunkSplitMode || DataChunkSplitModeEnum.size,
embeddingChunkSize: collection.chunkSize,
qaChunkSize: collection.chunkSize,
customSplitChar: collection.chunkSplitter,
qaPrompt: collection.qaPrompt,
webSelector: collection.metadata?.webPageSelector
indexSize: collection.indexSize || 512,
chunkSplitter: collection.chunkSplitter,
webSelector: collection.metadata?.webPageSelector,
qaPrompt: collection.qaPrompt || Prompt_AgentQA.description
});
}
});

View File

@@ -294,7 +294,7 @@ const MyInfo = ({ onOpenContact }: { onOpenContact: () => void }) => {
title={t('account_info:click_modify_nickname')}
borderColor={'transparent'}
transform={'translateX(-11px)'}
maxLength={20}
maxLength={100}
onBlur={async (e) => {
const val = e.target.value;
if (val === userInfo?.team?.memberName) return;

View File

@@ -2,8 +2,7 @@ import { reTrainingDatasetFileCollectionParams } from '@fastgpt/global/core/data
import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller';
import {
DatasetCollectionTypeEnum,
DatasetSourceReadTypeEnum,
TrainingModeEnum
DatasetSourceReadTypeEnum
} from '@fastgpt/global/core/dataset/constants';
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { hashStr } from '@fastgpt/global/common/string/tools';

View File

@@ -4,7 +4,7 @@
*/
import type { NextApiRequest } from 'next';
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken/index';
import { getEmbeddingModel } from '@fastgpt/service/core/ai/model';
import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
import { hasSameValue } from '@/service/core/dataset/data/utils';
import { insertData2Dataset } from '@/service/core/dataset/data/controller';
import { authDatasetCollection } from '@fastgpt/service/support/permission/dataset/auth';
@@ -16,6 +16,7 @@ import { checkDatasetLimit } from '@fastgpt/service/support/permission/teamLimit
import { NextAPI } from '@/service/middleware/entry';
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
async function handler(req: NextApiRequest) {
const { collectionId, q, a, indexes } = req.body as InsertOneDatasetDataProps;
@@ -45,7 +46,7 @@ async function handler(req: NextApiRequest) {
// auth collection and get dataset
const [
{
dataset: { _id: datasetId, vectorModel }
dataset: { _id: datasetId, vectorModel, agentModel }
}
] = await Promise.all([getCollectionWithDataset(collectionId)]);
@@ -60,9 +61,11 @@ async function handler(req: NextApiRequest) {
// token check
const token = await countPromptTokens(formatQ + formatA, '');
const vectorModelData = getEmbeddingModel(vectorModel);
const llmModelData = getLLMModel(agentModel);
const maxChunkSize = getLLMMaxChunkSize(llmModelData);
if (token > vectorModelData.maxToken) {
return Promise.reject('Q Over Tokens');
if (token > maxChunkSize) {
return Promise.reject(`Content over max chunk size: ${maxChunkSize}`);
}
// Duplicate data check
@@ -82,7 +85,7 @@ async function handler(req: NextApiRequest) {
q: formatQ,
a: formatA,
chunkIndex: 0,
model: vectorModelData.model,
embeddingModel: vectorModelData.model,
indexes: formatIndexes
});

View File

@@ -1,4 +1,9 @@
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
import {
ChunkSettingModeEnum,
DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum,
DatasetSourceReadTypeEnum
} from '@fastgpt/global/core/dataset/constants';
import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
import { NextAPI } from '@/service/middleware/entry';
import { ApiRequestProps } from '@fastgpt/service/type/next';
@@ -8,17 +13,30 @@ import {
} from '@fastgpt/global/support/permission/constant';
import { authCollectionFile } from '@fastgpt/service/support/permission/auth/file';
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import {
computeChunkSize,
computeChunkSplitter,
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { getLLMModel } from '@fastgpt/service/core/ai/model';
export type PostPreviewFilesChunksProps = {
datasetId: string;
type: DatasetSourceReadTypeEnum;
sourceId: string;
chunkSize: number;
overlapRatio: number;
customSplitChar?: string;
customPdfParse?: boolean;
trainingType: DatasetCollectionDataProcessModeEnum;
// Chunk settings
chunkSettingMode: ChunkSettingModeEnum;
chunkSplitMode: DataChunkSplitModeEnum;
chunkSize: number;
chunkSplitter?: string;
overlapRatio: number;
// Read params
selector?: string;
isQAImport?: boolean;
@@ -32,55 +50,64 @@ export type PreviewChunksResponse = {
async function handler(
req: ApiRequestProps<PostPreviewFilesChunksProps>
): Promise<PreviewChunksResponse> {
const {
let {
type,
sourceId,
customPdfParse = false,
trainingType,
chunkSettingMode,
chunkSplitMode,
chunkSize,
customSplitChar,
chunkSplitter,
overlapRatio,
selector,
isQAImport,
datasetId,
externalFileId,
customPdfParse = false
externalFileId
} = req.body;
if (!sourceId) {
throw new Error('sourceId is empty');
}
if (chunkSize > 30000) {
throw new Error('chunkSize is too large, should be less than 30000');
const fileAuthRes =
type === DatasetSourceReadTypeEnum.fileLocal
? await authCollectionFile({
req,
authToken: true,
authApiKey: true,
fileId: sourceId,
per: OwnerPermissionVal
})
: undefined;
const { dataset, teamId, tmbId } = await authDataset({
req,
authApiKey: true,
authToken: true,
datasetId,
per: WritePermissionVal
});
if (fileAuthRes && (String(fileAuthRes.tmbId) !== String(tmbId) || !fileAuthRes.isRoot)) {
return Promise.reject(CommonErrEnum.unAuthFile);
}
const { teamId, tmbId, apiServer, feishuServer, yuqueServer } = await (async () => {
if (type === DatasetSourceReadTypeEnum.fileLocal) {
const res = await authCollectionFile({
req,
authToken: true,
authApiKey: true,
fileId: sourceId,
per: OwnerPermissionVal
});
return {
teamId: res.teamId,
tmbId: res.tmbId
};
}
const { dataset, teamId, tmbId } = await authDataset({
req,
authApiKey: true,
authToken: true,
datasetId,
per: WritePermissionVal
});
return {
teamId,
tmbId,
apiServer: dataset.apiServer,
feishuServer: dataset.feishuServer,
yuqueServer: dataset.yuqueServer
};
})();
chunkSize = computeChunkSize({
trainingType,
chunkSettingMode,
chunkSplitMode,
chunkSize,
llmModel: getLLMModel(dataset.agentModel)
});
chunkSplitter = computeChunkSplitter({
chunkSettingMode,
chunkSplitMode,
chunkSplitter
});
const { rawText } = await readDatasetSourceRawText({
teamId,
@@ -89,18 +116,19 @@ async function handler(
sourceId,
selector,
isQAImport,
apiServer,
feishuServer,
yuqueServer,
apiServer: dataset.apiServer,
feishuServer: dataset.feishuServer,
yuqueServer: dataset.yuqueServer,
externalFileId,
customPdfParse
});
return rawText2Chunks({
rawText,
chunkLen: chunkSize,
chunkSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio,
customReg: customSplitChar ? [customSplitChar] : [],
customReg: chunkSplitter ? [chunkSplitter] : [],
isQAImport: isQAImport
}).slice(0, 10);
}

View File

@@ -5,25 +5,63 @@ import {
UpdateDatasetDataProps
} from '@fastgpt/global/core/dataset/controller';
import { insertDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';
import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils';
import { jiebaSplit } from '@fastgpt/service/common/string/jieba/index';
import { deleteDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';
import { DatasetDataIndexItemType, DatasetDataItemType } from '@fastgpt/global/core/dataset/type';
import { getEmbeddingModel } from '@fastgpt/service/core/ai/model';
import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { ClientSession } from '@fastgpt/service/common/mongo';
import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
const formatIndexes = ({
const formatIndexes = async ({
indexes,
q,
a = ''
a = '',
indexSize
}: {
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
q: string;
a?: string;
}) => {
indexSize: number;
}): Promise<
{
type: `${DatasetDataIndexTypeEnum}`;
text: string;
dataId?: string;
}[]
> => {
/* get dataset data default index */
const getDefaultIndex = ({
q = '',
a,
indexSize
}: {
q?: string;
a?: string;
indexSize: number;
}) => {
const qChunks = splitText2Chunks({
text: q,
chunkSize: indexSize
}).chunks;
const aChunks = a ? splitText2Chunks({ text: a, chunkSize: indexSize }).chunks : [];
return [
...qChunks.map((text) => ({
text,
type: DatasetDataIndexTypeEnum.default
})),
...aChunks.map((text) => ({
text,
type: DatasetDataIndexTypeEnum.default
}))
];
};
indexes = indexes || [];
// If index not type, set it to custom
indexes = indexes
@@ -35,7 +73,7 @@ const formatIndexes = ({
.filter((item) => !!item.text.trim());
// Recompute default indexes, Merge ids of the same index, reduce the number of rebuilds
const defaultIndexes = getDefaultIndex({ q, a });
const defaultIndexes = getDefaultIndex({ q, a, indexSize });
const concatDefaultIndexes = defaultIndexes.map((item) => {
const oldIndex = indexes!.find((index) => index.text === item.text);
if (oldIndex) {
@@ -56,11 +94,24 @@ const formatIndexes = ({
(item, index, self) => index === self.findIndex((t) => t.text === item.text)
);
return indexes.map((index) => ({
type: index.type,
text: index.text,
dataId: index.dataId
}));
const chekcIndexes = (
await Promise.all(
indexes.map(async (item) => {
// If oversize tokens, split it
const tokens = await countPromptTokens(item.text);
if (tokens > indexSize) {
const splitText = splitText2Chunks({ text: item.text, chunkSize: 512 }).chunks;
return splitText.map((text) => ({
text,
type: item.type
}));
}
return item;
})
)
).flat();
return chekcIndexes;
};
/* insert data.
* 1. create data id
@@ -75,30 +126,40 @@ export async function insertData2Dataset({
q,
a = '',
chunkIndex = 0,
indexSize = 512,
indexes,
model,
embeddingModel,
session
}: CreateDatasetDataProps & {
model: string;
embeddingModel: string;
indexSize?: number;
session?: ClientSession;
}) {
if (!q || !datasetId || !collectionId || !model) {
return Promise.reject('q, datasetId, collectionId, model is required');
if (!q || !datasetId || !collectionId || !embeddingModel) {
return Promise.reject('q, datasetId, collectionId, embeddingModel is required');
}
if (String(teamId) === String(tmbId)) {
return Promise.reject("teamId and tmbId can't be the same");
}
const embModel = getEmbeddingModel(embeddingModel);
indexSize = Math.min(embModel.maxToken, indexSize);
// 1. Get vector indexes and insert
// Empty indexes check, if empty, create default index
const newIndexes = formatIndexes({ indexes, q, a });
const newIndexes = await formatIndexes({
indexes,
q,
a,
indexSize
});
// insert to vector store
const result = await Promise.all(
newIndexes.map(async (item) => {
const result = await insertDatasetDataVector({
query: item.text,
model: getEmbeddingModel(model),
model: embModel,
teamId,
datasetId,
collectionId
@@ -163,8 +224,9 @@ export async function updateData2Dataset({
q = '',
a,
indexes,
model
}: UpdateDatasetDataProps & { model: string }) {
model,
indexSize = 512
}: UpdateDatasetDataProps & { model: string; indexSize?: number }) {
if (!Array.isArray(indexes)) {
return Promise.reject('indexes is required');
}
@@ -174,7 +236,7 @@ export async function updateData2Dataset({
if (!mongoData) return Promise.reject('core.dataset.error.Data not found');
// 2. Compute indexes
const formatIndexesResult = formatIndexes({ indexes, q, a });
const formatIndexesResult = await formatIndexes({ indexes, q, a, indexSize });
// 3. Patch indexes, create, update, delete
const patchResult: PatchIndexesProps[] = [];

View File

@@ -21,6 +21,11 @@ import {
llmCompletionsBodyFormat,
llmStreamResponseToAnswerText
} from '@fastgpt/service/core/ai/utils';
import { LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
import {
chunkAutoChunkSize,
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
const reduceQueue = () => {
global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0;
@@ -129,7 +134,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
});
const answer = await llmStreamResponseToAnswerText(chatResponse);
const qaArr = formatSplitText(answer, text); // 格式化后的QA对
const qaArr = formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对
addLog.info(`[QA Queue] Finish`, {
time: Date.now() - startTime,
@@ -180,10 +185,18 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
}
// Format qa answer
function formatSplitText(text: string, rawText: string) {
text = text.replace(/\\n/g, '\n'); // 将换行符替换为空格
function formatSplitText({
answer,
rawText,
llmModel
}: {
answer: string;
rawText: string;
llmModel: LLMModelItemType;
}) {
answer = answer.replace(/\\n/g, '\n'); // 将换行符替换为空格
const regex = /Q\d+:(\s*)(.*)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q\d|$)/g; // 匹配Q和A的正则表达式
const matches = text.matchAll(regex); // 获取所有匹配到的结果
const matches = answer.matchAll(regex); // 获取所有匹配到的结果
const result: PushDatasetDataChunkProps[] = []; // 存储最终的结果
for (const match of matches) {
@@ -199,7 +212,11 @@ function formatSplitText(text: string, rawText: string) {
// empty result. direct split chunk
if (result.length === 0) {
const { chunks } = splitText2Chunks({ text: rawText, chunkLen: 512 });
const { chunks } = splitText2Chunks({
text: rawText,
chunkSize: chunkAutoChunkSize,
maxSize: getLLMMaxChunkSize(llmModel)
});
chunks.forEach((chunk) => {
result.push({
q: chunk,

View File

@@ -245,7 +245,7 @@ const insertData = async ({
a: trainingData.a,
chunkIndex: trainingData.chunkIndex,
indexes: trainingData.indexes,
model: trainingData.model,
embeddingModel: trainingData.model,
session
});
// delete data from training

View File

@@ -60,15 +60,11 @@ export const defaultCollectionDetail: DatasetCollectionItemType = {
createTime: new Date(),
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
chunkSize: 0,
indexSize: 512,
permission: new DatasetPermission(),
indexAmount: 0
};
export enum ChunkSettingModeEnum {
auto = 'auto',
custom = 'custom'
}
export const datasetTypeCourseMap: Record<`${DatasetTypeEnum}`, string> = {
[DatasetTypeEnum.folder]: '',
[DatasetTypeEnum.dataset]: '',

View File

@@ -1,6 +1,6 @@
import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { ChunkSettingModeEnum } from './constants';
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { UseFormReturn } from 'react-hook-form';
import { APIFileItem } from '@fastgpt/global/core/dataset/apiDataset';
@@ -41,7 +41,7 @@ export type ImportSourceParamsType = UseFormReturn<
{
chunkSize: number;
chunkOverlapRatio: number;
customSplitChar: string;
chunkSplitter: string;
prompt: string;
mode: TrainingModeEnum;
way: ChunkSettingModeEnum;