mirror of
https://github.com/labring/FastGPT.git
synced 2026-05-05 01:02:59 +08:00
perf: password special chars;feat: llm paragraph;perf: chunk setting params;perf: text splitter worker (#4984)
* perf: password special chars * feat: llm paragraph;perf: chunk setting params * perf: text splitter worker * perf: get rawtext buffer * fix: test * fix: test * doc * min chunk size
This commit is contained in:
@@ -9,25 +9,14 @@ import { useMyStep } from '@fastgpt/web/hooks/useStep';
|
||||
import MyDivider from '@fastgpt/web/components/common/MyDivider';
|
||||
import React from 'react';
|
||||
import { Box, Link, Input, Button, ModalBody, ModalFooter, Stack } from '@chakra-ui/react';
|
||||
import {
|
||||
DataChunkSplitModeEnum,
|
||||
DatasetCollectionDataProcessModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
|
||||
import { useContextSelector } from 'use-context-selector';
|
||||
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
||||
import CollectionChunkForm, {
|
||||
collectionChunkForm2StoreChunkData,
|
||||
type CollectionChunkFormType
|
||||
} from '../Form/CollectionChunkForm';
|
||||
import {
|
||||
getAutoIndexSize,
|
||||
getLLMDefaultChunkSize
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
import CollectionChunkForm, { type CollectionChunkFormType } from '../Form/CollectionChunkForm';
|
||||
import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
|
||||
import PopoverConfirm from '@fastgpt/web/components/common/MyPopover/PopoverConfirm';
|
||||
import { defaultFormData } from '../Import/Context';
|
||||
import { computedCollectionChunkSettings } from '@fastgpt/global/core/dataset/training/utils';
|
||||
|
||||
export type WebsiteConfigFormType = {
|
||||
websiteConfig: {
|
||||
@@ -80,7 +69,7 @@ const WebsiteConfigModal = ({
|
||||
|
||||
const form = useForm<CollectionChunkFormType>({
|
||||
defaultValues: {
|
||||
trainingType: chunkSettings?.trainingType,
|
||||
trainingType: chunkSettings?.trainingType || defaultFormData.trainingType,
|
||||
|
||||
chunkTriggerType: chunkSettings?.chunkTriggerType || defaultFormData.chunkTriggerType,
|
||||
chunkTriggerMinSize:
|
||||
@@ -204,9 +193,9 @@ const WebsiteConfigModal = ({
|
||||
form.handleSubmit((data) =>
|
||||
onSuccess({
|
||||
websiteConfig: websiteInfoGetValues(),
|
||||
chunkSettings: collectionChunkForm2StoreChunkData({
|
||||
chunkSettings: computedCollectionChunkSettings({
|
||||
...data,
|
||||
agentModel: datasetDetail.agentModel,
|
||||
llmModel: datasetDetail.agentModel,
|
||||
vectorModel: datasetDetail.vectorModel
|
||||
})
|
||||
})
|
||||
|
||||
@@ -17,7 +17,7 @@ import {
|
||||
} from '@chakra-ui/react';
|
||||
import MyIcon from '@fastgpt/web/components/common/Icon';
|
||||
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
|
||||
import type { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { ChunkTriggerConfigTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import {
|
||||
DataChunkSplitModeEnum,
|
||||
@@ -133,6 +133,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
const autoIndexes = watch('autoIndexes');
|
||||
const indexSize = watch('indexSize');
|
||||
const imageIndex = watch('imageIndex');
|
||||
const paragraphChunkAIMode = watch('paragraphChunkAIMode');
|
||||
|
||||
const trainingModeList = useMemo(() => {
|
||||
const list = {
|
||||
@@ -362,11 +363,35 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
onChange={(e) => {
|
||||
setValue('chunkSplitMode', e);
|
||||
}}
|
||||
fontSize={'md'}
|
||||
/>
|
||||
|
||||
{chunkSplitMode === DataChunkSplitModeEnum.paragraph && (
|
||||
<>
|
||||
<Box mt={1.5}>
|
||||
<Box mt={3}>
|
||||
<Box fontSize={'sm'}>{t('dataset:llm_paragraph_mode')}</Box>
|
||||
<MySelect<ParagraphChunkAIModeEnum>
|
||||
size={'sm'}
|
||||
bg={'myGray.50'}
|
||||
value={paragraphChunkAIMode}
|
||||
onChange={(e) => {
|
||||
setValue('paragraphChunkAIMode', e);
|
||||
}}
|
||||
list={[
|
||||
{
|
||||
label: t('dataset:llm_paragraph_mode_forbid'),
|
||||
value: ParagraphChunkAIModeEnum.forbid,
|
||||
description: t('dataset:llm_paragraph_mode_forbid_desc')
|
||||
},
|
||||
{
|
||||
label: t('dataset:llm_paragraph_mode_auto'),
|
||||
value: ParagraphChunkAIModeEnum.auto,
|
||||
description: t('dataset:llm_paragraph_mode_auto_desc')
|
||||
}
|
||||
]}
|
||||
/>
|
||||
</Box>
|
||||
<Box mt={2} fontSize={'sm'}>
|
||||
<Box>{t('dataset:paragraph_max_deep')}</Box>
|
||||
<MyNumberInput
|
||||
size={'sm'}
|
||||
@@ -379,7 +404,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
h={'32px'}
|
||||
/>
|
||||
</Box>
|
||||
<Box mt={1.5}>
|
||||
<Box mt={2} fontSize={'sm'}>
|
||||
<Box>{t('dataset:max_chunk_size')}</Box>
|
||||
<Box
|
||||
css={{
|
||||
@@ -409,7 +434,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
)}
|
||||
|
||||
{chunkSplitMode === DataChunkSplitModeEnum.size && (
|
||||
<Box mt={1.5}>
|
||||
<Box mt={3} fontSize={'sm'}>
|
||||
<Box>{t('dataset:chunk_size')}</Box>
|
||||
<Box
|
||||
css={{
|
||||
@@ -438,45 +463,48 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
)}
|
||||
|
||||
{chunkSplitMode === DataChunkSplitModeEnum.char && (
|
||||
<HStack mt={1.5}>
|
||||
<Box flex={'1 0 0'}>
|
||||
<MySelect<string>
|
||||
list={customSplitList}
|
||||
size={'sm'}
|
||||
bg={'myGray.50'}
|
||||
value={customListSelectValue}
|
||||
h={'32px'}
|
||||
onChange={(val) => {
|
||||
if (val === 'Other') {
|
||||
setValue('chunkSplitter', '');
|
||||
} else {
|
||||
setValue('chunkSplitter', val);
|
||||
}
|
||||
setCustomListSelectValue(val);
|
||||
}}
|
||||
/>
|
||||
</Box>
|
||||
{customListSelectValue === 'Other' && (
|
||||
<Input
|
||||
flex={'1 0 0'}
|
||||
h={'32px'}
|
||||
size={'sm'}
|
||||
bg={'myGray.50'}
|
||||
placeholder="\n;======;==SPLIT=="
|
||||
{...register('chunkSplitter')}
|
||||
/>
|
||||
)}
|
||||
</HStack>
|
||||
<Box mt={3} fontSize={'sm'}>
|
||||
<Box>{t('dataset:custom_split_char')}</Box>
|
||||
<HStack>
|
||||
<Box flex={'1 0 0'}>
|
||||
<MySelect<string>
|
||||
list={customSplitList}
|
||||
size={'sm'}
|
||||
bg={'myGray.50'}
|
||||
value={customListSelectValue}
|
||||
h={'32px'}
|
||||
onChange={(val) => {
|
||||
if (val === 'Other') {
|
||||
setValue('chunkSplitter', '');
|
||||
} else {
|
||||
setValue('chunkSplitter', val);
|
||||
}
|
||||
setCustomListSelectValue(val);
|
||||
}}
|
||||
/>
|
||||
</Box>
|
||||
{customListSelectValue === 'Other' && (
|
||||
<Input
|
||||
flex={'1 0 0'}
|
||||
h={'32px'}
|
||||
size={'sm'}
|
||||
bg={'myGray.50'}
|
||||
placeholder="\n;======;==SPLIT=="
|
||||
{...register('chunkSplitter')}
|
||||
/>
|
||||
)}
|
||||
</HStack>
|
||||
</Box>
|
||||
)}
|
||||
</Box>
|
||||
|
||||
{trainingType === DatasetCollectionDataProcessModeEnum.chunk && (
|
||||
<Box>
|
||||
<Flex alignItems={'center'} mt={3}>
|
||||
<Box fontSize={'sm'} mt={2}>
|
||||
<Flex alignItems={'center'}>
|
||||
<Box>{t('dataset:index_size')}</Box>
|
||||
<QuestionTip label={t('dataset:index_size_tips')} />
|
||||
</Flex>
|
||||
<Box mt={1}>
|
||||
<Box>
|
||||
<MySelect<number>
|
||||
bg={'myGray.50'}
|
||||
list={indexSizeSeletorList}
|
||||
@@ -490,7 +518,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
)}
|
||||
|
||||
{showQAPromptInput && (
|
||||
<Box mt={3}>
|
||||
<Box mt={2}>
|
||||
<Box>{t('common:core.dataset.collection.QA Prompt')}</Box>
|
||||
<Box
|
||||
position={'relative'}
|
||||
@@ -570,83 +598,3 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
};
|
||||
|
||||
export default CollectionChunkForm;
|
||||
|
||||
// Get chunk settings from form
|
||||
export const collectionChunkForm2StoreChunkData = ({
|
||||
agentModel,
|
||||
vectorModel,
|
||||
...data
|
||||
}: CollectionChunkFormType & {
|
||||
agentModel: LLMModelItemType;
|
||||
vectorModel: EmbeddingModelItemType;
|
||||
}): CollectionChunkFormType => {
|
||||
const {
|
||||
trainingType,
|
||||
autoIndexes,
|
||||
chunkSettingMode,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
indexSize,
|
||||
qaPrompt
|
||||
} = data;
|
||||
|
||||
// 根据处理方式,获取 auto 和 custom 的参数。
|
||||
const trainingModeSize: {
|
||||
autoChunkSize: number;
|
||||
autoIndexSize: number;
|
||||
chunkSize: number;
|
||||
indexSize: number;
|
||||
} = (() => {
|
||||
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||
return {
|
||||
autoChunkSize: getLLMDefaultChunkSize(agentModel),
|
||||
autoIndexSize: getMaxIndexSize(vectorModel),
|
||||
chunkSize,
|
||||
indexSize: getMaxIndexSize(vectorModel)
|
||||
};
|
||||
} else if (autoIndexes) {
|
||||
return {
|
||||
autoChunkSize: chunkAutoChunkSize,
|
||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||
chunkSize,
|
||||
indexSize
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
autoChunkSize: chunkAutoChunkSize,
|
||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||
chunkSize,
|
||||
indexSize
|
||||
};
|
||||
}
|
||||
})();
|
||||
|
||||
// 获取真实参数
|
||||
const {
|
||||
chunkSize: formatChunkIndex,
|
||||
indexSize: formatIndexSize,
|
||||
chunkSplitter: formatChunkSplitter
|
||||
} = (() => {
|
||||
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return {
|
||||
chunkSize: trainingModeSize.autoChunkSize,
|
||||
indexSize: trainingModeSize.autoIndexSize,
|
||||
chunkSplitter: ''
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
chunkSize: trainingModeSize.chunkSize,
|
||||
indexSize: trainingModeSize.indexSize,
|
||||
chunkSplitter
|
||||
};
|
||||
}
|
||||
})();
|
||||
|
||||
return {
|
||||
...data,
|
||||
chunkSize: formatChunkIndex,
|
||||
indexSize: formatIndexSize,
|
||||
chunkSplitter: formatChunkSplitter,
|
||||
qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
|
||||
};
|
||||
};
|
||||
|
||||
@@ -52,7 +52,7 @@ export const defaultFormData: ImportFormType = {
|
||||
|
||||
chunkSettingMode: ChunkSettingModeEnum.auto,
|
||||
chunkSplitMode: DataChunkSplitModeEnum.paragraph,
|
||||
paragraphChunkAIMode: ParagraphChunkAIModeEnum.auto,
|
||||
paragraphChunkAIMode: ParagraphChunkAIModeEnum.forbid,
|
||||
paragraphChunkDeep: 5,
|
||||
paragraphChunkMinSize: 100,
|
||||
|
||||
@@ -198,10 +198,10 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
|
||||
const vectorModel = datasetDetail.vectorModel;
|
||||
|
||||
const processParamsForm = useForm<ImportFormType>({
|
||||
defaultValues: {
|
||||
defaultValues: (() => ({
|
||||
...defaultFormData,
|
||||
indexSize: getAutoIndexSize(vectorModel)
|
||||
}
|
||||
}))()
|
||||
});
|
||||
|
||||
const [sources, setSources] = useState<ImportSourceItemType[]>([]);
|
||||
|
||||
+1
-6
@@ -17,7 +17,6 @@ import MyBox from '@fastgpt/web/components/common/MyBox';
|
||||
import Markdown from '@/components/Markdown';
|
||||
import { useToast } from '@fastgpt/web/hooks/useToast';
|
||||
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
|
||||
import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';
|
||||
|
||||
const PreviewData = () => {
|
||||
const { t } = useTranslation();
|
||||
@@ -37,11 +36,7 @@ const PreviewData = () => {
|
||||
async () => {
|
||||
if (!previewFile) return { chunks: [], total: 0 };
|
||||
|
||||
const chunkData = collectionChunkForm2StoreChunkData({
|
||||
...processParamsForm.getValues(),
|
||||
vectorModel: datasetDetail.vectorModel,
|
||||
agentModel: datasetDetail.agentModel
|
||||
});
|
||||
const chunkData = processParamsForm.getValues();
|
||||
|
||||
if (importSource === ImportDataSourceEnum.fileCustom) {
|
||||
const chunkSplitter = processParamsForm.getValues('chunkSplitter');
|
||||
|
||||
@@ -37,7 +37,6 @@ import { useContextSelector } from 'use-context-selector';
|
||||
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
||||
import { DatasetImportContext, type ImportFormType } from '../Context';
|
||||
import { type ApiCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
|
||||
import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';
|
||||
|
||||
const Upload = () => {
|
||||
const { t } = useTranslation();
|
||||
@@ -82,12 +81,6 @@ const Upload = () => {
|
||||
|
||||
const { runAsync: startUpload, loading: isLoading } = useRequest2(
|
||||
async ({ customPdfParse, webSelector, ...data }: ImportFormType) => {
|
||||
const chunkData = collectionChunkForm2StoreChunkData({
|
||||
...data,
|
||||
vectorModel: datasetDetail.vectorModel,
|
||||
agentModel: datasetDetail.agentModel
|
||||
});
|
||||
|
||||
if (sources.length === 0) return;
|
||||
const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting');
|
||||
|
||||
@@ -108,7 +101,7 @@ const Upload = () => {
|
||||
const commonParams: ApiCreateDatasetCollectionParams & {
|
||||
name: string;
|
||||
} = {
|
||||
...chunkData,
|
||||
...data,
|
||||
parentId,
|
||||
datasetId: datasetDetail._id,
|
||||
name: item.sourceName,
|
||||
|
||||
Reference in New Issue
Block a user