mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 13:03:50 +00:00
perf: dataset import params code (#4875)
* perf: dataset import params code * perf: api dataset code * model
This commit is contained in:
@@ -21,9 +21,13 @@ import CollectionChunkForm, {
|
||||
collectionChunkForm2StoreChunkData,
|
||||
type CollectionChunkFormType
|
||||
} from '../Form/CollectionChunkForm';
|
||||
import { getLLMDefaultChunkSize } from '@fastgpt/global/core/dataset/training/utils';
|
||||
import {
|
||||
getAutoIndexSize,
|
||||
getLLMDefaultChunkSize
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
|
||||
import PopoverConfirm from '@fastgpt/web/components/common/MyPopover/PopoverConfirm';
|
||||
import { defaultFormData } from '../Import/Context';
|
||||
|
||||
export type WebsiteConfigFormType = {
|
||||
websiteConfig: {
|
||||
@@ -76,17 +80,35 @@ const WebsiteConfigModal = ({
|
||||
|
||||
const form = useForm<CollectionChunkFormType>({
|
||||
defaultValues: {
|
||||
trainingType: chunkSettings?.trainingType || DatasetCollectionDataProcessModeEnum.chunk,
|
||||
imageIndex: chunkSettings?.imageIndex || false,
|
||||
autoIndexes: chunkSettings?.autoIndexes || false,
|
||||
trainingType: chunkSettings?.trainingType,
|
||||
|
||||
chunkSettingMode: chunkSettings?.chunkSettingMode || ChunkSettingModeEnum.auto,
|
||||
chunkSplitMode: chunkSettings?.chunkSplitMode || DataChunkSplitModeEnum.size,
|
||||
embeddingChunkSize: chunkSettings?.chunkSize || 2000,
|
||||
qaChunkSize: chunkSettings?.chunkSize || getLLMDefaultChunkSize(datasetDetail.agentModel),
|
||||
indexSize: chunkSettings?.indexSize || datasetDetail.vectorModel?.defaultToken || 512,
|
||||
chunkTriggerType: chunkSettings?.chunkTriggerType || defaultFormData.chunkTriggerType,
|
||||
chunkTriggerMinSize:
|
||||
chunkSettings?.chunkTriggerMinSize || defaultFormData.chunkTriggerMinSize,
|
||||
|
||||
dataEnhanceCollectionName:
|
||||
chunkSettings?.dataEnhanceCollectionName || defaultFormData.dataEnhanceCollectionName,
|
||||
|
||||
imageIndex: chunkSettings?.imageIndex || defaultFormData.imageIndex,
|
||||
autoIndexes: chunkSettings?.autoIndexes || defaultFormData.autoIndexes,
|
||||
|
||||
chunkSettingMode: chunkSettings?.chunkSettingMode || defaultFormData.chunkSettingMode,
|
||||
chunkSplitMode: chunkSettings?.chunkSplitMode || defaultFormData.chunkSplitMode,
|
||||
|
||||
paragraphChunkAIMode:
|
||||
chunkSettings?.paragraphChunkAIMode || defaultFormData.paragraphChunkAIMode,
|
||||
paragraphChunkDeep: chunkSettings?.paragraphChunkDeep || defaultFormData.paragraphChunkDeep,
|
||||
paragraphChunkMinSize:
|
||||
chunkSettings?.paragraphChunkMinSize || defaultFormData.paragraphChunkMinSize,
|
||||
paragraphChunkMaxSize:
|
||||
chunkSettings?.paragraphChunkMaxSize || defaultFormData.paragraphChunkMaxSize,
|
||||
|
||||
chunkSize: chunkSettings?.chunkSize || defaultFormData.chunkSize,
|
||||
|
||||
chunkSplitter: chunkSettings?.chunkSplitter || defaultFormData.chunkSplitter,
|
||||
|
||||
indexSize: chunkSettings?.indexSize || defaultFormData.indexSize,
|
||||
|
||||
chunkSplitter: chunkSettings?.chunkSplitter || '',
|
||||
qaPrompt: chunkSettings?.qaPrompt || Prompt_AgentQA.description
|
||||
}
|
||||
});
|
||||
|
@@ -17,6 +17,10 @@ import {
|
||||
} from '@chakra-ui/react';
|
||||
import MyIcon from '@fastgpt/web/components/common/Icon';
|
||||
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
|
||||
import type {
|
||||
ChunkTriggerConfigTypeEnum,
|
||||
ParagraphChunkAIModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import {
|
||||
DataChunkSplitModeEnum,
|
||||
DatasetCollectionDataProcessModeEnum,
|
||||
@@ -42,7 +46,6 @@ import {
|
||||
minChunkSize
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
import RadioGroup from '@fastgpt/web/components/common/Radio/RadioGroup';
|
||||
import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
|
||||
import type { LLMModelItemType, EmbeddingModelItemType } from '@fastgpt/global/core/ai/model.d';
|
||||
|
||||
const PromptTextarea = ({
|
||||
@@ -86,19 +89,35 @@ const PromptTextarea = ({
|
||||
|
||||
export type CollectionChunkFormType = {
|
||||
trainingType: DatasetCollectionDataProcessModeEnum;
|
||||
|
||||
// Chunk trigger
|
||||
chunkTriggerType: ChunkTriggerConfigTypeEnum;
|
||||
chunkTriggerMinSize: number; // maxSize from agent model, not store
|
||||
|
||||
// Data enhance
|
||||
dataEnhanceCollectionName: boolean; // Auto add collection name to data
|
||||
|
||||
// Index enhance
|
||||
imageIndex: boolean;
|
||||
autoIndexes: boolean;
|
||||
|
||||
chunkSettingMode: ChunkSettingModeEnum;
|
||||
|
||||
// Chunk setting
|
||||
chunkSettingMode: ChunkSettingModeEnum; // 系统参数/自定义参数
|
||||
chunkSplitMode: DataChunkSplitModeEnum;
|
||||
embeddingChunkSize: number;
|
||||
qaChunkSize: number;
|
||||
chunkSplitter?: string;
|
||||
// Paragraph split
|
||||
paragraphChunkAIMode: ParagraphChunkAIModeEnum;
|
||||
paragraphChunkDeep: number; // Paragraph deep
|
||||
paragraphChunkMinSize: number; // Paragraph min size, if too small, it will merge
|
||||
paragraphChunkMaxSize: number; // Paragraph max size, if too large, it will split
|
||||
// Size split
|
||||
chunkSize: number;
|
||||
// Char split
|
||||
chunkSplitter: string;
|
||||
indexSize: number;
|
||||
|
||||
qaPrompt?: string;
|
||||
};
|
||||
|
||||
const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkFormType> }) => {
|
||||
const { t } = useTranslation();
|
||||
const { feConfigs } = useSystemStore();
|
||||
@@ -131,29 +150,26 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
tooltip: t(value.tooltip as any)
|
||||
}));
|
||||
}, [t]);
|
||||
|
||||
const {
|
||||
chunkSizeField,
|
||||
maxChunkSize,
|
||||
minChunkSize: minChunkSizeValue,
|
||||
maxIndexSize
|
||||
} = useMemo(() => {
|
||||
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||
return {
|
||||
chunkSizeField: 'qaChunkSize',
|
||||
maxChunkSize: getLLMMaxChunkSize(agentModel),
|
||||
minChunkSize: 1000,
|
||||
maxIndexSize: 1000
|
||||
};
|
||||
} else if (autoIndexes) {
|
||||
return {
|
||||
chunkSizeField: 'embeddingChunkSize',
|
||||
maxChunkSize: getMaxChunkSize(agentModel),
|
||||
minChunkSize: minChunkSize,
|
||||
maxIndexSize: getMaxIndexSize(vectorModel)
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
chunkSizeField: 'embeddingChunkSize',
|
||||
maxChunkSize: getMaxChunkSize(agentModel),
|
||||
minChunkSize: minChunkSize,
|
||||
maxIndexSize: getMaxIndexSize(vectorModel)
|
||||
@@ -216,6 +232,11 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
value={trainingType}
|
||||
onChange={(e) => {
|
||||
setValue('trainingType', e);
|
||||
if (e === DatasetCollectionDataProcessModeEnum.qa) {
|
||||
setValue('chunkSize', getLLMDefaultChunkSize(agentModel));
|
||||
} else {
|
||||
setValue('chunkSize', chunkAutoChunkSize);
|
||||
}
|
||||
}}
|
||||
defaultBg="white"
|
||||
activeBg="white"
|
||||
@@ -317,7 +338,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
>
|
||||
<MyNumberInput
|
||||
register={register}
|
||||
name={chunkSizeField}
|
||||
name={'chunkSize'}
|
||||
min={minChunkSizeValue}
|
||||
max={maxChunkSize}
|
||||
size={'sm'}
|
||||
@@ -456,24 +477,26 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
|
||||
export default CollectionChunkForm;
|
||||
|
||||
// Get chunk settings from form
|
||||
export const collectionChunkForm2StoreChunkData = ({
|
||||
trainingType,
|
||||
imageIndex,
|
||||
autoIndexes,
|
||||
chunkSettingMode,
|
||||
chunkSplitMode,
|
||||
embeddingChunkSize,
|
||||
qaChunkSize,
|
||||
chunkSplitter,
|
||||
indexSize,
|
||||
qaPrompt,
|
||||
|
||||
agentModel,
|
||||
vectorModel
|
||||
vectorModel,
|
||||
...data
|
||||
}: CollectionChunkFormType & {
|
||||
agentModel: LLMModelItemType;
|
||||
vectorModel: EmbeddingModelItemType;
|
||||
}): ChunkSettingsType => {
|
||||
}): CollectionChunkFormType => {
|
||||
const {
|
||||
trainingType,
|
||||
autoIndexes,
|
||||
chunkSettingMode,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
indexSize,
|
||||
qaPrompt
|
||||
} = data;
|
||||
|
||||
// 根据处理方式,获取 auto 和 custom 的参数。
|
||||
const trainingModeSize: {
|
||||
autoChunkSize: number;
|
||||
autoIndexSize: number;
|
||||
@@ -483,53 +506,53 @@ export const collectionChunkForm2StoreChunkData = ({
|
||||
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||
return {
|
||||
autoChunkSize: getLLMDefaultChunkSize(agentModel),
|
||||
autoIndexSize: 512,
|
||||
chunkSize: qaChunkSize,
|
||||
indexSize: 512
|
||||
autoIndexSize: getMaxIndexSize(vectorModel),
|
||||
chunkSize,
|
||||
indexSize: getMaxIndexSize(vectorModel)
|
||||
};
|
||||
} else if (autoIndexes) {
|
||||
return {
|
||||
autoChunkSize: chunkAutoChunkSize,
|
||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||
chunkSize: embeddingChunkSize,
|
||||
chunkSize,
|
||||
indexSize
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
autoChunkSize: chunkAutoChunkSize,
|
||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||
chunkSize: embeddingChunkSize,
|
||||
chunkSize,
|
||||
indexSize
|
||||
};
|
||||
}
|
||||
})();
|
||||
|
||||
const { chunkSize: formatChunkIndex, indexSize: formatIndexSize } = (() => {
|
||||
// 获取真实参数
|
||||
const {
|
||||
chunkSize: formatChunkIndex,
|
||||
indexSize: formatIndexSize,
|
||||
chunkSplitter: formatChunkSplitter
|
||||
} = (() => {
|
||||
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return {
|
||||
chunkSize: trainingModeSize.autoChunkSize,
|
||||
indexSize: trainingModeSize.autoIndexSize
|
||||
indexSize: trainingModeSize.autoIndexSize,
|
||||
chunkSplitter: ''
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
chunkSize: trainingModeSize.chunkSize,
|
||||
indexSize: trainingModeSize.indexSize
|
||||
indexSize: trainingModeSize.indexSize,
|
||||
chunkSplitter
|
||||
};
|
||||
}
|
||||
})();
|
||||
|
||||
return {
|
||||
trainingType,
|
||||
imageIndex,
|
||||
autoIndexes,
|
||||
|
||||
chunkSettingMode,
|
||||
chunkSplitMode,
|
||||
|
||||
...data,
|
||||
chunkSize: formatChunkIndex,
|
||||
indexSize: formatIndexSize,
|
||||
|
||||
chunkSplitter,
|
||||
chunkSplitter: formatChunkSplitter,
|
||||
qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
|
||||
};
|
||||
};
|
||||
|
@@ -3,8 +3,10 @@ import { type SetStateAction, useMemo, useState } from 'react';
|
||||
import { useTranslation } from 'next-i18next';
|
||||
import { createContext, useContextSelector } from 'use-context-selector';
|
||||
import {
|
||||
ChunkTriggerConfigTypeEnum,
|
||||
DatasetCollectionDataProcessModeEnum,
|
||||
ImportDataSourceEnum
|
||||
ImportDataSourceEnum,
|
||||
ParagraphChunkAIModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { useMyStep } from '@fastgpt/web/hooks/useStep';
|
||||
import { Box, Button, Flex, IconButton } from '@chakra-ui/react';
|
||||
@@ -16,38 +18,14 @@ import { type ImportSourceItemType } from '@/web/core/dataset/type';
|
||||
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
|
||||
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
||||
import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import {
|
||||
getMaxChunkSize,
|
||||
getLLMDefaultChunkSize,
|
||||
getLLMMaxChunkSize,
|
||||
chunkAutoChunkSize,
|
||||
minChunkSize,
|
||||
getAutoIndexSize,
|
||||
getMaxIndexSize
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
import { chunkAutoChunkSize, getAutoIndexSize } from '@fastgpt/global/core/dataset/training/utils';
|
||||
import { type CollectionChunkFormType } from '../Form/CollectionChunkForm';
|
||||
|
||||
type ChunkSizeFieldType = 'embeddingChunkSize' | 'qaChunkSize';
|
||||
export type ImportFormType = {
|
||||
customPdfParse: boolean;
|
||||
|
||||
webSelector: string;
|
||||
} & CollectionChunkFormType;
|
||||
|
||||
type TrainingFiledType = {
|
||||
chunkOverlapRatio: number;
|
||||
maxChunkSize: number;
|
||||
minChunkSize: number;
|
||||
autoChunkSize: number;
|
||||
chunkSize: number;
|
||||
maxIndexSize?: number;
|
||||
indexSize?: number;
|
||||
autoIndexSize?: number;
|
||||
charsPointsPrice: number;
|
||||
priceTip: string;
|
||||
uploadRate: number;
|
||||
chunkSizeField: ChunkSizeFieldType;
|
||||
};
|
||||
type DatasetImportContextType = {
|
||||
importSource: ImportDataSourceEnum;
|
||||
parentId: string | undefined;
|
||||
@@ -57,7 +35,35 @@ type DatasetImportContextType = {
|
||||
processParamsForm: UseFormReturn<ImportFormType, any>;
|
||||
sources: ImportSourceItemType[];
|
||||
setSources: React.Dispatch<React.SetStateAction<ImportSourceItemType[]>>;
|
||||
} & TrainingFiledType;
|
||||
};
|
||||
|
||||
export const defaultFormData: ImportFormType = {
|
||||
customPdfParse: false,
|
||||
|
||||
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
|
||||
|
||||
chunkTriggerType: ChunkTriggerConfigTypeEnum.minSize,
|
||||
chunkTriggerMinSize: chunkAutoChunkSize,
|
||||
|
||||
dataEnhanceCollectionName: false,
|
||||
|
||||
imageIndex: false,
|
||||
autoIndexes: false,
|
||||
|
||||
chunkSettingMode: ChunkSettingModeEnum.auto,
|
||||
chunkSplitMode: DataChunkSplitModeEnum.size,
|
||||
paragraphChunkAIMode: ParagraphChunkAIModeEnum.auto,
|
||||
paragraphChunkDeep: 4,
|
||||
paragraphChunkMinSize: 100,
|
||||
paragraphChunkMaxSize: chunkAutoChunkSize,
|
||||
|
||||
chunkSize: chunkAutoChunkSize,
|
||||
chunkSplitter: '',
|
||||
indexSize: getAutoIndexSize(),
|
||||
|
||||
qaPrompt: Prompt_AgentQA.description,
|
||||
webSelector: ''
|
||||
};
|
||||
|
||||
export const DatasetImportContext = createContext<DatasetImportContextType>({
|
||||
importSource: ImportDataSourceEnum.fileLocal,
|
||||
@@ -75,12 +81,9 @@ export const DatasetImportContext = createContext<DatasetImportContextType>({
|
||||
},
|
||||
chunkSize: 0,
|
||||
chunkOverlapRatio: 0,
|
||||
uploadRate: 0,
|
||||
//@ts-ignore
|
||||
processParamsForm: undefined,
|
||||
autoChunkSize: 0,
|
||||
charsPointsPrice: 0,
|
||||
priceTip: ''
|
||||
autoChunkSize: 0
|
||||
});
|
||||
|
||||
const DatasetImportContextProvider = ({ children }: { children: React.ReactNode }) => {
|
||||
@@ -180,119 +183,17 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
|
||||
});
|
||||
|
||||
const vectorModel = datasetDetail.vectorModel;
|
||||
const agentModel = datasetDetail.agentModel;
|
||||
|
||||
const processParamsForm = useForm<ImportFormType>({
|
||||
defaultValues: {
|
||||
imageIndex: false,
|
||||
autoIndexes: false,
|
||||
|
||||
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
|
||||
|
||||
chunkSettingMode: ChunkSettingModeEnum.auto,
|
||||
|
||||
chunkSplitMode: DataChunkSplitModeEnum.size,
|
||||
embeddingChunkSize: chunkAutoChunkSize,
|
||||
indexSize: vectorModel?.defaultToken || 512,
|
||||
qaChunkSize: getLLMDefaultChunkSize(agentModel),
|
||||
chunkSplitter: '',
|
||||
qaPrompt: Prompt_AgentQA.description,
|
||||
webSelector: '',
|
||||
customPdfParse: false
|
||||
...defaultFormData,
|
||||
indexSize: getAutoIndexSize(vectorModel)
|
||||
}
|
||||
});
|
||||
|
||||
const [sources, setSources] = useState<ImportSourceItemType[]>([]);
|
||||
|
||||
// watch form
|
||||
const trainingType = processParamsForm.watch('trainingType');
|
||||
const chunkSettingMode = processParamsForm.watch('chunkSettingMode');
|
||||
const embeddingChunkSize = processParamsForm.watch('embeddingChunkSize');
|
||||
const qaChunkSize = processParamsForm.watch('qaChunkSize');
|
||||
const chunkSplitter = processParamsForm.watch('chunkSplitter');
|
||||
const autoIndexes = processParamsForm.watch('autoIndexes');
|
||||
const indexSize = processParamsForm.watch('indexSize');
|
||||
|
||||
const TrainingModeMap = useMemo<TrainingFiledType>(() => {
|
||||
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||
return {
|
||||
chunkSizeField: 'qaChunkSize',
|
||||
chunkOverlapRatio: 0,
|
||||
maxChunkSize: getLLMMaxChunkSize(agentModel),
|
||||
minChunkSize: 1000,
|
||||
autoChunkSize: getLLMDefaultChunkSize(agentModel),
|
||||
chunkSize: qaChunkSize,
|
||||
charsPointsPrice: agentModel.charsPointsPrice || 0,
|
||||
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
|
||||
price: agentModel.charsPointsPrice
|
||||
}),
|
||||
uploadRate: 30
|
||||
};
|
||||
} else if (autoIndexes) {
|
||||
return {
|
||||
chunkSizeField: 'embeddingChunkSize',
|
||||
chunkOverlapRatio: 0.2,
|
||||
maxChunkSize: getMaxChunkSize(agentModel),
|
||||
minChunkSize: minChunkSize,
|
||||
autoChunkSize: chunkAutoChunkSize,
|
||||
chunkSize: embeddingChunkSize,
|
||||
maxIndexSize: getMaxIndexSize(vectorModel),
|
||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||
indexSize,
|
||||
charsPointsPrice: agentModel.charsPointsPrice || 0,
|
||||
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
|
||||
price: agentModel.charsPointsPrice
|
||||
}),
|
||||
uploadRate: 100
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
chunkSizeField: 'embeddingChunkSize',
|
||||
chunkOverlapRatio: 0.2,
|
||||
maxChunkSize: getMaxChunkSize(agentModel),
|
||||
minChunkSize: minChunkSize,
|
||||
autoChunkSize: chunkAutoChunkSize,
|
||||
chunkSize: embeddingChunkSize,
|
||||
maxIndexSize: getMaxIndexSize(vectorModel),
|
||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||
indexSize,
|
||||
charsPointsPrice: vectorModel.charsPointsPrice || 0,
|
||||
priceTip: t('dataset:import.Embedding Estimated Price Tips', {
|
||||
price: vectorModel.charsPointsPrice
|
||||
}),
|
||||
uploadRate: 150
|
||||
};
|
||||
}
|
||||
}, [
|
||||
trainingType,
|
||||
autoIndexes,
|
||||
agentModel,
|
||||
qaChunkSize,
|
||||
t,
|
||||
embeddingChunkSize,
|
||||
vectorModel,
|
||||
indexSize
|
||||
]);
|
||||
|
||||
const chunkSettingModeMap = useMemo(() => {
|
||||
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return {
|
||||
chunkSize: TrainingModeMap.autoChunkSize,
|
||||
indexSize: TrainingModeMap.autoIndexSize,
|
||||
chunkSplitter: ''
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
chunkSize: TrainingModeMap.chunkSize,
|
||||
indexSize: TrainingModeMap.indexSize,
|
||||
chunkSplitter
|
||||
};
|
||||
}
|
||||
}, [chunkSettingMode, TrainingModeMap, chunkSplitter]);
|
||||
|
||||
const contextValue = {
|
||||
...TrainingModeMap,
|
||||
...chunkSettingModeMap,
|
||||
importSource: source,
|
||||
parentId,
|
||||
activeStep,
|
||||
|
@@ -17,6 +17,7 @@ import MyBox from '@fastgpt/web/components/common/MyBox';
|
||||
import Markdown from '@/components/Markdown';
|
||||
import { useToast } from '@fastgpt/web/hooks/useToast';
|
||||
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
|
||||
import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';
|
||||
|
||||
const PreviewData = () => {
|
||||
const { t } = useTranslation();
|
||||
@@ -28,8 +29,6 @@ const PreviewData = () => {
|
||||
|
||||
const sources = useContextSelector(DatasetImportContext, (v) => v.sources);
|
||||
const importSource = useContextSelector(DatasetImportContext, (v) => v.importSource);
|
||||
const chunkSize = useContextSelector(DatasetImportContext, (v) => v.chunkSize);
|
||||
const chunkOverlapRatio = useContextSelector(DatasetImportContext, (v) => v.chunkOverlapRatio);
|
||||
const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm);
|
||||
|
||||
const [previewFile, setPreviewFile] = useState<ImportSourceItemType>();
|
||||
@@ -37,13 +36,20 @@ const PreviewData = () => {
|
||||
const { data = { chunks: [], total: 0 }, loading: isLoading } = useRequest2(
|
||||
async () => {
|
||||
if (!previewFile) return { chunks: [], total: 0 };
|
||||
|
||||
const chunkData = collectionChunkForm2StoreChunkData({
|
||||
...processParamsForm.getValues(),
|
||||
vectorModel: datasetDetail.vectorModel,
|
||||
agentModel: datasetDetail.agentModel
|
||||
});
|
||||
|
||||
if (importSource === ImportDataSourceEnum.fileCustom) {
|
||||
const chunkSplitter = processParamsForm.getValues('chunkSplitter');
|
||||
const { chunks } = splitText2Chunks({
|
||||
text: previewFile.rawText || '',
|
||||
chunkSize,
|
||||
chunkSize: chunkData.chunkSize,
|
||||
maxSize: getLLMMaxChunkSize(datasetDetail.agentModel),
|
||||
overlapRatio: chunkOverlapRatio,
|
||||
overlapRatio: 0.2,
|
||||
customReg: chunkSplitter ? [chunkSplitter] : []
|
||||
});
|
||||
return {
|
||||
@@ -64,18 +70,12 @@ const PreviewData = () => {
|
||||
previewFile.externalFileUrl ||
|
||||
previewFile.apiFileId ||
|
||||
'',
|
||||
externalFileId: previewFile.externalFileId,
|
||||
|
||||
customPdfParse: processParamsForm.getValues('customPdfParse'),
|
||||
|
||||
trainingType: processParamsForm.getValues('trainingType'),
|
||||
chunkSettingMode: processParamsForm.getValues('chunkSettingMode'),
|
||||
chunkSplitMode: processParamsForm.getValues('chunkSplitMode'),
|
||||
chunkSize,
|
||||
chunkSplitter: processParamsForm.getValues('chunkSplitter'),
|
||||
overlapRatio: chunkOverlapRatio,
|
||||
|
||||
...chunkData,
|
||||
selector: processParamsForm.getValues('webSelector'),
|
||||
externalFileId: previewFile.externalFileId
|
||||
customPdfParse: processParamsForm.getValues('customPdfParse'),
|
||||
overlapRatio: 0.2
|
||||
});
|
||||
},
|
||||
{
|
||||
|
@@ -37,6 +37,7 @@ import { useContextSelector } from 'use-context-selector';
|
||||
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
||||
import { DatasetImportContext, type ImportFormType } from '../Context';
|
||||
import { type ApiCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
|
||||
import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';
|
||||
|
||||
const Upload = () => {
|
||||
const { t } = useTranslation();
|
||||
@@ -48,10 +49,10 @@ const Upload = () => {
|
||||
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
|
||||
const retrainNewCollectionId = useRef('');
|
||||
|
||||
const { importSource, parentId, sources, setSources, processParamsForm, chunkSize, indexSize } =
|
||||
useContextSelector(DatasetImportContext, (v) => v);
|
||||
|
||||
const { handleSubmit } = processParamsForm;
|
||||
const { importSource, parentId, sources, setSources, processParamsForm } = useContextSelector(
|
||||
DatasetImportContext,
|
||||
(v) => v
|
||||
);
|
||||
|
||||
const { totalFilesCount, waitingFilesCount, allFinished, hasCreatingFiles } = useMemo(() => {
|
||||
const totalFilesCount = sources.length;
|
||||
@@ -80,7 +81,13 @@ const Upload = () => {
|
||||
}, [waitingFilesCount, totalFilesCount, allFinished, t]);
|
||||
|
||||
const { runAsync: startUpload, loading: isLoading } = useRequest2(
|
||||
async ({ trainingType, chunkSplitter, qaPrompt, webSelector }: ImportFormType) => {
|
||||
async ({ customPdfParse, webSelector, ...data }: ImportFormType) => {
|
||||
const chunkData = collectionChunkForm2StoreChunkData({
|
||||
...data,
|
||||
vectorModel: datasetDetail.vectorModel,
|
||||
agentModel: datasetDetail.agentModel
|
||||
});
|
||||
|
||||
if (sources.length === 0) return;
|
||||
const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting');
|
||||
|
||||
@@ -101,23 +108,12 @@ const Upload = () => {
|
||||
const commonParams: ApiCreateDatasetCollectionParams & {
|
||||
name: string;
|
||||
} = {
|
||||
...chunkData,
|
||||
parentId,
|
||||
datasetId: datasetDetail._id,
|
||||
name: item.sourceName,
|
||||
|
||||
customPdfParse: processParamsForm.getValues('customPdfParse'),
|
||||
|
||||
trainingType,
|
||||
imageIndex: processParamsForm.getValues('imageIndex'),
|
||||
autoIndexes: processParamsForm.getValues('autoIndexes'),
|
||||
|
||||
chunkSettingMode: processParamsForm.getValues('chunkSettingMode'),
|
||||
chunkSplitMode: processParamsForm.getValues('chunkSplitMode'),
|
||||
|
||||
chunkSize,
|
||||
indexSize,
|
||||
chunkSplitter,
|
||||
qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
|
||||
customPdfParse
|
||||
};
|
||||
|
||||
if (importSource === ImportDataSourceEnum.reTraining) {
|
||||
@@ -280,7 +276,10 @@ const Upload = () => {
|
||||
</TableContainer>
|
||||
|
||||
<Flex justifyContent={'flex-end'} mt={4}>
|
||||
<Button isLoading={isLoading} onClick={handleSubmit((data) => startUpload(data))}>
|
||||
<Button
|
||||
isLoading={isLoading}
|
||||
onClick={processParamsForm.handleSubmit((data) => startUpload(data))}
|
||||
>
|
||||
{totalFilesCount > 0 &&
|
||||
`${t('dataset:total_num_files', {
|
||||
total: totalFilesCount
|
||||
|
@@ -1,6 +1,6 @@
|
||||
import React from 'react';
|
||||
import { useContextSelector } from 'use-context-selector';
|
||||
import { DatasetImportContext } from '../Context';
|
||||
import { DatasetImportContext, defaultFormData } from '../Context';
|
||||
|
||||
import dynamic from 'next/dynamic';
|
||||
import DataProcess from '../commonProgress/DataProcess';
|
||||
@@ -48,18 +48,36 @@ const ReTraining = () => {
|
||||
]);
|
||||
|
||||
processParamsForm.reset({
|
||||
customPdfParse: collection.customPdfParse,
|
||||
customPdfParse: collection.customPdfParse || false,
|
||||
trainingType: collection.trainingType,
|
||||
imageIndex: collection.imageIndex,
|
||||
autoIndexes: collection.autoIndexes,
|
||||
|
||||
chunkSettingMode: collection.chunkSettingMode || ChunkSettingModeEnum.auto,
|
||||
chunkSplitMode: collection.chunkSplitMode || DataChunkSplitModeEnum.size,
|
||||
embeddingChunkSize: collection.chunkSize,
|
||||
qaChunkSize: collection.chunkSize,
|
||||
indexSize: collection.indexSize || 512,
|
||||
chunkSplitter: collection.chunkSplitter,
|
||||
webSelector: collection.metadata?.webPageSelector,
|
||||
chunkTriggerType: collection.chunkTriggerType || defaultFormData.chunkTriggerType,
|
||||
chunkTriggerMinSize: collection.chunkTriggerMinSize || defaultFormData.chunkTriggerMinSize,
|
||||
|
||||
dataEnhanceCollectionName:
|
||||
collection.dataEnhanceCollectionName || defaultFormData.dataEnhanceCollectionName,
|
||||
|
||||
imageIndex: collection.imageIndex || defaultFormData.imageIndex,
|
||||
autoIndexes: collection.autoIndexes || defaultFormData.autoIndexes,
|
||||
|
||||
chunkSettingMode: collection.chunkSettingMode || defaultFormData.chunkSettingMode,
|
||||
chunkSplitMode: collection.chunkSplitMode || defaultFormData.chunkSplitMode,
|
||||
|
||||
paragraphChunkAIMode:
|
||||
collection.paragraphChunkAIMode || defaultFormData.paragraphChunkAIMode,
|
||||
paragraphChunkDeep: collection.paragraphChunkDeep || defaultFormData.paragraphChunkDeep,
|
||||
paragraphChunkMinSize:
|
||||
collection.paragraphChunkMinSize || defaultFormData.paragraphChunkMinSize,
|
||||
paragraphChunkMaxSize:
|
||||
collection.paragraphChunkMaxSize || defaultFormData.paragraphChunkMaxSize,
|
||||
|
||||
chunkSize: collection.chunkSize || defaultFormData.chunkSize,
|
||||
|
||||
chunkSplitter: collection.chunkSplitter || defaultFormData.chunkSplitter,
|
||||
|
||||
indexSize: collection.indexSize || defaultFormData.indexSize,
|
||||
|
||||
webSelector: collection.metadata?.webPageSelector || defaultFormData.webSelector,
|
||||
qaPrompt: collection.qaPrompt || Prompt_AgentQA.description
|
||||
});
|
||||
}
|
||||
|
@@ -72,18 +72,26 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => {
|
||||
label: t('common:core.dataset.collection.metadata.Raw text length'),
|
||||
value: collection.rawTextLength ?? '-'
|
||||
},
|
||||
{
|
||||
label: t('dataset:collection_metadata_image_parse'),
|
||||
value: collection.imageIndex ? 'Yes' : 'No'
|
||||
},
|
||||
{
|
||||
label: t('dataset:auto_indexes'),
|
||||
value: collection.autoIndexes ? 'Yes' : 'No'
|
||||
},
|
||||
{
|
||||
label: t('dataset:collection.training_type'),
|
||||
value: t(DatasetCollectionDataProcessModeMap[collection.trainingType]?.label as any)
|
||||
},
|
||||
...(collection.imageIndex !== undefined
|
||||
? [
|
||||
{
|
||||
label: t('dataset:data_index_image'),
|
||||
value: collection.imageIndex ? 'Yes' : 'No'
|
||||
}
|
||||
]
|
||||
: []),
|
||||
...(collection.autoIndexes !== undefined
|
||||
? [
|
||||
{
|
||||
label: t('dataset:auto_indexes'),
|
||||
value: collection.autoIndexes ? 'Yes' : 'No'
|
||||
}
|
||||
]
|
||||
: []),
|
||||
...(collection.chunkSize
|
||||
? [
|
||||
{
|
||||
|
Reference in New Issue
Block a user