perf: dataset import params code (#4875)

* perf: dataset import params code * perf: api dataset code * model
2025-10-17 00:14:51 +00:00 · 2025-05-23 10:40:25 +08:00
parent 9af92d1eae
commit fae76e887a
23 changed files with 366 additions and 295 deletions
--- a/projects/app/src/pageComponents/dataset/detail/CollectionCard/WebsiteConfig.tsx
+++ b/projects/app/src/pageComponents/dataset/detail/CollectionCard/WebsiteConfig.tsx
@@ -21,9 +21,13 @@ import CollectionChunkForm, {
  collectionChunkForm2StoreChunkData,
  type CollectionChunkFormType
 } from '../Form/CollectionChunkForm';
-import { getLLMDefaultChunkSize } from '@fastgpt/global/core/dataset/training/utils';
+import {
+  getAutoIndexSize,
+  getLLMDefaultChunkSize
+} from '@fastgpt/global/core/dataset/training/utils';
 import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
 import PopoverConfirm from '@fastgpt/web/components/common/MyPopover/PopoverConfirm';
+import { defaultFormData } from '../Import/Context';

 export type WebsiteConfigFormType = {
  websiteConfig: {
@@ -76,17 +80,35 @@ const WebsiteConfigModal = ({

  const form = useForm<CollectionChunkFormType>({
    defaultValues: {
-      trainingType: chunkSettings?.trainingType || DatasetCollectionDataProcessModeEnum.chunk,
-      imageIndex: chunkSettings?.imageIndex || false,
-      autoIndexes: chunkSettings?.autoIndexes || false,
+      trainingType: chunkSettings?.trainingType,

-      chunkSettingMode: chunkSettings?.chunkSettingMode || ChunkSettingModeEnum.auto,
-      chunkSplitMode: chunkSettings?.chunkSplitMode || DataChunkSplitModeEnum.size,
-      embeddingChunkSize: chunkSettings?.chunkSize || 2000,
-      qaChunkSize: chunkSettings?.chunkSize || getLLMDefaultChunkSize(datasetDetail.agentModel),
-      indexSize: chunkSettings?.indexSize || datasetDetail.vectorModel?.defaultToken || 512,
+      chunkTriggerType: chunkSettings?.chunkTriggerType || defaultFormData.chunkTriggerType,
+      chunkTriggerMinSize:
+        chunkSettings?.chunkTriggerMinSize || defaultFormData.chunkTriggerMinSize,
+
+      dataEnhanceCollectionName:
+        chunkSettings?.dataEnhanceCollectionName || defaultFormData.dataEnhanceCollectionName,
+
+      imageIndex: chunkSettings?.imageIndex || defaultFormData.imageIndex,
+      autoIndexes: chunkSettings?.autoIndexes || defaultFormData.autoIndexes,
+
+      chunkSettingMode: chunkSettings?.chunkSettingMode || defaultFormData.chunkSettingMode,
+      chunkSplitMode: chunkSettings?.chunkSplitMode || defaultFormData.chunkSplitMode,
+
+      paragraphChunkAIMode:
+        chunkSettings?.paragraphChunkAIMode || defaultFormData.paragraphChunkAIMode,
+      paragraphChunkDeep: chunkSettings?.paragraphChunkDeep || defaultFormData.paragraphChunkDeep,
+      paragraphChunkMinSize:
+        chunkSettings?.paragraphChunkMinSize || defaultFormData.paragraphChunkMinSize,
+      paragraphChunkMaxSize:
+        chunkSettings?.paragraphChunkMaxSize || defaultFormData.paragraphChunkMaxSize,
+
+      chunkSize: chunkSettings?.chunkSize || defaultFormData.chunkSize,
+
+      chunkSplitter: chunkSettings?.chunkSplitter || defaultFormData.chunkSplitter,
+
+      indexSize: chunkSettings?.indexSize || defaultFormData.indexSize,

-      chunkSplitter: chunkSettings?.chunkSplitter || '',
      qaPrompt: chunkSettings?.qaPrompt || Prompt_AgentQA.description
    }
  });
--- a/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx
+++ b/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx
@@ -17,6 +17,10 @@ import {
 } from '@chakra-ui/react';
 import MyIcon from '@fastgpt/web/components/common/Icon';
 import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
+import type {
+  ChunkTriggerConfigTypeEnum,
+  ParagraphChunkAIModeEnum
+} from '@fastgpt/global/core/dataset/constants';
 import {
  DataChunkSplitModeEnum,
  DatasetCollectionDataProcessModeEnum,
@@ -42,7 +46,6 @@ import {
  minChunkSize
 } from '@fastgpt/global/core/dataset/training/utils';
 import RadioGroup from '@fastgpt/web/components/common/Radio/RadioGroup';
-import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
 import type { LLMModelItemType, EmbeddingModelItemType } from '@fastgpt/global/core/ai/model.d';

 const PromptTextarea = ({
@@ -86,19 +89,35 @@ const PromptTextarea = ({

 export type CollectionChunkFormType = {
  trainingType: DatasetCollectionDataProcessModeEnum;
+
+  // Chunk trigger
+  chunkTriggerType: ChunkTriggerConfigTypeEnum;
+  chunkTriggerMinSize: number; // maxSize from agent model, not store
+
+  // Data enhance
+  dataEnhanceCollectionName: boolean; // Auto add collection name to data
+
+  // Index enhance
  imageIndex: boolean;
  autoIndexes: boolean;

-  chunkSettingMode: ChunkSettingModeEnum;
-
+  // Chunk setting
+  chunkSettingMode: ChunkSettingModeEnum; // 系统参数/自定义参数
  chunkSplitMode: DataChunkSplitModeEnum;
-  embeddingChunkSize: number;
-  qaChunkSize: number;
-  chunkSplitter?: string;
+  // Paragraph split
+  paragraphChunkAIMode: ParagraphChunkAIModeEnum;
+  paragraphChunkDeep: number; // Paragraph deep
+  paragraphChunkMinSize: number; // Paragraph min size, if too small, it will merge
+  paragraphChunkMaxSize: number; // Paragraph max size, if too large, it will split
+  // Size split
+  chunkSize: number;
+  // Char split
+  chunkSplitter: string;
  indexSize: number;

  qaPrompt?: string;
 };
+
 const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkFormType> }) => {
  const { t } = useTranslation();
  const { feConfigs } = useSystemStore();
@@ -131,29 +150,26 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
      tooltip: t(value.tooltip as any)
    }));
  }, [t]);
+
  const {
-    chunkSizeField,
    maxChunkSize,
    minChunkSize: minChunkSizeValue,
    maxIndexSize
  } = useMemo(() => {
    if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
      return {
-        chunkSizeField: 'qaChunkSize',
        maxChunkSize: getLLMMaxChunkSize(agentModel),
        minChunkSize: 1000,
        maxIndexSize: 1000
      };
    } else if (autoIndexes) {
      return {
-        chunkSizeField: 'embeddingChunkSize',
        maxChunkSize: getMaxChunkSize(agentModel),
        minChunkSize: minChunkSize,
        maxIndexSize: getMaxIndexSize(vectorModel)
      };
    } else {
      return {
-        chunkSizeField: 'embeddingChunkSize',
        maxChunkSize: getMaxChunkSize(agentModel),
        minChunkSize: minChunkSize,
        maxIndexSize: getMaxIndexSize(vectorModel)
@@ -216,6 +232,11 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
          value={trainingType}
          onChange={(e) => {
            setValue('trainingType', e);
+            if (e === DatasetCollectionDataProcessModeEnum.qa) {
+              setValue('chunkSize', getLLMDefaultChunkSize(agentModel));
+            } else {
+              setValue('chunkSize', chunkAutoChunkSize);
+            }
          }}
          defaultBg="white"
          activeBg="white"
@@ -317,7 +338,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
                        >
                          <MyNumberInput
                            register={register}
-                            name={chunkSizeField}
+                            name={'chunkSize'}
                            min={minChunkSizeValue}
                            max={maxChunkSize}
                            size={'sm'}
@@ -456,24 +477,26 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm

 export default CollectionChunkForm;

+// Get chunk settings from form
 export const collectionChunkForm2StoreChunkData = ({
-  trainingType,
-  imageIndex,
-  autoIndexes,
-  chunkSettingMode,
-  chunkSplitMode,
-  embeddingChunkSize,
-  qaChunkSize,
-  chunkSplitter,
-  indexSize,
-  qaPrompt,
-
  agentModel,
-  vectorModel
+  vectorModel,
+  ...data
 }: CollectionChunkFormType & {
  agentModel: LLMModelItemType;
  vectorModel: EmbeddingModelItemType;
-}): ChunkSettingsType => {
+}): CollectionChunkFormType => {
+  const {
+    trainingType,
+    autoIndexes,
+    chunkSettingMode,
+    chunkSize,
+    chunkSplitter,
+    indexSize,
+    qaPrompt
+  } = data;
+
+  // 根据处理方式，获取 auto 和 custom 的参数。
  const trainingModeSize: {
    autoChunkSize: number;
    autoIndexSize: number;
@@ -483,53 +506,53 @@ export const collectionChunkForm2StoreChunkData = ({
    if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
      return {
        autoChunkSize: getLLMDefaultChunkSize(agentModel),
-        autoIndexSize: 512,
-        chunkSize: qaChunkSize,
-        indexSize: 512
+        autoIndexSize: getMaxIndexSize(vectorModel),
+        chunkSize,
+        indexSize: getMaxIndexSize(vectorModel)
      };
    } else if (autoIndexes) {
      return {
        autoChunkSize: chunkAutoChunkSize,
        autoIndexSize: getAutoIndexSize(vectorModel),
-        chunkSize: embeddingChunkSize,
+        chunkSize,
        indexSize
      };
    } else {
      return {
        autoChunkSize: chunkAutoChunkSize,
        autoIndexSize: getAutoIndexSize(vectorModel),
-        chunkSize: embeddingChunkSize,
+        chunkSize,
        indexSize
      };
    }
  })();

-  const { chunkSize: formatChunkIndex, indexSize: formatIndexSize } = (() => {
+  // 获取真实参数
+  const {
+    chunkSize: formatChunkIndex,
+    indexSize: formatIndexSize,
+    chunkSplitter: formatChunkSplitter
+  } = (() => {
    if (chunkSettingMode === ChunkSettingModeEnum.auto) {
      return {
        chunkSize: trainingModeSize.autoChunkSize,
-        indexSize: trainingModeSize.autoIndexSize
+        indexSize: trainingModeSize.autoIndexSize,
+        chunkSplitter: ''
      };
    } else {
      return {
        chunkSize: trainingModeSize.chunkSize,
-        indexSize: trainingModeSize.indexSize
+        indexSize: trainingModeSize.indexSize,
+        chunkSplitter
      };
    }
  })();

  return {
-    trainingType,
-    imageIndex,
-    autoIndexes,
-
-    chunkSettingMode,
-    chunkSplitMode,
-
+    ...data,
    chunkSize: formatChunkIndex,
    indexSize: formatIndexSize,
-
-    chunkSplitter,
+    chunkSplitter: formatChunkSplitter,
    qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
  };
 };
--- a/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx
+++ b/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx
@@ -3,8 +3,10 @@ import { type SetStateAction, useMemo, useState } from 'react';
 import { useTranslation } from 'next-i18next';
 import { createContext, useContextSelector } from 'use-context-selector';
 import {
+  ChunkTriggerConfigTypeEnum,
  DatasetCollectionDataProcessModeEnum,
-  ImportDataSourceEnum
+  ImportDataSourceEnum,
+  ParagraphChunkAIModeEnum
 } from '@fastgpt/global/core/dataset/constants';
 import { useMyStep } from '@fastgpt/web/hooks/useStep';
 import { Box, Button, Flex, IconButton } from '@chakra-ui/react';
@@ -16,38 +18,14 @@ import { type ImportSourceItemType } from '@/web/core/dataset/type';
 import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
 import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
 import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants';
-import {
-  getMaxChunkSize,
-  getLLMDefaultChunkSize,
-  getLLMMaxChunkSize,
-  chunkAutoChunkSize,
-  minChunkSize,
-  getAutoIndexSize,
-  getMaxIndexSize
-} from '@fastgpt/global/core/dataset/training/utils';
+import { chunkAutoChunkSize, getAutoIndexSize } from '@fastgpt/global/core/dataset/training/utils';
 import { type CollectionChunkFormType } from '../Form/CollectionChunkForm';

-type ChunkSizeFieldType = 'embeddingChunkSize' | 'qaChunkSize';
 export type ImportFormType = {
  customPdfParse: boolean;
-
  webSelector: string;
 } & CollectionChunkFormType;

-type TrainingFiledType = {
-  chunkOverlapRatio: number;
-  maxChunkSize: number;
-  minChunkSize: number;
-  autoChunkSize: number;
-  chunkSize: number;
-  maxIndexSize?: number;
-  indexSize?: number;
-  autoIndexSize?: number;
-  charsPointsPrice: number;
-  priceTip: string;
-  uploadRate: number;
-  chunkSizeField: ChunkSizeFieldType;
-};
 type DatasetImportContextType = {
  importSource: ImportDataSourceEnum;
  parentId: string | undefined;
@@ -57,7 +35,35 @@ type DatasetImportContextType = {
  processParamsForm: UseFormReturn<ImportFormType, any>;
  sources: ImportSourceItemType[];
  setSources: React.Dispatch<React.SetStateAction<ImportSourceItemType[]>>;
-} & TrainingFiledType;
+};
+
+export const defaultFormData: ImportFormType = {
+  customPdfParse: false,
+
+  trainingType: DatasetCollectionDataProcessModeEnum.chunk,
+
+  chunkTriggerType: ChunkTriggerConfigTypeEnum.minSize,
+  chunkTriggerMinSize: chunkAutoChunkSize,
+
+  dataEnhanceCollectionName: false,
+
+  imageIndex: false,
+  autoIndexes: false,
+
+  chunkSettingMode: ChunkSettingModeEnum.auto,
+  chunkSplitMode: DataChunkSplitModeEnum.size,
+  paragraphChunkAIMode: ParagraphChunkAIModeEnum.auto,
+  paragraphChunkDeep: 4,
+  paragraphChunkMinSize: 100,
+  paragraphChunkMaxSize: chunkAutoChunkSize,
+
+  chunkSize: chunkAutoChunkSize,
+  chunkSplitter: '',
+  indexSize: getAutoIndexSize(),
+
+  qaPrompt: Prompt_AgentQA.description,
+  webSelector: ''
+};

 export const DatasetImportContext = createContext<DatasetImportContextType>({
  importSource: ImportDataSourceEnum.fileLocal,
@@ -75,12 +81,9 @@ export const DatasetImportContext = createContext<DatasetImportContextType>({
  },
  chunkSize: 0,
  chunkOverlapRatio: 0,
-  uploadRate: 0,
  //@ts-ignore
  processParamsForm: undefined,
-  autoChunkSize: 0,
-  charsPointsPrice: 0,
-  priceTip: ''
+  autoChunkSize: 0
 });

 const DatasetImportContextProvider = ({ children }: { children: React.ReactNode }) => {
@@ -180,119 +183,17 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
  });

  const vectorModel = datasetDetail.vectorModel;
-  const agentModel = datasetDetail.agentModel;

  const processParamsForm = useForm<ImportFormType>({
    defaultValues: {
-      imageIndex: false,
-      autoIndexes: false,
-
-      trainingType: DatasetCollectionDataProcessModeEnum.chunk,
-
-      chunkSettingMode: ChunkSettingModeEnum.auto,
-
-      chunkSplitMode: DataChunkSplitModeEnum.size,
-      embeddingChunkSize: chunkAutoChunkSize,
-      indexSize: vectorModel?.defaultToken || 512,
-      qaChunkSize: getLLMDefaultChunkSize(agentModel),
-      chunkSplitter: '',
-      qaPrompt: Prompt_AgentQA.description,
-      webSelector: '',
-      customPdfParse: false
+      ...defaultFormData,
+      indexSize: getAutoIndexSize(vectorModel)
    }
  });

  const [sources, setSources] = useState<ImportSourceItemType[]>([]);

-  // watch form
-  const trainingType = processParamsForm.watch('trainingType');
-  const chunkSettingMode = processParamsForm.watch('chunkSettingMode');
-  const embeddingChunkSize = processParamsForm.watch('embeddingChunkSize');
-  const qaChunkSize = processParamsForm.watch('qaChunkSize');
-  const chunkSplitter = processParamsForm.watch('chunkSplitter');
-  const autoIndexes = processParamsForm.watch('autoIndexes');
-  const indexSize = processParamsForm.watch('indexSize');
-
-  const TrainingModeMap = useMemo<TrainingFiledType>(() => {
-    if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
-      return {
-        chunkSizeField: 'qaChunkSize',
-        chunkOverlapRatio: 0,
-        maxChunkSize: getLLMMaxChunkSize(agentModel),
-        minChunkSize: 1000,
-        autoChunkSize: getLLMDefaultChunkSize(agentModel),
-        chunkSize: qaChunkSize,
-        charsPointsPrice: agentModel.charsPointsPrice || 0,
-        priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
-          price: agentModel.charsPointsPrice
-        }),
-        uploadRate: 30
-      };
-    } else if (autoIndexes) {
-      return {
-        chunkSizeField: 'embeddingChunkSize',
-        chunkOverlapRatio: 0.2,
-        maxChunkSize: getMaxChunkSize(agentModel),
-        minChunkSize: minChunkSize,
-        autoChunkSize: chunkAutoChunkSize,
-        chunkSize: embeddingChunkSize,
-        maxIndexSize: getMaxIndexSize(vectorModel),
-        autoIndexSize: getAutoIndexSize(vectorModel),
-        indexSize,
-        charsPointsPrice: agentModel.charsPointsPrice || 0,
-        priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
-          price: agentModel.charsPointsPrice
-        }),
-        uploadRate: 100
-      };
-    } else {
-      return {
-        chunkSizeField: 'embeddingChunkSize',
-        chunkOverlapRatio: 0.2,
-        maxChunkSize: getMaxChunkSize(agentModel),
-        minChunkSize: minChunkSize,
-        autoChunkSize: chunkAutoChunkSize,
-        chunkSize: embeddingChunkSize,
-        maxIndexSize: getMaxIndexSize(vectorModel),
-        autoIndexSize: getAutoIndexSize(vectorModel),
-        indexSize,
-        charsPointsPrice: vectorModel.charsPointsPrice || 0,
-        priceTip: t('dataset:import.Embedding Estimated Price Tips', {
-          price: vectorModel.charsPointsPrice
-        }),
-        uploadRate: 150
-      };
-    }
-  }, [
-    trainingType,
-    autoIndexes,
-    agentModel,
-    qaChunkSize,
-    t,
-    embeddingChunkSize,
-    vectorModel,
-    indexSize
-  ]);
-
-  const chunkSettingModeMap = useMemo(() => {
-    if (chunkSettingMode === ChunkSettingModeEnum.auto) {
-      return {
-        chunkSize: TrainingModeMap.autoChunkSize,
-        indexSize: TrainingModeMap.autoIndexSize,
-        chunkSplitter: ''
-      };
-    } else {
-      return {
-        chunkSize: TrainingModeMap.chunkSize,
-        indexSize: TrainingModeMap.indexSize,
-        chunkSplitter
-      };
-    }
-  }, [chunkSettingMode, TrainingModeMap, chunkSplitter]);
-
  const contextValue = {
-    ...TrainingModeMap,
-    ...chunkSettingModeMap,
    importSource: source,
    parentId,
    activeStep,
--- a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx
+++ b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx
@@ -17,6 +17,7 @@ import MyBox from '@fastgpt/web/components/common/MyBox';
 import Markdown from '@/components/Markdown';
 import { useToast } from '@fastgpt/web/hooks/useToast';
 import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
+import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';

 const PreviewData = () => {
  const { t } = useTranslation();
@@ -28,8 +29,6 @@ const PreviewData = () => {

  const sources = useContextSelector(DatasetImportContext, (v) => v.sources);
  const importSource = useContextSelector(DatasetImportContext, (v) => v.importSource);
-  const chunkSize = useContextSelector(DatasetImportContext, (v) => v.chunkSize);
-  const chunkOverlapRatio = useContextSelector(DatasetImportContext, (v) => v.chunkOverlapRatio);
  const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm);

  const [previewFile, setPreviewFile] = useState<ImportSourceItemType>();
@@ -37,13 +36,20 @@ const PreviewData = () => {
  const { data = { chunks: [], total: 0 }, loading: isLoading } = useRequest2(
    async () => {
      if (!previewFile) return { chunks: [], total: 0 };
+
+      const chunkData = collectionChunkForm2StoreChunkData({
+        ...processParamsForm.getValues(),
+        vectorModel: datasetDetail.vectorModel,
+        agentModel: datasetDetail.agentModel
+      });
+
      if (importSource === ImportDataSourceEnum.fileCustom) {
        const chunkSplitter = processParamsForm.getValues('chunkSplitter');
        const { chunks } = splitText2Chunks({
          text: previewFile.rawText || '',
-          chunkSize,
+          chunkSize: chunkData.chunkSize,
          maxSize: getLLMMaxChunkSize(datasetDetail.agentModel),
-          overlapRatio: chunkOverlapRatio,
+          overlapRatio: 0.2,
          customReg: chunkSplitter ? [chunkSplitter] : []
        });
        return {
@@ -64,18 +70,12 @@ const PreviewData = () => {
          previewFile.externalFileUrl ||
          previewFile.apiFileId ||
          '',
+        externalFileId: previewFile.externalFileId,

-        customPdfParse: processParamsForm.getValues('customPdfParse'),
-
-        trainingType: processParamsForm.getValues('trainingType'),
-        chunkSettingMode: processParamsForm.getValues('chunkSettingMode'),
-        chunkSplitMode: processParamsForm.getValues('chunkSplitMode'),
-        chunkSize,
-        chunkSplitter: processParamsForm.getValues('chunkSplitter'),
-        overlapRatio: chunkOverlapRatio,
-
+        ...chunkData,
        selector: processParamsForm.getValues('webSelector'),
-        externalFileId: previewFile.externalFileId
+        customPdfParse: processParamsForm.getValues('customPdfParse'),
+        overlapRatio: 0.2
      });
    },
    {
--- a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx
+++ b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx
@@ -37,6 +37,7 @@ import { useContextSelector } from 'use-context-selector';
 import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
 import { DatasetImportContext, type ImportFormType } from '../Context';
 import { type ApiCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
+import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';

 const Upload = () => {
  const { t } = useTranslation();
@@ -48,10 +49,10 @@ const Upload = () => {
  const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
  const retrainNewCollectionId = useRef('');

-  const { importSource, parentId, sources, setSources, processParamsForm, chunkSize, indexSize } =
-    useContextSelector(DatasetImportContext, (v) => v);
-
-  const { handleSubmit } = processParamsForm;
+  const { importSource, parentId, sources, setSources, processParamsForm } = useContextSelector(
+    DatasetImportContext,
+    (v) => v
+  );

  const { totalFilesCount, waitingFilesCount, allFinished, hasCreatingFiles } = useMemo(() => {
    const totalFilesCount = sources.length;
@@ -80,7 +81,13 @@ const Upload = () => {
  }, [waitingFilesCount, totalFilesCount, allFinished, t]);

  const { runAsync: startUpload, loading: isLoading } = useRequest2(
-    async ({ trainingType, chunkSplitter, qaPrompt, webSelector }: ImportFormType) => {
+    async ({ customPdfParse, webSelector, ...data }: ImportFormType) => {
+      const chunkData = collectionChunkForm2StoreChunkData({
+        ...data,
+        vectorModel: datasetDetail.vectorModel,
+        agentModel: datasetDetail.agentModel
+      });
+
      if (sources.length === 0) return;
      const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting');

@@ -101,23 +108,12 @@ const Upload = () => {
        const commonParams: ApiCreateDatasetCollectionParams & {
          name: string;
        } = {
+          ...chunkData,
          parentId,
          datasetId: datasetDetail._id,
          name: item.sourceName,

-          customPdfParse: processParamsForm.getValues('customPdfParse'),
-
-          trainingType,
-          imageIndex: processParamsForm.getValues('imageIndex'),
-          autoIndexes: processParamsForm.getValues('autoIndexes'),
-
-          chunkSettingMode: processParamsForm.getValues('chunkSettingMode'),
-          chunkSplitMode: processParamsForm.getValues('chunkSplitMode'),
-
-          chunkSize,
-          indexSize,
-          chunkSplitter,
-          qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
+          customPdfParse
        };

        if (importSource === ImportDataSourceEnum.reTraining) {
@@ -280,7 +276,10 @@ const Upload = () => {
      </TableContainer>

      <Flex justifyContent={'flex-end'} mt={4}>
-        <Button isLoading={isLoading} onClick={handleSubmit((data) => startUpload(data))}>
+        <Button
+          isLoading={isLoading}
+          onClick={processParamsForm.handleSubmit((data) => startUpload(data))}
+        >
          {totalFilesCount > 0 &&
            `${t('dataset:total_num_files', {
              total: totalFilesCount
--- a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx
+++ b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx
@@ -1,6 +1,6 @@
 import React from 'react';
 import { useContextSelector } from 'use-context-selector';
-import { DatasetImportContext } from '../Context';
+import { DatasetImportContext, defaultFormData } from '../Context';

 import dynamic from 'next/dynamic';
 import DataProcess from '../commonProgress/DataProcess';
@@ -48,18 +48,36 @@ const ReTraining = () => {
      ]);

      processParamsForm.reset({
-        customPdfParse: collection.customPdfParse,
+        customPdfParse: collection.customPdfParse || false,
        trainingType: collection.trainingType,
-        imageIndex: collection.imageIndex,
-        autoIndexes: collection.autoIndexes,

-        chunkSettingMode: collection.chunkSettingMode || ChunkSettingModeEnum.auto,
-        chunkSplitMode: collection.chunkSplitMode || DataChunkSplitModeEnum.size,
-        embeddingChunkSize: collection.chunkSize,
-        qaChunkSize: collection.chunkSize,
-        indexSize: collection.indexSize || 512,
-        chunkSplitter: collection.chunkSplitter,
-        webSelector: collection.metadata?.webPageSelector,
+        chunkTriggerType: collection.chunkTriggerType || defaultFormData.chunkTriggerType,
+        chunkTriggerMinSize: collection.chunkTriggerMinSize || defaultFormData.chunkTriggerMinSize,
+
+        dataEnhanceCollectionName:
+          collection.dataEnhanceCollectionName || defaultFormData.dataEnhanceCollectionName,
+
+        imageIndex: collection.imageIndex || defaultFormData.imageIndex,
+        autoIndexes: collection.autoIndexes || defaultFormData.autoIndexes,
+
+        chunkSettingMode: collection.chunkSettingMode || defaultFormData.chunkSettingMode,
+        chunkSplitMode: collection.chunkSplitMode || defaultFormData.chunkSplitMode,
+
+        paragraphChunkAIMode:
+          collection.paragraphChunkAIMode || defaultFormData.paragraphChunkAIMode,
+        paragraphChunkDeep: collection.paragraphChunkDeep || defaultFormData.paragraphChunkDeep,
+        paragraphChunkMinSize:
+          collection.paragraphChunkMinSize || defaultFormData.paragraphChunkMinSize,
+        paragraphChunkMaxSize:
+          collection.paragraphChunkMaxSize || defaultFormData.paragraphChunkMaxSize,
+
+        chunkSize: collection.chunkSize || defaultFormData.chunkSize,
+
+        chunkSplitter: collection.chunkSplitter || defaultFormData.chunkSplitter,
+
+        indexSize: collection.indexSize || defaultFormData.indexSize,
+
+        webSelector: collection.metadata?.webPageSelector || defaultFormData.webSelector,
        qaPrompt: collection.qaPrompt || Prompt_AgentQA.description
      });
    }
--- a/projects/app/src/pageComponents/dataset/detail/MetaDataCard.tsx
+++ b/projects/app/src/pageComponents/dataset/detail/MetaDataCard.tsx
@@ -72,18 +72,26 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => {
        label: t('common:core.dataset.collection.metadata.Raw text length'),
        value: collection.rawTextLength ?? '-'
      },
-      {
-        label: t('dataset:collection_metadata_image_parse'),
-        value: collection.imageIndex ? 'Yes' : 'No'
-      },
-      {
-        label: t('dataset:auto_indexes'),
-        value: collection.autoIndexes ? 'Yes' : 'No'
-      },
      {
        label: t('dataset:collection.training_type'),
        value: t(DatasetCollectionDataProcessModeMap[collection.trainingType]?.label as any)
      },
+      ...(collection.imageIndex !== undefined
+        ? [
+            {
+              label: t('dataset:data_index_image'),
+              value: collection.imageIndex ? 'Yes' : 'No'
+            }
+          ]
+        : []),
+      ...(collection.autoIndexes !== undefined
+        ? [
+            {
+              label: t('dataset:auto_indexes'),
+              value: collection.autoIndexes ? 'Yes' : 'No'
+            }
+          ]
+        : []),
      ...(collection.chunkSize
        ? [
            {