From fae76e887a0acfe380e6f062bcac179587e6e9c4 Mon Sep 17 00:00:00 2001 From: Archer <545436317@qq.com> Date: Fri, 23 May 2025 10:40:25 +0800 Subject: [PATCH] perf: dataset import params code (#4875) * perf: dataset import params code * perf: api dataset code * model --- .../zh-cn/docs/development/upgrading/4910.md | 2 + packages/global/core/dataset/api.d.ts | 30 ++- packages/global/core/dataset/constants.ts | 10 + .../global/core/dataset/data/constants.ts | 2 +- packages/global/core/dataset/type.d.ts | 45 +++-- .../core/ai/config/provider/Claude.json | 48 +++++ .../core/ai/config/provider/Gemini.json | 24 +++ .../core/dataset/collection/controller.ts | 9 + packages/service/core/dataset/read.ts | 6 +- packages/service/core/dataset/schema.ts | 29 ++- packages/web/i18n/en/common.json | 1 - packages/web/i18n/en/dataset.json | 2 +- packages/web/i18n/zh-CN/common.json | 1 - packages/web/i18n/zh-CN/dataset.json | 2 +- packages/web/i18n/zh-Hant/common.json | 1 - packages/web/i18n/zh-Hant/dataset.json | 2 +- .../detail/CollectionCard/WebsiteConfig.tsx | 42 ++++- .../detail/Form/CollectionChunkForm.tsx | 105 ++++++----- .../dataset/detail/Import/Context.tsx | 171 ++++-------------- .../Import/commonProgress/PreviewData.tsx | 28 +-- .../detail/Import/commonProgress/Upload.tsx | 37 ++-- .../detail/Import/diffSource/ReTraining.tsx | 40 ++-- .../dataset/detail/MetaDataCard.tsx | 24 ++- 23 files changed, 366 insertions(+), 295 deletions(-) diff --git a/docSite/content/zh-cn/docs/development/upgrading/4910.md b/docSite/content/zh-cn/docs/development/upgrading/4910.md index a7e248cdb..92163f14e 100644 --- a/docSite/content/zh-cn/docs/development/upgrading/4910.md +++ b/docSite/content/zh-cn/docs/development/upgrading/4910.md @@ -11,6 +11,8 @@ weight: 790 ## 🚀 新增内容 1. 支持 PG 设置`systemEnv.hnswMaxScanTuples`参数,提高迭代搜索的数据总量。 +2. 开放飞书和语雀知识库到开源版。 +3. gemini 和 claude 最新模型预设。 ## ⚙️ 优化 diff --git a/packages/global/core/dataset/api.d.ts b/packages/global/core/dataset/api.d.ts index a4df84df9..92dc32ed3 100644 --- a/packages/global/core/dataset/api.d.ts +++ b/packages/global/core/dataset/api.d.ts @@ -1,9 +1,11 @@ -import type { DatasetDataIndexItemType, DatasetSchemaType } from './type'; +import type { ChunkSettingsType, DatasetDataIndexItemType, DatasetSchemaType } from './type'; import type { DatasetCollectionTypeEnum, DatasetCollectionDataProcessModeEnum, ChunkSettingModeEnum, - DataChunkSplitModeEnum + DataChunkSplitModeEnum, + ChunkTriggerConfigTypeEnum, + ParagraphChunkAIModeEnum } from './constants'; import type { LLMModelItemType } from '../ai/model.d'; import type { ParentIdType } from 'common/parentFolder/type'; @@ -32,26 +34,16 @@ export type DatasetUpdateBody = { }; /* ================= collection ===================== */ -export type DatasetCollectionChunkMetadataType = { +// Input + store params +type DatasetCollectionStoreDataType = ChunkSettingsType & { parentId?: string; - customPdfParse?: boolean; - trainingType?: DatasetCollectionDataProcessModeEnum; - imageIndex?: boolean; - autoIndexes?: boolean; - - chunkSettingMode?: ChunkSettingModeEnum; - chunkSplitMode?: DataChunkSplitModeEnum; - - chunkSize?: number; - indexSize?: number; - - chunkSplitter?: string; - qaPrompt?: string; metadata?: Record; + + customPdfParse?: boolean; }; // create collection params -export type CreateDatasetCollectionParams = DatasetCollectionChunkMetadataType & { +export type CreateDatasetCollectionParams = DatasetCollectionStoreDataType & { datasetId: string; name: string; type: DatasetCollectionTypeEnum; @@ -72,7 +64,7 @@ export type CreateDatasetCollectionParams = DatasetCollectionChunkMetadataType & nextSyncTime?: Date; }; -export type ApiCreateDatasetCollectionParams = DatasetCollectionChunkMetadataType & { +export type ApiCreateDatasetCollectionParams = DatasetCollectionStoreDataType & { datasetId: string; tags?: string[]; }; @@ -90,7 +82,7 @@ export type ApiDatasetCreateDatasetCollectionParams = ApiCreateDatasetCollection export type FileIdCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & { fileId: string; }; -export type reTrainingDatasetFileCollectionParams = DatasetCollectionChunkMetadataType & { +export type reTrainingDatasetFileCollectionParams = DatasetCollectionStoreDataType & { datasetId: string; collectionId: string; }; diff --git a/packages/global/core/dataset/constants.ts b/packages/global/core/dataset/constants.ts index 1afed622e..25acca563 100644 --- a/packages/global/core/dataset/constants.ts +++ b/packages/global/core/dataset/constants.ts @@ -143,15 +143,25 @@ export const DatasetCollectionDataProcessModeMap = { } }; +export enum ChunkTriggerConfigTypeEnum { + minSize = 'minSize', + forceChunk = 'forceChunk', + maxSize = 'maxSize' +} export enum ChunkSettingModeEnum { auto = 'auto', custom = 'custom' } export enum DataChunkSplitModeEnum { + paragraph = 'paragraph', size = 'size', char = 'char' } +export enum ParagraphChunkAIModeEnum { + auto = 'auto', + force = 'force' +} /* ------------ data -------------- */ diff --git a/packages/global/core/dataset/data/constants.ts b/packages/global/core/dataset/data/constants.ts index 7cb326fa2..2cc17562a 100644 --- a/packages/global/core/dataset/data/constants.ts +++ b/packages/global/core/dataset/data/constants.ts @@ -32,7 +32,7 @@ export const DatasetDataIndexMap: Record< color: 'red' }, [DatasetDataIndexTypeEnum.image]: { - label: i18nT('common:data_index_image'), + label: i18nT('dataset:data_index_image'), color: 'purple' } }; diff --git a/packages/global/core/dataset/type.d.ts b/packages/global/core/dataset/type.d.ts index 9463c239a..a4d78057c 100644 --- a/packages/global/core/dataset/type.d.ts +++ b/packages/global/core/dataset/type.d.ts @@ -8,26 +8,42 @@ import type { DatasetStatusEnum, DatasetTypeEnum, SearchScoreTypeEnum, - TrainingModeEnum + TrainingModeEnum, + ChunkSettingModeEnum } from './constants'; import type { DatasetPermission } from '../../support/permission/dataset/controller'; -import { Permission } from '../../support/permission/controller'; import type { APIFileServer, FeishuServer, YuqueServer } from './apiDataset'; import type { SourceMemberType } from 'support/user/type'; import type { DatasetDataIndexTypeEnum } from './data/constants'; -import type { ChunkSettingModeEnum } from './constants'; export type ChunkSettingsType = { - trainingType: DatasetCollectionDataProcessModeEnum; - autoIndexes?: boolean; + trainingType?: DatasetCollectionDataProcessModeEnum; + + // Chunk trigger + chunkTriggerType?: ChunkTriggerConfigTypeEnum; + chunkTriggerMinSize?: number; // maxSize from agent model, not store + + // Data enhance + dataEnhanceCollectionName?: boolean; // Auto add collection name to data + + // Index enhance imageIndex?: boolean; + autoIndexes?: boolean; - chunkSettingMode?: ChunkSettingModeEnum; + // Chunk setting + chunkSettingMode?: ChunkSettingModeEnum; // 系统参数/自定义参数 chunkSplitMode?: DataChunkSplitModeEnum; - + // Paragraph split + paragraphChunkAIMode?: ParagraphChunkAIModeEnum; + paragraphChunkDeep?: number; // Paragraph deep + paragraphChunkMinSize?: number; // Paragraph min size, if too small, it will merge + paragraphChunkMaxSize?: number; // Paragraph max size, if too large, it will split + // Size split chunkSize?: number; - indexSize?: number; + // Char split chunkSplitter?: string; + indexSize?: number; + qaPrompt?: string; }; @@ -66,7 +82,7 @@ export type DatasetSchemaType = { defaultPermission?: number; }; -export type DatasetCollectionSchemaType = { +export type DatasetCollectionSchemaType = ChunkSettingsType & { _id: string; teamId: string; tmbId: string; @@ -101,18 +117,7 @@ export type DatasetCollectionSchemaType = { // Parse settings customPdfParse?: boolean; - // Chunk settings - autoIndexes?: boolean; - imageIndex?: boolean; trainingType: DatasetCollectionDataProcessModeEnum; - - chunkSettingMode?: ChunkSettingModeEnum; - chunkSplitMode?: DataChunkSplitModeEnum; - - chunkSize?: number; - indexSize?: number; - chunkSplitter?: string; - qaPrompt?: string; }; export type DatasetCollectionTagsSchemaType = { diff --git a/packages/service/core/ai/config/provider/Claude.json b/packages/service/core/ai/config/provider/Claude.json index 2a328db31..8734498cd 100644 --- a/packages/service/core/ai/config/provider/Claude.json +++ b/packages/service/core/ai/config/provider/Claude.json @@ -1,6 +1,54 @@ { "provider": "Claude", "list": [ + { + "model": "claude-sonnet-4-20250514", + "name": "claude-sonnet-4-20250514", + "maxContext": 200000, + "maxResponse": 8000, + "quoteMaxToken": 100000, + "maxTemperature": 1, + "showTopP": true, + "showStopSign": true, + "vision": true, + "toolChoice": true, + "functionCall": false, + "defaultSystemChatPrompt": "", + "datasetProcess": true, + "usedInClassify": true, + "customCQPrompt": "", + "usedInExtractFields": true, + "usedInQueryExtension": true, + "customExtractPrompt": "", + "usedInToolCall": true, + "defaultConfig": {}, + "fieldMap": {}, + "type": "llm" + }, + { + "model": "claude-opus-4-20250514", + "name": "claude-opus-4-20250514", + "maxContext": 200000, + "maxResponse": 4096, + "quoteMaxToken": 100000, + "maxTemperature": 1, + "showTopP": true, + "showStopSign": true, + "vision": true, + "toolChoice": true, + "functionCall": false, + "defaultSystemChatPrompt": "", + "datasetProcess": true, + "usedInClassify": true, + "customCQPrompt": "", + "usedInExtractFields": true, + "usedInQueryExtension": true, + "customExtractPrompt": "", + "usedInToolCall": true, + "defaultConfig": {}, + "fieldMap": {}, + "type": "llm" + }, { "model": "claude-3-7-sonnet-20250219", "name": "claude-3-7-sonnet-20250219", diff --git a/packages/service/core/ai/config/provider/Gemini.json b/packages/service/core/ai/config/provider/Gemini.json index 1df074c74..0c2431b55 100644 --- a/packages/service/core/ai/config/provider/Gemini.json +++ b/packages/service/core/ai/config/provider/Gemini.json @@ -25,6 +25,30 @@ "showTopP": true, "showStopSign": true }, + { + "model": "gemini-2.5-flash-preview-04-17", + "name": "gemini-2.5-flash-preview-04-17", + "maxContext": 1000000, + "maxResponse": 8000, + "quoteMaxToken": 60000, + "maxTemperature": 1, + "vision": true, + "toolChoice": true, + "functionCall": false, + "defaultSystemChatPrompt": "", + "datasetProcess": true, + "usedInClassify": true, + "customCQPrompt": "", + "usedInExtractFields": true, + "usedInQueryExtension": true, + "customExtractPrompt": "", + "usedInToolCall": true, + "defaultConfig": {}, + "fieldMap": {}, + "type": "llm", + "showTopP": true, + "showStopSign": true + }, { "model": "gemini-2.0-flash", "name": "gemini-2.0-flash", diff --git a/packages/service/core/dataset/collection/controller.ts b/packages/service/core/dataset/collection/controller.ts index 46ad1b8d1..3407ea85b 100644 --- a/packages/service/core/dataset/collection/controller.ts +++ b/packages/service/core/dataset/collection/controller.ts @@ -74,6 +74,15 @@ export const createCollectionAndInsertData = async ({ llmModel: getLLMModel(dataset.agentModel) }); const chunkSplitter = computeChunkSplitter(createCollectionParams); + if (trainingType === DatasetCollectionDataProcessModeEnum.qa) { + delete createCollectionParams.chunkTriggerType; + delete createCollectionParams.chunkTriggerMinSize; + delete createCollectionParams.dataEnhanceCollectionName; + delete createCollectionParams.imageIndex; + delete createCollectionParams.autoIndexes; + delete createCollectionParams.indexSize; + delete createCollectionParams.qaPrompt; + } // 1. split chunks const chunks = rawText2Chunks({ diff --git a/packages/service/core/dataset/read.ts b/packages/service/core/dataset/read.ts index 073e6e8cd..9285c310a 100644 --- a/packages/service/core/dataset/read.ts +++ b/packages/service/core/dataset/read.ts @@ -163,7 +163,7 @@ export const readApiServerFileContent = async ({ title?: string; rawText: string; }> => { - const data = ( + return ( await getApiDatasetRequest({ apiServer, yuqueServer, @@ -175,10 +175,6 @@ export const readApiServerFileContent = async ({ apiFileId, customPdfParse }); - if (data) { - return data; - } - return Promise.reject(Error); }; export const rawText2Chunks = ({ diff --git a/packages/service/core/dataset/schema.ts b/packages/service/core/dataset/schema.ts index e75b5c0ca..dada6ce5d 100644 --- a/packages/service/core/dataset/schema.ts +++ b/packages/service/core/dataset/schema.ts @@ -1,10 +1,12 @@ import { getMongoModel, Schema } from '../../common/mongo'; import { ChunkSettingModeEnum, + ChunkTriggerConfigTypeEnum, DataChunkSplitModeEnum, DatasetCollectionDataProcessModeEnum, DatasetTypeEnum, - DatasetTypeMap + DatasetTypeMap, + ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants'; import { TeamCollectionName, @@ -15,12 +17,22 @@ import type { DatasetSchemaType } from '@fastgpt/global/core/dataset/type.d'; export const DatasetCollectionName = 'datasets'; export const ChunkSettings = { - imageIndex: Boolean, - autoIndexes: Boolean, trainingType: { type: String, enum: Object.values(DatasetCollectionDataProcessModeEnum) }, + + chunkTriggerType: { + type: String, + enum: Object.values(ChunkTriggerConfigTypeEnum) + }, + chunkTriggerMinSize: Number, + + dataEnhanceCollectionName: Boolean, + + imageIndex: Boolean, + autoIndexes: Boolean, + chunkSettingMode: { type: String, enum: Object.values(ChunkSettingModeEnum) @@ -29,6 +41,13 @@ export const ChunkSettings = { type: String, enum: Object.values(DataChunkSplitModeEnum) }, + paragraphChunkAIMode: { + type: String, + enum: Object.values(ParagraphChunkAIModeEnum) + }, + paragraphChunkDeep: Number, + paragraphChunkMinSize: Number, + paragraphChunkMaxSize: Number, chunkSize: Number, chunkSplitter: String, @@ -115,9 +134,7 @@ const DatasetSchema = new Schema({ // abandoned autoSync: Boolean, - externalReadUrl: { - type: String - }, + externalReadUrl: String, defaultPermission: Number }); diff --git a/packages/web/i18n/en/common.json b/packages/web/i18n/en/common.json index d40141666..64a724b42 100644 --- a/packages/web/i18n/en/common.json +++ b/packages/web/i18n/en/common.json @@ -749,7 +749,6 @@ "custom_title": "Custom Title", "data_index_custom": "Custom index", "data_index_default": "Default index", - "data_index_image": "Image Index", "data_index_question": "Inferred question index", "data_index_summary": "Summary Index", "data_not_found": "Data can't be found", diff --git a/packages/web/i18n/en/dataset.json b/packages/web/i18n/en/dataset.json index 1421aa505..3ac5f9091 100644 --- a/packages/web/i18n/en/dataset.json +++ b/packages/web/i18n/en/dataset.json @@ -22,7 +22,6 @@ "collection.training_type": "Chunk type", "collection_data_count": "Data amount", "collection_metadata_custom_pdf_parse": "PDF enhancement analysis", - "collection_metadata_image_parse": "Image tagging", "collection_not_support_retraining": "This collection type does not support retuning parameters", "collection_not_support_sync": "This collection does not support synchronization", "collection_sync": "Sync data", @@ -38,6 +37,7 @@ "custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.", "data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes", "data_error_amount": "{{errorAmount}} Group training exception", + "data_index_image": "Image index", "data_index_num": "Index {{index}}", "data_process_params": "Params", "data_process_setting": "Processing config", diff --git a/packages/web/i18n/zh-CN/common.json b/packages/web/i18n/zh-CN/common.json index ce6ade586..a1561eb77 100644 --- a/packages/web/i18n/zh-CN/common.json +++ b/packages/web/i18n/zh-CN/common.json @@ -749,7 +749,6 @@ "custom_title": "自定义标题", "data_index_custom": "自定义索引", "data_index_default": "默认索引", - "data_index_image": "图片索引", "data_index_question": "推测问题索引", "data_index_summary": "摘要索引", "data_not_found": "数据找不到了", diff --git a/packages/web/i18n/zh-CN/dataset.json b/packages/web/i18n/zh-CN/dataset.json index 153e0cd91..6c1e1ce93 100644 --- a/packages/web/i18n/zh-CN/dataset.json +++ b/packages/web/i18n/zh-CN/dataset.json @@ -22,7 +22,6 @@ "collection.training_type": "处理模式", "collection_data_count": "数据量", "collection_metadata_custom_pdf_parse": "PDF增强解析", - "collection_metadata_image_parse": "图片标注", "collection_not_support_retraining": "该集合类型不支持重新调整参数", "collection_not_support_sync": "该集合不支持同步", "collection_sync": "立即同步", @@ -38,6 +37,7 @@ "custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号,例如: * () [] {} 等。", "data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引", "data_error_amount": "{{errorAmount}} 组训练异常", + "data_index_image": "图片索引", "data_index_num": "索引 {{index}}", "data_process_params": "处理参数", "data_process_setting": "数据处理配置", diff --git a/packages/web/i18n/zh-Hant/common.json b/packages/web/i18n/zh-Hant/common.json index dbde30e8b..c7372ee31 100644 --- a/packages/web/i18n/zh-Hant/common.json +++ b/packages/web/i18n/zh-Hant/common.json @@ -749,7 +749,6 @@ "custom_title": "自訂標題", "data_index_custom": "自定義索引", "data_index_default": "預設索引", - "data_index_image": "圖片索引", "data_index_question": "推測問題索引", "data_index_summary": "摘要索引", "data_not_found": "數據找不到了", diff --git a/packages/web/i18n/zh-Hant/dataset.json b/packages/web/i18n/zh-Hant/dataset.json index ab13a5ec5..d5659446a 100644 --- a/packages/web/i18n/zh-Hant/dataset.json +++ b/packages/web/i18n/zh-Hant/dataset.json @@ -21,7 +21,6 @@ "collection.training_type": "處理模式", "collection_data_count": "資料量", "collection_metadata_custom_pdf_parse": "PDF 增強解析", - "collection_metadata_image_parse": "圖片標註", "collection_not_support_retraining": "此集合類型不支援重新調整參數", "collection_not_support_sync": "該集合不支援同步", "collection_sync": "立即同步", @@ -37,6 +36,7 @@ "custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的資料,使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.”表示中英文句號。\n\n盡量避免使用正則相關特殊符號,例如:* () [] {} 等。", "data_amount": "{{dataAmount}} 組資料,{{indexAmount}} 組索引", "data_error_amount": "{{errorAmount}} 組訓練異常", + "data_index_image": "圖片索引", "data_index_num": "索引 {{index}}", "data_process_params": "處理參數", "data_process_setting": "資料處理設定", diff --git a/projects/app/src/pageComponents/dataset/detail/CollectionCard/WebsiteConfig.tsx b/projects/app/src/pageComponents/dataset/detail/CollectionCard/WebsiteConfig.tsx index 96df296cf..54f4c9b2c 100644 --- a/projects/app/src/pageComponents/dataset/detail/CollectionCard/WebsiteConfig.tsx +++ b/projects/app/src/pageComponents/dataset/detail/CollectionCard/WebsiteConfig.tsx @@ -21,9 +21,13 @@ import CollectionChunkForm, { collectionChunkForm2StoreChunkData, type CollectionChunkFormType } from '../Form/CollectionChunkForm'; -import { getLLMDefaultChunkSize } from '@fastgpt/global/core/dataset/training/utils'; +import { + getAutoIndexSize, + getLLMDefaultChunkSize +} from '@fastgpt/global/core/dataset/training/utils'; import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type'; import PopoverConfirm from '@fastgpt/web/components/common/MyPopover/PopoverConfirm'; +import { defaultFormData } from '../Import/Context'; export type WebsiteConfigFormType = { websiteConfig: { @@ -76,17 +80,35 @@ const WebsiteConfigModal = ({ const form = useForm({ defaultValues: { - trainingType: chunkSettings?.trainingType || DatasetCollectionDataProcessModeEnum.chunk, - imageIndex: chunkSettings?.imageIndex || false, - autoIndexes: chunkSettings?.autoIndexes || false, + trainingType: chunkSettings?.trainingType, - chunkSettingMode: chunkSettings?.chunkSettingMode || ChunkSettingModeEnum.auto, - chunkSplitMode: chunkSettings?.chunkSplitMode || DataChunkSplitModeEnum.size, - embeddingChunkSize: chunkSettings?.chunkSize || 2000, - qaChunkSize: chunkSettings?.chunkSize || getLLMDefaultChunkSize(datasetDetail.agentModel), - indexSize: chunkSettings?.indexSize || datasetDetail.vectorModel?.defaultToken || 512, + chunkTriggerType: chunkSettings?.chunkTriggerType || defaultFormData.chunkTriggerType, + chunkTriggerMinSize: + chunkSettings?.chunkTriggerMinSize || defaultFormData.chunkTriggerMinSize, + + dataEnhanceCollectionName: + chunkSettings?.dataEnhanceCollectionName || defaultFormData.dataEnhanceCollectionName, + + imageIndex: chunkSettings?.imageIndex || defaultFormData.imageIndex, + autoIndexes: chunkSettings?.autoIndexes || defaultFormData.autoIndexes, + + chunkSettingMode: chunkSettings?.chunkSettingMode || defaultFormData.chunkSettingMode, + chunkSplitMode: chunkSettings?.chunkSplitMode || defaultFormData.chunkSplitMode, + + paragraphChunkAIMode: + chunkSettings?.paragraphChunkAIMode || defaultFormData.paragraphChunkAIMode, + paragraphChunkDeep: chunkSettings?.paragraphChunkDeep || defaultFormData.paragraphChunkDeep, + paragraphChunkMinSize: + chunkSettings?.paragraphChunkMinSize || defaultFormData.paragraphChunkMinSize, + paragraphChunkMaxSize: + chunkSettings?.paragraphChunkMaxSize || defaultFormData.paragraphChunkMaxSize, + + chunkSize: chunkSettings?.chunkSize || defaultFormData.chunkSize, + + chunkSplitter: chunkSettings?.chunkSplitter || defaultFormData.chunkSplitter, + + indexSize: chunkSettings?.indexSize || defaultFormData.indexSize, - chunkSplitter: chunkSettings?.chunkSplitter || '', qaPrompt: chunkSettings?.qaPrompt || Prompt_AgentQA.description } }); diff --git a/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx b/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx index f372199cc..826495859 100644 --- a/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx @@ -17,6 +17,10 @@ import { } from '@chakra-ui/react'; import MyIcon from '@fastgpt/web/components/common/Icon'; import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio'; +import type { + ChunkTriggerConfigTypeEnum, + ParagraphChunkAIModeEnum +} from '@fastgpt/global/core/dataset/constants'; import { DataChunkSplitModeEnum, DatasetCollectionDataProcessModeEnum, @@ -42,7 +46,6 @@ import { minChunkSize } from '@fastgpt/global/core/dataset/training/utils'; import RadioGroup from '@fastgpt/web/components/common/Radio/RadioGroup'; -import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type'; import type { LLMModelItemType, EmbeddingModelItemType } from '@fastgpt/global/core/ai/model.d'; const PromptTextarea = ({ @@ -86,19 +89,35 @@ const PromptTextarea = ({ export type CollectionChunkFormType = { trainingType: DatasetCollectionDataProcessModeEnum; + + // Chunk trigger + chunkTriggerType: ChunkTriggerConfigTypeEnum; + chunkTriggerMinSize: number; // maxSize from agent model, not store + + // Data enhance + dataEnhanceCollectionName: boolean; // Auto add collection name to data + + // Index enhance imageIndex: boolean; autoIndexes: boolean; - chunkSettingMode: ChunkSettingModeEnum; - + // Chunk setting + chunkSettingMode: ChunkSettingModeEnum; // 系统参数/自定义参数 chunkSplitMode: DataChunkSplitModeEnum; - embeddingChunkSize: number; - qaChunkSize: number; - chunkSplitter?: string; + // Paragraph split + paragraphChunkAIMode: ParagraphChunkAIModeEnum; + paragraphChunkDeep: number; // Paragraph deep + paragraphChunkMinSize: number; // Paragraph min size, if too small, it will merge + paragraphChunkMaxSize: number; // Paragraph max size, if too large, it will split + // Size split + chunkSize: number; + // Char split + chunkSplitter: string; indexSize: number; qaPrompt?: string; }; + const CollectionChunkForm = ({ form }: { form: UseFormReturn }) => { const { t } = useTranslation(); const { feConfigs } = useSystemStore(); @@ -131,29 +150,26 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn { if (trainingType === DatasetCollectionDataProcessModeEnum.qa) { return { - chunkSizeField: 'qaChunkSize', maxChunkSize: getLLMMaxChunkSize(agentModel), minChunkSize: 1000, maxIndexSize: 1000 }; } else if (autoIndexes) { return { - chunkSizeField: 'embeddingChunkSize', maxChunkSize: getMaxChunkSize(agentModel), minChunkSize: minChunkSize, maxIndexSize: getMaxIndexSize(vectorModel) }; } else { return { - chunkSizeField: 'embeddingChunkSize', maxChunkSize: getMaxChunkSize(agentModel), minChunkSize: minChunkSize, maxIndexSize: getMaxIndexSize(vectorModel) @@ -216,6 +232,11 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn { setValue('trainingType', e); + if (e === DatasetCollectionDataProcessModeEnum.qa) { + setValue('chunkSize', getLLMDefaultChunkSize(agentModel)); + } else { + setValue('chunkSize', chunkAutoChunkSize); + } }} defaultBg="white" activeBg="white" @@ -317,7 +338,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn { +}): CollectionChunkFormType => { + const { + trainingType, + autoIndexes, + chunkSettingMode, + chunkSize, + chunkSplitter, + indexSize, + qaPrompt + } = data; + + // 根据处理方式,获取 auto 和 custom 的参数。 const trainingModeSize: { autoChunkSize: number; autoIndexSize: number; @@ -483,53 +506,53 @@ export const collectionChunkForm2StoreChunkData = ({ if (trainingType === DatasetCollectionDataProcessModeEnum.qa) { return { autoChunkSize: getLLMDefaultChunkSize(agentModel), - autoIndexSize: 512, - chunkSize: qaChunkSize, - indexSize: 512 + autoIndexSize: getMaxIndexSize(vectorModel), + chunkSize, + indexSize: getMaxIndexSize(vectorModel) }; } else if (autoIndexes) { return { autoChunkSize: chunkAutoChunkSize, autoIndexSize: getAutoIndexSize(vectorModel), - chunkSize: embeddingChunkSize, + chunkSize, indexSize }; } else { return { autoChunkSize: chunkAutoChunkSize, autoIndexSize: getAutoIndexSize(vectorModel), - chunkSize: embeddingChunkSize, + chunkSize, indexSize }; } })(); - const { chunkSize: formatChunkIndex, indexSize: formatIndexSize } = (() => { + // 获取真实参数 + const { + chunkSize: formatChunkIndex, + indexSize: formatIndexSize, + chunkSplitter: formatChunkSplitter + } = (() => { if (chunkSettingMode === ChunkSettingModeEnum.auto) { return { chunkSize: trainingModeSize.autoChunkSize, - indexSize: trainingModeSize.autoIndexSize + indexSize: trainingModeSize.autoIndexSize, + chunkSplitter: '' }; } else { return { chunkSize: trainingModeSize.chunkSize, - indexSize: trainingModeSize.indexSize + indexSize: trainingModeSize.indexSize, + chunkSplitter }; } })(); return { - trainingType, - imageIndex, - autoIndexes, - - chunkSettingMode, - chunkSplitMode, - + ...data, chunkSize: formatChunkIndex, indexSize: formatIndexSize, - - chunkSplitter, + chunkSplitter: formatChunkSplitter, qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined }; }; diff --git a/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx b/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx index 46cb6f837..8b62e5db5 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx @@ -3,8 +3,10 @@ import { type SetStateAction, useMemo, useState } from 'react'; import { useTranslation } from 'next-i18next'; import { createContext, useContextSelector } from 'use-context-selector'; import { + ChunkTriggerConfigTypeEnum, DatasetCollectionDataProcessModeEnum, - ImportDataSourceEnum + ImportDataSourceEnum, + ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants'; import { useMyStep } from '@fastgpt/web/hooks/useStep'; import { Box, Button, Flex, IconButton } from '@chakra-ui/react'; @@ -16,38 +18,14 @@ import { type ImportSourceItemType } from '@/web/core/dataset/type'; import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent'; import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants'; -import { - getMaxChunkSize, - getLLMDefaultChunkSize, - getLLMMaxChunkSize, - chunkAutoChunkSize, - minChunkSize, - getAutoIndexSize, - getMaxIndexSize -} from '@fastgpt/global/core/dataset/training/utils'; +import { chunkAutoChunkSize, getAutoIndexSize } from '@fastgpt/global/core/dataset/training/utils'; import { type CollectionChunkFormType } from '../Form/CollectionChunkForm'; -type ChunkSizeFieldType = 'embeddingChunkSize' | 'qaChunkSize'; export type ImportFormType = { customPdfParse: boolean; - webSelector: string; } & CollectionChunkFormType; -type TrainingFiledType = { - chunkOverlapRatio: number; - maxChunkSize: number; - minChunkSize: number; - autoChunkSize: number; - chunkSize: number; - maxIndexSize?: number; - indexSize?: number; - autoIndexSize?: number; - charsPointsPrice: number; - priceTip: string; - uploadRate: number; - chunkSizeField: ChunkSizeFieldType; -}; type DatasetImportContextType = { importSource: ImportDataSourceEnum; parentId: string | undefined; @@ -57,7 +35,35 @@ type DatasetImportContextType = { processParamsForm: UseFormReturn; sources: ImportSourceItemType[]; setSources: React.Dispatch>; -} & TrainingFiledType; +}; + +export const defaultFormData: ImportFormType = { + customPdfParse: false, + + trainingType: DatasetCollectionDataProcessModeEnum.chunk, + + chunkTriggerType: ChunkTriggerConfigTypeEnum.minSize, + chunkTriggerMinSize: chunkAutoChunkSize, + + dataEnhanceCollectionName: false, + + imageIndex: false, + autoIndexes: false, + + chunkSettingMode: ChunkSettingModeEnum.auto, + chunkSplitMode: DataChunkSplitModeEnum.size, + paragraphChunkAIMode: ParagraphChunkAIModeEnum.auto, + paragraphChunkDeep: 4, + paragraphChunkMinSize: 100, + paragraphChunkMaxSize: chunkAutoChunkSize, + + chunkSize: chunkAutoChunkSize, + chunkSplitter: '', + indexSize: getAutoIndexSize(), + + qaPrompt: Prompt_AgentQA.description, + webSelector: '' +}; export const DatasetImportContext = createContext({ importSource: ImportDataSourceEnum.fileLocal, @@ -75,12 +81,9 @@ export const DatasetImportContext = createContext({ }, chunkSize: 0, chunkOverlapRatio: 0, - uploadRate: 0, //@ts-ignore processParamsForm: undefined, - autoChunkSize: 0, - charsPointsPrice: 0, - priceTip: '' + autoChunkSize: 0 }); const DatasetImportContextProvider = ({ children }: { children: React.ReactNode }) => { @@ -180,119 +183,17 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode }); const vectorModel = datasetDetail.vectorModel; - const agentModel = datasetDetail.agentModel; const processParamsForm = useForm({ defaultValues: { - imageIndex: false, - autoIndexes: false, - - trainingType: DatasetCollectionDataProcessModeEnum.chunk, - - chunkSettingMode: ChunkSettingModeEnum.auto, - - chunkSplitMode: DataChunkSplitModeEnum.size, - embeddingChunkSize: chunkAutoChunkSize, - indexSize: vectorModel?.defaultToken || 512, - qaChunkSize: getLLMDefaultChunkSize(agentModel), - chunkSplitter: '', - qaPrompt: Prompt_AgentQA.description, - webSelector: '', - customPdfParse: false + ...defaultFormData, + indexSize: getAutoIndexSize(vectorModel) } }); const [sources, setSources] = useState([]); - // watch form - const trainingType = processParamsForm.watch('trainingType'); - const chunkSettingMode = processParamsForm.watch('chunkSettingMode'); - const embeddingChunkSize = processParamsForm.watch('embeddingChunkSize'); - const qaChunkSize = processParamsForm.watch('qaChunkSize'); - const chunkSplitter = processParamsForm.watch('chunkSplitter'); - const autoIndexes = processParamsForm.watch('autoIndexes'); - const indexSize = processParamsForm.watch('indexSize'); - - const TrainingModeMap = useMemo(() => { - if (trainingType === DatasetCollectionDataProcessModeEnum.qa) { - return { - chunkSizeField: 'qaChunkSize', - chunkOverlapRatio: 0, - maxChunkSize: getLLMMaxChunkSize(agentModel), - minChunkSize: 1000, - autoChunkSize: getLLMDefaultChunkSize(agentModel), - chunkSize: qaChunkSize, - charsPointsPrice: agentModel.charsPointsPrice || 0, - priceTip: t('dataset:import.Auto mode Estimated Price Tips', { - price: agentModel.charsPointsPrice - }), - uploadRate: 30 - }; - } else if (autoIndexes) { - return { - chunkSizeField: 'embeddingChunkSize', - chunkOverlapRatio: 0.2, - maxChunkSize: getMaxChunkSize(agentModel), - minChunkSize: minChunkSize, - autoChunkSize: chunkAutoChunkSize, - chunkSize: embeddingChunkSize, - maxIndexSize: getMaxIndexSize(vectorModel), - autoIndexSize: getAutoIndexSize(vectorModel), - indexSize, - charsPointsPrice: agentModel.charsPointsPrice || 0, - priceTip: t('dataset:import.Auto mode Estimated Price Tips', { - price: agentModel.charsPointsPrice - }), - uploadRate: 100 - }; - } else { - return { - chunkSizeField: 'embeddingChunkSize', - chunkOverlapRatio: 0.2, - maxChunkSize: getMaxChunkSize(agentModel), - minChunkSize: minChunkSize, - autoChunkSize: chunkAutoChunkSize, - chunkSize: embeddingChunkSize, - maxIndexSize: getMaxIndexSize(vectorModel), - autoIndexSize: getAutoIndexSize(vectorModel), - indexSize, - charsPointsPrice: vectorModel.charsPointsPrice || 0, - priceTip: t('dataset:import.Embedding Estimated Price Tips', { - price: vectorModel.charsPointsPrice - }), - uploadRate: 150 - }; - } - }, [ - trainingType, - autoIndexes, - agentModel, - qaChunkSize, - t, - embeddingChunkSize, - vectorModel, - indexSize - ]); - - const chunkSettingModeMap = useMemo(() => { - if (chunkSettingMode === ChunkSettingModeEnum.auto) { - return { - chunkSize: TrainingModeMap.autoChunkSize, - indexSize: TrainingModeMap.autoIndexSize, - chunkSplitter: '' - }; - } else { - return { - chunkSize: TrainingModeMap.chunkSize, - indexSize: TrainingModeMap.indexSize, - chunkSplitter - }; - } - }, [chunkSettingMode, TrainingModeMap, chunkSplitter]); - const contextValue = { - ...TrainingModeMap, - ...chunkSettingModeMap, importSource: source, parentId, activeStep, diff --git a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx index 76e7f7d97..56dd189f8 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx @@ -17,6 +17,7 @@ import MyBox from '@fastgpt/web/components/common/MyBox'; import Markdown from '@/components/Markdown'; import { useToast } from '@fastgpt/web/hooks/useToast'; import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils'; +import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm'; const PreviewData = () => { const { t } = useTranslation(); @@ -28,8 +29,6 @@ const PreviewData = () => { const sources = useContextSelector(DatasetImportContext, (v) => v.sources); const importSource = useContextSelector(DatasetImportContext, (v) => v.importSource); - const chunkSize = useContextSelector(DatasetImportContext, (v) => v.chunkSize); - const chunkOverlapRatio = useContextSelector(DatasetImportContext, (v) => v.chunkOverlapRatio); const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm); const [previewFile, setPreviewFile] = useState(); @@ -37,13 +36,20 @@ const PreviewData = () => { const { data = { chunks: [], total: 0 }, loading: isLoading } = useRequest2( async () => { if (!previewFile) return { chunks: [], total: 0 }; + + const chunkData = collectionChunkForm2StoreChunkData({ + ...processParamsForm.getValues(), + vectorModel: datasetDetail.vectorModel, + agentModel: datasetDetail.agentModel + }); + if (importSource === ImportDataSourceEnum.fileCustom) { const chunkSplitter = processParamsForm.getValues('chunkSplitter'); const { chunks } = splitText2Chunks({ text: previewFile.rawText || '', - chunkSize, + chunkSize: chunkData.chunkSize, maxSize: getLLMMaxChunkSize(datasetDetail.agentModel), - overlapRatio: chunkOverlapRatio, + overlapRatio: 0.2, customReg: chunkSplitter ? [chunkSplitter] : [] }); return { @@ -64,18 +70,12 @@ const PreviewData = () => { previewFile.externalFileUrl || previewFile.apiFileId || '', + externalFileId: previewFile.externalFileId, - customPdfParse: processParamsForm.getValues('customPdfParse'), - - trainingType: processParamsForm.getValues('trainingType'), - chunkSettingMode: processParamsForm.getValues('chunkSettingMode'), - chunkSplitMode: processParamsForm.getValues('chunkSplitMode'), - chunkSize, - chunkSplitter: processParamsForm.getValues('chunkSplitter'), - overlapRatio: chunkOverlapRatio, - + ...chunkData, selector: processParamsForm.getValues('webSelector'), - externalFileId: previewFile.externalFileId + customPdfParse: processParamsForm.getValues('customPdfParse'), + overlapRatio: 0.2 }); }, { diff --git a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx index ffe4947a1..bffc8e16e 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx @@ -37,6 +37,7 @@ import { useContextSelector } from 'use-context-selector'; import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; import { DatasetImportContext, type ImportFormType } from '../Context'; import { type ApiCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d'; +import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm'; const Upload = () => { const { t } = useTranslation(); @@ -48,10 +49,10 @@ const Upload = () => { const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail); const retrainNewCollectionId = useRef(''); - const { importSource, parentId, sources, setSources, processParamsForm, chunkSize, indexSize } = - useContextSelector(DatasetImportContext, (v) => v); - - const { handleSubmit } = processParamsForm; + const { importSource, parentId, sources, setSources, processParamsForm } = useContextSelector( + DatasetImportContext, + (v) => v + ); const { totalFilesCount, waitingFilesCount, allFinished, hasCreatingFiles } = useMemo(() => { const totalFilesCount = sources.length; @@ -80,7 +81,13 @@ const Upload = () => { }, [waitingFilesCount, totalFilesCount, allFinished, t]); const { runAsync: startUpload, loading: isLoading } = useRequest2( - async ({ trainingType, chunkSplitter, qaPrompt, webSelector }: ImportFormType) => { + async ({ customPdfParse, webSelector, ...data }: ImportFormType) => { + const chunkData = collectionChunkForm2StoreChunkData({ + ...data, + vectorModel: datasetDetail.vectorModel, + agentModel: datasetDetail.agentModel + }); + if (sources.length === 0) return; const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting'); @@ -101,23 +108,12 @@ const Upload = () => { const commonParams: ApiCreateDatasetCollectionParams & { name: string; } = { + ...chunkData, parentId, datasetId: datasetDetail._id, name: item.sourceName, - customPdfParse: processParamsForm.getValues('customPdfParse'), - - trainingType, - imageIndex: processParamsForm.getValues('imageIndex'), - autoIndexes: processParamsForm.getValues('autoIndexes'), - - chunkSettingMode: processParamsForm.getValues('chunkSettingMode'), - chunkSplitMode: processParamsForm.getValues('chunkSplitMode'), - - chunkSize, - indexSize, - chunkSplitter, - qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined + customPdfParse }; if (importSource === ImportDataSourceEnum.reTraining) { @@ -280,7 +276,10 @@ const Upload = () => { -