perf: dataset import params code (#4875)

* perf: dataset import params code

* perf: api dataset code

* model
This commit is contained in:
Archer
2025-05-23 10:40:25 +08:00
committed by GitHub
parent 9af92d1eae
commit fae76e887a
23 changed files with 366 additions and 295 deletions

View File

@@ -11,6 +11,8 @@ weight: 790
## 🚀 新增内容 ## 🚀 新增内容
1. 支持 PG 设置`systemEnv.hnswMaxScanTuples`参数,提高迭代搜索的数据总量。 1. 支持 PG 设置`systemEnv.hnswMaxScanTuples`参数,提高迭代搜索的数据总量。
2. 开放飞书和语雀知识库到开源版。
3. gemini 和 claude 最新模型预设。
## ⚙️ 优化 ## ⚙️ 优化

View File

@@ -1,9 +1,11 @@
import type { DatasetDataIndexItemType, DatasetSchemaType } from './type'; import type { ChunkSettingsType, DatasetDataIndexItemType, DatasetSchemaType } from './type';
import type { import type {
DatasetCollectionTypeEnum, DatasetCollectionTypeEnum,
DatasetCollectionDataProcessModeEnum, DatasetCollectionDataProcessModeEnum,
ChunkSettingModeEnum, ChunkSettingModeEnum,
DataChunkSplitModeEnum DataChunkSplitModeEnum,
ChunkTriggerConfigTypeEnum,
ParagraphChunkAIModeEnum
} from './constants'; } from './constants';
import type { LLMModelItemType } from '../ai/model.d'; import type { LLMModelItemType } from '../ai/model.d';
import type { ParentIdType } from 'common/parentFolder/type'; import type { ParentIdType } from 'common/parentFolder/type';
@@ -32,26 +34,16 @@ export type DatasetUpdateBody = {
}; };
/* ================= collection ===================== */ /* ================= collection ===================== */
export type DatasetCollectionChunkMetadataType = { // Input + store params
type DatasetCollectionStoreDataType = ChunkSettingsType & {
parentId?: string; parentId?: string;
customPdfParse?: boolean;
trainingType?: DatasetCollectionDataProcessModeEnum;
imageIndex?: boolean;
autoIndexes?: boolean;
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
chunkSize?: number;
indexSize?: number;
chunkSplitter?: string;
qaPrompt?: string;
metadata?: Record<string, any>; metadata?: Record<string, any>;
customPdfParse?: boolean;
}; };
// create collection params // create collection params
export type CreateDatasetCollectionParams = DatasetCollectionChunkMetadataType & { export type CreateDatasetCollectionParams = DatasetCollectionStoreDataType & {
datasetId: string; datasetId: string;
name: string; name: string;
type: DatasetCollectionTypeEnum; type: DatasetCollectionTypeEnum;
@@ -72,7 +64,7 @@ export type CreateDatasetCollectionParams = DatasetCollectionChunkMetadataType &
nextSyncTime?: Date; nextSyncTime?: Date;
}; };
export type ApiCreateDatasetCollectionParams = DatasetCollectionChunkMetadataType & { export type ApiCreateDatasetCollectionParams = DatasetCollectionStoreDataType & {
datasetId: string; datasetId: string;
tags?: string[]; tags?: string[];
}; };
@@ -90,7 +82,7 @@ export type ApiDatasetCreateDatasetCollectionParams = ApiCreateDatasetCollection
export type FileIdCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & { export type FileIdCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
fileId: string; fileId: string;
}; };
export type reTrainingDatasetFileCollectionParams = DatasetCollectionChunkMetadataType & { export type reTrainingDatasetFileCollectionParams = DatasetCollectionStoreDataType & {
datasetId: string; datasetId: string;
collectionId: string; collectionId: string;
}; };

View File

@@ -143,15 +143,25 @@ export const DatasetCollectionDataProcessModeMap = {
} }
}; };
export enum ChunkTriggerConfigTypeEnum {
minSize = 'minSize',
forceChunk = 'forceChunk',
maxSize = 'maxSize'
}
export enum ChunkSettingModeEnum { export enum ChunkSettingModeEnum {
auto = 'auto', auto = 'auto',
custom = 'custom' custom = 'custom'
} }
export enum DataChunkSplitModeEnum { export enum DataChunkSplitModeEnum {
paragraph = 'paragraph',
size = 'size', size = 'size',
char = 'char' char = 'char'
} }
export enum ParagraphChunkAIModeEnum {
auto = 'auto',
force = 'force'
}
/* ------------ data -------------- */ /* ------------ data -------------- */

View File

@@ -32,7 +32,7 @@ export const DatasetDataIndexMap: Record<
color: 'red' color: 'red'
}, },
[DatasetDataIndexTypeEnum.image]: { [DatasetDataIndexTypeEnum.image]: {
label: i18nT('common:data_index_image'), label: i18nT('dataset:data_index_image'),
color: 'purple' color: 'purple'
} }
}; };

View File

@@ -8,26 +8,42 @@ import type {
DatasetStatusEnum, DatasetStatusEnum,
DatasetTypeEnum, DatasetTypeEnum,
SearchScoreTypeEnum, SearchScoreTypeEnum,
TrainingModeEnum TrainingModeEnum,
ChunkSettingModeEnum
} from './constants'; } from './constants';
import type { DatasetPermission } from '../../support/permission/dataset/controller'; import type { DatasetPermission } from '../../support/permission/dataset/controller';
import { Permission } from '../../support/permission/controller';
import type { APIFileServer, FeishuServer, YuqueServer } from './apiDataset'; import type { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
import type { SourceMemberType } from 'support/user/type'; import type { SourceMemberType } from 'support/user/type';
import type { DatasetDataIndexTypeEnum } from './data/constants'; import type { DatasetDataIndexTypeEnum } from './data/constants';
import type { ChunkSettingModeEnum } from './constants';
export type ChunkSettingsType = { export type ChunkSettingsType = {
trainingType: DatasetCollectionDataProcessModeEnum; trainingType?: DatasetCollectionDataProcessModeEnum;
autoIndexes?: boolean;
// Chunk trigger
chunkTriggerType?: ChunkTriggerConfigTypeEnum;
chunkTriggerMinSize?: number; // maxSize from agent model, not store
// Data enhance
dataEnhanceCollectionName?: boolean; // Auto add collection name to data
// Index enhance
imageIndex?: boolean; imageIndex?: boolean;
autoIndexes?: boolean;
chunkSettingMode?: ChunkSettingModeEnum; // Chunk setting
chunkSettingMode?: ChunkSettingModeEnum; // 系统参数/自定义参数
chunkSplitMode?: DataChunkSplitModeEnum; chunkSplitMode?: DataChunkSplitModeEnum;
// Paragraph split
paragraphChunkAIMode?: ParagraphChunkAIModeEnum;
paragraphChunkDeep?: number; // Paragraph deep
paragraphChunkMinSize?: number; // Paragraph min size, if too small, it will merge
paragraphChunkMaxSize?: number; // Paragraph max size, if too large, it will split
// Size split
chunkSize?: number; chunkSize?: number;
indexSize?: number; // Char split
chunkSplitter?: string; chunkSplitter?: string;
indexSize?: number;
qaPrompt?: string; qaPrompt?: string;
}; };
@@ -66,7 +82,7 @@ export type DatasetSchemaType = {
defaultPermission?: number; defaultPermission?: number;
}; };
export type DatasetCollectionSchemaType = { export type DatasetCollectionSchemaType = ChunkSettingsType & {
_id: string; _id: string;
teamId: string; teamId: string;
tmbId: string; tmbId: string;
@@ -101,18 +117,7 @@ export type DatasetCollectionSchemaType = {
// Parse settings // Parse settings
customPdfParse?: boolean; customPdfParse?: boolean;
// Chunk settings
autoIndexes?: boolean;
imageIndex?: boolean;
trainingType: DatasetCollectionDataProcessModeEnum; trainingType: DatasetCollectionDataProcessModeEnum;
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
chunkSize?: number;
indexSize?: number;
chunkSplitter?: string;
qaPrompt?: string;
}; };
export type DatasetCollectionTagsSchemaType = { export type DatasetCollectionTagsSchemaType = {

View File

@@ -1,6 +1,54 @@
{ {
"provider": "Claude", "provider": "Claude",
"list": [ "list": [
{
"model": "claude-sonnet-4-20250514",
"name": "claude-sonnet-4-20250514",
"maxContext": 200000,
"maxResponse": 8000,
"quoteMaxToken": 100000,
"maxTemperature": 1,
"showTopP": true,
"showStopSign": true,
"vision": true,
"toolChoice": true,
"functionCall": false,
"defaultSystemChatPrompt": "",
"datasetProcess": true,
"usedInClassify": true,
"customCQPrompt": "",
"usedInExtractFields": true,
"usedInQueryExtension": true,
"customExtractPrompt": "",
"usedInToolCall": true,
"defaultConfig": {},
"fieldMap": {},
"type": "llm"
},
{
"model": "claude-opus-4-20250514",
"name": "claude-opus-4-20250514",
"maxContext": 200000,
"maxResponse": 4096,
"quoteMaxToken": 100000,
"maxTemperature": 1,
"showTopP": true,
"showStopSign": true,
"vision": true,
"toolChoice": true,
"functionCall": false,
"defaultSystemChatPrompt": "",
"datasetProcess": true,
"usedInClassify": true,
"customCQPrompt": "",
"usedInExtractFields": true,
"usedInQueryExtension": true,
"customExtractPrompt": "",
"usedInToolCall": true,
"defaultConfig": {},
"fieldMap": {},
"type": "llm"
},
{ {
"model": "claude-3-7-sonnet-20250219", "model": "claude-3-7-sonnet-20250219",
"name": "claude-3-7-sonnet-20250219", "name": "claude-3-7-sonnet-20250219",

View File

@@ -25,6 +25,30 @@
"showTopP": true, "showTopP": true,
"showStopSign": true "showStopSign": true
}, },
{
"model": "gemini-2.5-flash-preview-04-17",
"name": "gemini-2.5-flash-preview-04-17",
"maxContext": 1000000,
"maxResponse": 8000,
"quoteMaxToken": 60000,
"maxTemperature": 1,
"vision": true,
"toolChoice": true,
"functionCall": false,
"defaultSystemChatPrompt": "",
"datasetProcess": true,
"usedInClassify": true,
"customCQPrompt": "",
"usedInExtractFields": true,
"usedInQueryExtension": true,
"customExtractPrompt": "",
"usedInToolCall": true,
"defaultConfig": {},
"fieldMap": {},
"type": "llm",
"showTopP": true,
"showStopSign": true
},
{ {
"model": "gemini-2.0-flash", "model": "gemini-2.0-flash",
"name": "gemini-2.0-flash", "name": "gemini-2.0-flash",

View File

@@ -74,6 +74,15 @@ export const createCollectionAndInsertData = async ({
llmModel: getLLMModel(dataset.agentModel) llmModel: getLLMModel(dataset.agentModel)
}); });
const chunkSplitter = computeChunkSplitter(createCollectionParams); const chunkSplitter = computeChunkSplitter(createCollectionParams);
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
delete createCollectionParams.chunkTriggerType;
delete createCollectionParams.chunkTriggerMinSize;
delete createCollectionParams.dataEnhanceCollectionName;
delete createCollectionParams.imageIndex;
delete createCollectionParams.autoIndexes;
delete createCollectionParams.indexSize;
delete createCollectionParams.qaPrompt;
}
// 1. split chunks // 1. split chunks
const chunks = rawText2Chunks({ const chunks = rawText2Chunks({

View File

@@ -163,7 +163,7 @@ export const readApiServerFileContent = async ({
title?: string; title?: string;
rawText: string; rawText: string;
}> => { }> => {
const data = ( return (
await getApiDatasetRequest({ await getApiDatasetRequest({
apiServer, apiServer,
yuqueServer, yuqueServer,
@@ -175,10 +175,6 @@ export const readApiServerFileContent = async ({
apiFileId, apiFileId,
customPdfParse customPdfParse
}); });
if (data) {
return data;
}
return Promise.reject(Error);
}; };
export const rawText2Chunks = ({ export const rawText2Chunks = ({

View File

@@ -1,10 +1,12 @@
import { getMongoModel, Schema } from '../../common/mongo'; import { getMongoModel, Schema } from '../../common/mongo';
import { import {
ChunkSettingModeEnum, ChunkSettingModeEnum,
ChunkTriggerConfigTypeEnum,
DataChunkSplitModeEnum, DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum, DatasetCollectionDataProcessModeEnum,
DatasetTypeEnum, DatasetTypeEnum,
DatasetTypeMap DatasetTypeMap,
ParagraphChunkAIModeEnum
} from '@fastgpt/global/core/dataset/constants'; } from '@fastgpt/global/core/dataset/constants';
import { import {
TeamCollectionName, TeamCollectionName,
@@ -15,12 +17,22 @@ import type { DatasetSchemaType } from '@fastgpt/global/core/dataset/type.d';
export const DatasetCollectionName = 'datasets'; export const DatasetCollectionName = 'datasets';
export const ChunkSettings = { export const ChunkSettings = {
imageIndex: Boolean,
autoIndexes: Boolean,
trainingType: { trainingType: {
type: String, type: String,
enum: Object.values(DatasetCollectionDataProcessModeEnum) enum: Object.values(DatasetCollectionDataProcessModeEnum)
}, },
chunkTriggerType: {
type: String,
enum: Object.values(ChunkTriggerConfigTypeEnum)
},
chunkTriggerMinSize: Number,
dataEnhanceCollectionName: Boolean,
imageIndex: Boolean,
autoIndexes: Boolean,
chunkSettingMode: { chunkSettingMode: {
type: String, type: String,
enum: Object.values(ChunkSettingModeEnum) enum: Object.values(ChunkSettingModeEnum)
@@ -29,6 +41,13 @@ export const ChunkSettings = {
type: String, type: String,
enum: Object.values(DataChunkSplitModeEnum) enum: Object.values(DataChunkSplitModeEnum)
}, },
paragraphChunkAIMode: {
type: String,
enum: Object.values(ParagraphChunkAIModeEnum)
},
paragraphChunkDeep: Number,
paragraphChunkMinSize: Number,
paragraphChunkMaxSize: Number,
chunkSize: Number, chunkSize: Number,
chunkSplitter: String, chunkSplitter: String,
@@ -115,9 +134,7 @@ const DatasetSchema = new Schema({
// abandoned // abandoned
autoSync: Boolean, autoSync: Boolean,
externalReadUrl: { externalReadUrl: String,
type: String
},
defaultPermission: Number defaultPermission: Number
}); });

View File

@@ -749,7 +749,6 @@
"custom_title": "Custom Title", "custom_title": "Custom Title",
"data_index_custom": "Custom index", "data_index_custom": "Custom index",
"data_index_default": "Default index", "data_index_default": "Default index",
"data_index_image": "Image Index",
"data_index_question": "Inferred question index", "data_index_question": "Inferred question index",
"data_index_summary": "Summary Index", "data_index_summary": "Summary Index",
"data_not_found": "Data can't be found", "data_not_found": "Data can't be found",

View File

@@ -22,7 +22,6 @@
"collection.training_type": "Chunk type", "collection.training_type": "Chunk type",
"collection_data_count": "Data amount", "collection_data_count": "Data amount",
"collection_metadata_custom_pdf_parse": "PDF enhancement analysis", "collection_metadata_custom_pdf_parse": "PDF enhancement analysis",
"collection_metadata_image_parse": "Image tagging",
"collection_not_support_retraining": "This collection type does not support retuning parameters", "collection_not_support_retraining": "This collection type does not support retuning parameters",
"collection_not_support_sync": "This collection does not support synchronization", "collection_not_support_sync": "This collection does not support synchronization",
"collection_sync": "Sync data", "collection_sync": "Sync data",
@@ -38,6 +37,7 @@
"custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.", "custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.",
"data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes", "data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
"data_error_amount": "{{errorAmount}} Group training exception", "data_error_amount": "{{errorAmount}} Group training exception",
"data_index_image": "Image index",
"data_index_num": "Index {{index}}", "data_index_num": "Index {{index}}",
"data_process_params": "Params", "data_process_params": "Params",
"data_process_setting": "Processing config", "data_process_setting": "Processing config",

View File

@@ -749,7 +749,6 @@
"custom_title": "自定义标题", "custom_title": "自定义标题",
"data_index_custom": "自定义索引", "data_index_custom": "自定义索引",
"data_index_default": "默认索引", "data_index_default": "默认索引",
"data_index_image": "图片索引",
"data_index_question": "推测问题索引", "data_index_question": "推测问题索引",
"data_index_summary": "摘要索引", "data_index_summary": "摘要索引",
"data_not_found": "数据找不到了", "data_not_found": "数据找不到了",

View File

@@ -22,7 +22,6 @@
"collection.training_type": "处理模式", "collection.training_type": "处理模式",
"collection_data_count": "数据量", "collection_data_count": "数据量",
"collection_metadata_custom_pdf_parse": "PDF增强解析", "collection_metadata_custom_pdf_parse": "PDF增强解析",
"collection_metadata_image_parse": "图片标注",
"collection_not_support_retraining": "该集合类型不支持重新调整参数", "collection_not_support_retraining": "该集合类型不支持重新调整参数",
"collection_not_support_sync": "该集合不支持同步", "collection_not_support_sync": "该集合不支持同步",
"collection_sync": "立即同步", "collection_sync": "立即同步",
@@ -38,6 +37,7 @@
"custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号例如: * () [] {} 等。", "custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号例如: * () [] {} 等。",
"data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引", "data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引",
"data_error_amount": "{{errorAmount}} 组训练异常", "data_error_amount": "{{errorAmount}} 组训练异常",
"data_index_image": "图片索引",
"data_index_num": "索引 {{index}}", "data_index_num": "索引 {{index}}",
"data_process_params": "处理参数", "data_process_params": "处理参数",
"data_process_setting": "数据处理配置", "data_process_setting": "数据处理配置",

View File

@@ -749,7 +749,6 @@
"custom_title": "自訂標題", "custom_title": "自訂標題",
"data_index_custom": "自定義索引", "data_index_custom": "自定義索引",
"data_index_default": "預設索引", "data_index_default": "預設索引",
"data_index_image": "圖片索引",
"data_index_question": "推測問題索引", "data_index_question": "推測問題索引",
"data_index_summary": "摘要索引", "data_index_summary": "摘要索引",
"data_not_found": "數據找不到了", "data_not_found": "數據找不到了",

View File

@@ -21,7 +21,6 @@
"collection.training_type": "處理模式", "collection.training_type": "處理模式",
"collection_data_count": "資料量", "collection_data_count": "資料量",
"collection_metadata_custom_pdf_parse": "PDF 增強解析", "collection_metadata_custom_pdf_parse": "PDF 增強解析",
"collection_metadata_image_parse": "圖片標註",
"collection_not_support_retraining": "此集合類型不支援重新調整參數", "collection_not_support_retraining": "此集合類型不支援重新調整參數",
"collection_not_support_sync": "該集合不支援同步", "collection_not_support_sync": "該集合不支援同步",
"collection_sync": "立即同步", "collection_sync": "立即同步",
@@ -37,6 +36,7 @@
"custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的資料使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.”表示中英文句號。\n\n盡量避免使用正則相關特殊符號例如* () [] {} 等。", "custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的資料使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.”表示中英文句號。\n\n盡量避免使用正則相關特殊符號例如* () [] {} 等。",
"data_amount": "{{dataAmount}} 組資料,{{indexAmount}} 組索引", "data_amount": "{{dataAmount}} 組資料,{{indexAmount}} 組索引",
"data_error_amount": "{{errorAmount}} 組訓練異常", "data_error_amount": "{{errorAmount}} 組訓練異常",
"data_index_image": "圖片索引",
"data_index_num": "索引 {{index}}", "data_index_num": "索引 {{index}}",
"data_process_params": "處理參數", "data_process_params": "處理參數",
"data_process_setting": "資料處理設定", "data_process_setting": "資料處理設定",

View File

@@ -21,9 +21,13 @@ import CollectionChunkForm, {
collectionChunkForm2StoreChunkData, collectionChunkForm2StoreChunkData,
type CollectionChunkFormType type CollectionChunkFormType
} from '../Form/CollectionChunkForm'; } from '../Form/CollectionChunkForm';
import { getLLMDefaultChunkSize } from '@fastgpt/global/core/dataset/training/utils'; import {
getAutoIndexSize,
getLLMDefaultChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type'; import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
import PopoverConfirm from '@fastgpt/web/components/common/MyPopover/PopoverConfirm'; import PopoverConfirm from '@fastgpt/web/components/common/MyPopover/PopoverConfirm';
import { defaultFormData } from '../Import/Context';
export type WebsiteConfigFormType = { export type WebsiteConfigFormType = {
websiteConfig: { websiteConfig: {
@@ -76,17 +80,35 @@ const WebsiteConfigModal = ({
const form = useForm<CollectionChunkFormType>({ const form = useForm<CollectionChunkFormType>({
defaultValues: { defaultValues: {
trainingType: chunkSettings?.trainingType || DatasetCollectionDataProcessModeEnum.chunk, trainingType: chunkSettings?.trainingType,
imageIndex: chunkSettings?.imageIndex || false,
autoIndexes: chunkSettings?.autoIndexes || false,
chunkSettingMode: chunkSettings?.chunkSettingMode || ChunkSettingModeEnum.auto, chunkTriggerType: chunkSettings?.chunkTriggerType || defaultFormData.chunkTriggerType,
chunkSplitMode: chunkSettings?.chunkSplitMode || DataChunkSplitModeEnum.size, chunkTriggerMinSize:
embeddingChunkSize: chunkSettings?.chunkSize || 2000, chunkSettings?.chunkTriggerMinSize || defaultFormData.chunkTriggerMinSize,
qaChunkSize: chunkSettings?.chunkSize || getLLMDefaultChunkSize(datasetDetail.agentModel),
indexSize: chunkSettings?.indexSize || datasetDetail.vectorModel?.defaultToken || 512, dataEnhanceCollectionName:
chunkSettings?.dataEnhanceCollectionName || defaultFormData.dataEnhanceCollectionName,
imageIndex: chunkSettings?.imageIndex || defaultFormData.imageIndex,
autoIndexes: chunkSettings?.autoIndexes || defaultFormData.autoIndexes,
chunkSettingMode: chunkSettings?.chunkSettingMode || defaultFormData.chunkSettingMode,
chunkSplitMode: chunkSettings?.chunkSplitMode || defaultFormData.chunkSplitMode,
paragraphChunkAIMode:
chunkSettings?.paragraphChunkAIMode || defaultFormData.paragraphChunkAIMode,
paragraphChunkDeep: chunkSettings?.paragraphChunkDeep || defaultFormData.paragraphChunkDeep,
paragraphChunkMinSize:
chunkSettings?.paragraphChunkMinSize || defaultFormData.paragraphChunkMinSize,
paragraphChunkMaxSize:
chunkSettings?.paragraphChunkMaxSize || defaultFormData.paragraphChunkMaxSize,
chunkSize: chunkSettings?.chunkSize || defaultFormData.chunkSize,
chunkSplitter: chunkSettings?.chunkSplitter || defaultFormData.chunkSplitter,
indexSize: chunkSettings?.indexSize || defaultFormData.indexSize,
chunkSplitter: chunkSettings?.chunkSplitter || '',
qaPrompt: chunkSettings?.qaPrompt || Prompt_AgentQA.description qaPrompt: chunkSettings?.qaPrompt || Prompt_AgentQA.description
} }
}); });

View File

@@ -17,6 +17,10 @@ import {
} from '@chakra-ui/react'; } from '@chakra-ui/react';
import MyIcon from '@fastgpt/web/components/common/Icon'; import MyIcon from '@fastgpt/web/components/common/Icon';
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio'; import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
import type {
ChunkTriggerConfigTypeEnum,
ParagraphChunkAIModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { import {
DataChunkSplitModeEnum, DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum, DatasetCollectionDataProcessModeEnum,
@@ -42,7 +46,6 @@ import {
minChunkSize minChunkSize
} from '@fastgpt/global/core/dataset/training/utils'; } from '@fastgpt/global/core/dataset/training/utils';
import RadioGroup from '@fastgpt/web/components/common/Radio/RadioGroup'; import RadioGroup from '@fastgpt/web/components/common/Radio/RadioGroup';
import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
import type { LLMModelItemType, EmbeddingModelItemType } from '@fastgpt/global/core/ai/model.d'; import type { LLMModelItemType, EmbeddingModelItemType } from '@fastgpt/global/core/ai/model.d';
const PromptTextarea = ({ const PromptTextarea = ({
@@ -86,19 +89,35 @@ const PromptTextarea = ({
export type CollectionChunkFormType = { export type CollectionChunkFormType = {
trainingType: DatasetCollectionDataProcessModeEnum; trainingType: DatasetCollectionDataProcessModeEnum;
// Chunk trigger
chunkTriggerType: ChunkTriggerConfigTypeEnum;
chunkTriggerMinSize: number; // maxSize from agent model, not store
// Data enhance
dataEnhanceCollectionName: boolean; // Auto add collection name to data
// Index enhance
imageIndex: boolean; imageIndex: boolean;
autoIndexes: boolean; autoIndexes: boolean;
chunkSettingMode: ChunkSettingModeEnum; // Chunk setting
chunkSettingMode: ChunkSettingModeEnum; // 系统参数/自定义参数
chunkSplitMode: DataChunkSplitModeEnum; chunkSplitMode: DataChunkSplitModeEnum;
embeddingChunkSize: number; // Paragraph split
qaChunkSize: number; paragraphChunkAIMode: ParagraphChunkAIModeEnum;
chunkSplitter?: string; paragraphChunkDeep: number; // Paragraph deep
paragraphChunkMinSize: number; // Paragraph min size, if too small, it will merge
paragraphChunkMaxSize: number; // Paragraph max size, if too large, it will split
// Size split
chunkSize: number;
// Char split
chunkSplitter: string;
indexSize: number; indexSize: number;
qaPrompt?: string; qaPrompt?: string;
}; };
const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkFormType> }) => { const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkFormType> }) => {
const { t } = useTranslation(); const { t } = useTranslation();
const { feConfigs } = useSystemStore(); const { feConfigs } = useSystemStore();
@@ -131,29 +150,26 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
tooltip: t(value.tooltip as any) tooltip: t(value.tooltip as any)
})); }));
}, [t]); }, [t]);
const { const {
chunkSizeField,
maxChunkSize, maxChunkSize,
minChunkSize: minChunkSizeValue, minChunkSize: minChunkSizeValue,
maxIndexSize maxIndexSize
} = useMemo(() => { } = useMemo(() => {
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) { if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
return { return {
chunkSizeField: 'qaChunkSize',
maxChunkSize: getLLMMaxChunkSize(agentModel), maxChunkSize: getLLMMaxChunkSize(agentModel),
minChunkSize: 1000, minChunkSize: 1000,
maxIndexSize: 1000 maxIndexSize: 1000
}; };
} else if (autoIndexes) { } else if (autoIndexes) {
return { return {
chunkSizeField: 'embeddingChunkSize',
maxChunkSize: getMaxChunkSize(agentModel), maxChunkSize: getMaxChunkSize(agentModel),
minChunkSize: minChunkSize, minChunkSize: minChunkSize,
maxIndexSize: getMaxIndexSize(vectorModel) maxIndexSize: getMaxIndexSize(vectorModel)
}; };
} else { } else {
return { return {
chunkSizeField: 'embeddingChunkSize',
maxChunkSize: getMaxChunkSize(agentModel), maxChunkSize: getMaxChunkSize(agentModel),
minChunkSize: minChunkSize, minChunkSize: minChunkSize,
maxIndexSize: getMaxIndexSize(vectorModel) maxIndexSize: getMaxIndexSize(vectorModel)
@@ -216,6 +232,11 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
value={trainingType} value={trainingType}
onChange={(e) => { onChange={(e) => {
setValue('trainingType', e); setValue('trainingType', e);
if (e === DatasetCollectionDataProcessModeEnum.qa) {
setValue('chunkSize', getLLMDefaultChunkSize(agentModel));
} else {
setValue('chunkSize', chunkAutoChunkSize);
}
}} }}
defaultBg="white" defaultBg="white"
activeBg="white" activeBg="white"
@@ -317,7 +338,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
> >
<MyNumberInput <MyNumberInput
register={register} register={register}
name={chunkSizeField} name={'chunkSize'}
min={minChunkSizeValue} min={minChunkSizeValue}
max={maxChunkSize} max={maxChunkSize}
size={'sm'} size={'sm'}
@@ -456,24 +477,26 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
export default CollectionChunkForm; export default CollectionChunkForm;
// Get chunk settings from form
export const collectionChunkForm2StoreChunkData = ({ export const collectionChunkForm2StoreChunkData = ({
trainingType,
imageIndex,
autoIndexes,
chunkSettingMode,
chunkSplitMode,
embeddingChunkSize,
qaChunkSize,
chunkSplitter,
indexSize,
qaPrompt,
agentModel, agentModel,
vectorModel vectorModel,
...data
}: CollectionChunkFormType & { }: CollectionChunkFormType & {
agentModel: LLMModelItemType; agentModel: LLMModelItemType;
vectorModel: EmbeddingModelItemType; vectorModel: EmbeddingModelItemType;
}): ChunkSettingsType => { }): CollectionChunkFormType => {
const {
trainingType,
autoIndexes,
chunkSettingMode,
chunkSize,
chunkSplitter,
indexSize,
qaPrompt
} = data;
// 根据处理方式,获取 auto 和 custom 的参数。
const trainingModeSize: { const trainingModeSize: {
autoChunkSize: number; autoChunkSize: number;
autoIndexSize: number; autoIndexSize: number;
@@ -483,53 +506,53 @@ export const collectionChunkForm2StoreChunkData = ({
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) { if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
return { return {
autoChunkSize: getLLMDefaultChunkSize(agentModel), autoChunkSize: getLLMDefaultChunkSize(agentModel),
autoIndexSize: 512, autoIndexSize: getMaxIndexSize(vectorModel),
chunkSize: qaChunkSize, chunkSize,
indexSize: 512 indexSize: getMaxIndexSize(vectorModel)
}; };
} else if (autoIndexes) { } else if (autoIndexes) {
return { return {
autoChunkSize: chunkAutoChunkSize, autoChunkSize: chunkAutoChunkSize,
autoIndexSize: getAutoIndexSize(vectorModel), autoIndexSize: getAutoIndexSize(vectorModel),
chunkSize: embeddingChunkSize, chunkSize,
indexSize indexSize
}; };
} else { } else {
return { return {
autoChunkSize: chunkAutoChunkSize, autoChunkSize: chunkAutoChunkSize,
autoIndexSize: getAutoIndexSize(vectorModel), autoIndexSize: getAutoIndexSize(vectorModel),
chunkSize: embeddingChunkSize, chunkSize,
indexSize indexSize
}; };
} }
})(); })();
const { chunkSize: formatChunkIndex, indexSize: formatIndexSize } = (() => { // 获取真实参数
const {
chunkSize: formatChunkIndex,
indexSize: formatIndexSize,
chunkSplitter: formatChunkSplitter
} = (() => {
if (chunkSettingMode === ChunkSettingModeEnum.auto) { if (chunkSettingMode === ChunkSettingModeEnum.auto) {
return { return {
chunkSize: trainingModeSize.autoChunkSize, chunkSize: trainingModeSize.autoChunkSize,
indexSize: trainingModeSize.autoIndexSize indexSize: trainingModeSize.autoIndexSize,
chunkSplitter: ''
}; };
} else { } else {
return { return {
chunkSize: trainingModeSize.chunkSize, chunkSize: trainingModeSize.chunkSize,
indexSize: trainingModeSize.indexSize indexSize: trainingModeSize.indexSize,
chunkSplitter
}; };
} }
})(); })();
return { return {
trainingType, ...data,
imageIndex,
autoIndexes,
chunkSettingMode,
chunkSplitMode,
chunkSize: formatChunkIndex, chunkSize: formatChunkIndex,
indexSize: formatIndexSize, indexSize: formatIndexSize,
chunkSplitter: formatChunkSplitter,
chunkSplitter,
qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
}; };
}; };

View File

@@ -3,8 +3,10 @@ import { type SetStateAction, useMemo, useState } from 'react';
import { useTranslation } from 'next-i18next'; import { useTranslation } from 'next-i18next';
import { createContext, useContextSelector } from 'use-context-selector'; import { createContext, useContextSelector } from 'use-context-selector';
import { import {
ChunkTriggerConfigTypeEnum,
DatasetCollectionDataProcessModeEnum, DatasetCollectionDataProcessModeEnum,
ImportDataSourceEnum ImportDataSourceEnum,
ParagraphChunkAIModeEnum
} from '@fastgpt/global/core/dataset/constants'; } from '@fastgpt/global/core/dataset/constants';
import { useMyStep } from '@fastgpt/web/hooks/useStep'; import { useMyStep } from '@fastgpt/web/hooks/useStep';
import { Box, Button, Flex, IconButton } from '@chakra-ui/react'; import { Box, Button, Flex, IconButton } from '@chakra-ui/react';
@@ -16,38 +18,14 @@ import { type ImportSourceItemType } from '@/web/core/dataset/type';
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent'; import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants'; import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants';
import { import { chunkAutoChunkSize, getAutoIndexSize } from '@fastgpt/global/core/dataset/training/utils';
getMaxChunkSize,
getLLMDefaultChunkSize,
getLLMMaxChunkSize,
chunkAutoChunkSize,
minChunkSize,
getAutoIndexSize,
getMaxIndexSize
} from '@fastgpt/global/core/dataset/training/utils';
import { type CollectionChunkFormType } from '../Form/CollectionChunkForm'; import { type CollectionChunkFormType } from '../Form/CollectionChunkForm';
type ChunkSizeFieldType = 'embeddingChunkSize' | 'qaChunkSize';
export type ImportFormType = { export type ImportFormType = {
customPdfParse: boolean; customPdfParse: boolean;
webSelector: string; webSelector: string;
} & CollectionChunkFormType; } & CollectionChunkFormType;
type TrainingFiledType = {
chunkOverlapRatio: number;
maxChunkSize: number;
minChunkSize: number;
autoChunkSize: number;
chunkSize: number;
maxIndexSize?: number;
indexSize?: number;
autoIndexSize?: number;
charsPointsPrice: number;
priceTip: string;
uploadRate: number;
chunkSizeField: ChunkSizeFieldType;
};
type DatasetImportContextType = { type DatasetImportContextType = {
importSource: ImportDataSourceEnum; importSource: ImportDataSourceEnum;
parentId: string | undefined; parentId: string | undefined;
@@ -57,7 +35,35 @@ type DatasetImportContextType = {
processParamsForm: UseFormReturn<ImportFormType, any>; processParamsForm: UseFormReturn<ImportFormType, any>;
sources: ImportSourceItemType[]; sources: ImportSourceItemType[];
setSources: React.Dispatch<React.SetStateAction<ImportSourceItemType[]>>; setSources: React.Dispatch<React.SetStateAction<ImportSourceItemType[]>>;
} & TrainingFiledType; };
export const defaultFormData: ImportFormType = {
customPdfParse: false,
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
chunkTriggerType: ChunkTriggerConfigTypeEnum.minSize,
chunkTriggerMinSize: chunkAutoChunkSize,
dataEnhanceCollectionName: false,
imageIndex: false,
autoIndexes: false,
chunkSettingMode: ChunkSettingModeEnum.auto,
chunkSplitMode: DataChunkSplitModeEnum.size,
paragraphChunkAIMode: ParagraphChunkAIModeEnum.auto,
paragraphChunkDeep: 4,
paragraphChunkMinSize: 100,
paragraphChunkMaxSize: chunkAutoChunkSize,
chunkSize: chunkAutoChunkSize,
chunkSplitter: '',
indexSize: getAutoIndexSize(),
qaPrompt: Prompt_AgentQA.description,
webSelector: ''
};
export const DatasetImportContext = createContext<DatasetImportContextType>({ export const DatasetImportContext = createContext<DatasetImportContextType>({
importSource: ImportDataSourceEnum.fileLocal, importSource: ImportDataSourceEnum.fileLocal,
@@ -75,12 +81,9 @@ export const DatasetImportContext = createContext<DatasetImportContextType>({
}, },
chunkSize: 0, chunkSize: 0,
chunkOverlapRatio: 0, chunkOverlapRatio: 0,
uploadRate: 0,
//@ts-ignore //@ts-ignore
processParamsForm: undefined, processParamsForm: undefined,
autoChunkSize: 0, autoChunkSize: 0
charsPointsPrice: 0,
priceTip: ''
}); });
const DatasetImportContextProvider = ({ children }: { children: React.ReactNode }) => { const DatasetImportContextProvider = ({ children }: { children: React.ReactNode }) => {
@@ -180,119 +183,17 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
}); });
const vectorModel = datasetDetail.vectorModel; const vectorModel = datasetDetail.vectorModel;
const agentModel = datasetDetail.agentModel;
const processParamsForm = useForm<ImportFormType>({ const processParamsForm = useForm<ImportFormType>({
defaultValues: { defaultValues: {
imageIndex: false, ...defaultFormData,
autoIndexes: false, indexSize: getAutoIndexSize(vectorModel)
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
chunkSettingMode: ChunkSettingModeEnum.auto,
chunkSplitMode: DataChunkSplitModeEnum.size,
embeddingChunkSize: chunkAutoChunkSize,
indexSize: vectorModel?.defaultToken || 512,
qaChunkSize: getLLMDefaultChunkSize(agentModel),
chunkSplitter: '',
qaPrompt: Prompt_AgentQA.description,
webSelector: '',
customPdfParse: false
} }
}); });
const [sources, setSources] = useState<ImportSourceItemType[]>([]); const [sources, setSources] = useState<ImportSourceItemType[]>([]);
// watch form
const trainingType = processParamsForm.watch('trainingType');
const chunkSettingMode = processParamsForm.watch('chunkSettingMode');
const embeddingChunkSize = processParamsForm.watch('embeddingChunkSize');
const qaChunkSize = processParamsForm.watch('qaChunkSize');
const chunkSplitter = processParamsForm.watch('chunkSplitter');
const autoIndexes = processParamsForm.watch('autoIndexes');
const indexSize = processParamsForm.watch('indexSize');
const TrainingModeMap = useMemo<TrainingFiledType>(() => {
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
return {
chunkSizeField: 'qaChunkSize',
chunkOverlapRatio: 0,
maxChunkSize: getLLMMaxChunkSize(agentModel),
minChunkSize: 1000,
autoChunkSize: getLLMDefaultChunkSize(agentModel),
chunkSize: qaChunkSize,
charsPointsPrice: agentModel.charsPointsPrice || 0,
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
price: agentModel.charsPointsPrice
}),
uploadRate: 30
};
} else if (autoIndexes) {
return {
chunkSizeField: 'embeddingChunkSize',
chunkOverlapRatio: 0.2,
maxChunkSize: getMaxChunkSize(agentModel),
minChunkSize: minChunkSize,
autoChunkSize: chunkAutoChunkSize,
chunkSize: embeddingChunkSize,
maxIndexSize: getMaxIndexSize(vectorModel),
autoIndexSize: getAutoIndexSize(vectorModel),
indexSize,
charsPointsPrice: agentModel.charsPointsPrice || 0,
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
price: agentModel.charsPointsPrice
}),
uploadRate: 100
};
} else {
return {
chunkSizeField: 'embeddingChunkSize',
chunkOverlapRatio: 0.2,
maxChunkSize: getMaxChunkSize(agentModel),
minChunkSize: minChunkSize,
autoChunkSize: chunkAutoChunkSize,
chunkSize: embeddingChunkSize,
maxIndexSize: getMaxIndexSize(vectorModel),
autoIndexSize: getAutoIndexSize(vectorModel),
indexSize,
charsPointsPrice: vectorModel.charsPointsPrice || 0,
priceTip: t('dataset:import.Embedding Estimated Price Tips', {
price: vectorModel.charsPointsPrice
}),
uploadRate: 150
};
}
}, [
trainingType,
autoIndexes,
agentModel,
qaChunkSize,
t,
embeddingChunkSize,
vectorModel,
indexSize
]);
const chunkSettingModeMap = useMemo(() => {
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
return {
chunkSize: TrainingModeMap.autoChunkSize,
indexSize: TrainingModeMap.autoIndexSize,
chunkSplitter: ''
};
} else {
return {
chunkSize: TrainingModeMap.chunkSize,
indexSize: TrainingModeMap.indexSize,
chunkSplitter
};
}
}, [chunkSettingMode, TrainingModeMap, chunkSplitter]);
const contextValue = { const contextValue = {
...TrainingModeMap,
...chunkSettingModeMap,
importSource: source, importSource: source,
parentId, parentId,
activeStep, activeStep,

View File

@@ -17,6 +17,7 @@ import MyBox from '@fastgpt/web/components/common/MyBox';
import Markdown from '@/components/Markdown'; import Markdown from '@/components/Markdown';
import { useToast } from '@fastgpt/web/hooks/useToast'; import { useToast } from '@fastgpt/web/hooks/useToast';
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils'; import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';
const PreviewData = () => { const PreviewData = () => {
const { t } = useTranslation(); const { t } = useTranslation();
@@ -28,8 +29,6 @@ const PreviewData = () => {
const sources = useContextSelector(DatasetImportContext, (v) => v.sources); const sources = useContextSelector(DatasetImportContext, (v) => v.sources);
const importSource = useContextSelector(DatasetImportContext, (v) => v.importSource); const importSource = useContextSelector(DatasetImportContext, (v) => v.importSource);
const chunkSize = useContextSelector(DatasetImportContext, (v) => v.chunkSize);
const chunkOverlapRatio = useContextSelector(DatasetImportContext, (v) => v.chunkOverlapRatio);
const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm); const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm);
const [previewFile, setPreviewFile] = useState<ImportSourceItemType>(); const [previewFile, setPreviewFile] = useState<ImportSourceItemType>();
@@ -37,13 +36,20 @@ const PreviewData = () => {
const { data = { chunks: [], total: 0 }, loading: isLoading } = useRequest2( const { data = { chunks: [], total: 0 }, loading: isLoading } = useRequest2(
async () => { async () => {
if (!previewFile) return { chunks: [], total: 0 }; if (!previewFile) return { chunks: [], total: 0 };
const chunkData = collectionChunkForm2StoreChunkData({
...processParamsForm.getValues(),
vectorModel: datasetDetail.vectorModel,
agentModel: datasetDetail.agentModel
});
if (importSource === ImportDataSourceEnum.fileCustom) { if (importSource === ImportDataSourceEnum.fileCustom) {
const chunkSplitter = processParamsForm.getValues('chunkSplitter'); const chunkSplitter = processParamsForm.getValues('chunkSplitter');
const { chunks } = splitText2Chunks({ const { chunks } = splitText2Chunks({
text: previewFile.rawText || '', text: previewFile.rawText || '',
chunkSize, chunkSize: chunkData.chunkSize,
maxSize: getLLMMaxChunkSize(datasetDetail.agentModel), maxSize: getLLMMaxChunkSize(datasetDetail.agentModel),
overlapRatio: chunkOverlapRatio, overlapRatio: 0.2,
customReg: chunkSplitter ? [chunkSplitter] : [] customReg: chunkSplitter ? [chunkSplitter] : []
}); });
return { return {
@@ -64,18 +70,12 @@ const PreviewData = () => {
previewFile.externalFileUrl || previewFile.externalFileUrl ||
previewFile.apiFileId || previewFile.apiFileId ||
'', '',
externalFileId: previewFile.externalFileId,
customPdfParse: processParamsForm.getValues('customPdfParse'), ...chunkData,
trainingType: processParamsForm.getValues('trainingType'),
chunkSettingMode: processParamsForm.getValues('chunkSettingMode'),
chunkSplitMode: processParamsForm.getValues('chunkSplitMode'),
chunkSize,
chunkSplitter: processParamsForm.getValues('chunkSplitter'),
overlapRatio: chunkOverlapRatio,
selector: processParamsForm.getValues('webSelector'), selector: processParamsForm.getValues('webSelector'),
externalFileId: previewFile.externalFileId customPdfParse: processParamsForm.getValues('customPdfParse'),
overlapRatio: 0.2
}); });
}, },
{ {

View File

@@ -37,6 +37,7 @@ import { useContextSelector } from 'use-context-selector';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import { DatasetImportContext, type ImportFormType } from '../Context'; import { DatasetImportContext, type ImportFormType } from '../Context';
import { type ApiCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d'; import { type ApiCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';
const Upload = () => { const Upload = () => {
const { t } = useTranslation(); const { t } = useTranslation();
@@ -48,10 +49,10 @@ const Upload = () => {
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail); const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
const retrainNewCollectionId = useRef(''); const retrainNewCollectionId = useRef('');
const { importSource, parentId, sources, setSources, processParamsForm, chunkSize, indexSize } = const { importSource, parentId, sources, setSources, processParamsForm } = useContextSelector(
useContextSelector(DatasetImportContext, (v) => v); DatasetImportContext,
(v) => v
const { handleSubmit } = processParamsForm; );
const { totalFilesCount, waitingFilesCount, allFinished, hasCreatingFiles } = useMemo(() => { const { totalFilesCount, waitingFilesCount, allFinished, hasCreatingFiles } = useMemo(() => {
const totalFilesCount = sources.length; const totalFilesCount = sources.length;
@@ -80,7 +81,13 @@ const Upload = () => {
}, [waitingFilesCount, totalFilesCount, allFinished, t]); }, [waitingFilesCount, totalFilesCount, allFinished, t]);
const { runAsync: startUpload, loading: isLoading } = useRequest2( const { runAsync: startUpload, loading: isLoading } = useRequest2(
async ({ trainingType, chunkSplitter, qaPrompt, webSelector }: ImportFormType) => { async ({ customPdfParse, webSelector, ...data }: ImportFormType) => {
const chunkData = collectionChunkForm2StoreChunkData({
...data,
vectorModel: datasetDetail.vectorModel,
agentModel: datasetDetail.agentModel
});
if (sources.length === 0) return; if (sources.length === 0) return;
const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting'); const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting');
@@ -101,23 +108,12 @@ const Upload = () => {
const commonParams: ApiCreateDatasetCollectionParams & { const commonParams: ApiCreateDatasetCollectionParams & {
name: string; name: string;
} = { } = {
...chunkData,
parentId, parentId,
datasetId: datasetDetail._id, datasetId: datasetDetail._id,
name: item.sourceName, name: item.sourceName,
customPdfParse: processParamsForm.getValues('customPdfParse'), customPdfParse
trainingType,
imageIndex: processParamsForm.getValues('imageIndex'),
autoIndexes: processParamsForm.getValues('autoIndexes'),
chunkSettingMode: processParamsForm.getValues('chunkSettingMode'),
chunkSplitMode: processParamsForm.getValues('chunkSplitMode'),
chunkSize,
indexSize,
chunkSplitter,
qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
}; };
if (importSource === ImportDataSourceEnum.reTraining) { if (importSource === ImportDataSourceEnum.reTraining) {
@@ -280,7 +276,10 @@ const Upload = () => {
</TableContainer> </TableContainer>
<Flex justifyContent={'flex-end'} mt={4}> <Flex justifyContent={'flex-end'} mt={4}>
<Button isLoading={isLoading} onClick={handleSubmit((data) => startUpload(data))}> <Button
isLoading={isLoading}
onClick={processParamsForm.handleSubmit((data) => startUpload(data))}
>
{totalFilesCount > 0 && {totalFilesCount > 0 &&
`${t('dataset:total_num_files', { `${t('dataset:total_num_files', {
total: totalFilesCount total: totalFilesCount

View File

@@ -1,6 +1,6 @@
import React from 'react'; import React from 'react';
import { useContextSelector } from 'use-context-selector'; import { useContextSelector } from 'use-context-selector';
import { DatasetImportContext } from '../Context'; import { DatasetImportContext, defaultFormData } from '../Context';
import dynamic from 'next/dynamic'; import dynamic from 'next/dynamic';
import DataProcess from '../commonProgress/DataProcess'; import DataProcess from '../commonProgress/DataProcess';
@@ -48,18 +48,36 @@ const ReTraining = () => {
]); ]);
processParamsForm.reset({ processParamsForm.reset({
customPdfParse: collection.customPdfParse, customPdfParse: collection.customPdfParse || false,
trainingType: collection.trainingType, trainingType: collection.trainingType,
imageIndex: collection.imageIndex,
autoIndexes: collection.autoIndexes,
chunkSettingMode: collection.chunkSettingMode || ChunkSettingModeEnum.auto, chunkTriggerType: collection.chunkTriggerType || defaultFormData.chunkTriggerType,
chunkSplitMode: collection.chunkSplitMode || DataChunkSplitModeEnum.size, chunkTriggerMinSize: collection.chunkTriggerMinSize || defaultFormData.chunkTriggerMinSize,
embeddingChunkSize: collection.chunkSize,
qaChunkSize: collection.chunkSize, dataEnhanceCollectionName:
indexSize: collection.indexSize || 512, collection.dataEnhanceCollectionName || defaultFormData.dataEnhanceCollectionName,
chunkSplitter: collection.chunkSplitter,
webSelector: collection.metadata?.webPageSelector, imageIndex: collection.imageIndex || defaultFormData.imageIndex,
autoIndexes: collection.autoIndexes || defaultFormData.autoIndexes,
chunkSettingMode: collection.chunkSettingMode || defaultFormData.chunkSettingMode,
chunkSplitMode: collection.chunkSplitMode || defaultFormData.chunkSplitMode,
paragraphChunkAIMode:
collection.paragraphChunkAIMode || defaultFormData.paragraphChunkAIMode,
paragraphChunkDeep: collection.paragraphChunkDeep || defaultFormData.paragraphChunkDeep,
paragraphChunkMinSize:
collection.paragraphChunkMinSize || defaultFormData.paragraphChunkMinSize,
paragraphChunkMaxSize:
collection.paragraphChunkMaxSize || defaultFormData.paragraphChunkMaxSize,
chunkSize: collection.chunkSize || defaultFormData.chunkSize,
chunkSplitter: collection.chunkSplitter || defaultFormData.chunkSplitter,
indexSize: collection.indexSize || defaultFormData.indexSize,
webSelector: collection.metadata?.webPageSelector || defaultFormData.webSelector,
qaPrompt: collection.qaPrompt || Prompt_AgentQA.description qaPrompt: collection.qaPrompt || Prompt_AgentQA.description
}); });
} }

View File

@@ -72,18 +72,26 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => {
label: t('common:core.dataset.collection.metadata.Raw text length'), label: t('common:core.dataset.collection.metadata.Raw text length'),
value: collection.rawTextLength ?? '-' value: collection.rawTextLength ?? '-'
}, },
{
label: t('dataset:collection_metadata_image_parse'),
value: collection.imageIndex ? 'Yes' : 'No'
},
{
label: t('dataset:auto_indexes'),
value: collection.autoIndexes ? 'Yes' : 'No'
},
{ {
label: t('dataset:collection.training_type'), label: t('dataset:collection.training_type'),
value: t(DatasetCollectionDataProcessModeMap[collection.trainingType]?.label as any) value: t(DatasetCollectionDataProcessModeMap[collection.trainingType]?.label as any)
}, },
...(collection.imageIndex !== undefined
? [
{
label: t('dataset:data_index_image'),
value: collection.imageIndex ? 'Yes' : 'No'
}
]
: []),
...(collection.autoIndexes !== undefined
? [
{
label: t('dataset:auto_indexes'),
value: collection.autoIndexes ? 'Yes' : 'No'
}
]
: []),
...(collection.chunkSize ...(collection.chunkSize
? [ ? [
{ {