mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-22 04:06:18 +00:00
perf: dataset import params code (#4875)
* perf: dataset import params code * perf: api dataset code * model
This commit is contained in:
@@ -11,6 +11,8 @@ weight: 790
|
|||||||
## 🚀 新增内容
|
## 🚀 新增内容
|
||||||
|
|
||||||
1. 支持 PG 设置`systemEnv.hnswMaxScanTuples`参数,提高迭代搜索的数据总量。
|
1. 支持 PG 设置`systemEnv.hnswMaxScanTuples`参数,提高迭代搜索的数据总量。
|
||||||
|
2. 开放飞书和语雀知识库到开源版。
|
||||||
|
3. gemini 和 claude 最新模型预设。
|
||||||
|
|
||||||
## ⚙️ 优化
|
## ⚙️ 优化
|
||||||
|
|
||||||
|
30
packages/global/core/dataset/api.d.ts
vendored
30
packages/global/core/dataset/api.d.ts
vendored
@@ -1,9 +1,11 @@
|
|||||||
import type { DatasetDataIndexItemType, DatasetSchemaType } from './type';
|
import type { ChunkSettingsType, DatasetDataIndexItemType, DatasetSchemaType } from './type';
|
||||||
import type {
|
import type {
|
||||||
DatasetCollectionTypeEnum,
|
DatasetCollectionTypeEnum,
|
||||||
DatasetCollectionDataProcessModeEnum,
|
DatasetCollectionDataProcessModeEnum,
|
||||||
ChunkSettingModeEnum,
|
ChunkSettingModeEnum,
|
||||||
DataChunkSplitModeEnum
|
DataChunkSplitModeEnum,
|
||||||
|
ChunkTriggerConfigTypeEnum,
|
||||||
|
ParagraphChunkAIModeEnum
|
||||||
} from './constants';
|
} from './constants';
|
||||||
import type { LLMModelItemType } from '../ai/model.d';
|
import type { LLMModelItemType } from '../ai/model.d';
|
||||||
import type { ParentIdType } from 'common/parentFolder/type';
|
import type { ParentIdType } from 'common/parentFolder/type';
|
||||||
@@ -32,26 +34,16 @@ export type DatasetUpdateBody = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
/* ================= collection ===================== */
|
/* ================= collection ===================== */
|
||||||
export type DatasetCollectionChunkMetadataType = {
|
// Input + store params
|
||||||
|
type DatasetCollectionStoreDataType = ChunkSettingsType & {
|
||||||
parentId?: string;
|
parentId?: string;
|
||||||
customPdfParse?: boolean;
|
|
||||||
trainingType?: DatasetCollectionDataProcessModeEnum;
|
|
||||||
imageIndex?: boolean;
|
|
||||||
autoIndexes?: boolean;
|
|
||||||
|
|
||||||
chunkSettingMode?: ChunkSettingModeEnum;
|
|
||||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
|
||||||
|
|
||||||
chunkSize?: number;
|
|
||||||
indexSize?: number;
|
|
||||||
|
|
||||||
chunkSplitter?: string;
|
|
||||||
qaPrompt?: string;
|
|
||||||
metadata?: Record<string, any>;
|
metadata?: Record<string, any>;
|
||||||
|
|
||||||
|
customPdfParse?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
// create collection params
|
// create collection params
|
||||||
export type CreateDatasetCollectionParams = DatasetCollectionChunkMetadataType & {
|
export type CreateDatasetCollectionParams = DatasetCollectionStoreDataType & {
|
||||||
datasetId: string;
|
datasetId: string;
|
||||||
name: string;
|
name: string;
|
||||||
type: DatasetCollectionTypeEnum;
|
type: DatasetCollectionTypeEnum;
|
||||||
@@ -72,7 +64,7 @@ export type CreateDatasetCollectionParams = DatasetCollectionChunkMetadataType &
|
|||||||
nextSyncTime?: Date;
|
nextSyncTime?: Date;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ApiCreateDatasetCollectionParams = DatasetCollectionChunkMetadataType & {
|
export type ApiCreateDatasetCollectionParams = DatasetCollectionStoreDataType & {
|
||||||
datasetId: string;
|
datasetId: string;
|
||||||
tags?: string[];
|
tags?: string[];
|
||||||
};
|
};
|
||||||
@@ -90,7 +82,7 @@ export type ApiDatasetCreateDatasetCollectionParams = ApiCreateDatasetCollection
|
|||||||
export type FileIdCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
|
export type FileIdCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
|
||||||
fileId: string;
|
fileId: string;
|
||||||
};
|
};
|
||||||
export type reTrainingDatasetFileCollectionParams = DatasetCollectionChunkMetadataType & {
|
export type reTrainingDatasetFileCollectionParams = DatasetCollectionStoreDataType & {
|
||||||
datasetId: string;
|
datasetId: string;
|
||||||
collectionId: string;
|
collectionId: string;
|
||||||
};
|
};
|
||||||
|
@@ -143,15 +143,25 @@ export const DatasetCollectionDataProcessModeMap = {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export enum ChunkTriggerConfigTypeEnum {
|
||||||
|
minSize = 'minSize',
|
||||||
|
forceChunk = 'forceChunk',
|
||||||
|
maxSize = 'maxSize'
|
||||||
|
}
|
||||||
export enum ChunkSettingModeEnum {
|
export enum ChunkSettingModeEnum {
|
||||||
auto = 'auto',
|
auto = 'auto',
|
||||||
custom = 'custom'
|
custom = 'custom'
|
||||||
}
|
}
|
||||||
|
|
||||||
export enum DataChunkSplitModeEnum {
|
export enum DataChunkSplitModeEnum {
|
||||||
|
paragraph = 'paragraph',
|
||||||
size = 'size',
|
size = 'size',
|
||||||
char = 'char'
|
char = 'char'
|
||||||
}
|
}
|
||||||
|
export enum ParagraphChunkAIModeEnum {
|
||||||
|
auto = 'auto',
|
||||||
|
force = 'force'
|
||||||
|
}
|
||||||
|
|
||||||
/* ------------ data -------------- */
|
/* ------------ data -------------- */
|
||||||
|
|
||||||
|
@@ -32,7 +32,7 @@ export const DatasetDataIndexMap: Record<
|
|||||||
color: 'red'
|
color: 'red'
|
||||||
},
|
},
|
||||||
[DatasetDataIndexTypeEnum.image]: {
|
[DatasetDataIndexTypeEnum.image]: {
|
||||||
label: i18nT('common:data_index_image'),
|
label: i18nT('dataset:data_index_image'),
|
||||||
color: 'purple'
|
color: 'purple'
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
45
packages/global/core/dataset/type.d.ts
vendored
45
packages/global/core/dataset/type.d.ts
vendored
@@ -8,26 +8,42 @@ import type {
|
|||||||
DatasetStatusEnum,
|
DatasetStatusEnum,
|
||||||
DatasetTypeEnum,
|
DatasetTypeEnum,
|
||||||
SearchScoreTypeEnum,
|
SearchScoreTypeEnum,
|
||||||
TrainingModeEnum
|
TrainingModeEnum,
|
||||||
|
ChunkSettingModeEnum
|
||||||
} from './constants';
|
} from './constants';
|
||||||
import type { DatasetPermission } from '../../support/permission/dataset/controller';
|
import type { DatasetPermission } from '../../support/permission/dataset/controller';
|
||||||
import { Permission } from '../../support/permission/controller';
|
|
||||||
import type { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
|
import type { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
|
||||||
import type { SourceMemberType } from 'support/user/type';
|
import type { SourceMemberType } from 'support/user/type';
|
||||||
import type { DatasetDataIndexTypeEnum } from './data/constants';
|
import type { DatasetDataIndexTypeEnum } from './data/constants';
|
||||||
import type { ChunkSettingModeEnum } from './constants';
|
|
||||||
|
|
||||||
export type ChunkSettingsType = {
|
export type ChunkSettingsType = {
|
||||||
trainingType: DatasetCollectionDataProcessModeEnum;
|
trainingType?: DatasetCollectionDataProcessModeEnum;
|
||||||
autoIndexes?: boolean;
|
|
||||||
|
// Chunk trigger
|
||||||
|
chunkTriggerType?: ChunkTriggerConfigTypeEnum;
|
||||||
|
chunkTriggerMinSize?: number; // maxSize from agent model, not store
|
||||||
|
|
||||||
|
// Data enhance
|
||||||
|
dataEnhanceCollectionName?: boolean; // Auto add collection name to data
|
||||||
|
|
||||||
|
// Index enhance
|
||||||
imageIndex?: boolean;
|
imageIndex?: boolean;
|
||||||
|
autoIndexes?: boolean;
|
||||||
|
|
||||||
chunkSettingMode?: ChunkSettingModeEnum;
|
// Chunk setting
|
||||||
|
chunkSettingMode?: ChunkSettingModeEnum; // 系统参数/自定义参数
|
||||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||||
|
// Paragraph split
|
||||||
|
paragraphChunkAIMode?: ParagraphChunkAIModeEnum;
|
||||||
|
paragraphChunkDeep?: number; // Paragraph deep
|
||||||
|
paragraphChunkMinSize?: number; // Paragraph min size, if too small, it will merge
|
||||||
|
paragraphChunkMaxSize?: number; // Paragraph max size, if too large, it will split
|
||||||
|
// Size split
|
||||||
chunkSize?: number;
|
chunkSize?: number;
|
||||||
indexSize?: number;
|
// Char split
|
||||||
chunkSplitter?: string;
|
chunkSplitter?: string;
|
||||||
|
indexSize?: number;
|
||||||
|
|
||||||
qaPrompt?: string;
|
qaPrompt?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -66,7 +82,7 @@ export type DatasetSchemaType = {
|
|||||||
defaultPermission?: number;
|
defaultPermission?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type DatasetCollectionSchemaType = {
|
export type DatasetCollectionSchemaType = ChunkSettingsType & {
|
||||||
_id: string;
|
_id: string;
|
||||||
teamId: string;
|
teamId: string;
|
||||||
tmbId: string;
|
tmbId: string;
|
||||||
@@ -101,18 +117,7 @@ export type DatasetCollectionSchemaType = {
|
|||||||
|
|
||||||
// Parse settings
|
// Parse settings
|
||||||
customPdfParse?: boolean;
|
customPdfParse?: boolean;
|
||||||
// Chunk settings
|
|
||||||
autoIndexes?: boolean;
|
|
||||||
imageIndex?: boolean;
|
|
||||||
trainingType: DatasetCollectionDataProcessModeEnum;
|
trainingType: DatasetCollectionDataProcessModeEnum;
|
||||||
|
|
||||||
chunkSettingMode?: ChunkSettingModeEnum;
|
|
||||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
|
||||||
|
|
||||||
chunkSize?: number;
|
|
||||||
indexSize?: number;
|
|
||||||
chunkSplitter?: string;
|
|
||||||
qaPrompt?: string;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
export type DatasetCollectionTagsSchemaType = {
|
export type DatasetCollectionTagsSchemaType = {
|
||||||
|
@@ -1,6 +1,54 @@
|
|||||||
{
|
{
|
||||||
"provider": "Claude",
|
"provider": "Claude",
|
||||||
"list": [
|
"list": [
|
||||||
|
{
|
||||||
|
"model": "claude-sonnet-4-20250514",
|
||||||
|
"name": "claude-sonnet-4-20250514",
|
||||||
|
"maxContext": 200000,
|
||||||
|
"maxResponse": 8000,
|
||||||
|
"quoteMaxToken": 100000,
|
||||||
|
"maxTemperature": 1,
|
||||||
|
"showTopP": true,
|
||||||
|
"showStopSign": true,
|
||||||
|
"vision": true,
|
||||||
|
"toolChoice": true,
|
||||||
|
"functionCall": false,
|
||||||
|
"defaultSystemChatPrompt": "",
|
||||||
|
"datasetProcess": true,
|
||||||
|
"usedInClassify": true,
|
||||||
|
"customCQPrompt": "",
|
||||||
|
"usedInExtractFields": true,
|
||||||
|
"usedInQueryExtension": true,
|
||||||
|
"customExtractPrompt": "",
|
||||||
|
"usedInToolCall": true,
|
||||||
|
"defaultConfig": {},
|
||||||
|
"fieldMap": {},
|
||||||
|
"type": "llm"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "claude-opus-4-20250514",
|
||||||
|
"name": "claude-opus-4-20250514",
|
||||||
|
"maxContext": 200000,
|
||||||
|
"maxResponse": 4096,
|
||||||
|
"quoteMaxToken": 100000,
|
||||||
|
"maxTemperature": 1,
|
||||||
|
"showTopP": true,
|
||||||
|
"showStopSign": true,
|
||||||
|
"vision": true,
|
||||||
|
"toolChoice": true,
|
||||||
|
"functionCall": false,
|
||||||
|
"defaultSystemChatPrompt": "",
|
||||||
|
"datasetProcess": true,
|
||||||
|
"usedInClassify": true,
|
||||||
|
"customCQPrompt": "",
|
||||||
|
"usedInExtractFields": true,
|
||||||
|
"usedInQueryExtension": true,
|
||||||
|
"customExtractPrompt": "",
|
||||||
|
"usedInToolCall": true,
|
||||||
|
"defaultConfig": {},
|
||||||
|
"fieldMap": {},
|
||||||
|
"type": "llm"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"model": "claude-3-7-sonnet-20250219",
|
"model": "claude-3-7-sonnet-20250219",
|
||||||
"name": "claude-3-7-sonnet-20250219",
|
"name": "claude-3-7-sonnet-20250219",
|
||||||
|
@@ -25,6 +25,30 @@
|
|||||||
"showTopP": true,
|
"showTopP": true,
|
||||||
"showStopSign": true
|
"showStopSign": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"model": "gemini-2.5-flash-preview-04-17",
|
||||||
|
"name": "gemini-2.5-flash-preview-04-17",
|
||||||
|
"maxContext": 1000000,
|
||||||
|
"maxResponse": 8000,
|
||||||
|
"quoteMaxToken": 60000,
|
||||||
|
"maxTemperature": 1,
|
||||||
|
"vision": true,
|
||||||
|
"toolChoice": true,
|
||||||
|
"functionCall": false,
|
||||||
|
"defaultSystemChatPrompt": "",
|
||||||
|
"datasetProcess": true,
|
||||||
|
"usedInClassify": true,
|
||||||
|
"customCQPrompt": "",
|
||||||
|
"usedInExtractFields": true,
|
||||||
|
"usedInQueryExtension": true,
|
||||||
|
"customExtractPrompt": "",
|
||||||
|
"usedInToolCall": true,
|
||||||
|
"defaultConfig": {},
|
||||||
|
"fieldMap": {},
|
||||||
|
"type": "llm",
|
||||||
|
"showTopP": true,
|
||||||
|
"showStopSign": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"model": "gemini-2.0-flash",
|
"model": "gemini-2.0-flash",
|
||||||
"name": "gemini-2.0-flash",
|
"name": "gemini-2.0-flash",
|
||||||
|
@@ -74,6 +74,15 @@ export const createCollectionAndInsertData = async ({
|
|||||||
llmModel: getLLMModel(dataset.agentModel)
|
llmModel: getLLMModel(dataset.agentModel)
|
||||||
});
|
});
|
||||||
const chunkSplitter = computeChunkSplitter(createCollectionParams);
|
const chunkSplitter = computeChunkSplitter(createCollectionParams);
|
||||||
|
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||||
|
delete createCollectionParams.chunkTriggerType;
|
||||||
|
delete createCollectionParams.chunkTriggerMinSize;
|
||||||
|
delete createCollectionParams.dataEnhanceCollectionName;
|
||||||
|
delete createCollectionParams.imageIndex;
|
||||||
|
delete createCollectionParams.autoIndexes;
|
||||||
|
delete createCollectionParams.indexSize;
|
||||||
|
delete createCollectionParams.qaPrompt;
|
||||||
|
}
|
||||||
|
|
||||||
// 1. split chunks
|
// 1. split chunks
|
||||||
const chunks = rawText2Chunks({
|
const chunks = rawText2Chunks({
|
||||||
|
@@ -163,7 +163,7 @@ export const readApiServerFileContent = async ({
|
|||||||
title?: string;
|
title?: string;
|
||||||
rawText: string;
|
rawText: string;
|
||||||
}> => {
|
}> => {
|
||||||
const data = (
|
return (
|
||||||
await getApiDatasetRequest({
|
await getApiDatasetRequest({
|
||||||
apiServer,
|
apiServer,
|
||||||
yuqueServer,
|
yuqueServer,
|
||||||
@@ -175,10 +175,6 @@ export const readApiServerFileContent = async ({
|
|||||||
apiFileId,
|
apiFileId,
|
||||||
customPdfParse
|
customPdfParse
|
||||||
});
|
});
|
||||||
if (data) {
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
return Promise.reject(Error);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
export const rawText2Chunks = ({
|
export const rawText2Chunks = ({
|
||||||
|
@@ -1,10 +1,12 @@
|
|||||||
import { getMongoModel, Schema } from '../../common/mongo';
|
import { getMongoModel, Schema } from '../../common/mongo';
|
||||||
import {
|
import {
|
||||||
ChunkSettingModeEnum,
|
ChunkSettingModeEnum,
|
||||||
|
ChunkTriggerConfigTypeEnum,
|
||||||
DataChunkSplitModeEnum,
|
DataChunkSplitModeEnum,
|
||||||
DatasetCollectionDataProcessModeEnum,
|
DatasetCollectionDataProcessModeEnum,
|
||||||
DatasetTypeEnum,
|
DatasetTypeEnum,
|
||||||
DatasetTypeMap
|
DatasetTypeMap,
|
||||||
|
ParagraphChunkAIModeEnum
|
||||||
} from '@fastgpt/global/core/dataset/constants';
|
} from '@fastgpt/global/core/dataset/constants';
|
||||||
import {
|
import {
|
||||||
TeamCollectionName,
|
TeamCollectionName,
|
||||||
@@ -15,12 +17,22 @@ import type { DatasetSchemaType } from '@fastgpt/global/core/dataset/type.d';
|
|||||||
export const DatasetCollectionName = 'datasets';
|
export const DatasetCollectionName = 'datasets';
|
||||||
|
|
||||||
export const ChunkSettings = {
|
export const ChunkSettings = {
|
||||||
imageIndex: Boolean,
|
|
||||||
autoIndexes: Boolean,
|
|
||||||
trainingType: {
|
trainingType: {
|
||||||
type: String,
|
type: String,
|
||||||
enum: Object.values(DatasetCollectionDataProcessModeEnum)
|
enum: Object.values(DatasetCollectionDataProcessModeEnum)
|
||||||
},
|
},
|
||||||
|
|
||||||
|
chunkTriggerType: {
|
||||||
|
type: String,
|
||||||
|
enum: Object.values(ChunkTriggerConfigTypeEnum)
|
||||||
|
},
|
||||||
|
chunkTriggerMinSize: Number,
|
||||||
|
|
||||||
|
dataEnhanceCollectionName: Boolean,
|
||||||
|
|
||||||
|
imageIndex: Boolean,
|
||||||
|
autoIndexes: Boolean,
|
||||||
|
|
||||||
chunkSettingMode: {
|
chunkSettingMode: {
|
||||||
type: String,
|
type: String,
|
||||||
enum: Object.values(ChunkSettingModeEnum)
|
enum: Object.values(ChunkSettingModeEnum)
|
||||||
@@ -29,6 +41,13 @@ export const ChunkSettings = {
|
|||||||
type: String,
|
type: String,
|
||||||
enum: Object.values(DataChunkSplitModeEnum)
|
enum: Object.values(DataChunkSplitModeEnum)
|
||||||
},
|
},
|
||||||
|
paragraphChunkAIMode: {
|
||||||
|
type: String,
|
||||||
|
enum: Object.values(ParagraphChunkAIModeEnum)
|
||||||
|
},
|
||||||
|
paragraphChunkDeep: Number,
|
||||||
|
paragraphChunkMinSize: Number,
|
||||||
|
paragraphChunkMaxSize: Number,
|
||||||
chunkSize: Number,
|
chunkSize: Number,
|
||||||
chunkSplitter: String,
|
chunkSplitter: String,
|
||||||
|
|
||||||
@@ -115,9 +134,7 @@ const DatasetSchema = new Schema({
|
|||||||
|
|
||||||
// abandoned
|
// abandoned
|
||||||
autoSync: Boolean,
|
autoSync: Boolean,
|
||||||
externalReadUrl: {
|
externalReadUrl: String,
|
||||||
type: String
|
|
||||||
},
|
|
||||||
defaultPermission: Number
|
defaultPermission: Number
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@@ -749,7 +749,6 @@
|
|||||||
"custom_title": "Custom Title",
|
"custom_title": "Custom Title",
|
||||||
"data_index_custom": "Custom index",
|
"data_index_custom": "Custom index",
|
||||||
"data_index_default": "Default index",
|
"data_index_default": "Default index",
|
||||||
"data_index_image": "Image Index",
|
|
||||||
"data_index_question": "Inferred question index",
|
"data_index_question": "Inferred question index",
|
||||||
"data_index_summary": "Summary Index",
|
"data_index_summary": "Summary Index",
|
||||||
"data_not_found": "Data can't be found",
|
"data_not_found": "Data can't be found",
|
||||||
|
@@ -22,7 +22,6 @@
|
|||||||
"collection.training_type": "Chunk type",
|
"collection.training_type": "Chunk type",
|
||||||
"collection_data_count": "Data amount",
|
"collection_data_count": "Data amount",
|
||||||
"collection_metadata_custom_pdf_parse": "PDF enhancement analysis",
|
"collection_metadata_custom_pdf_parse": "PDF enhancement analysis",
|
||||||
"collection_metadata_image_parse": "Image tagging",
|
|
||||||
"collection_not_support_retraining": "This collection type does not support retuning parameters",
|
"collection_not_support_retraining": "This collection type does not support retuning parameters",
|
||||||
"collection_not_support_sync": "This collection does not support synchronization",
|
"collection_not_support_sync": "This collection does not support synchronization",
|
||||||
"collection_sync": "Sync data",
|
"collection_sync": "Sync data",
|
||||||
@@ -38,6 +37,7 @@
|
|||||||
"custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.",
|
"custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.",
|
||||||
"data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
|
"data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
|
||||||
"data_error_amount": "{{errorAmount}} Group training exception",
|
"data_error_amount": "{{errorAmount}} Group training exception",
|
||||||
|
"data_index_image": "Image index",
|
||||||
"data_index_num": "Index {{index}}",
|
"data_index_num": "Index {{index}}",
|
||||||
"data_process_params": "Params",
|
"data_process_params": "Params",
|
||||||
"data_process_setting": "Processing config",
|
"data_process_setting": "Processing config",
|
||||||
|
@@ -749,7 +749,6 @@
|
|||||||
"custom_title": "自定义标题",
|
"custom_title": "自定义标题",
|
||||||
"data_index_custom": "自定义索引",
|
"data_index_custom": "自定义索引",
|
||||||
"data_index_default": "默认索引",
|
"data_index_default": "默认索引",
|
||||||
"data_index_image": "图片索引",
|
|
||||||
"data_index_question": "推测问题索引",
|
"data_index_question": "推测问题索引",
|
||||||
"data_index_summary": "摘要索引",
|
"data_index_summary": "摘要索引",
|
||||||
"data_not_found": "数据找不到了",
|
"data_not_found": "数据找不到了",
|
||||||
|
@@ -22,7 +22,6 @@
|
|||||||
"collection.training_type": "处理模式",
|
"collection.training_type": "处理模式",
|
||||||
"collection_data_count": "数据量",
|
"collection_data_count": "数据量",
|
||||||
"collection_metadata_custom_pdf_parse": "PDF增强解析",
|
"collection_metadata_custom_pdf_parse": "PDF增强解析",
|
||||||
"collection_metadata_image_parse": "图片标注",
|
|
||||||
"collection_not_support_retraining": "该集合类型不支持重新调整参数",
|
"collection_not_support_retraining": "该集合类型不支持重新调整参数",
|
||||||
"collection_not_support_sync": "该集合不支持同步",
|
"collection_not_support_sync": "该集合不支持同步",
|
||||||
"collection_sync": "立即同步",
|
"collection_sync": "立即同步",
|
||||||
@@ -38,6 +37,7 @@
|
|||||||
"custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号,例如: * () [] {} 等。",
|
"custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号,例如: * () [] {} 等。",
|
||||||
"data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引",
|
"data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引",
|
||||||
"data_error_amount": "{{errorAmount}} 组训练异常",
|
"data_error_amount": "{{errorAmount}} 组训练异常",
|
||||||
|
"data_index_image": "图片索引",
|
||||||
"data_index_num": "索引 {{index}}",
|
"data_index_num": "索引 {{index}}",
|
||||||
"data_process_params": "处理参数",
|
"data_process_params": "处理参数",
|
||||||
"data_process_setting": "数据处理配置",
|
"data_process_setting": "数据处理配置",
|
||||||
|
@@ -749,7 +749,6 @@
|
|||||||
"custom_title": "自訂標題",
|
"custom_title": "自訂標題",
|
||||||
"data_index_custom": "自定義索引",
|
"data_index_custom": "自定義索引",
|
||||||
"data_index_default": "預設索引",
|
"data_index_default": "預設索引",
|
||||||
"data_index_image": "圖片索引",
|
|
||||||
"data_index_question": "推測問題索引",
|
"data_index_question": "推測問題索引",
|
||||||
"data_index_summary": "摘要索引",
|
"data_index_summary": "摘要索引",
|
||||||
"data_not_found": "數據找不到了",
|
"data_not_found": "數據找不到了",
|
||||||
|
@@ -21,7 +21,6 @@
|
|||||||
"collection.training_type": "處理模式",
|
"collection.training_type": "處理模式",
|
||||||
"collection_data_count": "資料量",
|
"collection_data_count": "資料量",
|
||||||
"collection_metadata_custom_pdf_parse": "PDF 增強解析",
|
"collection_metadata_custom_pdf_parse": "PDF 增強解析",
|
||||||
"collection_metadata_image_parse": "圖片標註",
|
|
||||||
"collection_not_support_retraining": "此集合類型不支援重新調整參數",
|
"collection_not_support_retraining": "此集合類型不支援重新調整參數",
|
||||||
"collection_not_support_sync": "該集合不支援同步",
|
"collection_not_support_sync": "該集合不支援同步",
|
||||||
"collection_sync": "立即同步",
|
"collection_sync": "立即同步",
|
||||||
@@ -37,6 +36,7 @@
|
|||||||
"custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的資料,使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.”表示中英文句號。\n\n盡量避免使用正則相關特殊符號,例如:* () [] {} 等。",
|
"custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的資料,使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.”表示中英文句號。\n\n盡量避免使用正則相關特殊符號,例如:* () [] {} 等。",
|
||||||
"data_amount": "{{dataAmount}} 組資料,{{indexAmount}} 組索引",
|
"data_amount": "{{dataAmount}} 組資料,{{indexAmount}} 組索引",
|
||||||
"data_error_amount": "{{errorAmount}} 組訓練異常",
|
"data_error_amount": "{{errorAmount}} 組訓練異常",
|
||||||
|
"data_index_image": "圖片索引",
|
||||||
"data_index_num": "索引 {{index}}",
|
"data_index_num": "索引 {{index}}",
|
||||||
"data_process_params": "處理參數",
|
"data_process_params": "處理參數",
|
||||||
"data_process_setting": "資料處理設定",
|
"data_process_setting": "資料處理設定",
|
||||||
|
@@ -21,9 +21,13 @@ import CollectionChunkForm, {
|
|||||||
collectionChunkForm2StoreChunkData,
|
collectionChunkForm2StoreChunkData,
|
||||||
type CollectionChunkFormType
|
type CollectionChunkFormType
|
||||||
} from '../Form/CollectionChunkForm';
|
} from '../Form/CollectionChunkForm';
|
||||||
import { getLLMDefaultChunkSize } from '@fastgpt/global/core/dataset/training/utils';
|
import {
|
||||||
|
getAutoIndexSize,
|
||||||
|
getLLMDefaultChunkSize
|
||||||
|
} from '@fastgpt/global/core/dataset/training/utils';
|
||||||
import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
|
import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
|
||||||
import PopoverConfirm from '@fastgpt/web/components/common/MyPopover/PopoverConfirm';
|
import PopoverConfirm from '@fastgpt/web/components/common/MyPopover/PopoverConfirm';
|
||||||
|
import { defaultFormData } from '../Import/Context';
|
||||||
|
|
||||||
export type WebsiteConfigFormType = {
|
export type WebsiteConfigFormType = {
|
||||||
websiteConfig: {
|
websiteConfig: {
|
||||||
@@ -76,17 +80,35 @@ const WebsiteConfigModal = ({
|
|||||||
|
|
||||||
const form = useForm<CollectionChunkFormType>({
|
const form = useForm<CollectionChunkFormType>({
|
||||||
defaultValues: {
|
defaultValues: {
|
||||||
trainingType: chunkSettings?.trainingType || DatasetCollectionDataProcessModeEnum.chunk,
|
trainingType: chunkSettings?.trainingType,
|
||||||
imageIndex: chunkSettings?.imageIndex || false,
|
|
||||||
autoIndexes: chunkSettings?.autoIndexes || false,
|
|
||||||
|
|
||||||
chunkSettingMode: chunkSettings?.chunkSettingMode || ChunkSettingModeEnum.auto,
|
chunkTriggerType: chunkSettings?.chunkTriggerType || defaultFormData.chunkTriggerType,
|
||||||
chunkSplitMode: chunkSettings?.chunkSplitMode || DataChunkSplitModeEnum.size,
|
chunkTriggerMinSize:
|
||||||
embeddingChunkSize: chunkSettings?.chunkSize || 2000,
|
chunkSettings?.chunkTriggerMinSize || defaultFormData.chunkTriggerMinSize,
|
||||||
qaChunkSize: chunkSettings?.chunkSize || getLLMDefaultChunkSize(datasetDetail.agentModel),
|
|
||||||
indexSize: chunkSettings?.indexSize || datasetDetail.vectorModel?.defaultToken || 512,
|
dataEnhanceCollectionName:
|
||||||
|
chunkSettings?.dataEnhanceCollectionName || defaultFormData.dataEnhanceCollectionName,
|
||||||
|
|
||||||
|
imageIndex: chunkSettings?.imageIndex || defaultFormData.imageIndex,
|
||||||
|
autoIndexes: chunkSettings?.autoIndexes || defaultFormData.autoIndexes,
|
||||||
|
|
||||||
|
chunkSettingMode: chunkSettings?.chunkSettingMode || defaultFormData.chunkSettingMode,
|
||||||
|
chunkSplitMode: chunkSettings?.chunkSplitMode || defaultFormData.chunkSplitMode,
|
||||||
|
|
||||||
|
paragraphChunkAIMode:
|
||||||
|
chunkSettings?.paragraphChunkAIMode || defaultFormData.paragraphChunkAIMode,
|
||||||
|
paragraphChunkDeep: chunkSettings?.paragraphChunkDeep || defaultFormData.paragraphChunkDeep,
|
||||||
|
paragraphChunkMinSize:
|
||||||
|
chunkSettings?.paragraphChunkMinSize || defaultFormData.paragraphChunkMinSize,
|
||||||
|
paragraphChunkMaxSize:
|
||||||
|
chunkSettings?.paragraphChunkMaxSize || defaultFormData.paragraphChunkMaxSize,
|
||||||
|
|
||||||
|
chunkSize: chunkSettings?.chunkSize || defaultFormData.chunkSize,
|
||||||
|
|
||||||
|
chunkSplitter: chunkSettings?.chunkSplitter || defaultFormData.chunkSplitter,
|
||||||
|
|
||||||
|
indexSize: chunkSettings?.indexSize || defaultFormData.indexSize,
|
||||||
|
|
||||||
chunkSplitter: chunkSettings?.chunkSplitter || '',
|
|
||||||
qaPrompt: chunkSettings?.qaPrompt || Prompt_AgentQA.description
|
qaPrompt: chunkSettings?.qaPrompt || Prompt_AgentQA.description
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@@ -17,6 +17,10 @@ import {
|
|||||||
} from '@chakra-ui/react';
|
} from '@chakra-ui/react';
|
||||||
import MyIcon from '@fastgpt/web/components/common/Icon';
|
import MyIcon from '@fastgpt/web/components/common/Icon';
|
||||||
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
|
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
|
||||||
|
import type {
|
||||||
|
ChunkTriggerConfigTypeEnum,
|
||||||
|
ParagraphChunkAIModeEnum
|
||||||
|
} from '@fastgpt/global/core/dataset/constants';
|
||||||
import {
|
import {
|
||||||
DataChunkSplitModeEnum,
|
DataChunkSplitModeEnum,
|
||||||
DatasetCollectionDataProcessModeEnum,
|
DatasetCollectionDataProcessModeEnum,
|
||||||
@@ -42,7 +46,6 @@ import {
|
|||||||
minChunkSize
|
minChunkSize
|
||||||
} from '@fastgpt/global/core/dataset/training/utils';
|
} from '@fastgpt/global/core/dataset/training/utils';
|
||||||
import RadioGroup from '@fastgpt/web/components/common/Radio/RadioGroup';
|
import RadioGroup from '@fastgpt/web/components/common/Radio/RadioGroup';
|
||||||
import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
|
|
||||||
import type { LLMModelItemType, EmbeddingModelItemType } from '@fastgpt/global/core/ai/model.d';
|
import type { LLMModelItemType, EmbeddingModelItemType } from '@fastgpt/global/core/ai/model.d';
|
||||||
|
|
||||||
const PromptTextarea = ({
|
const PromptTextarea = ({
|
||||||
@@ -86,19 +89,35 @@ const PromptTextarea = ({
|
|||||||
|
|
||||||
export type CollectionChunkFormType = {
|
export type CollectionChunkFormType = {
|
||||||
trainingType: DatasetCollectionDataProcessModeEnum;
|
trainingType: DatasetCollectionDataProcessModeEnum;
|
||||||
|
|
||||||
|
// Chunk trigger
|
||||||
|
chunkTriggerType: ChunkTriggerConfigTypeEnum;
|
||||||
|
chunkTriggerMinSize: number; // maxSize from agent model, not store
|
||||||
|
|
||||||
|
// Data enhance
|
||||||
|
dataEnhanceCollectionName: boolean; // Auto add collection name to data
|
||||||
|
|
||||||
|
// Index enhance
|
||||||
imageIndex: boolean;
|
imageIndex: boolean;
|
||||||
autoIndexes: boolean;
|
autoIndexes: boolean;
|
||||||
|
|
||||||
chunkSettingMode: ChunkSettingModeEnum;
|
// Chunk setting
|
||||||
|
chunkSettingMode: ChunkSettingModeEnum; // 系统参数/自定义参数
|
||||||
chunkSplitMode: DataChunkSplitModeEnum;
|
chunkSplitMode: DataChunkSplitModeEnum;
|
||||||
embeddingChunkSize: number;
|
// Paragraph split
|
||||||
qaChunkSize: number;
|
paragraphChunkAIMode: ParagraphChunkAIModeEnum;
|
||||||
chunkSplitter?: string;
|
paragraphChunkDeep: number; // Paragraph deep
|
||||||
|
paragraphChunkMinSize: number; // Paragraph min size, if too small, it will merge
|
||||||
|
paragraphChunkMaxSize: number; // Paragraph max size, if too large, it will split
|
||||||
|
// Size split
|
||||||
|
chunkSize: number;
|
||||||
|
// Char split
|
||||||
|
chunkSplitter: string;
|
||||||
indexSize: number;
|
indexSize: number;
|
||||||
|
|
||||||
qaPrompt?: string;
|
qaPrompt?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkFormType> }) => {
|
const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkFormType> }) => {
|
||||||
const { t } = useTranslation();
|
const { t } = useTranslation();
|
||||||
const { feConfigs } = useSystemStore();
|
const { feConfigs } = useSystemStore();
|
||||||
@@ -131,29 +150,26 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
|||||||
tooltip: t(value.tooltip as any)
|
tooltip: t(value.tooltip as any)
|
||||||
}));
|
}));
|
||||||
}, [t]);
|
}, [t]);
|
||||||
|
|
||||||
const {
|
const {
|
||||||
chunkSizeField,
|
|
||||||
maxChunkSize,
|
maxChunkSize,
|
||||||
minChunkSize: minChunkSizeValue,
|
minChunkSize: minChunkSizeValue,
|
||||||
maxIndexSize
|
maxIndexSize
|
||||||
} = useMemo(() => {
|
} = useMemo(() => {
|
||||||
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||||
return {
|
return {
|
||||||
chunkSizeField: 'qaChunkSize',
|
|
||||||
maxChunkSize: getLLMMaxChunkSize(agentModel),
|
maxChunkSize: getLLMMaxChunkSize(agentModel),
|
||||||
minChunkSize: 1000,
|
minChunkSize: 1000,
|
||||||
maxIndexSize: 1000
|
maxIndexSize: 1000
|
||||||
};
|
};
|
||||||
} else if (autoIndexes) {
|
} else if (autoIndexes) {
|
||||||
return {
|
return {
|
||||||
chunkSizeField: 'embeddingChunkSize',
|
|
||||||
maxChunkSize: getMaxChunkSize(agentModel),
|
maxChunkSize: getMaxChunkSize(agentModel),
|
||||||
minChunkSize: minChunkSize,
|
minChunkSize: minChunkSize,
|
||||||
maxIndexSize: getMaxIndexSize(vectorModel)
|
maxIndexSize: getMaxIndexSize(vectorModel)
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
return {
|
return {
|
||||||
chunkSizeField: 'embeddingChunkSize',
|
|
||||||
maxChunkSize: getMaxChunkSize(agentModel),
|
maxChunkSize: getMaxChunkSize(agentModel),
|
||||||
minChunkSize: minChunkSize,
|
minChunkSize: minChunkSize,
|
||||||
maxIndexSize: getMaxIndexSize(vectorModel)
|
maxIndexSize: getMaxIndexSize(vectorModel)
|
||||||
@@ -216,6 +232,11 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
|||||||
value={trainingType}
|
value={trainingType}
|
||||||
onChange={(e) => {
|
onChange={(e) => {
|
||||||
setValue('trainingType', e);
|
setValue('trainingType', e);
|
||||||
|
if (e === DatasetCollectionDataProcessModeEnum.qa) {
|
||||||
|
setValue('chunkSize', getLLMDefaultChunkSize(agentModel));
|
||||||
|
} else {
|
||||||
|
setValue('chunkSize', chunkAutoChunkSize);
|
||||||
|
}
|
||||||
}}
|
}}
|
||||||
defaultBg="white"
|
defaultBg="white"
|
||||||
activeBg="white"
|
activeBg="white"
|
||||||
@@ -317,7 +338,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
|||||||
>
|
>
|
||||||
<MyNumberInput
|
<MyNumberInput
|
||||||
register={register}
|
register={register}
|
||||||
name={chunkSizeField}
|
name={'chunkSize'}
|
||||||
min={minChunkSizeValue}
|
min={minChunkSizeValue}
|
||||||
max={maxChunkSize}
|
max={maxChunkSize}
|
||||||
size={'sm'}
|
size={'sm'}
|
||||||
@@ -456,24 +477,26 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
|||||||
|
|
||||||
export default CollectionChunkForm;
|
export default CollectionChunkForm;
|
||||||
|
|
||||||
|
// Get chunk settings from form
|
||||||
export const collectionChunkForm2StoreChunkData = ({
|
export const collectionChunkForm2StoreChunkData = ({
|
||||||
trainingType,
|
|
||||||
imageIndex,
|
|
||||||
autoIndexes,
|
|
||||||
chunkSettingMode,
|
|
||||||
chunkSplitMode,
|
|
||||||
embeddingChunkSize,
|
|
||||||
qaChunkSize,
|
|
||||||
chunkSplitter,
|
|
||||||
indexSize,
|
|
||||||
qaPrompt,
|
|
||||||
|
|
||||||
agentModel,
|
agentModel,
|
||||||
vectorModel
|
vectorModel,
|
||||||
|
...data
|
||||||
}: CollectionChunkFormType & {
|
}: CollectionChunkFormType & {
|
||||||
agentModel: LLMModelItemType;
|
agentModel: LLMModelItemType;
|
||||||
vectorModel: EmbeddingModelItemType;
|
vectorModel: EmbeddingModelItemType;
|
||||||
}): ChunkSettingsType => {
|
}): CollectionChunkFormType => {
|
||||||
|
const {
|
||||||
|
trainingType,
|
||||||
|
autoIndexes,
|
||||||
|
chunkSettingMode,
|
||||||
|
chunkSize,
|
||||||
|
chunkSplitter,
|
||||||
|
indexSize,
|
||||||
|
qaPrompt
|
||||||
|
} = data;
|
||||||
|
|
||||||
|
// 根据处理方式,获取 auto 和 custom 的参数。
|
||||||
const trainingModeSize: {
|
const trainingModeSize: {
|
||||||
autoChunkSize: number;
|
autoChunkSize: number;
|
||||||
autoIndexSize: number;
|
autoIndexSize: number;
|
||||||
@@ -483,53 +506,53 @@ export const collectionChunkForm2StoreChunkData = ({
|
|||||||
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||||
return {
|
return {
|
||||||
autoChunkSize: getLLMDefaultChunkSize(agentModel),
|
autoChunkSize: getLLMDefaultChunkSize(agentModel),
|
||||||
autoIndexSize: 512,
|
autoIndexSize: getMaxIndexSize(vectorModel),
|
||||||
chunkSize: qaChunkSize,
|
chunkSize,
|
||||||
indexSize: 512
|
indexSize: getMaxIndexSize(vectorModel)
|
||||||
};
|
};
|
||||||
} else if (autoIndexes) {
|
} else if (autoIndexes) {
|
||||||
return {
|
return {
|
||||||
autoChunkSize: chunkAutoChunkSize,
|
autoChunkSize: chunkAutoChunkSize,
|
||||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||||
chunkSize: embeddingChunkSize,
|
chunkSize,
|
||||||
indexSize
|
indexSize
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
return {
|
return {
|
||||||
autoChunkSize: chunkAutoChunkSize,
|
autoChunkSize: chunkAutoChunkSize,
|
||||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||||
chunkSize: embeddingChunkSize,
|
chunkSize,
|
||||||
indexSize
|
indexSize
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
})();
|
})();
|
||||||
|
|
||||||
const { chunkSize: formatChunkIndex, indexSize: formatIndexSize } = (() => {
|
// 获取真实参数
|
||||||
|
const {
|
||||||
|
chunkSize: formatChunkIndex,
|
||||||
|
indexSize: formatIndexSize,
|
||||||
|
chunkSplitter: formatChunkSplitter
|
||||||
|
} = (() => {
|
||||||
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
|
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||||
return {
|
return {
|
||||||
chunkSize: trainingModeSize.autoChunkSize,
|
chunkSize: trainingModeSize.autoChunkSize,
|
||||||
indexSize: trainingModeSize.autoIndexSize
|
indexSize: trainingModeSize.autoIndexSize,
|
||||||
|
chunkSplitter: ''
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
return {
|
return {
|
||||||
chunkSize: trainingModeSize.chunkSize,
|
chunkSize: trainingModeSize.chunkSize,
|
||||||
indexSize: trainingModeSize.indexSize
|
indexSize: trainingModeSize.indexSize,
|
||||||
|
chunkSplitter
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
})();
|
})();
|
||||||
|
|
||||||
return {
|
return {
|
||||||
trainingType,
|
...data,
|
||||||
imageIndex,
|
|
||||||
autoIndexes,
|
|
||||||
|
|
||||||
chunkSettingMode,
|
|
||||||
chunkSplitMode,
|
|
||||||
|
|
||||||
chunkSize: formatChunkIndex,
|
chunkSize: formatChunkIndex,
|
||||||
indexSize: formatIndexSize,
|
indexSize: formatIndexSize,
|
||||||
|
chunkSplitter: formatChunkSplitter,
|
||||||
chunkSplitter,
|
|
||||||
qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
|
qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
@@ -3,8 +3,10 @@ import { type SetStateAction, useMemo, useState } from 'react';
|
|||||||
import { useTranslation } from 'next-i18next';
|
import { useTranslation } from 'next-i18next';
|
||||||
import { createContext, useContextSelector } from 'use-context-selector';
|
import { createContext, useContextSelector } from 'use-context-selector';
|
||||||
import {
|
import {
|
||||||
|
ChunkTriggerConfigTypeEnum,
|
||||||
DatasetCollectionDataProcessModeEnum,
|
DatasetCollectionDataProcessModeEnum,
|
||||||
ImportDataSourceEnum
|
ImportDataSourceEnum,
|
||||||
|
ParagraphChunkAIModeEnum
|
||||||
} from '@fastgpt/global/core/dataset/constants';
|
} from '@fastgpt/global/core/dataset/constants';
|
||||||
import { useMyStep } from '@fastgpt/web/hooks/useStep';
|
import { useMyStep } from '@fastgpt/web/hooks/useStep';
|
||||||
import { Box, Button, Flex, IconButton } from '@chakra-ui/react';
|
import { Box, Button, Flex, IconButton } from '@chakra-ui/react';
|
||||||
@@ -16,38 +18,14 @@ import { type ImportSourceItemType } from '@/web/core/dataset/type';
|
|||||||
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
|
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
|
||||||
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
||||||
import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants';
|
import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||||
import {
|
import { chunkAutoChunkSize, getAutoIndexSize } from '@fastgpt/global/core/dataset/training/utils';
|
||||||
getMaxChunkSize,
|
|
||||||
getLLMDefaultChunkSize,
|
|
||||||
getLLMMaxChunkSize,
|
|
||||||
chunkAutoChunkSize,
|
|
||||||
minChunkSize,
|
|
||||||
getAutoIndexSize,
|
|
||||||
getMaxIndexSize
|
|
||||||
} from '@fastgpt/global/core/dataset/training/utils';
|
|
||||||
import { type CollectionChunkFormType } from '../Form/CollectionChunkForm';
|
import { type CollectionChunkFormType } from '../Form/CollectionChunkForm';
|
||||||
|
|
||||||
type ChunkSizeFieldType = 'embeddingChunkSize' | 'qaChunkSize';
|
|
||||||
export type ImportFormType = {
|
export type ImportFormType = {
|
||||||
customPdfParse: boolean;
|
customPdfParse: boolean;
|
||||||
|
|
||||||
webSelector: string;
|
webSelector: string;
|
||||||
} & CollectionChunkFormType;
|
} & CollectionChunkFormType;
|
||||||
|
|
||||||
type TrainingFiledType = {
|
|
||||||
chunkOverlapRatio: number;
|
|
||||||
maxChunkSize: number;
|
|
||||||
minChunkSize: number;
|
|
||||||
autoChunkSize: number;
|
|
||||||
chunkSize: number;
|
|
||||||
maxIndexSize?: number;
|
|
||||||
indexSize?: number;
|
|
||||||
autoIndexSize?: number;
|
|
||||||
charsPointsPrice: number;
|
|
||||||
priceTip: string;
|
|
||||||
uploadRate: number;
|
|
||||||
chunkSizeField: ChunkSizeFieldType;
|
|
||||||
};
|
|
||||||
type DatasetImportContextType = {
|
type DatasetImportContextType = {
|
||||||
importSource: ImportDataSourceEnum;
|
importSource: ImportDataSourceEnum;
|
||||||
parentId: string | undefined;
|
parentId: string | undefined;
|
||||||
@@ -57,7 +35,35 @@ type DatasetImportContextType = {
|
|||||||
processParamsForm: UseFormReturn<ImportFormType, any>;
|
processParamsForm: UseFormReturn<ImportFormType, any>;
|
||||||
sources: ImportSourceItemType[];
|
sources: ImportSourceItemType[];
|
||||||
setSources: React.Dispatch<React.SetStateAction<ImportSourceItemType[]>>;
|
setSources: React.Dispatch<React.SetStateAction<ImportSourceItemType[]>>;
|
||||||
} & TrainingFiledType;
|
};
|
||||||
|
|
||||||
|
export const defaultFormData: ImportFormType = {
|
||||||
|
customPdfParse: false,
|
||||||
|
|
||||||
|
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
|
||||||
|
|
||||||
|
chunkTriggerType: ChunkTriggerConfigTypeEnum.minSize,
|
||||||
|
chunkTriggerMinSize: chunkAutoChunkSize,
|
||||||
|
|
||||||
|
dataEnhanceCollectionName: false,
|
||||||
|
|
||||||
|
imageIndex: false,
|
||||||
|
autoIndexes: false,
|
||||||
|
|
||||||
|
chunkSettingMode: ChunkSettingModeEnum.auto,
|
||||||
|
chunkSplitMode: DataChunkSplitModeEnum.size,
|
||||||
|
paragraphChunkAIMode: ParagraphChunkAIModeEnum.auto,
|
||||||
|
paragraphChunkDeep: 4,
|
||||||
|
paragraphChunkMinSize: 100,
|
||||||
|
paragraphChunkMaxSize: chunkAutoChunkSize,
|
||||||
|
|
||||||
|
chunkSize: chunkAutoChunkSize,
|
||||||
|
chunkSplitter: '',
|
||||||
|
indexSize: getAutoIndexSize(),
|
||||||
|
|
||||||
|
qaPrompt: Prompt_AgentQA.description,
|
||||||
|
webSelector: ''
|
||||||
|
};
|
||||||
|
|
||||||
export const DatasetImportContext = createContext<DatasetImportContextType>({
|
export const DatasetImportContext = createContext<DatasetImportContextType>({
|
||||||
importSource: ImportDataSourceEnum.fileLocal,
|
importSource: ImportDataSourceEnum.fileLocal,
|
||||||
@@ -75,12 +81,9 @@ export const DatasetImportContext = createContext<DatasetImportContextType>({
|
|||||||
},
|
},
|
||||||
chunkSize: 0,
|
chunkSize: 0,
|
||||||
chunkOverlapRatio: 0,
|
chunkOverlapRatio: 0,
|
||||||
uploadRate: 0,
|
|
||||||
//@ts-ignore
|
//@ts-ignore
|
||||||
processParamsForm: undefined,
|
processParamsForm: undefined,
|
||||||
autoChunkSize: 0,
|
autoChunkSize: 0
|
||||||
charsPointsPrice: 0,
|
|
||||||
priceTip: ''
|
|
||||||
});
|
});
|
||||||
|
|
||||||
const DatasetImportContextProvider = ({ children }: { children: React.ReactNode }) => {
|
const DatasetImportContextProvider = ({ children }: { children: React.ReactNode }) => {
|
||||||
@@ -180,119 +183,17 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
|
|||||||
});
|
});
|
||||||
|
|
||||||
const vectorModel = datasetDetail.vectorModel;
|
const vectorModel = datasetDetail.vectorModel;
|
||||||
const agentModel = datasetDetail.agentModel;
|
|
||||||
|
|
||||||
const processParamsForm = useForm<ImportFormType>({
|
const processParamsForm = useForm<ImportFormType>({
|
||||||
defaultValues: {
|
defaultValues: {
|
||||||
imageIndex: false,
|
...defaultFormData,
|
||||||
autoIndexes: false,
|
indexSize: getAutoIndexSize(vectorModel)
|
||||||
|
|
||||||
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
|
|
||||||
|
|
||||||
chunkSettingMode: ChunkSettingModeEnum.auto,
|
|
||||||
|
|
||||||
chunkSplitMode: DataChunkSplitModeEnum.size,
|
|
||||||
embeddingChunkSize: chunkAutoChunkSize,
|
|
||||||
indexSize: vectorModel?.defaultToken || 512,
|
|
||||||
qaChunkSize: getLLMDefaultChunkSize(agentModel),
|
|
||||||
chunkSplitter: '',
|
|
||||||
qaPrompt: Prompt_AgentQA.description,
|
|
||||||
webSelector: '',
|
|
||||||
customPdfParse: false
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
const [sources, setSources] = useState<ImportSourceItemType[]>([]);
|
const [sources, setSources] = useState<ImportSourceItemType[]>([]);
|
||||||
|
|
||||||
// watch form
|
|
||||||
const trainingType = processParamsForm.watch('trainingType');
|
|
||||||
const chunkSettingMode = processParamsForm.watch('chunkSettingMode');
|
|
||||||
const embeddingChunkSize = processParamsForm.watch('embeddingChunkSize');
|
|
||||||
const qaChunkSize = processParamsForm.watch('qaChunkSize');
|
|
||||||
const chunkSplitter = processParamsForm.watch('chunkSplitter');
|
|
||||||
const autoIndexes = processParamsForm.watch('autoIndexes');
|
|
||||||
const indexSize = processParamsForm.watch('indexSize');
|
|
||||||
|
|
||||||
const TrainingModeMap = useMemo<TrainingFiledType>(() => {
|
|
||||||
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
|
||||||
return {
|
|
||||||
chunkSizeField: 'qaChunkSize',
|
|
||||||
chunkOverlapRatio: 0,
|
|
||||||
maxChunkSize: getLLMMaxChunkSize(agentModel),
|
|
||||||
minChunkSize: 1000,
|
|
||||||
autoChunkSize: getLLMDefaultChunkSize(agentModel),
|
|
||||||
chunkSize: qaChunkSize,
|
|
||||||
charsPointsPrice: agentModel.charsPointsPrice || 0,
|
|
||||||
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
|
|
||||||
price: agentModel.charsPointsPrice
|
|
||||||
}),
|
|
||||||
uploadRate: 30
|
|
||||||
};
|
|
||||||
} else if (autoIndexes) {
|
|
||||||
return {
|
|
||||||
chunkSizeField: 'embeddingChunkSize',
|
|
||||||
chunkOverlapRatio: 0.2,
|
|
||||||
maxChunkSize: getMaxChunkSize(agentModel),
|
|
||||||
minChunkSize: minChunkSize,
|
|
||||||
autoChunkSize: chunkAutoChunkSize,
|
|
||||||
chunkSize: embeddingChunkSize,
|
|
||||||
maxIndexSize: getMaxIndexSize(vectorModel),
|
|
||||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
|
||||||
indexSize,
|
|
||||||
charsPointsPrice: agentModel.charsPointsPrice || 0,
|
|
||||||
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
|
|
||||||
price: agentModel.charsPointsPrice
|
|
||||||
}),
|
|
||||||
uploadRate: 100
|
|
||||||
};
|
|
||||||
} else {
|
|
||||||
return {
|
|
||||||
chunkSizeField: 'embeddingChunkSize',
|
|
||||||
chunkOverlapRatio: 0.2,
|
|
||||||
maxChunkSize: getMaxChunkSize(agentModel),
|
|
||||||
minChunkSize: minChunkSize,
|
|
||||||
autoChunkSize: chunkAutoChunkSize,
|
|
||||||
chunkSize: embeddingChunkSize,
|
|
||||||
maxIndexSize: getMaxIndexSize(vectorModel),
|
|
||||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
|
||||||
indexSize,
|
|
||||||
charsPointsPrice: vectorModel.charsPointsPrice || 0,
|
|
||||||
priceTip: t('dataset:import.Embedding Estimated Price Tips', {
|
|
||||||
price: vectorModel.charsPointsPrice
|
|
||||||
}),
|
|
||||||
uploadRate: 150
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}, [
|
|
||||||
trainingType,
|
|
||||||
autoIndexes,
|
|
||||||
agentModel,
|
|
||||||
qaChunkSize,
|
|
||||||
t,
|
|
||||||
embeddingChunkSize,
|
|
||||||
vectorModel,
|
|
||||||
indexSize
|
|
||||||
]);
|
|
||||||
|
|
||||||
const chunkSettingModeMap = useMemo(() => {
|
|
||||||
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
|
|
||||||
return {
|
|
||||||
chunkSize: TrainingModeMap.autoChunkSize,
|
|
||||||
indexSize: TrainingModeMap.autoIndexSize,
|
|
||||||
chunkSplitter: ''
|
|
||||||
};
|
|
||||||
} else {
|
|
||||||
return {
|
|
||||||
chunkSize: TrainingModeMap.chunkSize,
|
|
||||||
indexSize: TrainingModeMap.indexSize,
|
|
||||||
chunkSplitter
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}, [chunkSettingMode, TrainingModeMap, chunkSplitter]);
|
|
||||||
|
|
||||||
const contextValue = {
|
const contextValue = {
|
||||||
...TrainingModeMap,
|
|
||||||
...chunkSettingModeMap,
|
|
||||||
importSource: source,
|
importSource: source,
|
||||||
parentId,
|
parentId,
|
||||||
activeStep,
|
activeStep,
|
||||||
|
@@ -17,6 +17,7 @@ import MyBox from '@fastgpt/web/components/common/MyBox';
|
|||||||
import Markdown from '@/components/Markdown';
|
import Markdown from '@/components/Markdown';
|
||||||
import { useToast } from '@fastgpt/web/hooks/useToast';
|
import { useToast } from '@fastgpt/web/hooks/useToast';
|
||||||
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
|
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
|
||||||
|
import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';
|
||||||
|
|
||||||
const PreviewData = () => {
|
const PreviewData = () => {
|
||||||
const { t } = useTranslation();
|
const { t } = useTranslation();
|
||||||
@@ -28,8 +29,6 @@ const PreviewData = () => {
|
|||||||
|
|
||||||
const sources = useContextSelector(DatasetImportContext, (v) => v.sources);
|
const sources = useContextSelector(DatasetImportContext, (v) => v.sources);
|
||||||
const importSource = useContextSelector(DatasetImportContext, (v) => v.importSource);
|
const importSource = useContextSelector(DatasetImportContext, (v) => v.importSource);
|
||||||
const chunkSize = useContextSelector(DatasetImportContext, (v) => v.chunkSize);
|
|
||||||
const chunkOverlapRatio = useContextSelector(DatasetImportContext, (v) => v.chunkOverlapRatio);
|
|
||||||
const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm);
|
const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm);
|
||||||
|
|
||||||
const [previewFile, setPreviewFile] = useState<ImportSourceItemType>();
|
const [previewFile, setPreviewFile] = useState<ImportSourceItemType>();
|
||||||
@@ -37,13 +36,20 @@ const PreviewData = () => {
|
|||||||
const { data = { chunks: [], total: 0 }, loading: isLoading } = useRequest2(
|
const { data = { chunks: [], total: 0 }, loading: isLoading } = useRequest2(
|
||||||
async () => {
|
async () => {
|
||||||
if (!previewFile) return { chunks: [], total: 0 };
|
if (!previewFile) return { chunks: [], total: 0 };
|
||||||
|
|
||||||
|
const chunkData = collectionChunkForm2StoreChunkData({
|
||||||
|
...processParamsForm.getValues(),
|
||||||
|
vectorModel: datasetDetail.vectorModel,
|
||||||
|
agentModel: datasetDetail.agentModel
|
||||||
|
});
|
||||||
|
|
||||||
if (importSource === ImportDataSourceEnum.fileCustom) {
|
if (importSource === ImportDataSourceEnum.fileCustom) {
|
||||||
const chunkSplitter = processParamsForm.getValues('chunkSplitter');
|
const chunkSplitter = processParamsForm.getValues('chunkSplitter');
|
||||||
const { chunks } = splitText2Chunks({
|
const { chunks } = splitText2Chunks({
|
||||||
text: previewFile.rawText || '',
|
text: previewFile.rawText || '',
|
||||||
chunkSize,
|
chunkSize: chunkData.chunkSize,
|
||||||
maxSize: getLLMMaxChunkSize(datasetDetail.agentModel),
|
maxSize: getLLMMaxChunkSize(datasetDetail.agentModel),
|
||||||
overlapRatio: chunkOverlapRatio,
|
overlapRatio: 0.2,
|
||||||
customReg: chunkSplitter ? [chunkSplitter] : []
|
customReg: chunkSplitter ? [chunkSplitter] : []
|
||||||
});
|
});
|
||||||
return {
|
return {
|
||||||
@@ -64,18 +70,12 @@ const PreviewData = () => {
|
|||||||
previewFile.externalFileUrl ||
|
previewFile.externalFileUrl ||
|
||||||
previewFile.apiFileId ||
|
previewFile.apiFileId ||
|
||||||
'',
|
'',
|
||||||
|
externalFileId: previewFile.externalFileId,
|
||||||
|
|
||||||
customPdfParse: processParamsForm.getValues('customPdfParse'),
|
...chunkData,
|
||||||
|
|
||||||
trainingType: processParamsForm.getValues('trainingType'),
|
|
||||||
chunkSettingMode: processParamsForm.getValues('chunkSettingMode'),
|
|
||||||
chunkSplitMode: processParamsForm.getValues('chunkSplitMode'),
|
|
||||||
chunkSize,
|
|
||||||
chunkSplitter: processParamsForm.getValues('chunkSplitter'),
|
|
||||||
overlapRatio: chunkOverlapRatio,
|
|
||||||
|
|
||||||
selector: processParamsForm.getValues('webSelector'),
|
selector: processParamsForm.getValues('webSelector'),
|
||||||
externalFileId: previewFile.externalFileId
|
customPdfParse: processParamsForm.getValues('customPdfParse'),
|
||||||
|
overlapRatio: 0.2
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@@ -37,6 +37,7 @@ import { useContextSelector } from 'use-context-selector';
|
|||||||
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
||||||
import { DatasetImportContext, type ImportFormType } from '../Context';
|
import { DatasetImportContext, type ImportFormType } from '../Context';
|
||||||
import { type ApiCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
|
import { type ApiCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
|
||||||
|
import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';
|
||||||
|
|
||||||
const Upload = () => {
|
const Upload = () => {
|
||||||
const { t } = useTranslation();
|
const { t } = useTranslation();
|
||||||
@@ -48,10 +49,10 @@ const Upload = () => {
|
|||||||
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
|
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
|
||||||
const retrainNewCollectionId = useRef('');
|
const retrainNewCollectionId = useRef('');
|
||||||
|
|
||||||
const { importSource, parentId, sources, setSources, processParamsForm, chunkSize, indexSize } =
|
const { importSource, parentId, sources, setSources, processParamsForm } = useContextSelector(
|
||||||
useContextSelector(DatasetImportContext, (v) => v);
|
DatasetImportContext,
|
||||||
|
(v) => v
|
||||||
const { handleSubmit } = processParamsForm;
|
);
|
||||||
|
|
||||||
const { totalFilesCount, waitingFilesCount, allFinished, hasCreatingFiles } = useMemo(() => {
|
const { totalFilesCount, waitingFilesCount, allFinished, hasCreatingFiles } = useMemo(() => {
|
||||||
const totalFilesCount = sources.length;
|
const totalFilesCount = sources.length;
|
||||||
@@ -80,7 +81,13 @@ const Upload = () => {
|
|||||||
}, [waitingFilesCount, totalFilesCount, allFinished, t]);
|
}, [waitingFilesCount, totalFilesCount, allFinished, t]);
|
||||||
|
|
||||||
const { runAsync: startUpload, loading: isLoading } = useRequest2(
|
const { runAsync: startUpload, loading: isLoading } = useRequest2(
|
||||||
async ({ trainingType, chunkSplitter, qaPrompt, webSelector }: ImportFormType) => {
|
async ({ customPdfParse, webSelector, ...data }: ImportFormType) => {
|
||||||
|
const chunkData = collectionChunkForm2StoreChunkData({
|
||||||
|
...data,
|
||||||
|
vectorModel: datasetDetail.vectorModel,
|
||||||
|
agentModel: datasetDetail.agentModel
|
||||||
|
});
|
||||||
|
|
||||||
if (sources.length === 0) return;
|
if (sources.length === 0) return;
|
||||||
const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting');
|
const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting');
|
||||||
|
|
||||||
@@ -101,23 +108,12 @@ const Upload = () => {
|
|||||||
const commonParams: ApiCreateDatasetCollectionParams & {
|
const commonParams: ApiCreateDatasetCollectionParams & {
|
||||||
name: string;
|
name: string;
|
||||||
} = {
|
} = {
|
||||||
|
...chunkData,
|
||||||
parentId,
|
parentId,
|
||||||
datasetId: datasetDetail._id,
|
datasetId: datasetDetail._id,
|
||||||
name: item.sourceName,
|
name: item.sourceName,
|
||||||
|
|
||||||
customPdfParse: processParamsForm.getValues('customPdfParse'),
|
customPdfParse
|
||||||
|
|
||||||
trainingType,
|
|
||||||
imageIndex: processParamsForm.getValues('imageIndex'),
|
|
||||||
autoIndexes: processParamsForm.getValues('autoIndexes'),
|
|
||||||
|
|
||||||
chunkSettingMode: processParamsForm.getValues('chunkSettingMode'),
|
|
||||||
chunkSplitMode: processParamsForm.getValues('chunkSplitMode'),
|
|
||||||
|
|
||||||
chunkSize,
|
|
||||||
indexSize,
|
|
||||||
chunkSplitter,
|
|
||||||
qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
|
|
||||||
};
|
};
|
||||||
|
|
||||||
if (importSource === ImportDataSourceEnum.reTraining) {
|
if (importSource === ImportDataSourceEnum.reTraining) {
|
||||||
@@ -280,7 +276,10 @@ const Upload = () => {
|
|||||||
</TableContainer>
|
</TableContainer>
|
||||||
|
|
||||||
<Flex justifyContent={'flex-end'} mt={4}>
|
<Flex justifyContent={'flex-end'} mt={4}>
|
||||||
<Button isLoading={isLoading} onClick={handleSubmit((data) => startUpload(data))}>
|
<Button
|
||||||
|
isLoading={isLoading}
|
||||||
|
onClick={processParamsForm.handleSubmit((data) => startUpload(data))}
|
||||||
|
>
|
||||||
{totalFilesCount > 0 &&
|
{totalFilesCount > 0 &&
|
||||||
`${t('dataset:total_num_files', {
|
`${t('dataset:total_num_files', {
|
||||||
total: totalFilesCount
|
total: totalFilesCount
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
import React from 'react';
|
import React from 'react';
|
||||||
import { useContextSelector } from 'use-context-selector';
|
import { useContextSelector } from 'use-context-selector';
|
||||||
import { DatasetImportContext } from '../Context';
|
import { DatasetImportContext, defaultFormData } from '../Context';
|
||||||
|
|
||||||
import dynamic from 'next/dynamic';
|
import dynamic from 'next/dynamic';
|
||||||
import DataProcess from '../commonProgress/DataProcess';
|
import DataProcess from '../commonProgress/DataProcess';
|
||||||
@@ -48,18 +48,36 @@ const ReTraining = () => {
|
|||||||
]);
|
]);
|
||||||
|
|
||||||
processParamsForm.reset({
|
processParamsForm.reset({
|
||||||
customPdfParse: collection.customPdfParse,
|
customPdfParse: collection.customPdfParse || false,
|
||||||
trainingType: collection.trainingType,
|
trainingType: collection.trainingType,
|
||||||
imageIndex: collection.imageIndex,
|
|
||||||
autoIndexes: collection.autoIndexes,
|
|
||||||
|
|
||||||
chunkSettingMode: collection.chunkSettingMode || ChunkSettingModeEnum.auto,
|
chunkTriggerType: collection.chunkTriggerType || defaultFormData.chunkTriggerType,
|
||||||
chunkSplitMode: collection.chunkSplitMode || DataChunkSplitModeEnum.size,
|
chunkTriggerMinSize: collection.chunkTriggerMinSize || defaultFormData.chunkTriggerMinSize,
|
||||||
embeddingChunkSize: collection.chunkSize,
|
|
||||||
qaChunkSize: collection.chunkSize,
|
dataEnhanceCollectionName:
|
||||||
indexSize: collection.indexSize || 512,
|
collection.dataEnhanceCollectionName || defaultFormData.dataEnhanceCollectionName,
|
||||||
chunkSplitter: collection.chunkSplitter,
|
|
||||||
webSelector: collection.metadata?.webPageSelector,
|
imageIndex: collection.imageIndex || defaultFormData.imageIndex,
|
||||||
|
autoIndexes: collection.autoIndexes || defaultFormData.autoIndexes,
|
||||||
|
|
||||||
|
chunkSettingMode: collection.chunkSettingMode || defaultFormData.chunkSettingMode,
|
||||||
|
chunkSplitMode: collection.chunkSplitMode || defaultFormData.chunkSplitMode,
|
||||||
|
|
||||||
|
paragraphChunkAIMode:
|
||||||
|
collection.paragraphChunkAIMode || defaultFormData.paragraphChunkAIMode,
|
||||||
|
paragraphChunkDeep: collection.paragraphChunkDeep || defaultFormData.paragraphChunkDeep,
|
||||||
|
paragraphChunkMinSize:
|
||||||
|
collection.paragraphChunkMinSize || defaultFormData.paragraphChunkMinSize,
|
||||||
|
paragraphChunkMaxSize:
|
||||||
|
collection.paragraphChunkMaxSize || defaultFormData.paragraphChunkMaxSize,
|
||||||
|
|
||||||
|
chunkSize: collection.chunkSize || defaultFormData.chunkSize,
|
||||||
|
|
||||||
|
chunkSplitter: collection.chunkSplitter || defaultFormData.chunkSplitter,
|
||||||
|
|
||||||
|
indexSize: collection.indexSize || defaultFormData.indexSize,
|
||||||
|
|
||||||
|
webSelector: collection.metadata?.webPageSelector || defaultFormData.webSelector,
|
||||||
qaPrompt: collection.qaPrompt || Prompt_AgentQA.description
|
qaPrompt: collection.qaPrompt || Prompt_AgentQA.description
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@@ -72,18 +72,26 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => {
|
|||||||
label: t('common:core.dataset.collection.metadata.Raw text length'),
|
label: t('common:core.dataset.collection.metadata.Raw text length'),
|
||||||
value: collection.rawTextLength ?? '-'
|
value: collection.rawTextLength ?? '-'
|
||||||
},
|
},
|
||||||
{
|
|
||||||
label: t('dataset:collection_metadata_image_parse'),
|
|
||||||
value: collection.imageIndex ? 'Yes' : 'No'
|
|
||||||
},
|
|
||||||
{
|
|
||||||
label: t('dataset:auto_indexes'),
|
|
||||||
value: collection.autoIndexes ? 'Yes' : 'No'
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
label: t('dataset:collection.training_type'),
|
label: t('dataset:collection.training_type'),
|
||||||
value: t(DatasetCollectionDataProcessModeMap[collection.trainingType]?.label as any)
|
value: t(DatasetCollectionDataProcessModeMap[collection.trainingType]?.label as any)
|
||||||
},
|
},
|
||||||
|
...(collection.imageIndex !== undefined
|
||||||
|
? [
|
||||||
|
{
|
||||||
|
label: t('dataset:data_index_image'),
|
||||||
|
value: collection.imageIndex ? 'Yes' : 'No'
|
||||||
|
}
|
||||||
|
]
|
||||||
|
: []),
|
||||||
|
...(collection.autoIndexes !== undefined
|
||||||
|
? [
|
||||||
|
{
|
||||||
|
label: t('dataset:auto_indexes'),
|
||||||
|
value: collection.autoIndexes ? 'Yes' : 'No'
|
||||||
|
}
|
||||||
|
]
|
||||||
|
: []),
|
||||||
...(collection.chunkSize
|
...(collection.chunkSize
|
||||||
? [
|
? [
|
||||||
{
|
{
|
||||||
|
Reference in New Issue
Block a user