mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-22 20:37:48 +00:00
perf: dataset import params code (#4875)
* perf: dataset import params code * perf: api dataset code * model
This commit is contained in:
30
packages/global/core/dataset/api.d.ts
vendored
30
packages/global/core/dataset/api.d.ts
vendored
@@ -1,9 +1,11 @@
|
||||
import type { DatasetDataIndexItemType, DatasetSchemaType } from './type';
|
||||
import type { ChunkSettingsType, DatasetDataIndexItemType, DatasetSchemaType } from './type';
|
||||
import type {
|
||||
DatasetCollectionTypeEnum,
|
||||
DatasetCollectionDataProcessModeEnum,
|
||||
ChunkSettingModeEnum,
|
||||
DataChunkSplitModeEnum
|
||||
DataChunkSplitModeEnum,
|
||||
ChunkTriggerConfigTypeEnum,
|
||||
ParagraphChunkAIModeEnum
|
||||
} from './constants';
|
||||
import type { LLMModelItemType } from '../ai/model.d';
|
||||
import type { ParentIdType } from 'common/parentFolder/type';
|
||||
@@ -32,26 +34,16 @@ export type DatasetUpdateBody = {
|
||||
};
|
||||
|
||||
/* ================= collection ===================== */
|
||||
export type DatasetCollectionChunkMetadataType = {
|
||||
// Input + store params
|
||||
type DatasetCollectionStoreDataType = ChunkSettingsType & {
|
||||
parentId?: string;
|
||||
customPdfParse?: boolean;
|
||||
trainingType?: DatasetCollectionDataProcessModeEnum;
|
||||
imageIndex?: boolean;
|
||||
autoIndexes?: boolean;
|
||||
|
||||
chunkSettingMode?: ChunkSettingModeEnum;
|
||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||
|
||||
chunkSize?: number;
|
||||
indexSize?: number;
|
||||
|
||||
chunkSplitter?: string;
|
||||
qaPrompt?: string;
|
||||
metadata?: Record<string, any>;
|
||||
|
||||
customPdfParse?: boolean;
|
||||
};
|
||||
|
||||
// create collection params
|
||||
export type CreateDatasetCollectionParams = DatasetCollectionChunkMetadataType & {
|
||||
export type CreateDatasetCollectionParams = DatasetCollectionStoreDataType & {
|
||||
datasetId: string;
|
||||
name: string;
|
||||
type: DatasetCollectionTypeEnum;
|
||||
@@ -72,7 +64,7 @@ export type CreateDatasetCollectionParams = DatasetCollectionChunkMetadataType &
|
||||
nextSyncTime?: Date;
|
||||
};
|
||||
|
||||
export type ApiCreateDatasetCollectionParams = DatasetCollectionChunkMetadataType & {
|
||||
export type ApiCreateDatasetCollectionParams = DatasetCollectionStoreDataType & {
|
||||
datasetId: string;
|
||||
tags?: string[];
|
||||
};
|
||||
@@ -90,7 +82,7 @@ export type ApiDatasetCreateDatasetCollectionParams = ApiCreateDatasetCollection
|
||||
export type FileIdCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
|
||||
fileId: string;
|
||||
};
|
||||
export type reTrainingDatasetFileCollectionParams = DatasetCollectionChunkMetadataType & {
|
||||
export type reTrainingDatasetFileCollectionParams = DatasetCollectionStoreDataType & {
|
||||
datasetId: string;
|
||||
collectionId: string;
|
||||
};
|
||||
|
@@ -143,15 +143,25 @@ export const DatasetCollectionDataProcessModeMap = {
|
||||
}
|
||||
};
|
||||
|
||||
export enum ChunkTriggerConfigTypeEnum {
|
||||
minSize = 'minSize',
|
||||
forceChunk = 'forceChunk',
|
||||
maxSize = 'maxSize'
|
||||
}
|
||||
export enum ChunkSettingModeEnum {
|
||||
auto = 'auto',
|
||||
custom = 'custom'
|
||||
}
|
||||
|
||||
export enum DataChunkSplitModeEnum {
|
||||
paragraph = 'paragraph',
|
||||
size = 'size',
|
||||
char = 'char'
|
||||
}
|
||||
export enum ParagraphChunkAIModeEnum {
|
||||
auto = 'auto',
|
||||
force = 'force'
|
||||
}
|
||||
|
||||
/* ------------ data -------------- */
|
||||
|
||||
|
@@ -32,7 +32,7 @@ export const DatasetDataIndexMap: Record<
|
||||
color: 'red'
|
||||
},
|
||||
[DatasetDataIndexTypeEnum.image]: {
|
||||
label: i18nT('common:data_index_image'),
|
||||
label: i18nT('dataset:data_index_image'),
|
||||
color: 'purple'
|
||||
}
|
||||
};
|
||||
|
45
packages/global/core/dataset/type.d.ts
vendored
45
packages/global/core/dataset/type.d.ts
vendored
@@ -8,26 +8,42 @@ import type {
|
||||
DatasetStatusEnum,
|
||||
DatasetTypeEnum,
|
||||
SearchScoreTypeEnum,
|
||||
TrainingModeEnum
|
||||
TrainingModeEnum,
|
||||
ChunkSettingModeEnum
|
||||
} from './constants';
|
||||
import type { DatasetPermission } from '../../support/permission/dataset/controller';
|
||||
import { Permission } from '../../support/permission/controller';
|
||||
import type { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
|
||||
import type { SourceMemberType } from 'support/user/type';
|
||||
import type { DatasetDataIndexTypeEnum } from './data/constants';
|
||||
import type { ChunkSettingModeEnum } from './constants';
|
||||
|
||||
export type ChunkSettingsType = {
|
||||
trainingType: DatasetCollectionDataProcessModeEnum;
|
||||
autoIndexes?: boolean;
|
||||
trainingType?: DatasetCollectionDataProcessModeEnum;
|
||||
|
||||
// Chunk trigger
|
||||
chunkTriggerType?: ChunkTriggerConfigTypeEnum;
|
||||
chunkTriggerMinSize?: number; // maxSize from agent model, not store
|
||||
|
||||
// Data enhance
|
||||
dataEnhanceCollectionName?: boolean; // Auto add collection name to data
|
||||
|
||||
// Index enhance
|
||||
imageIndex?: boolean;
|
||||
autoIndexes?: boolean;
|
||||
|
||||
chunkSettingMode?: ChunkSettingModeEnum;
|
||||
// Chunk setting
|
||||
chunkSettingMode?: ChunkSettingModeEnum; // 系统参数/自定义参数
|
||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||
|
||||
// Paragraph split
|
||||
paragraphChunkAIMode?: ParagraphChunkAIModeEnum;
|
||||
paragraphChunkDeep?: number; // Paragraph deep
|
||||
paragraphChunkMinSize?: number; // Paragraph min size, if too small, it will merge
|
||||
paragraphChunkMaxSize?: number; // Paragraph max size, if too large, it will split
|
||||
// Size split
|
||||
chunkSize?: number;
|
||||
indexSize?: number;
|
||||
// Char split
|
||||
chunkSplitter?: string;
|
||||
indexSize?: number;
|
||||
|
||||
qaPrompt?: string;
|
||||
};
|
||||
|
||||
@@ -66,7 +82,7 @@ export type DatasetSchemaType = {
|
||||
defaultPermission?: number;
|
||||
};
|
||||
|
||||
export type DatasetCollectionSchemaType = {
|
||||
export type DatasetCollectionSchemaType = ChunkSettingsType & {
|
||||
_id: string;
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
@@ -101,18 +117,7 @@ export type DatasetCollectionSchemaType = {
|
||||
|
||||
// Parse settings
|
||||
customPdfParse?: boolean;
|
||||
// Chunk settings
|
||||
autoIndexes?: boolean;
|
||||
imageIndex?: boolean;
|
||||
trainingType: DatasetCollectionDataProcessModeEnum;
|
||||
|
||||
chunkSettingMode?: ChunkSettingModeEnum;
|
||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||
|
||||
chunkSize?: number;
|
||||
indexSize?: number;
|
||||
chunkSplitter?: string;
|
||||
qaPrompt?: string;
|
||||
};
|
||||
|
||||
export type DatasetCollectionTagsSchemaType = {
|
||||
|
@@ -1,6 +1,54 @@
|
||||
{
|
||||
"provider": "Claude",
|
||||
"list": [
|
||||
{
|
||||
"model": "claude-sonnet-4-20250514",
|
||||
"name": "claude-sonnet-4-20250514",
|
||||
"maxContext": 200000,
|
||||
"maxResponse": 8000,
|
||||
"quoteMaxToken": 100000,
|
||||
"maxTemperature": 1,
|
||||
"showTopP": true,
|
||||
"showStopSign": true,
|
||||
"vision": true,
|
||||
"toolChoice": true,
|
||||
"functionCall": false,
|
||||
"defaultSystemChatPrompt": "",
|
||||
"datasetProcess": true,
|
||||
"usedInClassify": true,
|
||||
"customCQPrompt": "",
|
||||
"usedInExtractFields": true,
|
||||
"usedInQueryExtension": true,
|
||||
"customExtractPrompt": "",
|
||||
"usedInToolCall": true,
|
||||
"defaultConfig": {},
|
||||
"fieldMap": {},
|
||||
"type": "llm"
|
||||
},
|
||||
{
|
||||
"model": "claude-opus-4-20250514",
|
||||
"name": "claude-opus-4-20250514",
|
||||
"maxContext": 200000,
|
||||
"maxResponse": 4096,
|
||||
"quoteMaxToken": 100000,
|
||||
"maxTemperature": 1,
|
||||
"showTopP": true,
|
||||
"showStopSign": true,
|
||||
"vision": true,
|
||||
"toolChoice": true,
|
||||
"functionCall": false,
|
||||
"defaultSystemChatPrompt": "",
|
||||
"datasetProcess": true,
|
||||
"usedInClassify": true,
|
||||
"customCQPrompt": "",
|
||||
"usedInExtractFields": true,
|
||||
"usedInQueryExtension": true,
|
||||
"customExtractPrompt": "",
|
||||
"usedInToolCall": true,
|
||||
"defaultConfig": {},
|
||||
"fieldMap": {},
|
||||
"type": "llm"
|
||||
},
|
||||
{
|
||||
"model": "claude-3-7-sonnet-20250219",
|
||||
"name": "claude-3-7-sonnet-20250219",
|
||||
|
@@ -25,6 +25,30 @@
|
||||
"showTopP": true,
|
||||
"showStopSign": true
|
||||
},
|
||||
{
|
||||
"model": "gemini-2.5-flash-preview-04-17",
|
||||
"name": "gemini-2.5-flash-preview-04-17",
|
||||
"maxContext": 1000000,
|
||||
"maxResponse": 8000,
|
||||
"quoteMaxToken": 60000,
|
||||
"maxTemperature": 1,
|
||||
"vision": true,
|
||||
"toolChoice": true,
|
||||
"functionCall": false,
|
||||
"defaultSystemChatPrompt": "",
|
||||
"datasetProcess": true,
|
||||
"usedInClassify": true,
|
||||
"customCQPrompt": "",
|
||||
"usedInExtractFields": true,
|
||||
"usedInQueryExtension": true,
|
||||
"customExtractPrompt": "",
|
||||
"usedInToolCall": true,
|
||||
"defaultConfig": {},
|
||||
"fieldMap": {},
|
||||
"type": "llm",
|
||||
"showTopP": true,
|
||||
"showStopSign": true
|
||||
},
|
||||
{
|
||||
"model": "gemini-2.0-flash",
|
||||
"name": "gemini-2.0-flash",
|
||||
|
@@ -74,6 +74,15 @@ export const createCollectionAndInsertData = async ({
|
||||
llmModel: getLLMModel(dataset.agentModel)
|
||||
});
|
||||
const chunkSplitter = computeChunkSplitter(createCollectionParams);
|
||||
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||
delete createCollectionParams.chunkTriggerType;
|
||||
delete createCollectionParams.chunkTriggerMinSize;
|
||||
delete createCollectionParams.dataEnhanceCollectionName;
|
||||
delete createCollectionParams.imageIndex;
|
||||
delete createCollectionParams.autoIndexes;
|
||||
delete createCollectionParams.indexSize;
|
||||
delete createCollectionParams.qaPrompt;
|
||||
}
|
||||
|
||||
// 1. split chunks
|
||||
const chunks = rawText2Chunks({
|
||||
|
@@ -163,7 +163,7 @@ export const readApiServerFileContent = async ({
|
||||
title?: string;
|
||||
rawText: string;
|
||||
}> => {
|
||||
const data = (
|
||||
return (
|
||||
await getApiDatasetRequest({
|
||||
apiServer,
|
||||
yuqueServer,
|
||||
@@ -175,10 +175,6 @@ export const readApiServerFileContent = async ({
|
||||
apiFileId,
|
||||
customPdfParse
|
||||
});
|
||||
if (data) {
|
||||
return data;
|
||||
}
|
||||
return Promise.reject(Error);
|
||||
};
|
||||
|
||||
export const rawText2Chunks = ({
|
||||
|
@@ -1,10 +1,12 @@
|
||||
import { getMongoModel, Schema } from '../../common/mongo';
|
||||
import {
|
||||
ChunkSettingModeEnum,
|
||||
ChunkTriggerConfigTypeEnum,
|
||||
DataChunkSplitModeEnum,
|
||||
DatasetCollectionDataProcessModeEnum,
|
||||
DatasetTypeEnum,
|
||||
DatasetTypeMap
|
||||
DatasetTypeMap,
|
||||
ParagraphChunkAIModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import {
|
||||
TeamCollectionName,
|
||||
@@ -15,12 +17,22 @@ import type { DatasetSchemaType } from '@fastgpt/global/core/dataset/type.d';
|
||||
export const DatasetCollectionName = 'datasets';
|
||||
|
||||
export const ChunkSettings = {
|
||||
imageIndex: Boolean,
|
||||
autoIndexes: Boolean,
|
||||
trainingType: {
|
||||
type: String,
|
||||
enum: Object.values(DatasetCollectionDataProcessModeEnum)
|
||||
},
|
||||
|
||||
chunkTriggerType: {
|
||||
type: String,
|
||||
enum: Object.values(ChunkTriggerConfigTypeEnum)
|
||||
},
|
||||
chunkTriggerMinSize: Number,
|
||||
|
||||
dataEnhanceCollectionName: Boolean,
|
||||
|
||||
imageIndex: Boolean,
|
||||
autoIndexes: Boolean,
|
||||
|
||||
chunkSettingMode: {
|
||||
type: String,
|
||||
enum: Object.values(ChunkSettingModeEnum)
|
||||
@@ -29,6 +41,13 @@ export const ChunkSettings = {
|
||||
type: String,
|
||||
enum: Object.values(DataChunkSplitModeEnum)
|
||||
},
|
||||
paragraphChunkAIMode: {
|
||||
type: String,
|
||||
enum: Object.values(ParagraphChunkAIModeEnum)
|
||||
},
|
||||
paragraphChunkDeep: Number,
|
||||
paragraphChunkMinSize: Number,
|
||||
paragraphChunkMaxSize: Number,
|
||||
chunkSize: Number,
|
||||
chunkSplitter: String,
|
||||
|
||||
@@ -115,9 +134,7 @@ const DatasetSchema = new Schema({
|
||||
|
||||
// abandoned
|
||||
autoSync: Boolean,
|
||||
externalReadUrl: {
|
||||
type: String
|
||||
},
|
||||
externalReadUrl: String,
|
||||
defaultPermission: Number
|
||||
});
|
||||
|
||||
|
@@ -749,7 +749,6 @@
|
||||
"custom_title": "Custom Title",
|
||||
"data_index_custom": "Custom index",
|
||||
"data_index_default": "Default index",
|
||||
"data_index_image": "Image Index",
|
||||
"data_index_question": "Inferred question index",
|
||||
"data_index_summary": "Summary Index",
|
||||
"data_not_found": "Data can't be found",
|
||||
|
@@ -22,7 +22,6 @@
|
||||
"collection.training_type": "Chunk type",
|
||||
"collection_data_count": "Data amount",
|
||||
"collection_metadata_custom_pdf_parse": "PDF enhancement analysis",
|
||||
"collection_metadata_image_parse": "Image tagging",
|
||||
"collection_not_support_retraining": "This collection type does not support retuning parameters",
|
||||
"collection_not_support_sync": "This collection does not support synchronization",
|
||||
"collection_sync": "Sync data",
|
||||
@@ -38,6 +37,7 @@
|
||||
"custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.",
|
||||
"data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
|
||||
"data_error_amount": "{{errorAmount}} Group training exception",
|
||||
"data_index_image": "Image index",
|
||||
"data_index_num": "Index {{index}}",
|
||||
"data_process_params": "Params",
|
||||
"data_process_setting": "Processing config",
|
||||
|
@@ -749,7 +749,6 @@
|
||||
"custom_title": "自定义标题",
|
||||
"data_index_custom": "自定义索引",
|
||||
"data_index_default": "默认索引",
|
||||
"data_index_image": "图片索引",
|
||||
"data_index_question": "推测问题索引",
|
||||
"data_index_summary": "摘要索引",
|
||||
"data_not_found": "数据找不到了",
|
||||
|
@@ -22,7 +22,6 @@
|
||||
"collection.training_type": "处理模式",
|
||||
"collection_data_count": "数据量",
|
||||
"collection_metadata_custom_pdf_parse": "PDF增强解析",
|
||||
"collection_metadata_image_parse": "图片标注",
|
||||
"collection_not_support_retraining": "该集合类型不支持重新调整参数",
|
||||
"collection_not_support_sync": "该集合不支持同步",
|
||||
"collection_sync": "立即同步",
|
||||
@@ -38,6 +37,7 @@
|
||||
"custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号,例如: * () [] {} 等。",
|
||||
"data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引",
|
||||
"data_error_amount": "{{errorAmount}} 组训练异常",
|
||||
"data_index_image": "图片索引",
|
||||
"data_index_num": "索引 {{index}}",
|
||||
"data_process_params": "处理参数",
|
||||
"data_process_setting": "数据处理配置",
|
||||
|
@@ -749,7 +749,6 @@
|
||||
"custom_title": "自訂標題",
|
||||
"data_index_custom": "自定義索引",
|
||||
"data_index_default": "預設索引",
|
||||
"data_index_image": "圖片索引",
|
||||
"data_index_question": "推測問題索引",
|
||||
"data_index_summary": "摘要索引",
|
||||
"data_not_found": "數據找不到了",
|
||||
|
@@ -21,7 +21,6 @@
|
||||
"collection.training_type": "處理模式",
|
||||
"collection_data_count": "資料量",
|
||||
"collection_metadata_custom_pdf_parse": "PDF 增強解析",
|
||||
"collection_metadata_image_parse": "圖片標註",
|
||||
"collection_not_support_retraining": "此集合類型不支援重新調整參數",
|
||||
"collection_not_support_sync": "該集合不支援同步",
|
||||
"collection_sync": "立即同步",
|
||||
@@ -37,6 +36,7 @@
|
||||
"custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的資料,使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.”表示中英文句號。\n\n盡量避免使用正則相關特殊符號,例如:* () [] {} 等。",
|
||||
"data_amount": "{{dataAmount}} 組資料,{{indexAmount}} 組索引",
|
||||
"data_error_amount": "{{errorAmount}} 組訓練異常",
|
||||
"data_index_image": "圖片索引",
|
||||
"data_index_num": "索引 {{index}}",
|
||||
"data_process_params": "處理參數",
|
||||
"data_process_setting": "資料處理設定",
|
||||
|
Reference in New Issue
Block a user