perf: dataset import params code (#4875)

* perf: dataset import params code

* perf: api dataset code

* model
This commit is contained in:
Archer
2025-05-23 10:40:25 +08:00
committed by GitHub
parent 9af92d1eae
commit fae76e887a
23 changed files with 366 additions and 295 deletions

View File

@@ -1,9 +1,11 @@
import type { DatasetDataIndexItemType, DatasetSchemaType } from './type';
import type { ChunkSettingsType, DatasetDataIndexItemType, DatasetSchemaType } from './type';
import type {
DatasetCollectionTypeEnum,
DatasetCollectionDataProcessModeEnum,
ChunkSettingModeEnum,
DataChunkSplitModeEnum
DataChunkSplitModeEnum,
ChunkTriggerConfigTypeEnum,
ParagraphChunkAIModeEnum
} from './constants';
import type { LLMModelItemType } from '../ai/model.d';
import type { ParentIdType } from 'common/parentFolder/type';
@@ -32,26 +34,16 @@ export type DatasetUpdateBody = {
};
/* ================= collection ===================== */
export type DatasetCollectionChunkMetadataType = {
// Input + store params
type DatasetCollectionStoreDataType = ChunkSettingsType & {
parentId?: string;
customPdfParse?: boolean;
trainingType?: DatasetCollectionDataProcessModeEnum;
imageIndex?: boolean;
autoIndexes?: boolean;
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
chunkSize?: number;
indexSize?: number;
chunkSplitter?: string;
qaPrompt?: string;
metadata?: Record<string, any>;
customPdfParse?: boolean;
};
// create collection params
export type CreateDatasetCollectionParams = DatasetCollectionChunkMetadataType & {
export type CreateDatasetCollectionParams = DatasetCollectionStoreDataType & {
datasetId: string;
name: string;
type: DatasetCollectionTypeEnum;
@@ -72,7 +64,7 @@ export type CreateDatasetCollectionParams = DatasetCollectionChunkMetadataType &
nextSyncTime?: Date;
};
export type ApiCreateDatasetCollectionParams = DatasetCollectionChunkMetadataType & {
export type ApiCreateDatasetCollectionParams = DatasetCollectionStoreDataType & {
datasetId: string;
tags?: string[];
};
@@ -90,7 +82,7 @@ export type ApiDatasetCreateDatasetCollectionParams = ApiCreateDatasetCollection
export type FileIdCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
fileId: string;
};
export type reTrainingDatasetFileCollectionParams = DatasetCollectionChunkMetadataType & {
export type reTrainingDatasetFileCollectionParams = DatasetCollectionStoreDataType & {
datasetId: string;
collectionId: string;
};

View File

@@ -143,15 +143,25 @@ export const DatasetCollectionDataProcessModeMap = {
}
};
export enum ChunkTriggerConfigTypeEnum {
minSize = 'minSize',
forceChunk = 'forceChunk',
maxSize = 'maxSize'
}
export enum ChunkSettingModeEnum {
auto = 'auto',
custom = 'custom'
}
export enum DataChunkSplitModeEnum {
paragraph = 'paragraph',
size = 'size',
char = 'char'
}
export enum ParagraphChunkAIModeEnum {
auto = 'auto',
force = 'force'
}
/* ------------ data -------------- */

View File

@@ -32,7 +32,7 @@ export const DatasetDataIndexMap: Record<
color: 'red'
},
[DatasetDataIndexTypeEnum.image]: {
label: i18nT('common:data_index_image'),
label: i18nT('dataset:data_index_image'),
color: 'purple'
}
};

View File

@@ -8,26 +8,42 @@ import type {
DatasetStatusEnum,
DatasetTypeEnum,
SearchScoreTypeEnum,
TrainingModeEnum
TrainingModeEnum,
ChunkSettingModeEnum
} from './constants';
import type { DatasetPermission } from '../../support/permission/dataset/controller';
import { Permission } from '../../support/permission/controller';
import type { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
import type { SourceMemberType } from 'support/user/type';
import type { DatasetDataIndexTypeEnum } from './data/constants';
import type { ChunkSettingModeEnum } from './constants';
export type ChunkSettingsType = {
trainingType: DatasetCollectionDataProcessModeEnum;
autoIndexes?: boolean;
trainingType?: DatasetCollectionDataProcessModeEnum;
// Chunk trigger
chunkTriggerType?: ChunkTriggerConfigTypeEnum;
chunkTriggerMinSize?: number; // maxSize from agent model, not store
// Data enhance
dataEnhanceCollectionName?: boolean; // Auto add collection name to data
// Index enhance
imageIndex?: boolean;
autoIndexes?: boolean;
chunkSettingMode?: ChunkSettingModeEnum;
// Chunk setting
chunkSettingMode?: ChunkSettingModeEnum; // 系统参数/自定义参数
chunkSplitMode?: DataChunkSplitModeEnum;
// Paragraph split
paragraphChunkAIMode?: ParagraphChunkAIModeEnum;
paragraphChunkDeep?: number; // Paragraph deep
paragraphChunkMinSize?: number; // Paragraph min size, if too small, it will merge
paragraphChunkMaxSize?: number; // Paragraph max size, if too large, it will split
// Size split
chunkSize?: number;
indexSize?: number;
// Char split
chunkSplitter?: string;
indexSize?: number;
qaPrompt?: string;
};
@@ -66,7 +82,7 @@ export type DatasetSchemaType = {
defaultPermission?: number;
};
export type DatasetCollectionSchemaType = {
export type DatasetCollectionSchemaType = ChunkSettingsType & {
_id: string;
teamId: string;
tmbId: string;
@@ -101,18 +117,7 @@ export type DatasetCollectionSchemaType = {
// Parse settings
customPdfParse?: boolean;
// Chunk settings
autoIndexes?: boolean;
imageIndex?: boolean;
trainingType: DatasetCollectionDataProcessModeEnum;
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
chunkSize?: number;
indexSize?: number;
chunkSplitter?: string;
qaPrompt?: string;
};
export type DatasetCollectionTagsSchemaType = {