Add image index and pdf parse (#3956)

* feat: think tag parse

* feat: parse think tag test

* feat: pdf parse ux

* feat: doc2x parse

* perf: rewrite training mode setting

* feat: image parse queue

* perf: image index

* feat: image parse process

* feat: add init sh

* fix: ts
This commit is contained in:
Archer
2025-03-03 23:08:29 +08:00
committed by archer
parent 08b6f594df
commit adf5377ebe
106 changed files with 2337 additions and 1454 deletions

View File

@@ -1,5 +1,5 @@
import { DatasetDataIndexItemType, DatasetSchemaType } from './type';
import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
import { DatasetCollectionTypeEnum, DatasetCollectionDataProcessModeEnum } from './constants';
import type { LLMModelItemType } from '../ai/model.d';
import { ParentIdType } from 'common/parentFolder/type';
@@ -10,9 +10,11 @@ export type DatasetUpdateBody = {
name?: string;
avatar?: string;
intro?: string;
agentModel?: LLMModelItemType;
status?: DatasetSchemaType['status'];
agentModel?: string;
vlmModel?: string;
websiteConfig?: DatasetSchemaType['websiteConfig'];
externalReadUrl?: DatasetSchemaType['externalReadUrl'];
defaultPermission?: DatasetSchemaType['defaultPermission'];
@@ -27,7 +29,10 @@ export type DatasetUpdateBody = {
/* ================= collection ===================== */
export type DatasetCollectionChunkMetadataType = {
parentId?: string;
trainingType?: TrainingModeEnum;
customPdfParse?: boolean;
trainingType?: DatasetCollectionDataProcessModeEnum;
imageIndex?: boolean;
autoIndexes?: boolean;
chunkSize?: number;
chunkSplitter?: string;
qaPrompt?: string;
@@ -131,9 +136,15 @@ export type PostWebsiteSyncParams = {
export type PushDatasetDataProps = {
collectionId: string;
data: PushDatasetDataChunkProps[];
trainingMode: TrainingModeEnum;
trainingType?: DatasetCollectionDataProcessModeEnum;
autoIndexes?: boolean;
imageIndex?: boolean;
prompt?: string;
billId?: string;
// Abandon
trainingMode?: DatasetCollectionDataProcessModeEnum;
};
export type PushDatasetDataResponse = {
insertLen: number;

View File

@@ -1,4 +1,4 @@
import { DatasetCollectionTypeEnum, TrainingModeEnum, TrainingTypeMap } from '../constants';
import { DatasetCollectionTypeEnum } from '../constants';
import { DatasetCollectionSchemaType } from '../type';
export const getCollectionSourceData = (collection?: DatasetCollectionSchemaType) => {
@@ -16,9 +16,3 @@ export const getCollectionSourceData = (collection?: DatasetCollectionSchemaType
export const checkCollectionIsFolder = (type: DatasetCollectionTypeEnum) => {
return type === DatasetCollectionTypeEnum.folder || type === DatasetCollectionTypeEnum.virtual;
};
export const getTrainingTypeLabel = (type?: TrainingModeEnum) => {
if (!type) return '';
if (!TrainingTypeMap[type]) return '';
return TrainingTypeMap[type].label;
};

View File

@@ -109,6 +109,26 @@ export const DatasetCollectionSyncResultMap = {
}
};
export enum DatasetCollectionDataProcessModeEnum {
chunk = 'chunk',
qa = 'qa',
auto = 'auto' // abandon
}
export const DatasetCollectionDataProcessModeMap = {
[DatasetCollectionDataProcessModeEnum.chunk]: {
label: i18nT('common:core.dataset.training.Chunk mode'),
tooltip: i18nT('common:core.dataset.import.Chunk Split Tip')
},
[DatasetCollectionDataProcessModeEnum.qa]: {
label: i18nT('common:core.dataset.training.QA mode'),
tooltip: i18nT('common:core.dataset.import.QA Import Tip')
},
[DatasetCollectionDataProcessModeEnum.auto]: {
label: i18nT('common:core.dataset.training.Auto mode'),
tooltip: i18nT('common:core.dataset.training.Auto mode Tip')
}
};
/* ------------ data -------------- */
/* ------------ training -------------- */
@@ -124,28 +144,11 @@ export enum ImportDataSourceEnum {
export enum TrainingModeEnum {
chunk = 'chunk',
qa = 'qa',
auto = 'auto',
qa = 'qa'
image = 'image'
}
export const TrainingTypeMap = {
[TrainingModeEnum.chunk]: {
label: i18nT('common:core.dataset.training.Chunk mode'),
tooltip: i18nT('common:core.dataset.import.Chunk Split Tip'),
openSource: true
},
[TrainingModeEnum.auto]: {
label: i18nT('common:core.dataset.training.Auto mode'),
tooltip: i18nT('common:core.dataset.training.Auto mode Tip'),
openSource: false
},
[TrainingModeEnum.qa]: {
label: i18nT('common:core.dataset.training.QA mode'),
tooltip: i18nT('common:core.dataset.import.QA Import Tip'),
openSource: true
}
};
/* ------------ search -------------- */
export enum DatasetSearchModeEnum {
embedding = 'embedding',

View File

@@ -20,9 +20,22 @@ export type UpdateDatasetDataProps = {
})[];
};
export type PatchIndexesProps = {
type: 'create' | 'update' | 'delete' | 'unChange';
index: Omit<DatasetDataIndexItemType, 'dataId'> & {
dataId?: string;
};
};
export type PatchIndexesProps =
| {
type: 'create';
index: Omit<DatasetDataIndexItemType, 'dataId'> & {
dataId?: string;
};
}
| {
type: 'update';
index: DatasetDataIndexItemType;
}
| {
type: 'delete';
index: DatasetDataIndexItemType;
}
| {
type: 'unChange';
index: DatasetDataIndexItemType;
};

View File

@@ -0,0 +1,42 @@
import { i18nT } from '../../../../web/i18n/utils';
export enum DatasetDataIndexTypeEnum {
default = 'default',
custom = 'custom',
summary = 'summary',
question = 'question',
image = 'image'
}
export const DatasetDataIndexMap: Record<
`${DatasetDataIndexTypeEnum}`,
{
label: any;
color: string;
}
> = {
[DatasetDataIndexTypeEnum.default]: {
label: i18nT('dataset:data_index_default'),
color: 'gray'
},
[DatasetDataIndexTypeEnum.custom]: {
label: i18nT('dataset:data_index_custom'),
color: 'blue'
},
[DatasetDataIndexTypeEnum.summary]: {
label: i18nT('dataset:data_index_summary'),
color: 'green'
},
[DatasetDataIndexTypeEnum.question]: {
label: i18nT('dataset:data_index_question'),
color: 'red'
},
[DatasetDataIndexTypeEnum.image]: {
label: i18nT('dataset:data_index_image'),
color: 'purple'
}
};
export const defaultDatasetIndexData = DatasetDataIndexMap[DatasetDataIndexTypeEnum.custom];
export const getDatasetIndexMapData = (type: `${DatasetDataIndexTypeEnum}`) => {
return DatasetDataIndexMap[type] || defaultDatasetIndexData;
};

View File

@@ -0,0 +1,20 @@
import { PushDatasetDataChunkProps } from '../api';
import { TrainingModeEnum } from '../constants';
export type PushDataToTrainingQueueProps = {
teamId: string;
tmbId: string;
datasetId: string;
collectionId: string;
mode?: TrainingModeEnum;
data: PushDatasetDataChunkProps[];
prompt?: string;
agentModel: string;
vectorModel: string;
vlmModel?: string;
billId?: string;
session?: ClientSession;
};

View File

@@ -2,6 +2,7 @@ import type { LLMModelItemType, EmbeddingModelItemType } from '../../core/ai/mod
import { PermissionTypeEnum } from '../../support/permission/constant';
import { PushDatasetDataChunkProps } from './api';
import {
DatasetCollectionDataProcessModeEnum,
DatasetCollectionTypeEnum,
DatasetStatusEnum,
DatasetTypeEnum,
@@ -12,6 +13,7 @@ import { DatasetPermission } from '../../support/permission/dataset/controller';
import { Permission } from '../../support/permission/controller';
import { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
import { SourceMemberType } from 'support/user/type';
import { DatasetDataIndexTypeEnum } from './data/constants';
export type DatasetSchemaType = {
_id: string;
@@ -23,11 +25,14 @@ export type DatasetSchemaType = {
avatar: string;
name: string;
vectorModel: string;
agentModel: string;
intro: string;
type: `${DatasetTypeEnum}`;
status: `${DatasetStatusEnum}`;
vectorModel: string;
agentModel: string;
vlmModel?: string;
websiteConfig?: {
url: string;
selector: string;
@@ -52,26 +57,22 @@ export type DatasetCollectionSchemaType = {
parentId?: string;
name: string;
type: DatasetCollectionTypeEnum;
createTime: Date;
updateTime: Date;
forbid?: boolean;
trainingType: TrainingModeEnum;
chunkSize: number;
chunkSplitter?: string;
qaPrompt?: string;
ocrParse?: boolean;
tags?: string[];
createTime: Date;
updateTime: Date;
// Status
forbid?: boolean;
nextSyncTime?: Date;
// Collection metadata
fileId?: string; // local file id
rawLink?: string; // link url
externalFileId?: string; //external file id
apiFileId?: string; // api file id
externalFileUrl?: string; // external import url
nextSyncTime?: Date;
rawTextLength?: number;
hashRawText?: string;
metadata?: {
@@ -80,6 +81,16 @@ export type DatasetCollectionSchemaType = {
[key: string]: any;
};
// Parse settings
customPdfParse?: boolean;
// Chunk settings
autoIndexes?: boolean;
imageIndex?: boolean;
trainingType: DatasetCollectionDataProcessModeEnum;
chunkSize: number;
chunkSplitter?: string;
qaPrompt?: string;
};
export type DatasetCollectionTagsSchemaType = {
@@ -90,7 +101,7 @@ export type DatasetCollectionTagsSchemaType = {
};
export type DatasetDataIndexItemType = {
defaultIndex: boolean;
type: `${DatasetDataIndexTypeEnum}`;
dataId: string; // pg data id
text: string;
};
@@ -141,6 +152,7 @@ export type DatasetTrainingSchemaType = {
chunkIndex: number;
weight: number;
indexes: Omit<DatasetDataIndexItemType, 'dataId'>[];
retryCount: number;
};
export type CollectionWithDatasetType = DatasetCollectionSchemaType & {
@@ -169,9 +181,10 @@ export type DatasetListItemType = {
sourceMember?: SourceMemberType;
};
export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel' | 'agentModel'> & {
export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel' | 'agentModel' | 'vlmModel'> & {
vectorModel: EmbeddingModelItemType;
agentModel: LLMModelItemType;
vlmModel?: LLMModelItemType;
permission: DatasetPermission;
};

View File

@@ -1,6 +1,7 @@
import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
import { getFileIcon } from '../../common/file/icon';
import { strIsLink } from '../../common/string/tools';
import { DatasetDataIndexTypeEnum } from './data/constants';
export function getCollectionIcon(
type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file,
@@ -38,14 +39,23 @@ export function getSourceNameIcon({
}
/* get dataset data default index */
export function getDefaultIndex(props?: { q?: string; a?: string; dataId?: string }) {
const { q = '', a, dataId } = props || {};
const qaStr = `${q}\n${a}`.trim();
return {
defaultIndex: true,
text: a ? qaStr : q,
dataId
};
export function getDefaultIndex(props?: { q?: string; a?: string }) {
const { q = '', a } = props || {};
return [
{
text: q,
type: DatasetDataIndexTypeEnum.default
},
...(a
? [
{
text: a,
type: DatasetDataIndexTypeEnum.default
}
]
: [])
];
}
export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => {