mirror of
https://github.com/labring/FastGPT.git
synced 2025-10-15 07:31:19 +00:00
Add image index and pdf parse (#3956)
* feat: think tag parse * feat: parse think tag test * feat: pdf parse ux * feat: doc2x parse * perf: rewrite training mode setting * feat: image parse queue * perf: image index * feat: image parse process * feat: add init sh * fix: ts
This commit is contained in:
19
packages/global/core/dataset/api.d.ts
vendored
19
packages/global/core/dataset/api.d.ts
vendored
@@ -1,5 +1,5 @@
|
||||
import { DatasetDataIndexItemType, DatasetSchemaType } from './type';
|
||||
import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
|
||||
import { DatasetCollectionTypeEnum, DatasetCollectionDataProcessModeEnum } from './constants';
|
||||
import type { LLMModelItemType } from '../ai/model.d';
|
||||
import { ParentIdType } from 'common/parentFolder/type';
|
||||
|
||||
@@ -10,9 +10,11 @@ export type DatasetUpdateBody = {
|
||||
name?: string;
|
||||
avatar?: string;
|
||||
intro?: string;
|
||||
agentModel?: LLMModelItemType;
|
||||
status?: DatasetSchemaType['status'];
|
||||
|
||||
agentModel?: string;
|
||||
vlmModel?: string;
|
||||
|
||||
websiteConfig?: DatasetSchemaType['websiteConfig'];
|
||||
externalReadUrl?: DatasetSchemaType['externalReadUrl'];
|
||||
defaultPermission?: DatasetSchemaType['defaultPermission'];
|
||||
@@ -27,7 +29,10 @@ export type DatasetUpdateBody = {
|
||||
/* ================= collection ===================== */
|
||||
export type DatasetCollectionChunkMetadataType = {
|
||||
parentId?: string;
|
||||
trainingType?: TrainingModeEnum;
|
||||
customPdfParse?: boolean;
|
||||
trainingType?: DatasetCollectionDataProcessModeEnum;
|
||||
imageIndex?: boolean;
|
||||
autoIndexes?: boolean;
|
||||
chunkSize?: number;
|
||||
chunkSplitter?: string;
|
||||
qaPrompt?: string;
|
||||
@@ -131,9 +136,15 @@ export type PostWebsiteSyncParams = {
|
||||
export type PushDatasetDataProps = {
|
||||
collectionId: string;
|
||||
data: PushDatasetDataChunkProps[];
|
||||
trainingMode: TrainingModeEnum;
|
||||
trainingType?: DatasetCollectionDataProcessModeEnum;
|
||||
autoIndexes?: boolean;
|
||||
imageIndex?: boolean;
|
||||
prompt?: string;
|
||||
|
||||
billId?: string;
|
||||
|
||||
// Abandon
|
||||
trainingMode?: DatasetCollectionDataProcessModeEnum;
|
||||
};
|
||||
export type PushDatasetDataResponse = {
|
||||
insertLen: number;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
import { DatasetCollectionTypeEnum, TrainingModeEnum, TrainingTypeMap } from '../constants';
|
||||
import { DatasetCollectionTypeEnum } from '../constants';
|
||||
import { DatasetCollectionSchemaType } from '../type';
|
||||
|
||||
export const getCollectionSourceData = (collection?: DatasetCollectionSchemaType) => {
|
||||
@@ -16,9 +16,3 @@ export const getCollectionSourceData = (collection?: DatasetCollectionSchemaType
|
||||
export const checkCollectionIsFolder = (type: DatasetCollectionTypeEnum) => {
|
||||
return type === DatasetCollectionTypeEnum.folder || type === DatasetCollectionTypeEnum.virtual;
|
||||
};
|
||||
|
||||
export const getTrainingTypeLabel = (type?: TrainingModeEnum) => {
|
||||
if (!type) return '';
|
||||
if (!TrainingTypeMap[type]) return '';
|
||||
return TrainingTypeMap[type].label;
|
||||
};
|
||||
|
@@ -109,6 +109,26 @@ export const DatasetCollectionSyncResultMap = {
|
||||
}
|
||||
};
|
||||
|
||||
export enum DatasetCollectionDataProcessModeEnum {
|
||||
chunk = 'chunk',
|
||||
qa = 'qa',
|
||||
auto = 'auto' // abandon
|
||||
}
|
||||
export const DatasetCollectionDataProcessModeMap = {
|
||||
[DatasetCollectionDataProcessModeEnum.chunk]: {
|
||||
label: i18nT('common:core.dataset.training.Chunk mode'),
|
||||
tooltip: i18nT('common:core.dataset.import.Chunk Split Tip')
|
||||
},
|
||||
[DatasetCollectionDataProcessModeEnum.qa]: {
|
||||
label: i18nT('common:core.dataset.training.QA mode'),
|
||||
tooltip: i18nT('common:core.dataset.import.QA Import Tip')
|
||||
},
|
||||
[DatasetCollectionDataProcessModeEnum.auto]: {
|
||||
label: i18nT('common:core.dataset.training.Auto mode'),
|
||||
tooltip: i18nT('common:core.dataset.training.Auto mode Tip')
|
||||
}
|
||||
};
|
||||
|
||||
/* ------------ data -------------- */
|
||||
|
||||
/* ------------ training -------------- */
|
||||
@@ -124,28 +144,11 @@ export enum ImportDataSourceEnum {
|
||||
|
||||
export enum TrainingModeEnum {
|
||||
chunk = 'chunk',
|
||||
qa = 'qa',
|
||||
auto = 'auto',
|
||||
qa = 'qa'
|
||||
image = 'image'
|
||||
}
|
||||
|
||||
export const TrainingTypeMap = {
|
||||
[TrainingModeEnum.chunk]: {
|
||||
label: i18nT('common:core.dataset.training.Chunk mode'),
|
||||
tooltip: i18nT('common:core.dataset.import.Chunk Split Tip'),
|
||||
openSource: true
|
||||
},
|
||||
[TrainingModeEnum.auto]: {
|
||||
label: i18nT('common:core.dataset.training.Auto mode'),
|
||||
tooltip: i18nT('common:core.dataset.training.Auto mode Tip'),
|
||||
openSource: false
|
||||
},
|
||||
[TrainingModeEnum.qa]: {
|
||||
label: i18nT('common:core.dataset.training.QA mode'),
|
||||
tooltip: i18nT('common:core.dataset.import.QA Import Tip'),
|
||||
openSource: true
|
||||
}
|
||||
};
|
||||
|
||||
/* ------------ search -------------- */
|
||||
export enum DatasetSearchModeEnum {
|
||||
embedding = 'embedding',
|
||||
|
25
packages/global/core/dataset/controller.d.ts
vendored
25
packages/global/core/dataset/controller.d.ts
vendored
@@ -20,9 +20,22 @@ export type UpdateDatasetDataProps = {
|
||||
})[];
|
||||
};
|
||||
|
||||
export type PatchIndexesProps = {
|
||||
type: 'create' | 'update' | 'delete' | 'unChange';
|
||||
index: Omit<DatasetDataIndexItemType, 'dataId'> & {
|
||||
dataId?: string;
|
||||
};
|
||||
};
|
||||
export type PatchIndexesProps =
|
||||
| {
|
||||
type: 'create';
|
||||
index: Omit<DatasetDataIndexItemType, 'dataId'> & {
|
||||
dataId?: string;
|
||||
};
|
||||
}
|
||||
| {
|
||||
type: 'update';
|
||||
index: DatasetDataIndexItemType;
|
||||
}
|
||||
| {
|
||||
type: 'delete';
|
||||
index: DatasetDataIndexItemType;
|
||||
}
|
||||
| {
|
||||
type: 'unChange';
|
||||
index: DatasetDataIndexItemType;
|
||||
};
|
||||
|
42
packages/global/core/dataset/data/constants.ts
Normal file
42
packages/global/core/dataset/data/constants.ts
Normal file
@@ -0,0 +1,42 @@
|
||||
import { i18nT } from '../../../../web/i18n/utils';
|
||||
|
||||
export enum DatasetDataIndexTypeEnum {
|
||||
default = 'default',
|
||||
custom = 'custom',
|
||||
summary = 'summary',
|
||||
question = 'question',
|
||||
image = 'image'
|
||||
}
|
||||
|
||||
export const DatasetDataIndexMap: Record<
|
||||
`${DatasetDataIndexTypeEnum}`,
|
||||
{
|
||||
label: any;
|
||||
color: string;
|
||||
}
|
||||
> = {
|
||||
[DatasetDataIndexTypeEnum.default]: {
|
||||
label: i18nT('dataset:data_index_default'),
|
||||
color: 'gray'
|
||||
},
|
||||
[DatasetDataIndexTypeEnum.custom]: {
|
||||
label: i18nT('dataset:data_index_custom'),
|
||||
color: 'blue'
|
||||
},
|
||||
[DatasetDataIndexTypeEnum.summary]: {
|
||||
label: i18nT('dataset:data_index_summary'),
|
||||
color: 'green'
|
||||
},
|
||||
[DatasetDataIndexTypeEnum.question]: {
|
||||
label: i18nT('dataset:data_index_question'),
|
||||
color: 'red'
|
||||
},
|
||||
[DatasetDataIndexTypeEnum.image]: {
|
||||
label: i18nT('dataset:data_index_image'),
|
||||
color: 'purple'
|
||||
}
|
||||
};
|
||||
export const defaultDatasetIndexData = DatasetDataIndexMap[DatasetDataIndexTypeEnum.custom];
|
||||
export const getDatasetIndexMapData = (type: `${DatasetDataIndexTypeEnum}`) => {
|
||||
return DatasetDataIndexMap[type] || defaultDatasetIndexData;
|
||||
};
|
20
packages/global/core/dataset/training/type.d.ts
vendored
Normal file
20
packages/global/core/dataset/training/type.d.ts
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
import { PushDatasetDataChunkProps } from '../api';
|
||||
import { TrainingModeEnum } from '../constants';
|
||||
|
||||
export type PushDataToTrainingQueueProps = {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
datasetId: string;
|
||||
collectionId: string;
|
||||
|
||||
mode?: TrainingModeEnum;
|
||||
data: PushDatasetDataChunkProps[];
|
||||
prompt?: string;
|
||||
|
||||
agentModel: string;
|
||||
vectorModel: string;
|
||||
vlmModel?: string;
|
||||
|
||||
billId?: string;
|
||||
session?: ClientSession;
|
||||
};
|
45
packages/global/core/dataset/type.d.ts
vendored
45
packages/global/core/dataset/type.d.ts
vendored
@@ -2,6 +2,7 @@ import type { LLMModelItemType, EmbeddingModelItemType } from '../../core/ai/mod
|
||||
import { PermissionTypeEnum } from '../../support/permission/constant';
|
||||
import { PushDatasetDataChunkProps } from './api';
|
||||
import {
|
||||
DatasetCollectionDataProcessModeEnum,
|
||||
DatasetCollectionTypeEnum,
|
||||
DatasetStatusEnum,
|
||||
DatasetTypeEnum,
|
||||
@@ -12,6 +13,7 @@ import { DatasetPermission } from '../../support/permission/dataset/controller';
|
||||
import { Permission } from '../../support/permission/controller';
|
||||
import { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
|
||||
import { SourceMemberType } from 'support/user/type';
|
||||
import { DatasetDataIndexTypeEnum } from './data/constants';
|
||||
|
||||
export type DatasetSchemaType = {
|
||||
_id: string;
|
||||
@@ -23,11 +25,14 @@ export type DatasetSchemaType = {
|
||||
|
||||
avatar: string;
|
||||
name: string;
|
||||
vectorModel: string;
|
||||
agentModel: string;
|
||||
intro: string;
|
||||
type: `${DatasetTypeEnum}`;
|
||||
status: `${DatasetStatusEnum}`;
|
||||
|
||||
vectorModel: string;
|
||||
agentModel: string;
|
||||
vlmModel?: string;
|
||||
|
||||
websiteConfig?: {
|
||||
url: string;
|
||||
selector: string;
|
||||
@@ -52,26 +57,22 @@ export type DatasetCollectionSchemaType = {
|
||||
parentId?: string;
|
||||
name: string;
|
||||
type: DatasetCollectionTypeEnum;
|
||||
createTime: Date;
|
||||
updateTime: Date;
|
||||
forbid?: boolean;
|
||||
|
||||
trainingType: TrainingModeEnum;
|
||||
chunkSize: number;
|
||||
chunkSplitter?: string;
|
||||
qaPrompt?: string;
|
||||
ocrParse?: boolean;
|
||||
|
||||
tags?: string[];
|
||||
|
||||
createTime: Date;
|
||||
updateTime: Date;
|
||||
|
||||
// Status
|
||||
forbid?: boolean;
|
||||
nextSyncTime?: Date;
|
||||
|
||||
// Collection metadata
|
||||
fileId?: string; // local file id
|
||||
rawLink?: string; // link url
|
||||
externalFileId?: string; //external file id
|
||||
apiFileId?: string; // api file id
|
||||
externalFileUrl?: string; // external import url
|
||||
|
||||
nextSyncTime?: Date;
|
||||
|
||||
rawTextLength?: number;
|
||||
hashRawText?: string;
|
||||
metadata?: {
|
||||
@@ -80,6 +81,16 @@ export type DatasetCollectionSchemaType = {
|
||||
|
||||
[key: string]: any;
|
||||
};
|
||||
|
||||
// Parse settings
|
||||
customPdfParse?: boolean;
|
||||
// Chunk settings
|
||||
autoIndexes?: boolean;
|
||||
imageIndex?: boolean;
|
||||
trainingType: DatasetCollectionDataProcessModeEnum;
|
||||
chunkSize: number;
|
||||
chunkSplitter?: string;
|
||||
qaPrompt?: string;
|
||||
};
|
||||
|
||||
export type DatasetCollectionTagsSchemaType = {
|
||||
@@ -90,7 +101,7 @@ export type DatasetCollectionTagsSchemaType = {
|
||||
};
|
||||
|
||||
export type DatasetDataIndexItemType = {
|
||||
defaultIndex: boolean;
|
||||
type: `${DatasetDataIndexTypeEnum}`;
|
||||
dataId: string; // pg data id
|
||||
text: string;
|
||||
};
|
||||
@@ -141,6 +152,7 @@ export type DatasetTrainingSchemaType = {
|
||||
chunkIndex: number;
|
||||
weight: number;
|
||||
indexes: Omit<DatasetDataIndexItemType, 'dataId'>[];
|
||||
retryCount: number;
|
||||
};
|
||||
|
||||
export type CollectionWithDatasetType = DatasetCollectionSchemaType & {
|
||||
@@ -169,9 +181,10 @@ export type DatasetListItemType = {
|
||||
sourceMember?: SourceMemberType;
|
||||
};
|
||||
|
||||
export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel' | 'agentModel'> & {
|
||||
export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel' | 'agentModel' | 'vlmModel'> & {
|
||||
vectorModel: EmbeddingModelItemType;
|
||||
agentModel: LLMModelItemType;
|
||||
vlmModel?: LLMModelItemType;
|
||||
permission: DatasetPermission;
|
||||
};
|
||||
|
||||
|
@@ -1,6 +1,7 @@
|
||||
import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
|
||||
import { getFileIcon } from '../../common/file/icon';
|
||||
import { strIsLink } from '../../common/string/tools';
|
||||
import { DatasetDataIndexTypeEnum } from './data/constants';
|
||||
|
||||
export function getCollectionIcon(
|
||||
type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file,
|
||||
@@ -38,14 +39,23 @@ export function getSourceNameIcon({
|
||||
}
|
||||
|
||||
/* get dataset data default index */
|
||||
export function getDefaultIndex(props?: { q?: string; a?: string; dataId?: string }) {
|
||||
const { q = '', a, dataId } = props || {};
|
||||
const qaStr = `${q}\n${a}`.trim();
|
||||
return {
|
||||
defaultIndex: true,
|
||||
text: a ? qaStr : q,
|
||||
dataId
|
||||
};
|
||||
export function getDefaultIndex(props?: { q?: string; a?: string }) {
|
||||
const { q = '', a } = props || {};
|
||||
|
||||
return [
|
||||
{
|
||||
text: q,
|
||||
type: DatasetDataIndexTypeEnum.default
|
||||
},
|
||||
...(a
|
||||
? [
|
||||
{
|
||||
text: a,
|
||||
type: DatasetDataIndexTypeEnum.default
|
||||
}
|
||||
]
|
||||
: [])
|
||||
];
|
||||
}
|
||||
|
||||
export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => {
|
||||
|
Reference in New Issue
Block a user