Add image index and pdf parse (#3956)

* feat: think tag parse

* feat: parse think tag test

* feat: pdf parse ux

* feat: doc2x parse

* perf: rewrite training mode setting

* feat: image parse queue

* perf: image index

* feat: image parse process

* feat: add init sh

* fix: ts
This commit is contained in:
Archer
2025-03-03 23:08:29 +08:00
committed by archer
parent 08b6f594df
commit adf5377ebe
106 changed files with 2337 additions and 1454 deletions

View File

@@ -1,5 +1,5 @@
import { DatasetDataIndexItemType, DatasetSchemaType } from './type';
import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
import { DatasetCollectionTypeEnum, DatasetCollectionDataProcessModeEnum } from './constants';
import type { LLMModelItemType } from '../ai/model.d';
import { ParentIdType } from 'common/parentFolder/type';
@@ -10,9 +10,11 @@ export type DatasetUpdateBody = {
name?: string;
avatar?: string;
intro?: string;
agentModel?: LLMModelItemType;
status?: DatasetSchemaType['status'];
agentModel?: string;
vlmModel?: string;
websiteConfig?: DatasetSchemaType['websiteConfig'];
externalReadUrl?: DatasetSchemaType['externalReadUrl'];
defaultPermission?: DatasetSchemaType['defaultPermission'];
@@ -27,7 +29,10 @@ export type DatasetUpdateBody = {
/* ================= collection ===================== */
export type DatasetCollectionChunkMetadataType = {
parentId?: string;
trainingType?: TrainingModeEnum;
customPdfParse?: boolean;
trainingType?: DatasetCollectionDataProcessModeEnum;
imageIndex?: boolean;
autoIndexes?: boolean;
chunkSize?: number;
chunkSplitter?: string;
qaPrompt?: string;
@@ -131,9 +136,15 @@ export type PostWebsiteSyncParams = {
export type PushDatasetDataProps = {
collectionId: string;
data: PushDatasetDataChunkProps[];
trainingMode: TrainingModeEnum;
trainingType?: DatasetCollectionDataProcessModeEnum;
autoIndexes?: boolean;
imageIndex?: boolean;
prompt?: string;
billId?: string;
// Abandon
trainingMode?: DatasetCollectionDataProcessModeEnum;
};
export type PushDatasetDataResponse = {
insertLen: number;