Add image index and pdf parse (#3956)

* feat: think tag parse

* feat: parse think tag test

* feat: pdf parse ux

* feat: doc2x parse

* perf: rewrite training mode setting

* feat: image parse queue

* perf: image index

* feat: image parse process

* feat: add init sh

* fix: ts
This commit is contained in:
Archer
2025-03-03 23:08:29 +08:00
committed by archer
parent 08b6f594df
commit adf5377ebe
106 changed files with 2337 additions and 1454 deletions

View File

@@ -0,0 +1,42 @@
import { i18nT } from '../../../../web/i18n/utils';
export enum DatasetDataIndexTypeEnum {
default = 'default',
custom = 'custom',
summary = 'summary',
question = 'question',
image = 'image'
}
export const DatasetDataIndexMap: Record<
`${DatasetDataIndexTypeEnum}`,
{
label: any;
color: string;
}
> = {
[DatasetDataIndexTypeEnum.default]: {
label: i18nT('dataset:data_index_default'),
color: 'gray'
},
[DatasetDataIndexTypeEnum.custom]: {
label: i18nT('dataset:data_index_custom'),
color: 'blue'
},
[DatasetDataIndexTypeEnum.summary]: {
label: i18nT('dataset:data_index_summary'),
color: 'green'
},
[DatasetDataIndexTypeEnum.question]: {
label: i18nT('dataset:data_index_question'),
color: 'red'
},
[DatasetDataIndexTypeEnum.image]: {
label: i18nT('dataset:data_index_image'),
color: 'purple'
}
};
export const defaultDatasetIndexData = DatasetDataIndexMap[DatasetDataIndexTypeEnum.custom];
export const getDatasetIndexMapData = (type: `${DatasetDataIndexTypeEnum}`) => {
return DatasetDataIndexMap[type] || defaultDatasetIndexData;
};