Add image index and pdf parse (#3956)

* feat: think tag parse

* feat: parse think tag test

* feat: pdf parse ux

* feat: doc2x parse

* perf: rewrite training mode setting

* feat: image parse queue

* perf: image index

* feat: image parse process

* feat: add init sh

* fix: ts
This commit is contained in:
Archer
2025-03-03 23:08:29 +08:00
committed by archer
parent 08b6f594df
commit adf5377ebe
106 changed files with 2337 additions and 1454 deletions

View File

@@ -2,12 +2,17 @@ import { MongoDatasetCollection } from './schema';
import { ClientSession } from '../../../common/mongo';
import { MongoDatasetCollectionTags } from '../tag/schema';
import { readFromSecondary } from '../../../common/mongo/utils';
import { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type';
import {
CollectionWithDatasetType,
DatasetCollectionSchemaType
} from '@fastgpt/global/core/dataset/type';
import {
DatasetCollectionDataProcessModeEnum,
DatasetCollectionSyncResultEnum,
DatasetCollectionTypeEnum,
DatasetSourceReadTypeEnum,
DatasetTypeEnum
DatasetTypeEnum,
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
import { readDatasetSourceRawText } from '../read';
@@ -160,6 +165,7 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
})();
const rawText = await readDatasetSourceRawText({
teamId: collection.teamId,
tmbId: collection.tmbId,
...sourceReadType
});
@@ -220,3 +226,24 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
return DatasetCollectionSyncResultEnum.success;
};
/*
QA: 独立进程
Chunk: Image Index -> Auto index -> chunk index
*/
export const getTrainingModeByCollection = (collection: {
trainingType: DatasetCollectionSchemaType['trainingType'];
autoIndexes?: DatasetCollectionSchemaType['autoIndexes'];
imageIndex?: DatasetCollectionSchemaType['imageIndex'];
}) => {
if (collection.trainingType === DatasetCollectionDataProcessModeEnum.qa) {
return TrainingModeEnum.qa;
}
if (collection.imageIndex && global.feConfigs?.isPlus) {
return TrainingModeEnum.image;
}
if (collection.autoIndexes && global.feConfigs?.isPlus) {
return TrainingModeEnum.auto;
}
return TrainingModeEnum.chunk;
};