Add image index and pdf parse (#3956)

* feat: think tag parse

* feat: parse think tag test

* feat: pdf parse ux

* feat: doc2x parse

* perf: rewrite training mode setting

* feat: image parse queue

* perf: image index

* feat: image parse process

* feat: add init sh

* fix: ts
This commit is contained in:
Archer
2025-03-03 23:08:29 +08:00
committed by archer
parent 08b6f594df
commit adf5377ebe
106 changed files with 2337 additions and 1454 deletions

View File

@@ -1,16 +1,16 @@
import { MongoDatasetTraining } from './schema';
import type {
PushDatasetDataChunkProps,
PushDatasetDataProps,
PushDatasetDataResponse
} from '@fastgpt/global/core/dataset/api.d';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { simpleText } from '@fastgpt/global/common/string/tools';
import { ClientSession } from '../../../common/mongo';
import { getLLMModel, getEmbeddingModel } from '../../ai/model';
import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
import { addLog } from '../../../common/system/log';
import { getCollectionWithDataset } from '../controller';
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
import { PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type';
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
try {
@@ -28,20 +28,17 @@ export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> =>
export const pushDataListToTrainingQueueByCollectionId = async ({
collectionId,
...props
}: {
teamId: string;
tmbId: string;
session?: ClientSession;
} & PushDatasetDataProps) => {
}: Omit<PushDataToTrainingQueueProps, 'datasetId' | 'agentModel' | 'vectorModel' | 'vlmModel'>) => {
const {
dataset: { _id: datasetId, agentModel, vectorModel }
dataset: { _id: datasetId, agentModel, vectorModel, vlmModel }
} = await getCollectionWithDataset(collectionId);
return pushDataListToTrainingQueue({
...props,
datasetId,
collectionId,
vectorModel,
agentModel,
vectorModel
vlmModel
});
};
@@ -52,30 +49,30 @@ export async function pushDataListToTrainingQueue({
collectionId,
agentModel,
vectorModel,
vlmModel,
data,
prompt,
billId,
trainingMode = TrainingModeEnum.chunk,
mode = TrainingModeEnum.chunk,
session
}: {
teamId: string;
tmbId: string;
datasetId: string;
agentModel: string;
vectorModel: string;
session?: ClientSession;
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
const getImageChunkMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => {
if (mode !== TrainingModeEnum.image) return mode;
// 检查内容中,是否包含 ![](xxx) 的图片格式
const text = data.q + data.a || '';
const regex = /!\[\]\((.*?)\)/g;
const match = text.match(regex);
if (match) {
return TrainingModeEnum.image;
}
return mode;
};
const { model, maxToken, weight } = await (async () => {
const agentModelData = getLLMModel(agentModel);
if (!agentModelData) {
return Promise.reject(`File model ${agentModel} is inValid`);
}
const vectorModelData = getEmbeddingModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(`Vector model ${vectorModel} is inValid`);
}
if (trainingMode === TrainingModeEnum.chunk) {
if (mode === TrainingModeEnum.chunk) {
const vectorModelData = getEmbeddingModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(`Vector model ${vectorModel} is inValid`);
}
return {
maxToken: vectorModelData.maxToken * 1.5,
model: vectorModelData.model,
@@ -83,7 +80,11 @@ export async function pushDataListToTrainingQueue({
};
}
if (trainingMode === TrainingModeEnum.qa || trainingMode === TrainingModeEnum.auto) {
if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) {
const agentModelData = getLLMModel(agentModel);
if (!agentModelData) {
return Promise.reject(`File model ${agentModel} is inValid`);
}
return {
maxToken: agentModelData.maxContext * 0.8,
model: agentModelData.model,
@@ -91,8 +92,24 @@ export async function pushDataListToTrainingQueue({
};
}
return Promise.reject(`Training mode "${trainingMode}" is inValid`);
if (mode === TrainingModeEnum.image) {
const vllmModelData = getVlmModel(vlmModel);
if (!vllmModelData) {
return Promise.reject(`Vlm model ${vlmModel} is inValid`);
}
return {
maxToken: vllmModelData.maxContext * 0.8,
model: vllmModelData.model,
weight: 0
};
}
return Promise.reject(`Training mode "${mode}" is inValid`);
})();
// Filter redundant params
if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) {
prompt = undefined;
}
// filter repeat or equal content
const set = new Set();
@@ -158,7 +175,7 @@ export async function pushDataListToTrainingQueue({
datasetId,
collectionId,
billId,
mode: trainingMode,
mode: getImageChunkMode(item, mode),
prompt,
model,
q: item.q,

View File

@@ -1,14 +1,15 @@
/* 模型的知识库 */
import { connectionMongo, getMongoModel, type Model } from '../../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { connectionMongo, getMongoModel } from '../../../common/mongo';
const { Schema } = connectionMongo;
import { DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
import { TrainingTypeMap } from '@fastgpt/global/core/dataset/constants';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { DatasetColCollectionName } from '../collection/schema';
import { DatasetCollectionName } from '../schema';
import {
TeamCollectionName,
TeamMemberCollectionName
} from '@fastgpt/global/support/user/team/constant';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
export const DatasetTrainingCollectionName = 'dataset_trainings';
@@ -25,7 +26,6 @@ const TrainingDataSchema = new Schema({
},
datasetId: {
type: Schema.Types.ObjectId,
ref: DatasetCollectionName,
required: true
},
collectionId: {
@@ -33,15 +33,13 @@ const TrainingDataSchema = new Schema({
ref: DatasetColCollectionName,
required: true
},
billId: {
// concat bill
type: String
},
billId: String,
mode: {
type: String,
enum: Object.keys(TrainingTypeMap),
enum: Object.values(TrainingModeEnum),
required: true
},
expireAt: {
// It will be deleted after 7 days
type: Date,
@@ -88,6 +86,10 @@ const TrainingDataSchema = new Schema({
indexes: {
type: [
{
type: {
type: String,
enum: Object.values(DatasetDataIndexTypeEnum)
},
text: {
type: String,
required: true
@@ -98,6 +100,19 @@ const TrainingDataSchema = new Schema({
}
});
TrainingDataSchema.virtual('dataset', {
ref: DatasetCollectionName,
localField: 'datasetId',
foreignField: '_id',
justOne: true
});
TrainingDataSchema.virtual('collection', {
ref: DatasetColCollectionName,
localField: 'collectionId',
foreignField: '_id',
justOne: true
});
try {
// lock training data(teamId); delete training data
TrainingDataSchema.index({ teamId: 1, datasetId: 1 });