mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 05:12:39 +00:00
Add image index and pdf parse (#3956)
* feat: think tag parse * feat: parse think tag test * feat: pdf parse ux * feat: doc2x parse * perf: rewrite training mode setting * feat: image parse queue * perf: image index * feat: image parse process * feat: add init sh * fix: ts
This commit is contained in:
@@ -1,16 +1,16 @@
|
||||
import { MongoDatasetTraining } from './schema';
|
||||
import type {
|
||||
PushDatasetDataChunkProps,
|
||||
PushDatasetDataProps,
|
||||
PushDatasetDataResponse
|
||||
} from '@fastgpt/global/core/dataset/api.d';
|
||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { simpleText } from '@fastgpt/global/common/string/tools';
|
||||
import { ClientSession } from '../../../common/mongo';
|
||||
import { getLLMModel, getEmbeddingModel } from '../../ai/model';
|
||||
import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
|
||||
import { addLog } from '../../../common/system/log';
|
||||
import { getCollectionWithDataset } from '../controller';
|
||||
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
|
||||
import { PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type';
|
||||
|
||||
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
|
||||
try {
|
||||
@@ -28,20 +28,17 @@ export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> =>
|
||||
export const pushDataListToTrainingQueueByCollectionId = async ({
|
||||
collectionId,
|
||||
...props
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
session?: ClientSession;
|
||||
} & PushDatasetDataProps) => {
|
||||
}: Omit<PushDataToTrainingQueueProps, 'datasetId' | 'agentModel' | 'vectorModel' | 'vlmModel'>) => {
|
||||
const {
|
||||
dataset: { _id: datasetId, agentModel, vectorModel }
|
||||
dataset: { _id: datasetId, agentModel, vectorModel, vlmModel }
|
||||
} = await getCollectionWithDataset(collectionId);
|
||||
return pushDataListToTrainingQueue({
|
||||
...props,
|
||||
datasetId,
|
||||
collectionId,
|
||||
vectorModel,
|
||||
agentModel,
|
||||
vectorModel
|
||||
vlmModel
|
||||
});
|
||||
};
|
||||
|
||||
@@ -52,30 +49,30 @@ export async function pushDataListToTrainingQueue({
|
||||
collectionId,
|
||||
agentModel,
|
||||
vectorModel,
|
||||
vlmModel,
|
||||
data,
|
||||
prompt,
|
||||
billId,
|
||||
trainingMode = TrainingModeEnum.chunk,
|
||||
mode = TrainingModeEnum.chunk,
|
||||
session
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
datasetId: string;
|
||||
agentModel: string;
|
||||
vectorModel: string;
|
||||
session?: ClientSession;
|
||||
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
|
||||
}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
|
||||
const getImageChunkMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => {
|
||||
if (mode !== TrainingModeEnum.image) return mode;
|
||||
// 检查内容中,是否包含  的图片格式
|
||||
const text = data.q + data.a || '';
|
||||
const regex = /!\[\]\((.*?)\)/g;
|
||||
const match = text.match(regex);
|
||||
if (match) {
|
||||
return TrainingModeEnum.image;
|
||||
}
|
||||
return mode;
|
||||
};
|
||||
const { model, maxToken, weight } = await (async () => {
|
||||
const agentModelData = getLLMModel(agentModel);
|
||||
if (!agentModelData) {
|
||||
return Promise.reject(`File model ${agentModel} is inValid`);
|
||||
}
|
||||
const vectorModelData = getEmbeddingModel(vectorModel);
|
||||
if (!vectorModelData) {
|
||||
return Promise.reject(`Vector model ${vectorModel} is inValid`);
|
||||
}
|
||||
|
||||
if (trainingMode === TrainingModeEnum.chunk) {
|
||||
if (mode === TrainingModeEnum.chunk) {
|
||||
const vectorModelData = getEmbeddingModel(vectorModel);
|
||||
if (!vectorModelData) {
|
||||
return Promise.reject(`Vector model ${vectorModel} is inValid`);
|
||||
}
|
||||
return {
|
||||
maxToken: vectorModelData.maxToken * 1.5,
|
||||
model: vectorModelData.model,
|
||||
@@ -83,7 +80,11 @@ export async function pushDataListToTrainingQueue({
|
||||
};
|
||||
}
|
||||
|
||||
if (trainingMode === TrainingModeEnum.qa || trainingMode === TrainingModeEnum.auto) {
|
||||
if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) {
|
||||
const agentModelData = getLLMModel(agentModel);
|
||||
if (!agentModelData) {
|
||||
return Promise.reject(`File model ${agentModel} is inValid`);
|
||||
}
|
||||
return {
|
||||
maxToken: agentModelData.maxContext * 0.8,
|
||||
model: agentModelData.model,
|
||||
@@ -91,8 +92,24 @@ export async function pushDataListToTrainingQueue({
|
||||
};
|
||||
}
|
||||
|
||||
return Promise.reject(`Training mode "${trainingMode}" is inValid`);
|
||||
if (mode === TrainingModeEnum.image) {
|
||||
const vllmModelData = getVlmModel(vlmModel);
|
||||
if (!vllmModelData) {
|
||||
return Promise.reject(`Vlm model ${vlmModel} is inValid`);
|
||||
}
|
||||
return {
|
||||
maxToken: vllmModelData.maxContext * 0.8,
|
||||
model: vllmModelData.model,
|
||||
weight: 0
|
||||
};
|
||||
}
|
||||
|
||||
return Promise.reject(`Training mode "${mode}" is inValid`);
|
||||
})();
|
||||
// Filter redundant params
|
||||
if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) {
|
||||
prompt = undefined;
|
||||
}
|
||||
|
||||
// filter repeat or equal content
|
||||
const set = new Set();
|
||||
@@ -158,7 +175,7 @@ export async function pushDataListToTrainingQueue({
|
||||
datasetId,
|
||||
collectionId,
|
||||
billId,
|
||||
mode: trainingMode,
|
||||
mode: getImageChunkMode(item, mode),
|
||||
prompt,
|
||||
model,
|
||||
q: item.q,
|
||||
|
@@ -1,14 +1,15 @@
|
||||
/* 模型的知识库 */
|
||||
import { connectionMongo, getMongoModel, type Model } from '../../../common/mongo';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
import { connectionMongo, getMongoModel } from '../../../common/mongo';
|
||||
const { Schema } = connectionMongo;
|
||||
import { DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
|
||||
import { TrainingTypeMap } from '@fastgpt/global/core/dataset/constants';
|
||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { DatasetColCollectionName } from '../collection/schema';
|
||||
import { DatasetCollectionName } from '../schema';
|
||||
import {
|
||||
TeamCollectionName,
|
||||
TeamMemberCollectionName
|
||||
} from '@fastgpt/global/support/user/team/constant';
|
||||
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
|
||||
|
||||
export const DatasetTrainingCollectionName = 'dataset_trainings';
|
||||
|
||||
@@ -25,7 +26,6 @@ const TrainingDataSchema = new Schema({
|
||||
},
|
||||
datasetId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: DatasetCollectionName,
|
||||
required: true
|
||||
},
|
||||
collectionId: {
|
||||
@@ -33,15 +33,13 @@ const TrainingDataSchema = new Schema({
|
||||
ref: DatasetColCollectionName,
|
||||
required: true
|
||||
},
|
||||
billId: {
|
||||
// concat bill
|
||||
type: String
|
||||
},
|
||||
billId: String,
|
||||
mode: {
|
||||
type: String,
|
||||
enum: Object.keys(TrainingTypeMap),
|
||||
enum: Object.values(TrainingModeEnum),
|
||||
required: true
|
||||
},
|
||||
|
||||
expireAt: {
|
||||
// It will be deleted after 7 days
|
||||
type: Date,
|
||||
@@ -88,6 +86,10 @@ const TrainingDataSchema = new Schema({
|
||||
indexes: {
|
||||
type: [
|
||||
{
|
||||
type: {
|
||||
type: String,
|
||||
enum: Object.values(DatasetDataIndexTypeEnum)
|
||||
},
|
||||
text: {
|
||||
type: String,
|
||||
required: true
|
||||
@@ -98,6 +100,19 @@ const TrainingDataSchema = new Schema({
|
||||
}
|
||||
});
|
||||
|
||||
TrainingDataSchema.virtual('dataset', {
|
||||
ref: DatasetCollectionName,
|
||||
localField: 'datasetId',
|
||||
foreignField: '_id',
|
||||
justOne: true
|
||||
});
|
||||
TrainingDataSchema.virtual('collection', {
|
||||
ref: DatasetColCollectionName,
|
||||
localField: 'collectionId',
|
||||
foreignField: '_id',
|
||||
justOne: true
|
||||
});
|
||||
|
||||
try {
|
||||
// lock training data(teamId); delete training data
|
||||
TrainingDataSchema.index({ teamId: 1, datasetId: 1 });
|
||||
|
Reference in New Issue
Block a user