mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 05:12:39 +00:00
Add image index and pdf parse (#3956)
* feat: think tag parse * feat: parse think tag test * feat: pdf parse ux * feat: doc2x parse * perf: rewrite training mode setting * feat: image parse queue * perf: image index * feat: image parse process * feat: add init sh * fix: ts
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
import {
|
||||
DatasetCollectionTypeEnum,
|
||||
TrainingModeEnum
|
||||
DatasetCollectionDataProcessModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
|
||||
import { MongoDatasetCollection } from './schema';
|
||||
@@ -19,13 +19,14 @@ import { predictDataLimitLength } from '../../../../global/core/dataset/utils';
|
||||
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
|
||||
import { createTrainingUsage } from '../../../support/wallet/usage/controller';
|
||||
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
|
||||
import { getLLMModel, getEmbeddingModel } from '../../ai/model';
|
||||
import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
|
||||
import { pushDataListToTrainingQueue } from '../training/controller';
|
||||
import { MongoImage } from '../../../common/file/image/schema';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
import { addDays } from 'date-fns';
|
||||
import { MongoDatasetDataText } from '../data/dataTextSchema';
|
||||
import { delay, retryFn } from '@fastgpt/global/common/system/utils';
|
||||
import { retryFn } from '@fastgpt/global/common/system/utils';
|
||||
import { getTrainingModeByCollection } from './utils';
|
||||
|
||||
export const createCollectionAndInsertData = async ({
|
||||
dataset,
|
||||
@@ -33,6 +34,7 @@ export const createCollectionAndInsertData = async ({
|
||||
relatedId,
|
||||
createCollectionParams,
|
||||
isQAImport = false,
|
||||
billId,
|
||||
session
|
||||
}: {
|
||||
dataset: DatasetSchemaType;
|
||||
@@ -41,13 +43,21 @@ export const createCollectionAndInsertData = async ({
|
||||
createCollectionParams: CreateOneCollectionParams;
|
||||
|
||||
isQAImport?: boolean;
|
||||
billId?: string;
|
||||
session?: ClientSession;
|
||||
}) => {
|
||||
// Adapter 4.9.0
|
||||
if (createCollectionParams.trainingType === DatasetCollectionDataProcessModeEnum.auto) {
|
||||
createCollectionParams.trainingType = DatasetCollectionDataProcessModeEnum.chunk;
|
||||
createCollectionParams.autoIndexes = true;
|
||||
}
|
||||
|
||||
const teamId = createCollectionParams.teamId;
|
||||
const tmbId = createCollectionParams.tmbId;
|
||||
// Chunk split params
|
||||
const trainingType = createCollectionParams.trainingType || TrainingModeEnum.chunk;
|
||||
const chunkSize = createCollectionParams.chunkSize;
|
||||
const trainingType =
|
||||
createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
|
||||
const chunkSize = createCollectionParams.chunkSize || 512;
|
||||
const chunkSplitter = createCollectionParams.chunkSplitter;
|
||||
const qaPrompt = createCollectionParams.qaPrompt;
|
||||
const usageName = createCollectionParams.name;
|
||||
@@ -56,7 +66,7 @@ export const createCollectionAndInsertData = async ({
|
||||
const chunks = rawText2Chunks({
|
||||
rawText,
|
||||
chunkLen: chunkSize,
|
||||
overlapRatio: trainingType === TrainingModeEnum.chunk ? 0.2 : 0,
|
||||
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
|
||||
customReg: chunkSplitter ? [chunkSplitter] : [],
|
||||
isQAImport
|
||||
});
|
||||
@@ -64,7 +74,14 @@ export const createCollectionAndInsertData = async ({
|
||||
// 2. auth limit
|
||||
await checkDatasetLimit({
|
||||
teamId,
|
||||
insertLen: predictDataLimitLength(trainingType, chunks)
|
||||
insertLen: predictDataLimitLength(
|
||||
getTrainingModeByCollection({
|
||||
trainingType,
|
||||
autoIndexes: createCollectionParams.autoIndexes,
|
||||
imageIndex: createCollectionParams.imageIndex
|
||||
}),
|
||||
chunks
|
||||
)
|
||||
});
|
||||
|
||||
const fn = async (session: ClientSession) => {
|
||||
@@ -89,15 +106,20 @@ export const createCollectionAndInsertData = async ({
|
||||
});
|
||||
|
||||
// 4. create training bill
|
||||
const { billId } = await createTrainingUsage({
|
||||
teamId,
|
||||
tmbId,
|
||||
appName: usageName,
|
||||
billSource: UsageSourceEnum.training,
|
||||
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
|
||||
agentModel: getLLMModel(dataset.agentModel)?.name,
|
||||
session
|
||||
});
|
||||
const traingBillId = await (async () => {
|
||||
if (billId) return billId;
|
||||
const { billId: newBillId } = await createTrainingUsage({
|
||||
teamId,
|
||||
tmbId,
|
||||
appName: usageName,
|
||||
billSource: UsageSourceEnum.training,
|
||||
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
|
||||
agentModel: getLLMModel(dataset.agentModel)?.name,
|
||||
vllmModel: getVlmModel(dataset.vlmModel)?.name,
|
||||
session
|
||||
});
|
||||
return newBillId;
|
||||
})();
|
||||
|
||||
// 5. insert to training queue
|
||||
const insertResults = await pushDataListToTrainingQueue({
|
||||
@@ -107,9 +129,14 @@ export const createCollectionAndInsertData = async ({
|
||||
collectionId,
|
||||
agentModel: dataset.agentModel,
|
||||
vectorModel: dataset.vectorModel,
|
||||
trainingMode: trainingType,
|
||||
vlmModel: dataset.vlmModel,
|
||||
mode: getTrainingModeByCollection({
|
||||
trainingType,
|
||||
autoIndexes: createCollectionParams.autoIndexes,
|
||||
imageIndex: createCollectionParams.imageIndex
|
||||
}),
|
||||
prompt: qaPrompt,
|
||||
billId,
|
||||
billId: traingBillId,
|
||||
data: chunks.map((item, index) => ({
|
||||
...item,
|
||||
chunkIndex: index
|
||||
@@ -161,10 +188,15 @@ export async function createOneCollection({
|
||||
datasetId,
|
||||
type,
|
||||
|
||||
trainingType = TrainingModeEnum.chunk,
|
||||
chunkSize = 512,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
createTime,
|
||||
updateTime,
|
||||
|
||||
hashRawText,
|
||||
rawTextLength,
|
||||
metadata = {},
|
||||
tags,
|
||||
|
||||
nextSyncTime,
|
||||
|
||||
fileId,
|
||||
rawLink,
|
||||
@@ -172,15 +204,18 @@ export async function createOneCollection({
|
||||
externalFileUrl,
|
||||
apiFileId,
|
||||
|
||||
hashRawText,
|
||||
rawTextLength,
|
||||
metadata = {},
|
||||
session,
|
||||
tags,
|
||||
// Parse settings
|
||||
customPdfParse,
|
||||
imageIndex,
|
||||
|
||||
createTime,
|
||||
updateTime,
|
||||
nextSyncTime
|
||||
// Chunk settings
|
||||
trainingType = DatasetCollectionDataProcessModeEnum.chunk,
|
||||
autoIndexes,
|
||||
chunkSize = 512,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
|
||||
session
|
||||
}: CreateOneCollectionParams) {
|
||||
// Create collection tags
|
||||
const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session });
|
||||
@@ -196,25 +231,31 @@ export async function createOneCollection({
|
||||
name,
|
||||
type,
|
||||
|
||||
trainingType,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
rawTextLength,
|
||||
hashRawText,
|
||||
tags: collectionTags,
|
||||
metadata,
|
||||
|
||||
createTime,
|
||||
updateTime,
|
||||
nextSyncTime,
|
||||
|
||||
...(fileId ? { fileId } : {}),
|
||||
...(rawLink ? { rawLink } : {}),
|
||||
...(externalFileId ? { externalFileId } : {}),
|
||||
...(externalFileUrl ? { externalFileUrl } : {}),
|
||||
...(apiFileId ? { apiFileId } : {}),
|
||||
|
||||
rawTextLength,
|
||||
hashRawText,
|
||||
tags: collectionTags,
|
||||
// Parse settings
|
||||
customPdfParse,
|
||||
imageIndex,
|
||||
|
||||
createTime,
|
||||
updateTime,
|
||||
nextSyncTime
|
||||
// Chunk settings
|
||||
trainingType,
|
||||
autoIndexes,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
qaPrompt
|
||||
}
|
||||
],
|
||||
{ session, ordered: true }
|
||||
|
@@ -1,7 +1,10 @@
|
||||
import { connectionMongo, getMongoModel } from '../../../common/mongo';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
|
||||
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constants';
|
||||
import {
|
||||
DatasetCollectionTypeMap,
|
||||
DatasetCollectionDataProcessModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { DatasetCollectionName } from '../schema';
|
||||
import {
|
||||
TeamCollectionName,
|
||||
@@ -31,6 +34,8 @@ const DatasetCollectionSchema = new Schema({
|
||||
ref: DatasetCollectionName,
|
||||
required: true
|
||||
},
|
||||
|
||||
// Basic info
|
||||
type: {
|
||||
type: String,
|
||||
enum: Object.keys(DatasetCollectionTypeMap),
|
||||
@@ -40,6 +45,11 @@ const DatasetCollectionSchema = new Schema({
|
||||
type: String,
|
||||
required: true
|
||||
},
|
||||
tags: {
|
||||
type: [String],
|
||||
default: []
|
||||
},
|
||||
|
||||
createTime: {
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
@@ -48,33 +58,8 @@ const DatasetCollectionSchema = new Schema({
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
},
|
||||
forbid: {
|
||||
type: Boolean,
|
||||
default: false
|
||||
},
|
||||
|
||||
// chunk filed
|
||||
trainingType: {
|
||||
type: String,
|
||||
enum: Object.keys(TrainingTypeMap)
|
||||
},
|
||||
chunkSize: {
|
||||
type: Number,
|
||||
required: true
|
||||
},
|
||||
chunkSplitter: {
|
||||
type: String
|
||||
},
|
||||
qaPrompt: {
|
||||
type: String
|
||||
},
|
||||
ocrParse: Boolean,
|
||||
|
||||
tags: {
|
||||
type: [String],
|
||||
default: []
|
||||
},
|
||||
|
||||
// Metadata
|
||||
// local file collection
|
||||
fileId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
@@ -82,22 +67,39 @@ const DatasetCollectionSchema = new Schema({
|
||||
},
|
||||
// web link collection
|
||||
rawLink: String,
|
||||
// api collection
|
||||
// Api collection
|
||||
apiFileId: String,
|
||||
// external collection
|
||||
// external collection(Abandoned)
|
||||
externalFileId: String,
|
||||
externalFileUrl: String, // external import url
|
||||
|
||||
// next sync time
|
||||
nextSyncTime: Date,
|
||||
|
||||
// metadata
|
||||
rawTextLength: Number,
|
||||
hashRawText: String,
|
||||
metadata: {
|
||||
type: Object,
|
||||
default: {}
|
||||
}
|
||||
},
|
||||
|
||||
forbid: Boolean,
|
||||
// next sync time
|
||||
nextSyncTime: Date,
|
||||
|
||||
// Parse settings
|
||||
customPdfParse: Boolean,
|
||||
|
||||
// Chunk settings
|
||||
imageIndex: Boolean,
|
||||
autoIndexes: Boolean,
|
||||
trainingType: {
|
||||
type: String,
|
||||
enum: Object.values(DatasetCollectionDataProcessModeEnum)
|
||||
},
|
||||
chunkSize: {
|
||||
type: Number,
|
||||
required: true
|
||||
},
|
||||
chunkSplitter: String,
|
||||
qaPrompt: String
|
||||
});
|
||||
|
||||
DatasetCollectionSchema.virtual('dataset', {
|
||||
|
@@ -2,12 +2,17 @@ import { MongoDatasetCollection } from './schema';
|
||||
import { ClientSession } from '../../../common/mongo';
|
||||
import { MongoDatasetCollectionTags } from '../tag/schema';
|
||||
import { readFromSecondary } from '../../../common/mongo/utils';
|
||||
import { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type';
|
||||
import {
|
||||
CollectionWithDatasetType,
|
||||
DatasetCollectionSchemaType
|
||||
} from '@fastgpt/global/core/dataset/type';
|
||||
import {
|
||||
DatasetCollectionDataProcessModeEnum,
|
||||
DatasetCollectionSyncResultEnum,
|
||||
DatasetCollectionTypeEnum,
|
||||
DatasetSourceReadTypeEnum,
|
||||
DatasetTypeEnum
|
||||
DatasetTypeEnum,
|
||||
TrainingModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
|
||||
import { readDatasetSourceRawText } from '../read';
|
||||
@@ -160,6 +165,7 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
|
||||
})();
|
||||
const rawText = await readDatasetSourceRawText({
|
||||
teamId: collection.teamId,
|
||||
tmbId: collection.tmbId,
|
||||
...sourceReadType
|
||||
});
|
||||
|
||||
@@ -220,3 +226,24 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
|
||||
|
||||
return DatasetCollectionSyncResultEnum.success;
|
||||
};
|
||||
|
||||
/*
|
||||
QA: 独立进程
|
||||
Chunk: Image Index -> Auto index -> chunk index
|
||||
*/
|
||||
export const getTrainingModeByCollection = (collection: {
|
||||
trainingType: DatasetCollectionSchemaType['trainingType'];
|
||||
autoIndexes?: DatasetCollectionSchemaType['autoIndexes'];
|
||||
imageIndex?: DatasetCollectionSchemaType['imageIndex'];
|
||||
}) => {
|
||||
if (collection.trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||
return TrainingModeEnum.qa;
|
||||
}
|
||||
if (collection.imageIndex && global.feConfigs?.isPlus) {
|
||||
return TrainingModeEnum.image;
|
||||
}
|
||||
if (collection.autoIndexes && global.feConfigs?.isPlus) {
|
||||
return TrainingModeEnum.auto;
|
||||
}
|
||||
return TrainingModeEnum.chunk;
|
||||
};
|
||||
|
Reference in New Issue
Block a user