Add image index and pdf parse (#3956)

* feat: think tag parse

* feat: parse think tag test

* feat: pdf parse ux

* feat: doc2x parse

* perf: rewrite training mode setting

* feat: image parse queue

* perf: image index

* feat: image parse process

* feat: add init sh

* fix: ts
This commit is contained in:
Archer
2025-03-03 23:08:29 +08:00
committed by archer
parent 08b6f594df
commit adf5377ebe
106 changed files with 2337 additions and 1454 deletions

View File

@@ -13,6 +13,11 @@ export const getDatasetModel = (model?: string) => {
?.find((item) => item.model === model || item.name === model) ?? getDefaultLLMModel()
);
};
export const getVlmModel = (model?: string) => {
return Array.from(global.llmModelMap.values())
?.filter((item) => item.vision)
?.find((item) => item.model === model || item.name === model);
};
export const getDefaultEmbeddingModel = () => global?.systemDefaultModel.embedding!;
export const getEmbeddingModel = (model?: string) => {

View File

@@ -9,10 +9,9 @@ import type {
} from '@fastgpt/global/core/ai/type.d';
import axios from 'axios';
import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants';
import { getFileContentTypeFromHeader, guessBase64ImageType } from '../../common/file/utils';
import { serverRequestBaseUrl } from '../../common/api/serverRequest';
import { i18nT } from '../../../web/i18n/utils';
import { addLog } from '../../common/system/log';
import { getImageBase64 } from '../../common/file/image/utils';
export const filterGPTMessageByMaxContext = async ({
messages = [],
@@ -166,25 +165,13 @@ export const loadRequestMessages = async ({
try {
// If imgUrl is a local path, load image from local, and set url to base64
if (imgUrl.startsWith('/') || process.env.MULTIPLE_DATA_TO_BASE64 === 'true') {
addLog.debug('Load image from local server', {
baseUrl: serverRequestBaseUrl,
requestUrl: imgUrl
});
const response = await axios.get(imgUrl, {
baseURL: serverRequestBaseUrl,
responseType: 'arraybuffer',
proxy: false
});
const base64 = Buffer.from(response.data, 'binary').toString('base64');
const imageType =
getFileContentTypeFromHeader(response.headers['content-type']) ||
guessBase64ImageType(base64);
const base64 = await getImageBase64(imgUrl);
return {
...item,
image_url: {
...item.image_url,
url: `data:${imageType};base64,${base64}`
url: base64
}
};
}
@@ -223,7 +210,8 @@ export const loadRequestMessages = async ({
await Promise.all(
content.map(async (item) => {
if (item.type === 'text') {
if (item.text) return parseStringWithImages(item.text);
// If it is array, not need to parse image
if (item.text) return item;
return;
}
if (item.type === 'file_url') return; // LLM not support file_url

View File

@@ -108,7 +108,15 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer }
return formattedFiles;
};
const getFileContent = async ({ teamId, apiFileId }: { teamId: string; apiFileId: string }) => {
const getFileContent = async ({
teamId,
tmbId,
apiFileId
}: {
teamId: string;
tmbId: string;
apiFileId: string;
}) => {
const data = await request<APIFileContentResponse>(
`/v1/file/content`,
{ id: apiFileId },
@@ -123,6 +131,7 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer }
if (previewUrl) {
const rawText = await readFileRawTextByUrl({
teamId,
tmbId,
url: previewUrl,
relatedId: apiFileId
});

View File

@@ -1,6 +1,6 @@
import {
DatasetCollectionTypeEnum,
TrainingModeEnum
DatasetCollectionDataProcessModeEnum
} from '@fastgpt/global/core/dataset/constants';
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { MongoDatasetCollection } from './schema';
@@ -19,13 +19,14 @@ import { predictDataLimitLength } from '../../../../global/core/dataset/utils';
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
import { createTrainingUsage } from '../../../support/wallet/usage/controller';
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
import { getLLMModel, getEmbeddingModel } from '../../ai/model';
import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
import { pushDataListToTrainingQueue } from '../training/controller';
import { MongoImage } from '../../../common/file/image/schema';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { addDays } from 'date-fns';
import { MongoDatasetDataText } from '../data/dataTextSchema';
import { delay, retryFn } from '@fastgpt/global/common/system/utils';
import { retryFn } from '@fastgpt/global/common/system/utils';
import { getTrainingModeByCollection } from './utils';
export const createCollectionAndInsertData = async ({
dataset,
@@ -33,6 +34,7 @@ export const createCollectionAndInsertData = async ({
relatedId,
createCollectionParams,
isQAImport = false,
billId,
session
}: {
dataset: DatasetSchemaType;
@@ -41,13 +43,21 @@ export const createCollectionAndInsertData = async ({
createCollectionParams: CreateOneCollectionParams;
isQAImport?: boolean;
billId?: string;
session?: ClientSession;
}) => {
// Adapter 4.9.0
if (createCollectionParams.trainingType === DatasetCollectionDataProcessModeEnum.auto) {
createCollectionParams.trainingType = DatasetCollectionDataProcessModeEnum.chunk;
createCollectionParams.autoIndexes = true;
}
const teamId = createCollectionParams.teamId;
const tmbId = createCollectionParams.tmbId;
// Chunk split params
const trainingType = createCollectionParams.trainingType || TrainingModeEnum.chunk;
const chunkSize = createCollectionParams.chunkSize;
const trainingType =
createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
const chunkSize = createCollectionParams.chunkSize || 512;
const chunkSplitter = createCollectionParams.chunkSplitter;
const qaPrompt = createCollectionParams.qaPrompt;
const usageName = createCollectionParams.name;
@@ -56,7 +66,7 @@ export const createCollectionAndInsertData = async ({
const chunks = rawText2Chunks({
rawText,
chunkLen: chunkSize,
overlapRatio: trainingType === TrainingModeEnum.chunk ? 0.2 : 0,
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
customReg: chunkSplitter ? [chunkSplitter] : [],
isQAImport
});
@@ -64,7 +74,14 @@ export const createCollectionAndInsertData = async ({
// 2. auth limit
await checkDatasetLimit({
teamId,
insertLen: predictDataLimitLength(trainingType, chunks)
insertLen: predictDataLimitLength(
getTrainingModeByCollection({
trainingType,
autoIndexes: createCollectionParams.autoIndexes,
imageIndex: createCollectionParams.imageIndex
}),
chunks
)
});
const fn = async (session: ClientSession) => {
@@ -89,15 +106,20 @@ export const createCollectionAndInsertData = async ({
});
// 4. create training bill
const { billId } = await createTrainingUsage({
teamId,
tmbId,
appName: usageName,
billSource: UsageSourceEnum.training,
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
agentModel: getLLMModel(dataset.agentModel)?.name,
session
});
const traingBillId = await (async () => {
if (billId) return billId;
const { billId: newBillId } = await createTrainingUsage({
teamId,
tmbId,
appName: usageName,
billSource: UsageSourceEnum.training,
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
agentModel: getLLMModel(dataset.agentModel)?.name,
vllmModel: getVlmModel(dataset.vlmModel)?.name,
session
});
return newBillId;
})();
// 5. insert to training queue
const insertResults = await pushDataListToTrainingQueue({
@@ -107,9 +129,14 @@ export const createCollectionAndInsertData = async ({
collectionId,
agentModel: dataset.agentModel,
vectorModel: dataset.vectorModel,
trainingMode: trainingType,
vlmModel: dataset.vlmModel,
mode: getTrainingModeByCollection({
trainingType,
autoIndexes: createCollectionParams.autoIndexes,
imageIndex: createCollectionParams.imageIndex
}),
prompt: qaPrompt,
billId,
billId: traingBillId,
data: chunks.map((item, index) => ({
...item,
chunkIndex: index
@@ -161,10 +188,15 @@ export async function createOneCollection({
datasetId,
type,
trainingType = TrainingModeEnum.chunk,
chunkSize = 512,
chunkSplitter,
qaPrompt,
createTime,
updateTime,
hashRawText,
rawTextLength,
metadata = {},
tags,
nextSyncTime,
fileId,
rawLink,
@@ -172,15 +204,18 @@ export async function createOneCollection({
externalFileUrl,
apiFileId,
hashRawText,
rawTextLength,
metadata = {},
session,
tags,
// Parse settings
customPdfParse,
imageIndex,
createTime,
updateTime,
nextSyncTime
// Chunk settings
trainingType = DatasetCollectionDataProcessModeEnum.chunk,
autoIndexes,
chunkSize = 512,
chunkSplitter,
qaPrompt,
session
}: CreateOneCollectionParams) {
// Create collection tags
const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session });
@@ -196,25 +231,31 @@ export async function createOneCollection({
name,
type,
trainingType,
chunkSize,
chunkSplitter,
qaPrompt,
rawTextLength,
hashRawText,
tags: collectionTags,
metadata,
createTime,
updateTime,
nextSyncTime,
...(fileId ? { fileId } : {}),
...(rawLink ? { rawLink } : {}),
...(externalFileId ? { externalFileId } : {}),
...(externalFileUrl ? { externalFileUrl } : {}),
...(apiFileId ? { apiFileId } : {}),
rawTextLength,
hashRawText,
tags: collectionTags,
// Parse settings
customPdfParse,
imageIndex,
createTime,
updateTime,
nextSyncTime
// Chunk settings
trainingType,
autoIndexes,
chunkSize,
chunkSplitter,
qaPrompt
}
],
{ session, ordered: true }

View File

@@ -1,7 +1,10 @@
import { connectionMongo, getMongoModel } from '../../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constants';
import {
DatasetCollectionTypeMap,
DatasetCollectionDataProcessModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { DatasetCollectionName } from '../schema';
import {
TeamCollectionName,
@@ -31,6 +34,8 @@ const DatasetCollectionSchema = new Schema({
ref: DatasetCollectionName,
required: true
},
// Basic info
type: {
type: String,
enum: Object.keys(DatasetCollectionTypeMap),
@@ -40,6 +45,11 @@ const DatasetCollectionSchema = new Schema({
type: String,
required: true
},
tags: {
type: [String],
default: []
},
createTime: {
type: Date,
default: () => new Date()
@@ -48,33 +58,8 @@ const DatasetCollectionSchema = new Schema({
type: Date,
default: () => new Date()
},
forbid: {
type: Boolean,
default: false
},
// chunk filed
trainingType: {
type: String,
enum: Object.keys(TrainingTypeMap)
},
chunkSize: {
type: Number,
required: true
},
chunkSplitter: {
type: String
},
qaPrompt: {
type: String
},
ocrParse: Boolean,
tags: {
type: [String],
default: []
},
// Metadata
// local file collection
fileId: {
type: Schema.Types.ObjectId,
@@ -82,22 +67,39 @@ const DatasetCollectionSchema = new Schema({
},
// web link collection
rawLink: String,
// api collection
// Api collection
apiFileId: String,
// external collection
// external collection(Abandoned)
externalFileId: String,
externalFileUrl: String, // external import url
// next sync time
nextSyncTime: Date,
// metadata
rawTextLength: Number,
hashRawText: String,
metadata: {
type: Object,
default: {}
}
},
forbid: Boolean,
// next sync time
nextSyncTime: Date,
// Parse settings
customPdfParse: Boolean,
// Chunk settings
imageIndex: Boolean,
autoIndexes: Boolean,
trainingType: {
type: String,
enum: Object.values(DatasetCollectionDataProcessModeEnum)
},
chunkSize: {
type: Number,
required: true
},
chunkSplitter: String,
qaPrompt: String
});
DatasetCollectionSchema.virtual('dataset', {

View File

@@ -2,12 +2,17 @@ import { MongoDatasetCollection } from './schema';
import { ClientSession } from '../../../common/mongo';
import { MongoDatasetCollectionTags } from '../tag/schema';
import { readFromSecondary } from '../../../common/mongo/utils';
import { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type';
import {
CollectionWithDatasetType,
DatasetCollectionSchemaType
} from '@fastgpt/global/core/dataset/type';
import {
DatasetCollectionDataProcessModeEnum,
DatasetCollectionSyncResultEnum,
DatasetCollectionTypeEnum,
DatasetSourceReadTypeEnum,
DatasetTypeEnum
DatasetTypeEnum,
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
import { readDatasetSourceRawText } from '../read';
@@ -160,6 +165,7 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
})();
const rawText = await readDatasetSourceRawText({
teamId: collection.teamId,
tmbId: collection.tmbId,
...sourceReadType
});
@@ -220,3 +226,24 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
return DatasetCollectionSyncResultEnum.success;
};
/*
QA: 独立进程
Chunk: Image Index -> Auto index -> chunk index
*/
export const getTrainingModeByCollection = (collection: {
trainingType: DatasetCollectionSchemaType['trainingType'];
autoIndexes?: DatasetCollectionSchemaType['autoIndexes'];
imageIndex?: DatasetCollectionSchemaType['imageIndex'];
}) => {
if (collection.trainingType === DatasetCollectionDataProcessModeEnum.qa) {
return TrainingModeEnum.qa;
}
if (collection.imageIndex && global.feConfigs?.isPlus) {
return TrainingModeEnum.image;
}
if (collection.autoIndexes && global.feConfigs?.isPlus) {
return TrainingModeEnum.auto;
}
return TrainingModeEnum.chunk;
};

View File

@@ -7,6 +7,7 @@ import {
} from '@fastgpt/global/support/user/team/constant';
import { DatasetCollectionName } from '../schema';
import { DatasetColCollectionName } from '../collection/schema';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
export const DatasetDataCollectionName = 'dataset_datas';
@@ -42,10 +43,16 @@ const DatasetDataSchema = new Schema({
indexes: {
type: [
{
// Abandon
defaultIndex: {
type: Boolean,
default: false
},
type: {
type: String,
enum: Object.values(DatasetDataIndexTypeEnum),
default: DatasetDataIndexTypeEnum.custom
},
dataId: {
type: String,
required: true

View File

@@ -13,11 +13,15 @@ import { POST } from '../../common/api/plusRequest';
export const readFileRawTextByUrl = async ({
teamId,
tmbId,
url,
customPdfParse,
relatedId
}: {
teamId: string;
tmbId: string;
url: string;
customPdfParse?: boolean;
relatedId: string; // externalFileId / apiFileId
}) => {
const response = await axios({
@@ -30,8 +34,11 @@ export const readFileRawTextByUrl = async ({
const buffer = Buffer.from(response.data, 'binary');
const { rawText } = await readRawContentByFileBuffer({
customPdfParse,
isQAImport: false,
extension,
teamId,
tmbId,
buffer,
encoding: 'utf-8',
metadata: {
@@ -49,6 +56,7 @@ export const readFileRawTextByUrl = async ({
*/
export const readDatasetSourceRawText = async ({
teamId,
tmbId,
type,
sourceId,
isQAImport,
@@ -56,11 +64,14 @@ export const readDatasetSourceRawText = async ({
externalFileId,
apiServer,
feishuServer,
yuqueServer
yuqueServer,
customPdfParse
}: {
teamId: string;
tmbId: string;
type: DatasetSourceReadTypeEnum;
sourceId: string;
customPdfParse?: boolean;
isQAImport?: boolean; // csv data
selector?: string; // link selector
@@ -72,9 +83,11 @@ export const readDatasetSourceRawText = async ({
if (type === DatasetSourceReadTypeEnum.fileLocal) {
const { rawText } = await readFileContentFromMongo({
teamId,
tmbId,
bucketName: BucketNameEnum.dataset,
fileId: sourceId,
isQAImport
isQAImport,
customPdfParse
});
return rawText;
} else if (type === DatasetSourceReadTypeEnum.link) {
@@ -88,8 +101,10 @@ export const readDatasetSourceRawText = async ({
if (!externalFileId) return Promise.reject('FileId not found');
const rawText = await readFileRawTextByUrl({
teamId,
tmbId,
url: sourceId,
relatedId: externalFileId
relatedId: externalFileId,
customPdfParse
});
return rawText;
} else if (type === DatasetSourceReadTypeEnum.apiFile) {
@@ -98,7 +113,8 @@ export const readDatasetSourceRawText = async ({
feishuServer,
yuqueServer,
apiFileId: sourceId,
teamId
teamId,
tmbId
});
return rawText;
}
@@ -110,16 +126,18 @@ export const readApiServerFileContent = async ({
feishuServer,
yuqueServer,
apiFileId,
teamId
teamId,
tmbId
}: {
apiServer?: APIFileServer;
feishuServer?: FeishuServer;
yuqueServer?: YuqueServer;
apiFileId: string;
teamId: string;
tmbId: string;
}) => {
if (apiServer) {
return useApiDatasetRequest({ apiServer }).getFileContent({ teamId, apiFileId });
return useApiDatasetRequest({ apiServer }).getFileContent({ teamId, tmbId, apiFileId });
}
if (feishuServer || yuqueServer) {

View File

@@ -67,6 +67,7 @@ const DatasetSchema = new Schema({
required: true,
default: 'gpt-4o-mini'
},
vlmModel: String,
intro: {
type: String,
default: ''

View File

@@ -1,16 +1,16 @@
import { MongoDatasetTraining } from './schema';
import type {
PushDatasetDataChunkProps,
PushDatasetDataProps,
PushDatasetDataResponse
} from '@fastgpt/global/core/dataset/api.d';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { simpleText } from '@fastgpt/global/common/string/tools';
import { ClientSession } from '../../../common/mongo';
import { getLLMModel, getEmbeddingModel } from '../../ai/model';
import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
import { addLog } from '../../../common/system/log';
import { getCollectionWithDataset } from '../controller';
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
import { PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type';
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
try {
@@ -28,20 +28,17 @@ export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> =>
export const pushDataListToTrainingQueueByCollectionId = async ({
collectionId,
...props
}: {
teamId: string;
tmbId: string;
session?: ClientSession;
} & PushDatasetDataProps) => {
}: Omit<PushDataToTrainingQueueProps, 'datasetId' | 'agentModel' | 'vectorModel' | 'vlmModel'>) => {
const {
dataset: { _id: datasetId, agentModel, vectorModel }
dataset: { _id: datasetId, agentModel, vectorModel, vlmModel }
} = await getCollectionWithDataset(collectionId);
return pushDataListToTrainingQueue({
...props,
datasetId,
collectionId,
vectorModel,
agentModel,
vectorModel
vlmModel
});
};
@@ -52,30 +49,30 @@ export async function pushDataListToTrainingQueue({
collectionId,
agentModel,
vectorModel,
vlmModel,
data,
prompt,
billId,
trainingMode = TrainingModeEnum.chunk,
mode = TrainingModeEnum.chunk,
session
}: {
teamId: string;
tmbId: string;
datasetId: string;
agentModel: string;
vectorModel: string;
session?: ClientSession;
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
const getImageChunkMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => {
if (mode !== TrainingModeEnum.image) return mode;
// 检查内容中,是否包含 ![](xxx) 的图片格式
const text = data.q + data.a || '';
const regex = /!\[\]\((.*?)\)/g;
const match = text.match(regex);
if (match) {
return TrainingModeEnum.image;
}
return mode;
};
const { model, maxToken, weight } = await (async () => {
const agentModelData = getLLMModel(agentModel);
if (!agentModelData) {
return Promise.reject(`File model ${agentModel} is inValid`);
}
const vectorModelData = getEmbeddingModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(`Vector model ${vectorModel} is inValid`);
}
if (trainingMode === TrainingModeEnum.chunk) {
if (mode === TrainingModeEnum.chunk) {
const vectorModelData = getEmbeddingModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(`Vector model ${vectorModel} is inValid`);
}
return {
maxToken: vectorModelData.maxToken * 1.5,
model: vectorModelData.model,
@@ -83,7 +80,11 @@ export async function pushDataListToTrainingQueue({
};
}
if (trainingMode === TrainingModeEnum.qa || trainingMode === TrainingModeEnum.auto) {
if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) {
const agentModelData = getLLMModel(agentModel);
if (!agentModelData) {
return Promise.reject(`File model ${agentModel} is inValid`);
}
return {
maxToken: agentModelData.maxContext * 0.8,
model: agentModelData.model,
@@ -91,8 +92,24 @@ export async function pushDataListToTrainingQueue({
};
}
return Promise.reject(`Training mode "${trainingMode}" is inValid`);
if (mode === TrainingModeEnum.image) {
const vllmModelData = getVlmModel(vlmModel);
if (!vllmModelData) {
return Promise.reject(`Vlm model ${vlmModel} is inValid`);
}
return {
maxToken: vllmModelData.maxContext * 0.8,
model: vllmModelData.model,
weight: 0
};
}
return Promise.reject(`Training mode "${mode}" is inValid`);
})();
// Filter redundant params
if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) {
prompt = undefined;
}
// filter repeat or equal content
const set = new Set();
@@ -158,7 +175,7 @@ export async function pushDataListToTrainingQueue({
datasetId,
collectionId,
billId,
mode: trainingMode,
mode: getImageChunkMode(item, mode),
prompt,
model,
q: item.q,

View File

@@ -1,14 +1,15 @@
/* 模型的知识库 */
import { connectionMongo, getMongoModel, type Model } from '../../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { connectionMongo, getMongoModel } from '../../../common/mongo';
const { Schema } = connectionMongo;
import { DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
import { TrainingTypeMap } from '@fastgpt/global/core/dataset/constants';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { DatasetColCollectionName } from '../collection/schema';
import { DatasetCollectionName } from '../schema';
import {
TeamCollectionName,
TeamMemberCollectionName
} from '@fastgpt/global/support/user/team/constant';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
export const DatasetTrainingCollectionName = 'dataset_trainings';
@@ -25,7 +26,6 @@ const TrainingDataSchema = new Schema({
},
datasetId: {
type: Schema.Types.ObjectId,
ref: DatasetCollectionName,
required: true
},
collectionId: {
@@ -33,15 +33,13 @@ const TrainingDataSchema = new Schema({
ref: DatasetColCollectionName,
required: true
},
billId: {
// concat bill
type: String
},
billId: String,
mode: {
type: String,
enum: Object.keys(TrainingTypeMap),
enum: Object.values(TrainingModeEnum),
required: true
},
expireAt: {
// It will be deleted after 7 days
type: Date,
@@ -88,6 +86,10 @@ const TrainingDataSchema = new Schema({
indexes: {
type: [
{
type: {
type: String,
enum: Object.values(DatasetDataIndexTypeEnum)
},
text: {
type: String,
required: true
@@ -98,6 +100,19 @@ const TrainingDataSchema = new Schema({
}
});
TrainingDataSchema.virtual('dataset', {
ref: DatasetCollectionName,
localField: 'datasetId',
foreignField: '_id',
justOne: true
});
TrainingDataSchema.virtual('collection', {
ref: DatasetColCollectionName,
localField: 'collectionId',
foreignField: '_id',
justOne: true
});
try {
// lock training data(teamId); delete training data
TrainingDataSchema.index({ teamId: 1, datasetId: 1 });

View File

@@ -1,6 +1,7 @@
import { NodeOutputKeyEnum } from '@fastgpt/global/core/workflow/constants';
import { DispatchNodeResponseKeyEnum } from '@fastgpt/global/core/workflow/runtime/constants';
import type {
ChatDispatchProps,
DispatchNodeResultType,
RuntimeNodeItemType
} from '@fastgpt/global/core/workflow/runtime/type';
@@ -46,7 +47,7 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise<
query,
requestOrigin,
chatConfig,
runningAppInfo: { teamId },
runningUserInfo,
externalProvider,
params: {
model,
@@ -99,10 +100,10 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise<
const globalFiles = chatValue2RuntimePrompt(query).files;
const { documentQuoteText, userFiles } = await getMultiInput({
runningUserInfo,
histories: chatHistories,
requestOrigin,
maxFiles: chatConfig?.fileSelectConfig?.maxFiles || 20,
teamId,
fileLinks,
inputFiles: globalFiles,
hasReadFilesTool
@@ -289,19 +290,19 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise<
};
const getMultiInput = async ({
runningUserInfo,
histories,
fileLinks,
requestOrigin,
maxFiles,
teamId,
inputFiles,
hasReadFilesTool
}: {
runningUserInfo: ChatDispatchProps['runningUserInfo'];
histories: ChatItemType[];
fileLinks?: string[];
requestOrigin?: string;
maxFiles: number;
teamId: string;
inputFiles: UserChatItemValueItemType['file'][];
hasReadFilesTool: boolean;
}) => {
@@ -329,7 +330,8 @@ const getMultiInput = async ({
urls,
requestOrigin,
maxFiles,
teamId
teamId: runningUserInfo.teamId,
tmbId: runningUserInfo.tmbId
});
return {

View File

@@ -11,7 +11,10 @@ import { formatModelChars2Points } from '../../../../support/wallet/usage/utils'
import type { LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
import { postTextCensor } from '../../../../common/api/requestPlusApi';
import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants';
import type { DispatchNodeResultType } from '@fastgpt/global/core/workflow/runtime/type';
import type {
ChatDispatchProps,
DispatchNodeResultType
} from '@fastgpt/global/core/workflow/runtime/type';
import { countGptMessagesTokens } from '../../../../common/string/tiktoken/index';
import {
chats2GPTMessages,
@@ -69,7 +72,7 @@ export const dispatchChatCompletion = async (props: ChatProps): Promise<ChatResp
histories,
node: { name },
query,
runningAppInfo: { teamId },
runningUserInfo,
workflowStreamResponse,
chatConfig,
params: {
@@ -121,7 +124,7 @@ export const dispatchChatCompletion = async (props: ChatProps): Promise<ChatResp
stringQuoteText,
requestOrigin,
maxFiles: chatConfig?.fileSelectConfig?.maxFiles || 20,
teamId
runningUserInfo
})
]);
@@ -355,7 +358,7 @@ async function getMultiInput({
stringQuoteText,
requestOrigin,
maxFiles,
teamId
runningUserInfo
}: {
histories: ChatItemType[];
inputFiles: UserChatItemValueItemType['file'][];
@@ -363,7 +366,7 @@ async function getMultiInput({
stringQuoteText?: string; // file quote
requestOrigin?: string;
maxFiles: number;
teamId: string;
runningUserInfo: ChatDispatchProps['runningUserInfo'];
}) {
// 旧版本适配====>
if (stringQuoteText) {
@@ -400,7 +403,8 @@ async function getMultiInput({
urls,
requestOrigin,
maxFiles,
teamId
teamId: runningUserInfo.teamId,
tmbId: runningUserInfo.tmbId
});
return {

View File

@@ -45,7 +45,7 @@ ${content.slice(0, 100)}${content.length > 100 ? '......' : ''}
export const dispatchReadFiles = async (props: Props): Promise<Response> => {
const {
requestOrigin,
runningAppInfo: { teamId },
runningUserInfo: { teamId, tmbId },
histories,
chatConfig,
node: { version },
@@ -61,7 +61,8 @@ export const dispatchReadFiles = async (props: Props): Promise<Response> => {
urls: [...fileUrlList, ...filesFromHistories],
requestOrigin,
maxFiles,
teamId
teamId,
tmbId
});
return {
@@ -105,12 +106,14 @@ export const getFileContentFromLinks = async ({
urls,
requestOrigin,
maxFiles,
teamId
teamId,
tmbId
}: {
urls: string[];
requestOrigin?: string;
maxFiles: number;
teamId: string;
tmbId: string;
}) => {
const parseUrlList = urls
// Remove invalid urls
@@ -205,6 +208,7 @@ export const getFileContentFromLinks = async ({
extension,
isQAImport: false,
teamId,
tmbId,
buffer,
encoding
});