perf: image index (#5071)

* doc

* perf: img cite prompt

* perf: image index

* perf: add api key to retrain api
This commit is contained in:
Archer
2025-06-20 13:25:32 +08:00
committed by GitHub
parent be72dda47e
commit 3ed3f2ad01
22 changed files with 153 additions and 55 deletions

View File

@@ -1,23 +1,54 @@
import { addEndpointToImageUrl } from '../../../common/file/image/utils';
import { getDatasetImagePreviewUrl } from '../image/utils';
import type { DatasetCiteItemType, DatasetDataSchemaType } from '@fastgpt/global/core/dataset/type';
export const formatDatasetDataValue = ({
teamId,
datasetId,
q,
a,
imageId,
teamId,
datasetId
imageDescMap
}: {
teamId: string;
datasetId: string;
q: string;
a?: string;
imageId?: string;
teamId: string;
datasetId: string;
imageDescMap?: Record<string, string>;
}): {
q: string;
a?: string;
imagePreivewUrl?: string;
} => {
// Add image description to image markdown
if (imageDescMap) {
// Helper function to replace image markdown with description
const replaceImageMarkdown = (text: string): string => {
return text.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (match, altText, url) => {
const description = imageDescMap[url];
if (description) {
// Add description to alt text, keeping original if exists
const newAltText = altText ? `${altText} - ${description}` : description;
return `![${newAltText.replace(/\n/g, '')}](${url})`;
}
return match; // Return original if no description found
});
};
// Apply replacement to both q and a
q = replaceImageMarkdown(q);
if (a) {
a = replaceImageMarkdown(a);
}
}
// Add image base url
q = addEndpointToImageUrl(q);
if (a) {
a = addEndpointToImageUrl(a);
}
if (!imageId) {
return {
q,
@@ -33,7 +64,7 @@ export const formatDatasetDataValue = ({
});
return {
q: `![${q.replaceAll('\n', '\\n')}](${previewUrl})`,
q: `![${q.replaceAll('\n', '')}](${previewUrl})`,
a,
imagePreivewUrl: previewUrl
};

View File

@@ -39,6 +39,8 @@ const DatasetDataSchema = new Schema({
a: {
type: String
},
imageId: String,
imageDescMap: Object,
history: {
type: [
{
@@ -73,9 +75,6 @@ const DatasetDataSchema = new Schema({
default: []
},
imageId: {
type: String
},
updateTime: {
type: Date,
default: () => new Date()

View File

@@ -4,6 +4,7 @@ import { deleteDatasetImage } from './controller';
import { MongoDatasetImageSchema } from './schema';
import { addMinutes } from 'date-fns';
import jwt from 'jsonwebtoken';
import { EndpointUrl } from '@fastgpt/global/common/file/constants';
export const removeDatasetImageExpiredTime = async ({
ids = [],
@@ -51,17 +52,19 @@ export const getDatasetImagePreviewUrl = ({
{
teamId: String(teamId),
datasetId: String(datasetId),
imageId: String(imageId),
exp: expiredTime
},
key
);
return `/api/core/dataset/image/${imageId}?token=${token}`;
return `${EndpointUrl}/api/file/datasetImg/${token}.jpeg`;
};
export const authDatasetImagePreviewUrl = (token?: string) =>
new Promise<{
teamId: string;
datasetId: string;
imageId: string;
}>((resolve, reject) => {
if (!token) {
return reject(ERROR_ENUM.unAuthFile);
@@ -75,7 +78,8 @@ export const authDatasetImagePreviewUrl = (token?: string) =>
}
resolve({
teamId: decoded.teamId,
datasetId: decoded.datasetId
datasetId: decoded.datasetId,
imageId: decoded.imageId
});
});
});

View File

@@ -178,7 +178,7 @@ export async function searchDatasetData(
// Constants data
const datasetDataSelectField =
'_id datasetId collectionId updateTime q a imageId chunkIndex indexes';
'_id datasetId collectionId updateTime q a imageId imageDescMap chunkIndex indexes';
const datsaetCollectionSelectField =
'_id name fileId rawLink apiFileId externalFileId externalFileUrl';
@@ -506,7 +506,8 @@ export async function searchDatasetData(
datasetId: data.datasetId,
q: data.q,
a: data.a,
imageId: data.imageId
imageId: data.imageId,
imageDescMap: data.imageDescMap
}),
chunkIndex: data.chunkIndex,
datasetId: String(data.datasetId),
@@ -647,7 +648,8 @@ export async function searchDatasetData(
datasetId: data.datasetId,
q: data.q,
a: data.a,
imageId: data.imageId
imageId: data.imageId,
imageDescMap: data.imageDescMap
}),
chunkIndex: data.chunkIndex,
indexes: data.indexes,

View File

@@ -41,18 +41,6 @@ export async function pushDataListToTrainingQueue({
indexSize,
session
}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
const formatTrainingMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => {
if (mode !== TrainingModeEnum.image) return mode;
// 检查内容中,是否包含 ![](xxx) 的图片格式
const text = (data.q || '') + (data.a || '');
const regex = /!\[\]\((.*?)\)/g;
const match = text.match(regex);
if (match) {
return TrainingModeEnum.image;
}
return mode;
};
const vectorModelData = getEmbeddingModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(i18nT('common:error_embedding_not_config'));
@@ -130,7 +118,7 @@ export async function pushDataListToTrainingQueue({
datasetId: datasetId,
collectionId: collectionId,
billId,
mode: formatTrainingMode(item, mode),
mode,
...(item.q && { q: item.q }),
...(item.a && { a: item.a }),
...(item.imageId && { imageId: item.imageId }),

View File

@@ -64,6 +64,7 @@ const TrainingDataSchema = new Schema({
default: ''
},
imageId: String,
imageDescMap: Object,
chunkIndex: {
type: Number,
default: 0