perf: image index (#5071)

* doc

* perf: img cite prompt

* perf: image index

* perf: add api key to retrain api
This commit is contained in:
Archer
2025-06-20 13:25:32 +08:00
committed by GitHub
parent be72dda47e
commit 3ed3f2ad01
22 changed files with 153 additions and 55 deletions

View File

@@ -17,5 +17,9 @@ weight: 787
1. 统一知识库训练队列代码逻辑。
2. 输入框 UX。
3. 图片知识库自动去除介绍中的换行,避免模型输出换行导致无法显示图片。
4. 图片索引过程会单独描述图片内容,并在检索后会将图片描述赋予检索结果,使语言模型也可以对图片进行理解。
## 🐛 修复
1. 知识库数据输入,识别 QA 模式错误。

View File

@@ -16,7 +16,8 @@ export const bucketNameMap = {
}
};
export const ReadFileBaseUrl = `${process.env.FILE_DOMAIN || process.env.FE_DOMAIN || ''}${process.env.NEXT_PUBLIC_BASE_URL || ''}/api/common/file/read`;
export const EndpointUrl = `${process.env.FILE_DOMAIN || process.env.FE_DOMAIN || ''}${process.env.NEXT_PUBLIC_BASE_URL || ''}`;
export const ReadFileBaseUrl = `${EndpointUrl}/api/common/file/read`;
export const documentFileType = '.txt, .docx, .csv, .xlsx, .pdf, .md, .html, .pptx';
export const imageFileType =

View File

@@ -22,9 +22,9 @@ export const Prompt_userQuotePromptList: PromptTemplateItem[] = [
## 通用规则
- 如果你不清楚答案,你需要澄清。
- 避免提及你是从 <Cites></Cites> 获取的知识。
- 保持答案与 <Cites></Cites> 中描述的一致。
- 保持答案与 <Cites></Cites> 中描述的一致。但是要避免提及你是从 <Cites></Cites> 获取的知识。
- 使用 Markdown 语法优化回答格式。尤其是图片、表格、序列号等内容,需严格完整输出。
- 如果有合适的图片作为回答,则必须输出图片。输出图片时,仅需输出图片的 url不要输出图片描述例如[](url)。
- 使用与问题相同的语言回答。
<Cites>
@@ -84,9 +84,9 @@ export const Prompt_userQuotePromptList: PromptTemplateItem[] = [
## 通用规则
- 如果你不清楚答案,你需要澄清。
- 避免提及你是从 <Cites></Cites> 获取的知识。
- 保持答案与 <Cites></Cites> 中描述的一致。
- 保持答案与 <Cites></Cites> 中描述的一致。但是要避免提及你是从 <Cites></Cites> 获取的知识。
- 使用 Markdown 语法优化回答格式。尤其是图片、表格、序列号等内容,需严格完整输出。
- 如果有合适的图片作为回答,则必须输出图片。输出图片时,仅需输出图片的 url不要输出图片描述例如[](url)。
- 使用与问题相同的语言回答。
## 严格要求
@@ -157,9 +157,9 @@ export const Prompt_systemQuotePromptList: PromptTemplateItem[] = [
## 通用规则
- 如果你不清楚答案,你需要澄清。
- 避免提及你是从 <Cites></Cites> 获取的知识。
- 保持答案与 <Cites></Cites> 中描述的一致。
- 保持答案与 <Cites></Cites> 中描述的一致。但是要避免提及你是从 <Cites></Cites> 获取的知识。
- 使用 Markdown 语法优化回答格式。尤其是图片、表格、序列号等内容,需严格完整输出。
- 如果有合适的图片作为回答,则必须输出图片。输出图片时,仅需输出图片的 url不要输出图片描述例如[](url)。
- 使用与问题相同的语言回答。
<Cites>
@@ -205,9 +205,9 @@ export const Prompt_systemQuotePromptList: PromptTemplateItem[] = [
## 通用规则
- 如果你不清楚答案,你需要澄清。
- 避免提及你是从 <Cites></Cites> 获取的知识。
- 保持答案与 <Cites></Cites> 中描述的一致。
- 保持答案与 <Cites></Cites> 中描述的一致。但是要避免提及你是从 <Cites></Cites> 获取的知识。
- 使用 Markdown 语法优化回答格式。尤其是图片、表格、序列号等内容,需严格完整输出。
- 如果有合适的图片作为回答,则必须输出图片。输出图片时,仅需输出图片的 url不要输出图片描述例如[](url)。
- 使用与问题相同的语言回答。
## 严格要求

View File

@@ -163,6 +163,7 @@ export type DatasetDataSchemaType = DatasetDataFieldType & {
fullTextToken: string;
indexes: DatasetDataIndexItemType[];
rebuilding?: boolean;
imageDescMap?: Record<string, string>;
};
export type DatasetDataTextSchemaType = {
@@ -189,6 +190,7 @@ export type DatasetTrainingSchemaType = {
q: string;
a: string;
imageId?: string;
imageDescMap?: Record<string, string>;
chunkIndex: number;
indexSize?: number;
weight: number;

View File

@@ -11,7 +11,7 @@ import axios from 'axios';
import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants';
import { i18nT } from '../../../web/i18n/utils';
import { addLog } from '../../common/system/log';
import { addEndpointToImageUrl, getImageBase64 } from '../../common/file/image/utils';
import { getImageBase64 } from '../../common/file/image/utils';
export const filterGPTMessageByMaxContext = async ({
messages = [],
@@ -100,12 +100,12 @@ export const loadRequestMessages = async ({
): string | ChatCompletionContentPartText[] | undefined => {
if (typeof content === 'string') {
if (!content) return;
return addEndpointToImageUrl(content);
return content;
}
const arrayContent = content
.filter((item) => item.text)
.map((item) => addEndpointToImageUrl(item.text))
.map((item) => item.text)
.join('\n');
return arrayContent;

View File

@@ -1,23 +1,54 @@
import { addEndpointToImageUrl } from '../../../common/file/image/utils';
import { getDatasetImagePreviewUrl } from '../image/utils';
import type { DatasetCiteItemType, DatasetDataSchemaType } from '@fastgpt/global/core/dataset/type';
export const formatDatasetDataValue = ({
teamId,
datasetId,
q,
a,
imageId,
teamId,
datasetId
imageDescMap
}: {
teamId: string;
datasetId: string;
q: string;
a?: string;
imageId?: string;
teamId: string;
datasetId: string;
imageDescMap?: Record<string, string>;
}): {
q: string;
a?: string;
imagePreivewUrl?: string;
} => {
// Add image description to image markdown
if (imageDescMap) {
// Helper function to replace image markdown with description
const replaceImageMarkdown = (text: string): string => {
return text.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (match, altText, url) => {
const description = imageDescMap[url];
if (description) {
// Add description to alt text, keeping original if exists
const newAltText = altText ? `${altText} - ${description}` : description;
return `![${newAltText.replace(/\n/g, '')}](${url})`;
}
return match; // Return original if no description found
});
};
// Apply replacement to both q and a
q = replaceImageMarkdown(q);
if (a) {
a = replaceImageMarkdown(a);
}
}
// Add image base url
q = addEndpointToImageUrl(q);
if (a) {
a = addEndpointToImageUrl(a);
}
if (!imageId) {
return {
q,
@@ -33,7 +64,7 @@ export const formatDatasetDataValue = ({
});
return {
q: `![${q.replaceAll('\n', '\\n')}](${previewUrl})`,
q: `![${q.replaceAll('\n', '')}](${previewUrl})`,
a,
imagePreivewUrl: previewUrl
};

View File

@@ -39,6 +39,8 @@ const DatasetDataSchema = new Schema({
a: {
type: String
},
imageId: String,
imageDescMap: Object,
history: {
type: [
{
@@ -73,9 +75,6 @@ const DatasetDataSchema = new Schema({
default: []
},
imageId: {
type: String
},
updateTime: {
type: Date,
default: () => new Date()

View File

@@ -4,6 +4,7 @@ import { deleteDatasetImage } from './controller';
import { MongoDatasetImageSchema } from './schema';
import { addMinutes } from 'date-fns';
import jwt from 'jsonwebtoken';
import { EndpointUrl } from '@fastgpt/global/common/file/constants';
export const removeDatasetImageExpiredTime = async ({
ids = [],
@@ -51,17 +52,19 @@ export const getDatasetImagePreviewUrl = ({
{
teamId: String(teamId),
datasetId: String(datasetId),
imageId: String(imageId),
exp: expiredTime
},
key
);
return `/api/core/dataset/image/${imageId}?token=${token}`;
return `${EndpointUrl}/api/file/datasetImg/${token}.jpeg`;
};
export const authDatasetImagePreviewUrl = (token?: string) =>
new Promise<{
teamId: string;
datasetId: string;
imageId: string;
}>((resolve, reject) => {
if (!token) {
return reject(ERROR_ENUM.unAuthFile);
@@ -75,7 +78,8 @@ export const authDatasetImagePreviewUrl = (token?: string) =>
}
resolve({
teamId: decoded.teamId,
datasetId: decoded.datasetId
datasetId: decoded.datasetId,
imageId: decoded.imageId
});
});
});

View File

@@ -178,7 +178,7 @@ export async function searchDatasetData(
// Constants data
const datasetDataSelectField =
'_id datasetId collectionId updateTime q a imageId chunkIndex indexes';
'_id datasetId collectionId updateTime q a imageId imageDescMap chunkIndex indexes';
const datsaetCollectionSelectField =
'_id name fileId rawLink apiFileId externalFileId externalFileUrl';
@@ -506,7 +506,8 @@ export async function searchDatasetData(
datasetId: data.datasetId,
q: data.q,
a: data.a,
imageId: data.imageId
imageId: data.imageId,
imageDescMap: data.imageDescMap
}),
chunkIndex: data.chunkIndex,
datasetId: String(data.datasetId),
@@ -647,7 +648,8 @@ export async function searchDatasetData(
datasetId: data.datasetId,
q: data.q,
a: data.a,
imageId: data.imageId
imageId: data.imageId,
imageDescMap: data.imageDescMap
}),
chunkIndex: data.chunkIndex,
indexes: data.indexes,

View File

@@ -41,18 +41,6 @@ export async function pushDataListToTrainingQueue({
indexSize,
session
}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
const formatTrainingMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => {
if (mode !== TrainingModeEnum.image) return mode;
// 检查内容中,是否包含 ![](xxx) 的图片格式
const text = (data.q || '') + (data.a || '');
const regex = /!\[\]\((.*?)\)/g;
const match = text.match(regex);
if (match) {
return TrainingModeEnum.image;
}
return mode;
};
const vectorModelData = getEmbeddingModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(i18nT('common:error_embedding_not_config'));
@@ -130,7 +118,7 @@ export async function pushDataListToTrainingQueue({
datasetId: datasetId,
collectionId: collectionId,
billId,
mode: formatTrainingMode(item, mode),
mode,
...(item.q && { q: item.q }),
...(item.a && { a: item.a }),
...(item.imageId && { imageId: item.imageId }),

View File

@@ -64,6 +64,7 @@ const TrainingDataSchema = new Schema({
default: ''
},
imageId: String,
imageDescMap: Object,
chunkIndex: {
type: Number,
default: 0

View File

@@ -16,7 +16,6 @@ import { MongoDataset } from '../../../dataset/schema';
import { i18nT } from '../../../../../web/i18n/utils';
import { filterDatasetsByTmbId } from '../../../dataset/utils';
import { ModelTypeEnum } from '@fastgpt/global/core/ai/model';
import { addEndpointToImageUrl } from '../../../../common/file/image/utils';
import { getDatasetSearchToolResponsePrompt } from '../../../../../global/core/ai/prompt/dataset';
type DatasetSearchProps = ModuleDispatchProps<{
@@ -272,7 +271,7 @@ export async function dispatchDatasetSearch(
id: item.id,
sourceName: item.sourceName,
updateTime: item.updateTime,
content: addEndpointToImageUrl(`${item.q}\n${item.a}`.trim())
content: `${item.q}\n${item.a}`.trim()
}))
}
};

View File

@@ -17,6 +17,7 @@ import { DatasetTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { type ParentIdType } from '@fastgpt/global/common/parentFolder/type';
import { DatasetDefaultPermissionVal } from '@fastgpt/global/support/permission/dataset/constant';
import { getDatasetImagePreviewUrl } from '../../../core/dataset/image/utils';
import { i18nT } from '../../../../web/i18n/utils';
export const authDatasetByTmbId = async ({
tmbId,
@@ -254,7 +255,7 @@ export async function authDatasetData({
const datasetData = await MongoDatasetData.findById(dataId);
if (!datasetData) {
return Promise.reject('core.dataset.error.Data not found');
return Promise.reject(i18nT('common:core.dataset.error.Data not found'));
}
const result = await authDatasetCollection({

View File

@@ -49,7 +49,7 @@ export const defaultFormData: ImportFormType = {
imageIndex: false,
autoIndexes: false,
indexPrefixTitle: true,
indexPrefixTitle: false,
chunkSettingMode: ChunkSettingModeEnum.auto,
chunkSplitMode: DataChunkSplitModeEnum.paragraph,

View File

@@ -43,9 +43,10 @@ async function handler(
const { collection } = await authDatasetCollection({
req,
collectionId,
per: ReadPermissionVal,
authToken: true,
collectionId: collectionId as string,
per: ReadPermissionVal
authApiKey: true
});
const match = {

View File

@@ -9,6 +9,7 @@ import { type ApiRequestProps } from '@fastgpt/service/type/next';
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
import { ChatErrEnum } from '@fastgpt/global/common/error/code/chat';
import { i18nT } from '@fastgpt/web/i18n/utils';
import { formatDatasetDataValue } from '@fastgpt/service/core/dataset/data/controller';
export type GetQuoteDataResponse = {
collection: DatasetCollectionSchemaType;
@@ -78,8 +79,13 @@ async function handler(req: ApiRequestProps<GetQuoteDataProps>): Promise<GetQuot
return {
collection,
q: datasetData.q,
a: datasetData.a
...formatDatasetDataValue({
teamId: datasetData.teamId,
datasetId: datasetData.datasetId,
q: datasetData.q,
a: datasetData.a,
imageId: datasetData.imageId
})
};
} else {
const { datasetData, collection } = await authDatasetData({
@@ -91,8 +97,13 @@ async function handler(req: ApiRequestProps<GetQuoteDataProps>): Promise<GetQuot
});
return {
collection,
q: datasetData.q,
a: datasetData.a
...formatDatasetDataValue({
teamId: datasetData.teamId,
datasetId: datasetData.datasetId,
q: datasetData.q,
a: datasetData.a,
imageId: datasetData.imageId
})
};
}
})();

View File

@@ -1,14 +1,9 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import type { NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { getDownloadStream, getFileById } from '@fastgpt/service/common/file/gridfs/controller';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import type { ApiRequestProps } from '@fastgpt/service/type/next';
import { authDatasetImagePreviewUrl } from '@fastgpt/service/core/dataset/image/utils';
import { getDatasetImageReadData } from '@fastgpt/service/core/dataset/image/controller';
const previewableExtensions = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp'];
export default async function handler(
req: ApiRequestProps<
{},

View File

@@ -32,6 +32,7 @@ async function handler(
const { teamId } = await authDatasetCollection({
req,
authToken: true,
authApiKey: true,
collectionId,
per: ReadPermissionVal
});

View File

@@ -21,6 +21,7 @@ async function handler(req: ApiRequestProps<getTrainingErrorBody, {}>) {
const { collection } = await authDatasetCollection({
req,
authToken: true,
authApiKey: true,
collectionId,
per: ReadPermissionVal
});

View File

@@ -0,0 +1,53 @@
import type { NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import type { ApiRequestProps } from '@fastgpt/service/type/next';
import { authDatasetImagePreviewUrl } from '@fastgpt/service/core/dataset/image/utils';
import { getDatasetImageReadData } from '@fastgpt/service/core/dataset/image/controller';
export default async function handler(
req: ApiRequestProps<
{},
{
token: string;
}
>,
res: NextApiResponse<any>
) {
try {
const { token } = req.query;
if (!token) {
return jsonRes(res, {
code: 401,
error: 'ImageId not found'
});
}
const formatToken = token.replace(/\.jpeg$/, '');
// Verify token and permissions
const { imageId } = await authDatasetImagePreviewUrl(formatToken);
const { fileInfo, stream } = await getDatasetImageReadData(imageId);
// Set response headers
res.setHeader('Content-Type', fileInfo.contentType);
res.setHeader('Cache-Control', 'public, max-age=31536000');
res.setHeader('Content-Length', fileInfo.length);
stream.pipe(res);
stream.on('error', (error) => {
if (!res.headersSent) {
res.status(500).end();
}
});
stream.on('end', () => {
res.end();
});
} catch (error) {
return jsonRes(res, {
code: 500,
error
});
}
}

View File

@@ -173,10 +173,12 @@ export async function insertData2Dataset({
indexes,
indexPrefix,
embeddingModel,
imageDescMap,
session
}: CreateDatasetDataProps & {
embeddingModel: string;
indexSize?: number;
imageDescMap?: Record<string, string>;
session?: ClientSession;
}) {
if (!q || !datasetId || !collectionId || !embeddingModel) {
@@ -234,9 +236,10 @@ export async function insertData2Dataset({
tmbId,
datasetId,
collectionId,
imageId,
q,
a,
imageId,
imageDescMap,
chunkIndex,
indexes: results.map((item) => item.index)
}

View File

@@ -279,6 +279,7 @@ const insertData = async ({ trainingData }: { trainingData: TrainingDataType })
q: trainingData.q,
a: trainingData.a,
imageId: trainingData.imageId,
imageDescMap: trainingData.imageDescMap,
chunkIndex: trainingData.chunkIndex,
indexSize:
trainingData.indexSize ||