diff --git a/docSite/content/zh-cn/docs/development/upgrading/4914.md b/docSite/content/zh-cn/docs/development/upgrading/4914.md
index 78d0d725d..894a1ef45 100644
--- a/docSite/content/zh-cn/docs/development/upgrading/4914.md
+++ b/docSite/content/zh-cn/docs/development/upgrading/4914.md
@@ -17,5 +17,9 @@ weight: 787
1. 统一知识库训练队列代码逻辑。
2. 输入框 UX。
+3. 图片知识库自动去除介绍中的换行,避免模型输出换行导致无法显示图片。
+4. 图片索引过程会单独描述图片内容,并在检索后会将图片描述赋予检索结果,使语言模型也可以对图片进行理解。
## 🐛 修复
+
+1. 知识库数据输入,识别 QA 模式错误。
\ No newline at end of file
diff --git a/packages/global/common/file/constants.ts b/packages/global/common/file/constants.ts
index d98793b0b..ac48e3a3e 100644
--- a/packages/global/common/file/constants.ts
+++ b/packages/global/common/file/constants.ts
@@ -16,7 +16,8 @@ export const bucketNameMap = {
}
};
-export const ReadFileBaseUrl = `${process.env.FILE_DOMAIN || process.env.FE_DOMAIN || ''}${process.env.NEXT_PUBLIC_BASE_URL || ''}/api/common/file/read`;
+export const EndpointUrl = `${process.env.FILE_DOMAIN || process.env.FE_DOMAIN || ''}${process.env.NEXT_PUBLIC_BASE_URL || ''}`;
+export const ReadFileBaseUrl = `${EndpointUrl}/api/common/file/read`;
export const documentFileType = '.txt, .docx, .csv, .xlsx, .pdf, .md, .html, .pptx';
export const imageFileType =
diff --git a/packages/global/core/ai/prompt/AIChat.ts b/packages/global/core/ai/prompt/AIChat.ts
index b3869ad42..a81bed8c0 100644
--- a/packages/global/core/ai/prompt/AIChat.ts
+++ b/packages/global/core/ai/prompt/AIChat.ts
@@ -22,9 +22,9 @@ export const Prompt_userQuotePromptList: PromptTemplateItem[] = [
## 通用规则
- 如果你不清楚答案,你需要澄清。
-- 避免提及你是从 获取的知识。
-- 保持答案与 中描述的一致。
+- 保持答案与 中描述的一致。但是要避免提及你是从 获取的知识。
- 使用 Markdown 语法优化回答格式。尤其是图片、表格、序列号等内容,需严格完整输出。
+- 如果有合适的图片作为回答,则必须输出图片。输出图片时,仅需输出图片的 url,不要输出图片描述,例如:[](url)。
- 使用与问题相同的语言回答。
@@ -84,9 +84,9 @@ export const Prompt_userQuotePromptList: PromptTemplateItem[] = [
## 通用规则
- 如果你不清楚答案,你需要澄清。
-- 避免提及你是从 获取的知识。
-- 保持答案与 中描述的一致。
+- 保持答案与 中描述的一致。但是要避免提及你是从 获取的知识。
- 使用 Markdown 语法优化回答格式。尤其是图片、表格、序列号等内容,需严格完整输出。
+- 如果有合适的图片作为回答,则必须输出图片。输出图片时,仅需输出图片的 url,不要输出图片描述,例如:[](url)。
- 使用与问题相同的语言回答。
## 严格要求
@@ -157,9 +157,9 @@ export const Prompt_systemQuotePromptList: PromptTemplateItem[] = [
## 通用规则
- 如果你不清楚答案,你需要澄清。
-- 避免提及你是从 获取的知识。
-- 保持答案与 中描述的一致。
+- 保持答案与 中描述的一致。但是要避免提及你是从 获取的知识。
- 使用 Markdown 语法优化回答格式。尤其是图片、表格、序列号等内容,需严格完整输出。
+- 如果有合适的图片作为回答,则必须输出图片。输出图片时,仅需输出图片的 url,不要输出图片描述,例如:[](url)。
- 使用与问题相同的语言回答。
@@ -205,9 +205,9 @@ export const Prompt_systemQuotePromptList: PromptTemplateItem[] = [
## 通用规则
- 如果你不清楚答案,你需要澄清。
-- 避免提及你是从 获取的知识。
-- 保持答案与 中描述的一致。
+- 保持答案与 中描述的一致。但是要避免提及你是从 获取的知识。
- 使用 Markdown 语法优化回答格式。尤其是图片、表格、序列号等内容,需严格完整输出。
+- 如果有合适的图片作为回答,则必须输出图片。输出图片时,仅需输出图片的 url,不要输出图片描述,例如:[](url)。
- 使用与问题相同的语言回答。
## 严格要求
diff --git a/packages/global/core/dataset/type.d.ts b/packages/global/core/dataset/type.d.ts
index 37047705f..65030f76e 100644
--- a/packages/global/core/dataset/type.d.ts
+++ b/packages/global/core/dataset/type.d.ts
@@ -163,6 +163,7 @@ export type DatasetDataSchemaType = DatasetDataFieldType & {
fullTextToken: string;
indexes: DatasetDataIndexItemType[];
rebuilding?: boolean;
+ imageDescMap?: Record;
};
export type DatasetDataTextSchemaType = {
@@ -189,6 +190,7 @@ export type DatasetTrainingSchemaType = {
q: string;
a: string;
imageId?: string;
+ imageDescMap?: Record;
chunkIndex: number;
indexSize?: number;
weight: number;
diff --git a/packages/service/core/chat/utils.ts b/packages/service/core/chat/utils.ts
index 701a9f525..6d865be77 100644
--- a/packages/service/core/chat/utils.ts
+++ b/packages/service/core/chat/utils.ts
@@ -11,7 +11,7 @@ import axios from 'axios';
import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants';
import { i18nT } from '../../../web/i18n/utils';
import { addLog } from '../../common/system/log';
-import { addEndpointToImageUrl, getImageBase64 } from '../../common/file/image/utils';
+import { getImageBase64 } from '../../common/file/image/utils';
export const filterGPTMessageByMaxContext = async ({
messages = [],
@@ -100,12 +100,12 @@ export const loadRequestMessages = async ({
): string | ChatCompletionContentPartText[] | undefined => {
if (typeof content === 'string') {
if (!content) return;
- return addEndpointToImageUrl(content);
+ return content;
}
const arrayContent = content
.filter((item) => item.text)
- .map((item) => addEndpointToImageUrl(item.text))
+ .map((item) => item.text)
.join('\n');
return arrayContent;
diff --git a/packages/service/core/dataset/data/controller.ts b/packages/service/core/dataset/data/controller.ts
index b1872318e..50c705007 100644
--- a/packages/service/core/dataset/data/controller.ts
+++ b/packages/service/core/dataset/data/controller.ts
@@ -1,23 +1,54 @@
+import { addEndpointToImageUrl } from '../../../common/file/image/utils';
import { getDatasetImagePreviewUrl } from '../image/utils';
import type { DatasetCiteItemType, DatasetDataSchemaType } from '@fastgpt/global/core/dataset/type';
export const formatDatasetDataValue = ({
+ teamId,
+ datasetId,
q,
a,
imageId,
- teamId,
- datasetId
+ imageDescMap
}: {
+ teamId: string;
+ datasetId: string;
q: string;
a?: string;
imageId?: string;
- teamId: string;
- datasetId: string;
+ imageDescMap?: Record;
}): {
q: string;
a?: string;
imagePreivewUrl?: string;
} => {
+ // Add image description to image markdown
+ if (imageDescMap) {
+ // Helper function to replace image markdown with description
+ const replaceImageMarkdown = (text: string): string => {
+ return text.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (match, altText, url) => {
+ const description = imageDescMap[url];
+ if (description) {
+ // Add description to alt text, keeping original if exists
+ const newAltText = altText ? `${altText} - ${description}` : description;
+ return ``;
+ }
+ return match; // Return original if no description found
+ });
+ };
+
+ // Apply replacement to both q and a
+ q = replaceImageMarkdown(q);
+ if (a) {
+ a = replaceImageMarkdown(a);
+ }
+ }
+
+ // Add image base url
+ q = addEndpointToImageUrl(q);
+ if (a) {
+ a = addEndpointToImageUrl(a);
+ }
+
if (!imageId) {
return {
q,
@@ -33,7 +64,7 @@ export const formatDatasetDataValue = ({
});
return {
- q: ``,
+ q: ``,
a,
imagePreivewUrl: previewUrl
};
diff --git a/packages/service/core/dataset/data/schema.ts b/packages/service/core/dataset/data/schema.ts
index 11cc34446..8d20378e6 100644
--- a/packages/service/core/dataset/data/schema.ts
+++ b/packages/service/core/dataset/data/schema.ts
@@ -39,6 +39,8 @@ const DatasetDataSchema = new Schema({
a: {
type: String
},
+ imageId: String,
+ imageDescMap: Object,
history: {
type: [
{
@@ -73,9 +75,6 @@ const DatasetDataSchema = new Schema({
default: []
},
- imageId: {
- type: String
- },
updateTime: {
type: Date,
default: () => new Date()
diff --git a/packages/service/core/dataset/image/utils.ts b/packages/service/core/dataset/image/utils.ts
index eed4a0b82..f20c28122 100644
--- a/packages/service/core/dataset/image/utils.ts
+++ b/packages/service/core/dataset/image/utils.ts
@@ -4,6 +4,7 @@ import { deleteDatasetImage } from './controller';
import { MongoDatasetImageSchema } from './schema';
import { addMinutes } from 'date-fns';
import jwt from 'jsonwebtoken';
+import { EndpointUrl } from '@fastgpt/global/common/file/constants';
export const removeDatasetImageExpiredTime = async ({
ids = [],
@@ -51,17 +52,19 @@ export const getDatasetImagePreviewUrl = ({
{
teamId: String(teamId),
datasetId: String(datasetId),
+ imageId: String(imageId),
exp: expiredTime
},
key
);
- return `/api/core/dataset/image/${imageId}?token=${token}`;
+ return `${EndpointUrl}/api/file/datasetImg/${token}.jpeg`;
};
export const authDatasetImagePreviewUrl = (token?: string) =>
new Promise<{
teamId: string;
datasetId: string;
+ imageId: string;
}>((resolve, reject) => {
if (!token) {
return reject(ERROR_ENUM.unAuthFile);
@@ -75,7 +78,8 @@ export const authDatasetImagePreviewUrl = (token?: string) =>
}
resolve({
teamId: decoded.teamId,
- datasetId: decoded.datasetId
+ datasetId: decoded.datasetId,
+ imageId: decoded.imageId
});
});
});
diff --git a/packages/service/core/dataset/search/controller.ts b/packages/service/core/dataset/search/controller.ts
index ffd94eca4..13170b830 100644
--- a/packages/service/core/dataset/search/controller.ts
+++ b/packages/service/core/dataset/search/controller.ts
@@ -178,7 +178,7 @@ export async function searchDatasetData(
// Constants data
const datasetDataSelectField =
- '_id datasetId collectionId updateTime q a imageId chunkIndex indexes';
+ '_id datasetId collectionId updateTime q a imageId imageDescMap chunkIndex indexes';
const datsaetCollectionSelectField =
'_id name fileId rawLink apiFileId externalFileId externalFileUrl';
@@ -506,7 +506,8 @@ export async function searchDatasetData(
datasetId: data.datasetId,
q: data.q,
a: data.a,
- imageId: data.imageId
+ imageId: data.imageId,
+ imageDescMap: data.imageDescMap
}),
chunkIndex: data.chunkIndex,
datasetId: String(data.datasetId),
@@ -647,7 +648,8 @@ export async function searchDatasetData(
datasetId: data.datasetId,
q: data.q,
a: data.a,
- imageId: data.imageId
+ imageId: data.imageId,
+ imageDescMap: data.imageDescMap
}),
chunkIndex: data.chunkIndex,
indexes: data.indexes,
diff --git a/packages/service/core/dataset/training/controller.ts b/packages/service/core/dataset/training/controller.ts
index 5da0fb86c..9216e6488 100644
--- a/packages/service/core/dataset/training/controller.ts
+++ b/packages/service/core/dataset/training/controller.ts
@@ -41,18 +41,6 @@ export async function pushDataListToTrainingQueue({
indexSize,
session
}: PushDataToTrainingQueueProps): Promise {
- const formatTrainingMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => {
- if (mode !== TrainingModeEnum.image) return mode;
- // 检查内容中,是否包含  的图片格式
- const text = (data.q || '') + (data.a || '');
- const regex = /!\[\]\((.*?)\)/g;
- const match = text.match(regex);
- if (match) {
- return TrainingModeEnum.image;
- }
- return mode;
- };
-
const vectorModelData = getEmbeddingModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(i18nT('common:error_embedding_not_config'));
@@ -130,7 +118,7 @@ export async function pushDataListToTrainingQueue({
datasetId: datasetId,
collectionId: collectionId,
billId,
- mode: formatTrainingMode(item, mode),
+ mode,
...(item.q && { q: item.q }),
...(item.a && { a: item.a }),
...(item.imageId && { imageId: item.imageId }),
diff --git a/packages/service/core/dataset/training/schema.ts b/packages/service/core/dataset/training/schema.ts
index ae8001d4b..a8f723a37 100644
--- a/packages/service/core/dataset/training/schema.ts
+++ b/packages/service/core/dataset/training/schema.ts
@@ -64,6 +64,7 @@ const TrainingDataSchema = new Schema({
default: ''
},
imageId: String,
+ imageDescMap: Object,
chunkIndex: {
type: Number,
default: 0
diff --git a/packages/service/core/workflow/dispatch/dataset/search.ts b/packages/service/core/workflow/dispatch/dataset/search.ts
index d7ca421cf..80addbd73 100644
--- a/packages/service/core/workflow/dispatch/dataset/search.ts
+++ b/packages/service/core/workflow/dispatch/dataset/search.ts
@@ -16,7 +16,6 @@ import { MongoDataset } from '../../../dataset/schema';
import { i18nT } from '../../../../../web/i18n/utils';
import { filterDatasetsByTmbId } from '../../../dataset/utils';
import { ModelTypeEnum } from '@fastgpt/global/core/ai/model';
-import { addEndpointToImageUrl } from '../../../../common/file/image/utils';
import { getDatasetSearchToolResponsePrompt } from '../../../../../global/core/ai/prompt/dataset';
type DatasetSearchProps = ModuleDispatchProps<{
@@ -272,7 +271,7 @@ export async function dispatchDatasetSearch(
id: item.id,
sourceName: item.sourceName,
updateTime: item.updateTime,
- content: addEndpointToImageUrl(`${item.q}\n${item.a}`.trim())
+ content: `${item.q}\n${item.a}`.trim()
}))
}
};
diff --git a/packages/service/support/permission/dataset/auth.ts b/packages/service/support/permission/dataset/auth.ts
index 74980aac6..e983264a8 100644
--- a/packages/service/support/permission/dataset/auth.ts
+++ b/packages/service/support/permission/dataset/auth.ts
@@ -17,6 +17,7 @@ import { DatasetTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { type ParentIdType } from '@fastgpt/global/common/parentFolder/type';
import { DatasetDefaultPermissionVal } from '@fastgpt/global/support/permission/dataset/constant';
import { getDatasetImagePreviewUrl } from '../../../core/dataset/image/utils';
+import { i18nT } from '../../../../web/i18n/utils';
export const authDatasetByTmbId = async ({
tmbId,
@@ -254,7 +255,7 @@ export async function authDatasetData({
const datasetData = await MongoDatasetData.findById(dataId);
if (!datasetData) {
- return Promise.reject('core.dataset.error.Data not found');
+ return Promise.reject(i18nT('common:core.dataset.error.Data not found'));
}
const result = await authDatasetCollection({
diff --git a/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx b/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx
index 44c6f364b..79c7cd012 100644
--- a/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx
+++ b/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx
@@ -49,7 +49,7 @@ export const defaultFormData: ImportFormType = {
imageIndex: false,
autoIndexes: false,
- indexPrefixTitle: true,
+ indexPrefixTitle: false,
chunkSettingMode: ChunkSettingModeEnum.auto,
chunkSplitMode: DataChunkSplitModeEnum.paragraph,
diff --git a/projects/app/src/pages/api/core/dataset/collection/trainingDetail.ts b/projects/app/src/pages/api/core/dataset/collection/trainingDetail.ts
index a3c9cd525..d656507c1 100644
--- a/projects/app/src/pages/api/core/dataset/collection/trainingDetail.ts
+++ b/projects/app/src/pages/api/core/dataset/collection/trainingDetail.ts
@@ -43,9 +43,10 @@ async function handler(
const { collection } = await authDatasetCollection({
req,
+ collectionId,
+ per: ReadPermissionVal,
authToken: true,
- collectionId: collectionId as string,
- per: ReadPermissionVal
+ authApiKey: true
});
const match = {
diff --git a/projects/app/src/pages/api/core/dataset/data/getQuoteData.ts b/projects/app/src/pages/api/core/dataset/data/getQuoteData.ts
index 20d584c99..b82f0092b 100644
--- a/projects/app/src/pages/api/core/dataset/data/getQuoteData.ts
+++ b/projects/app/src/pages/api/core/dataset/data/getQuoteData.ts
@@ -9,6 +9,7 @@ import { type ApiRequestProps } from '@fastgpt/service/type/next';
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
import { ChatErrEnum } from '@fastgpt/global/common/error/code/chat';
import { i18nT } from '@fastgpt/web/i18n/utils';
+import { formatDatasetDataValue } from '@fastgpt/service/core/dataset/data/controller';
export type GetQuoteDataResponse = {
collection: DatasetCollectionSchemaType;
@@ -78,8 +79,13 @@ async function handler(req: ApiRequestProps): Promise): Promise) {
const { collection } = await authDatasetCollection({
req,
authToken: true,
+ authApiKey: true,
collectionId,
per: ReadPermissionVal
});
diff --git a/projects/app/src/pages/api/file/datasetImg/[token].ts b/projects/app/src/pages/api/file/datasetImg/[token].ts
new file mode 100644
index 000000000..78a8bd0d2
--- /dev/null
+++ b/projects/app/src/pages/api/file/datasetImg/[token].ts
@@ -0,0 +1,53 @@
+import type { NextApiResponse } from 'next';
+import { jsonRes } from '@fastgpt/service/common/response';
+import type { ApiRequestProps } from '@fastgpt/service/type/next';
+import { authDatasetImagePreviewUrl } from '@fastgpt/service/core/dataset/image/utils';
+import { getDatasetImageReadData } from '@fastgpt/service/core/dataset/image/controller';
+
+export default async function handler(
+ req: ApiRequestProps<
+ {},
+ {
+ token: string;
+ }
+ >,
+ res: NextApiResponse
+) {
+ try {
+ const { token } = req.query;
+
+ if (!token) {
+ return jsonRes(res, {
+ code: 401,
+ error: 'ImageId not found'
+ });
+ }
+
+ const formatToken = token.replace(/\.jpeg$/, '');
+
+ // Verify token and permissions
+ const { imageId } = await authDatasetImagePreviewUrl(formatToken);
+
+ const { fileInfo, stream } = await getDatasetImageReadData(imageId);
+
+ // Set response headers
+ res.setHeader('Content-Type', fileInfo.contentType);
+ res.setHeader('Cache-Control', 'public, max-age=31536000');
+ res.setHeader('Content-Length', fileInfo.length);
+
+ stream.pipe(res);
+ stream.on('error', (error) => {
+ if (!res.headersSent) {
+ res.status(500).end();
+ }
+ });
+ stream.on('end', () => {
+ res.end();
+ });
+ } catch (error) {
+ return jsonRes(res, {
+ code: 500,
+ error
+ });
+ }
+}
diff --git a/projects/app/src/service/core/dataset/data/controller.ts b/projects/app/src/service/core/dataset/data/controller.ts
index 45021148c..556c14e00 100644
--- a/projects/app/src/service/core/dataset/data/controller.ts
+++ b/projects/app/src/service/core/dataset/data/controller.ts
@@ -173,10 +173,12 @@ export async function insertData2Dataset({
indexes,
indexPrefix,
embeddingModel,
+ imageDescMap,
session
}: CreateDatasetDataProps & {
embeddingModel: string;
indexSize?: number;
+ imageDescMap?: Record;
session?: ClientSession;
}) {
if (!q || !datasetId || !collectionId || !embeddingModel) {
@@ -234,9 +236,10 @@ export async function insertData2Dataset({
tmbId,
datasetId,
collectionId,
- imageId,
q,
a,
+ imageId,
+ imageDescMap,
chunkIndex,
indexes: results.map((item) => item.index)
}
diff --git a/projects/app/src/service/core/dataset/queues/generateVector.ts b/projects/app/src/service/core/dataset/queues/generateVector.ts
index d2dd94ace..66be3b477 100644
--- a/projects/app/src/service/core/dataset/queues/generateVector.ts
+++ b/projects/app/src/service/core/dataset/queues/generateVector.ts
@@ -279,6 +279,7 @@ const insertData = async ({ trainingData }: { trainingData: TrainingDataType })
q: trainingData.q,
a: trainingData.a,
imageId: trainingData.imageId,
+ imageDescMap: trainingData.imageDescMap,
chunkIndex: trainingData.chunkIndex,
indexSize:
trainingData.indexSize ||