mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-21 03:35:36 +00:00
perf: retry to load image;perf: default index check (#4004)
* perf: retry to load image * perf: default index check
This commit is contained in:
@@ -1063,10 +1063,12 @@ curl --location --request DELETE 'http://localhost:3000/api/core/dataset/collect
|
||||
|
||||
| 字段 | 类型 | 说明 | 必填 |
|
||||
| --- | --- | --- | --- |
|
||||
| defaultIndex | Boolean | 是否为默认索引 | ✅ |
|
||||
| dataId | String | 关联的向量ID | ✅ |
|
||||
| type | String | 可选索引类型:default-默认索引; custom-自定义索引; summary-总结索引; question-问题索引; image-图片索引 | |
|
||||
| dataId | String | 关联的向量ID,变更数据时候传入该 ID,会进行差量更新,而不是全量更新 | |
|
||||
| text | String | 文本内容 | ✅ |
|
||||
|
||||
`type` 不填则默认为 `custom` 索引,还会基于 q/a 组成一个默认索引。如果传入了默认索引,则不会额外创建。
|
||||
|
||||
### 为集合批量添加添加数据
|
||||
|
||||
注意,每次最多推送 200 组数据。
|
||||
@@ -1298,8 +1300,7 @@ curl --location --request GET 'http://localhost:3000/api/core/dataset/data/detai
|
||||
"chunkIndex": 0,
|
||||
"indexes": [
|
||||
{
|
||||
"defaultIndex": true,
|
||||
"type": "chunk",
|
||||
"type": "default",
|
||||
"dataId": "3720083",
|
||||
"text": "N o . 2 0 2 2 1 2中 国 信 息 通 信 研 究 院京东探索研究院2022年 9月人工智能生成内容(AIGC)白皮书(2022 年)版权声明本白皮书版权属于中国信息通信研究院和京东探索研究院,并受法律保护。转载、摘编或利用其它方式使用本白皮书文字或者观点的,应注明“来源:中国信息通信研究院和京东探索研究院”。违反上述声明者,编者将追究其相关法律责任。前 言习近平总书记曾指出,“数字技术正以新理念、新业态、新模式全面融入人类经济、政治、文化、社会、生态文明建设各领域和全过程”。在当前数字世界和物理世界加速融合的大背景下,人工智能生成内容(Artificial Intelligence Generated Content,简称 AIGC)正在悄然引导着一场深刻的变革,重塑甚至颠覆数字内容的生产方式和消费模式,将极大地丰富人们的数字生活,是未来全面迈向数字文明新时代不可或缺的支撑力量。",
|
||||
"_id": "65abd4b29d1448617cba61dc"
|
||||
@@ -1335,12 +1336,18 @@ curl --location --request PUT 'http://localhost:3000/api/core/dataset/data/updat
|
||||
"a":"sss",
|
||||
"indexes":[
|
||||
{
|
||||
"dataId": "xxx",
|
||||
"defaultIndex":false,
|
||||
"text":"自定义索引1"
|
||||
"dataId": "xxxx",
|
||||
"type": "default",
|
||||
"text": "默认索引"
|
||||
},
|
||||
{
|
||||
"text":"修改后的自定义索引2。(会删除原来的自定义索引2,并插入新的自定义索引2)"
|
||||
"dataId": "xxx",
|
||||
"type": "custom",
|
||||
"text": "旧的自定义索引1"
|
||||
},
|
||||
{
|
||||
"type":"custom",
|
||||
"text":"新增的自定义索引"
|
||||
}
|
||||
]
|
||||
}'
|
||||
|
@@ -168,7 +168,7 @@ export const markdownProcess = async ({
|
||||
return simpleMarkdownText(imageProcess);
|
||||
};
|
||||
|
||||
export const matchMdImgTextAndUpload = (text: string) => {
|
||||
export const matchMdImg = (text: string) => {
|
||||
const base64Regex = /!\[([^\]]*)\]\((data:image\/[^;]+;base64[^)]+)\)/g;
|
||||
const imageList: ImageType[] = [];
|
||||
|
||||
|
@@ -6,6 +6,7 @@ import { guessBase64ImageType } from '../utils';
|
||||
import { readFromSecondary } from '../../mongo/utils';
|
||||
import { addHours } from 'date-fns';
|
||||
import { imageFileType } from '@fastgpt/global/common/file/constants';
|
||||
import { retryFn } from '@fastgpt/global/common/system/utils';
|
||||
|
||||
export const maxImgSize = 1024 * 1024 * 12;
|
||||
const base64MimeRegex = /data:image\/([^\)]+);base64/;
|
||||
@@ -40,13 +41,15 @@ export async function uploadMongoImg({
|
||||
return Promise.reject(`Invalid image file type: ${mime}`);
|
||||
}
|
||||
|
||||
const { _id } = await MongoImage.create({
|
||||
teamId,
|
||||
binary,
|
||||
metadata: Object.assign({ mime }, metadata),
|
||||
shareId,
|
||||
expiredTime: forever ? undefined : addHours(new Date(), 1)
|
||||
});
|
||||
const { _id } = await retryFn(() =>
|
||||
MongoImage.create({
|
||||
teamId,
|
||||
binary,
|
||||
metadata: Object.assign({ mime }, metadata),
|
||||
shareId,
|
||||
expiredTime: forever ? undefined : addHours(new Date(), 1)
|
||||
})
|
||||
);
|
||||
|
||||
return `${process.env.NEXT_PUBLIC_BASE_URL || ''}${imageBaseUrl}${String(_id)}.${extension}`;
|
||||
}
|
||||
|
@@ -2,23 +2,30 @@ import axios from 'axios';
|
||||
import { addLog } from '../../system/log';
|
||||
import { serverRequestBaseUrl } from '../../api/serverRequest';
|
||||
import { getFileContentTypeFromHeader, guessBase64ImageType } from '../utils';
|
||||
import { retryFn } from '@fastgpt/global/common/system/utils';
|
||||
|
||||
export const getImageBase64 = async (url: string) => {
|
||||
addLog.debug(`Load image to base64: ${url}`);
|
||||
|
||||
try {
|
||||
const response = await axios.get(url, {
|
||||
baseURL: serverRequestBaseUrl,
|
||||
responseType: 'arraybuffer',
|
||||
proxy: false
|
||||
});
|
||||
const response = await retryFn(() =>
|
||||
axios.get(url, {
|
||||
baseURL: serverRequestBaseUrl,
|
||||
responseType: 'arraybuffer',
|
||||
proxy: false
|
||||
})
|
||||
);
|
||||
|
||||
const base64 = Buffer.from(response.data, 'binary').toString('base64');
|
||||
const imageType =
|
||||
getFileContentTypeFromHeader(response.headers['content-type']) ||
|
||||
guessBase64ImageType(base64);
|
||||
|
||||
return `data:${imageType};base64,${base64}`;
|
||||
return {
|
||||
completeBase64: `data:${imageType};base64,${base64}`,
|
||||
base64,
|
||||
mime: imageType
|
||||
};
|
||||
} catch (error) {
|
||||
addLog.debug(`Load image to base64 failed: ${url}`);
|
||||
console.log(error);
|
||||
|
@@ -6,11 +6,12 @@ import type { ImageType, ReadFileResponse } from '../../../worker/readFile/type'
|
||||
import axios from 'axios';
|
||||
import { addLog } from '../../system/log';
|
||||
import { batchRun } from '@fastgpt/global/common/system/utils';
|
||||
import { htmlTable2Md, matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
|
||||
import { htmlTable2Md, matchMdImg } from '@fastgpt/global/common/string/markdown';
|
||||
import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
|
||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { getNanoid } from '@fastgpt/global/common/string/tools';
|
||||
import { getImageBase64 } from '../image/utils';
|
||||
|
||||
export type readRawTextByLocalFileParams = {
|
||||
teamId: string;
|
||||
@@ -99,7 +100,7 @@ export const readRawContentByFileBuffer = async ({
|
||||
addLog.info(`Custom file parsing is complete, time: ${Date.now() - start}ms`);
|
||||
|
||||
const rawText = response.markdown;
|
||||
const { text, imageList } = matchMdImgTextAndUpload(rawText);
|
||||
const { text, imageList } = matchMdImg(rawText);
|
||||
|
||||
createPdfParseUsage({
|
||||
teamId,
|
||||
@@ -120,8 +121,8 @@ export const readRawContentByFileBuffer = async ({
|
||||
const parseTextImage = async (text: string) => {
|
||||
// Extract image links and convert to base64
|
||||
const imageList: { id: string; url: string }[] = [];
|
||||
const processedText = text.replace(/!\[.*?\]\((http[^)]+)\)/g, (match, url) => {
|
||||
const id = getNanoid();
|
||||
let processedText = text.replace(/!\[.*?\]\((http[^)]+)\)/g, (match, url) => {
|
||||
const id = `IMAGE_${getNanoid()}_IMAGE`;
|
||||
imageList.push({
|
||||
id,
|
||||
url
|
||||
@@ -129,22 +130,24 @@ export const readRawContentByFileBuffer = async ({
|
||||
return ``;
|
||||
});
|
||||
|
||||
// Get base64 from image url
|
||||
let resultImageList: ImageType[] = [];
|
||||
await Promise.all(
|
||||
imageList.map(async (item) => {
|
||||
await batchRun(
|
||||
imageList,
|
||||
async (item) => {
|
||||
try {
|
||||
const response = await axios.get(item.url, { responseType: 'arraybuffer' });
|
||||
const mime = response.headers['content-type'] || 'image/jpeg';
|
||||
const base64 = response.data.toString('base64');
|
||||
const { base64, mime } = await getImageBase64(item.url);
|
||||
resultImageList.push({
|
||||
uuid: item.id,
|
||||
mime,
|
||||
base64
|
||||
});
|
||||
} catch (error) {
|
||||
processedText = processedText.replace(item.id, item.url);
|
||||
addLog.warn(`Failed to get image from ${item.url}: ${getErrText(error)}`);
|
||||
}
|
||||
})
|
||||
},
|
||||
5
|
||||
);
|
||||
|
||||
return {
|
||||
@@ -312,14 +315,14 @@ export const readRawContentByFileBuffer = async ({
|
||||
return await uploadMongoImg({
|
||||
base64Img: `data:${item.mime};base64,${item.base64}`,
|
||||
teamId,
|
||||
// expiredTime: addHours(new Date(), 1),
|
||||
metadata: {
|
||||
...metadata,
|
||||
mime: item.mime
|
||||
}
|
||||
});
|
||||
} catch (error) {
|
||||
return '';
|
||||
addLog.warn('Upload file image error', { error });
|
||||
return 'Upload load image error';
|
||||
}
|
||||
})();
|
||||
rawText = rawText.replace(item.uuid, src);
|
||||
|
@@ -165,7 +165,7 @@ export const loadRequestMessages = async ({
|
||||
try {
|
||||
// If imgUrl is a local path, load image from local, and set url to base64
|
||||
if (imgUrl.startsWith('/') || process.env.MULTIPLE_DATA_TO_BASE64 === 'true') {
|
||||
const base64 = await getImageBase64(imgUrl);
|
||||
const { completeBase64: base64 } = await getImageBase64(imgUrl);
|
||||
|
||||
return {
|
||||
...item,
|
||||
|
@@ -1,6 +1,6 @@
|
||||
import TurndownService from 'turndown';
|
||||
import { ImageType } from '../readFile/type';
|
||||
import { matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
|
||||
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
|
||||
import { getNanoid } from '@fastgpt/global/common/string/tools';
|
||||
// @ts-ignore
|
||||
const turndownPluginGfm = require('joplin-turndown-plugin-gfm');
|
||||
@@ -46,7 +46,7 @@ export const html2md = (
|
||||
// Base64 img to id, otherwise it will occupy memory when going to md
|
||||
const { processedHtml, images } = processBase64Images(html);
|
||||
const md = turndownService.turndown(processedHtml);
|
||||
const { text, imageList } = matchMdImgTextAndUpload(md);
|
||||
const { text, imageList } = matchMdImg(md);
|
||||
|
||||
return {
|
||||
rawText: text,
|
||||
|
@@ -1,5 +1,5 @@
|
||||
import React, { useCallback, useEffect, useMemo, useRef, useState } from 'react';
|
||||
import { Box, Flex, Button, Textarea, useTheme } from '@chakra-ui/react';
|
||||
import { Box, Flex, Button, Textarea } from '@chakra-ui/react';
|
||||
import {
|
||||
FieldArrayWithId,
|
||||
UseFieldArrayRemove,
|
||||
@@ -19,8 +19,7 @@ import MyModal from '@fastgpt/web/components/common/MyModal';
|
||||
import MyTooltip from '@fastgpt/web/components/common/MyTooltip';
|
||||
import { useQuery } from '@tanstack/react-query';
|
||||
import { useTranslation } from 'next-i18next';
|
||||
import { useRequest, useRequest2 } from '@fastgpt/web/hooks/useRequest';
|
||||
import { useConfirm } from '@fastgpt/web/hooks/useConfirm';
|
||||
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
|
||||
import { getSourceNameIcon } from '@fastgpt/global/core/dataset/utils';
|
||||
import { DatasetDataIndexItemType } from '@fastgpt/global/core/dataset/type';
|
||||
import DeleteIcon from '@fastgpt/web/components/common/Icon/delete';
|
||||
@@ -30,10 +29,12 @@ import MyBox from '@fastgpt/web/components/common/MyBox';
|
||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||
import { useSystemStore } from '@/web/common/system/useSystemStore';
|
||||
import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip';
|
||||
import { useSystem } from '@fastgpt/web/hooks/useSystem';
|
||||
import LightRowTabs from '@fastgpt/web/components/common/Tabs/LightRowTabs';
|
||||
import styles from './styles.module.scss';
|
||||
import { getDatasetIndexMapData } from '@fastgpt/global/core/dataset/data/constants';
|
||||
import {
|
||||
DatasetDataIndexTypeEnum,
|
||||
getDatasetIndexMapData
|
||||
} from '@fastgpt/global/core/dataset/data/constants';
|
||||
|
||||
export type InputDataType = {
|
||||
q: string;
|
||||
@@ -62,11 +63,10 @@ const InputDataModal = ({
|
||||
onSuccess: (data: InputDataType & { dataId: string }) => void;
|
||||
}) => {
|
||||
const { t } = useTranslation();
|
||||
const theme = useTheme();
|
||||
const { toast } = useToast();
|
||||
const [currentTab, setCurrentTab] = useState(TabEnum.content);
|
||||
const { embeddingModelList, defaultModels } = useSystemStore();
|
||||
const { isPc } = useSystem();
|
||||
|
||||
const { register, handleSubmit, reset, control } = useForm<InputDataType>();
|
||||
const {
|
||||
fields: indexes,
|
||||
@@ -112,11 +112,6 @@ const InputDataModal = ({
|
||||
}
|
||||
];
|
||||
|
||||
const { ConfirmModal, openConfirm } = useConfirm({
|
||||
content: t('common:dataset.data.Delete Tip'),
|
||||
type: 'delete'
|
||||
});
|
||||
|
||||
const { data: collection = defaultCollectionDetail } = useQuery(
|
||||
['loadCollectionId', collectionId],
|
||||
() => {
|
||||
@@ -163,8 +158,8 @@ const InputDataModal = ({
|
||||
}, [collection.dataset.vectorModel, defaultModels.embedding, embeddingModelList]);
|
||||
|
||||
// import new data
|
||||
const { mutate: sureImportData, isLoading: isImporting } = useRequest({
|
||||
mutationFn: async (e: InputDataType) => {
|
||||
const { runAsync: sureImportData, loading: isImporting } = useRequest2(
|
||||
async (e: InputDataType) => {
|
||||
if (!e.q) {
|
||||
setCurrentTab(TabEnum.content);
|
||||
return Promise.reject(t('common:dataset.data.input is empty'));
|
||||
@@ -181,12 +176,8 @@ const InputDataModal = ({
|
||||
collectionId: collection._id,
|
||||
q: e.q,
|
||||
a: e.a,
|
||||
// remove dataId
|
||||
indexes:
|
||||
e.indexes?.map((index) => ({
|
||||
...index,
|
||||
dataId: undefined
|
||||
})) || []
|
||||
// Contains no default index
|
||||
indexes: e.indexes
|
||||
});
|
||||
|
||||
return {
|
||||
@@ -194,18 +185,20 @@ const InputDataModal = ({
|
||||
dataId
|
||||
};
|
||||
},
|
||||
successToast: t('common:dataset.data.Input Success Tip'),
|
||||
onSuccess(e) {
|
||||
reset({
|
||||
...e,
|
||||
q: '',
|
||||
a: '',
|
||||
indexes: []
|
||||
});
|
||||
onSuccess(e);
|
||||
},
|
||||
errorToast: t('common:common.error.unKnow')
|
||||
});
|
||||
{
|
||||
successToast: t('common:dataset.data.Input Success Tip'),
|
||||
onSuccess(e) {
|
||||
reset({
|
||||
...e,
|
||||
q: '',
|
||||
a: '',
|
||||
indexes: []
|
||||
});
|
||||
onSuccess(e);
|
||||
},
|
||||
errorToast: t('common:common.error.unKnow')
|
||||
}
|
||||
);
|
||||
|
||||
// update
|
||||
const { runAsync: onUpdateData, loading: isUpdating } = useRequest2(
|
||||
@@ -239,6 +232,7 @@ const InputDataModal = ({
|
||||
() => getSourceNameIcon({ sourceName: collection.sourceName, sourceId: collection.sourceId }),
|
||||
[collection]
|
||||
);
|
||||
|
||||
return (
|
||||
<MyModal
|
||||
isOpen={true}
|
||||
@@ -291,9 +285,8 @@ const InputDataModal = ({
|
||||
p={0}
|
||||
onClick={() =>
|
||||
appendIndexes({
|
||||
type: 'custom',
|
||||
text: '',
|
||||
dataId: `${Date.now()}`
|
||||
type: DatasetDataIndexTypeEnum.custom,
|
||||
text: ''
|
||||
})
|
||||
}
|
||||
>
|
||||
@@ -331,7 +324,6 @@ const InputDataModal = ({
|
||||
</MyTooltip>
|
||||
</Flex>
|
||||
</MyBox>
|
||||
<ConfirmModal />
|
||||
</MyModal>
|
||||
);
|
||||
};
|
||||
|
@@ -25,16 +25,35 @@ const formatIndexes = ({
|
||||
a?: string;
|
||||
}) => {
|
||||
indexes = indexes || [];
|
||||
const defaultIndex = getDefaultIndex({ q, a });
|
||||
// If index not type, set it to custom
|
||||
indexes = indexes
|
||||
.map((item) => ({
|
||||
text: typeof item.text === 'string' ? item.text : String(item.text),
|
||||
type: item.type || DatasetDataIndexTypeEnum.custom,
|
||||
dataId: item.dataId
|
||||
}))
|
||||
.filter((item) => !!item.text.trim());
|
||||
|
||||
// 1. Reset default index
|
||||
// Recompute default indexes, Merge ids of the same index, reduce the number of rebuilds
|
||||
const defaultIndexes = getDefaultIndex({ q, a });
|
||||
const concatDefaultIndexes = defaultIndexes.map((item) => {
|
||||
const oldIndex = indexes!.find((index) => index.text === item.text);
|
||||
if (oldIndex) {
|
||||
return {
|
||||
type: DatasetDataIndexTypeEnum.default,
|
||||
text: item.text,
|
||||
dataId: oldIndex.dataId
|
||||
};
|
||||
} else {
|
||||
return item;
|
||||
}
|
||||
});
|
||||
indexes = indexes.filter((item) => item.type !== DatasetDataIndexTypeEnum.default);
|
||||
// 2. Add default index
|
||||
indexes.unshift(...defaultIndex);
|
||||
// 3. Filter same text
|
||||
indexes.push(...concatDefaultIndexes);
|
||||
|
||||
// Filter same text
|
||||
indexes = indexes.filter(
|
||||
(item, index, self) =>
|
||||
!!item.text.trim() && index === self.findIndex((t) => t.text === item.text)
|
||||
(item, index, self) => index === self.findIndex((t) => t.text === item.text)
|
||||
);
|
||||
|
||||
return indexes.map((index) => ({
|
||||
@@ -229,7 +248,7 @@ export async function updateData2Dataset({
|
||||
const newIndexes = patchResult
|
||||
.filter((item) => item.type !== 'delete')
|
||||
.map((item) => item.index) as DatasetDataIndexItemType[];
|
||||
console.log(newIndexes, '---');
|
||||
|
||||
// console.log(clonePatchResult2Insert);
|
||||
await mongoSessionRun(async (session) => {
|
||||
// Update MongoData
|
||||
|
Reference in New Issue
Block a user