From d05259deddd11443be31561117db1460577c276a Mon Sep 17 00:00:00 2001 From: Archer <545436317@qq.com> Date: Thu, 6 Mar 2025 18:27:47 +0800 Subject: [PATCH] =?UTF-8?q?perf:=20retry=20to=20load=20image=EF=BC=9Bperf:?= =?UTF-8?q?=20default=20index=20check=20(#4004)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * perf: retry to load image * perf: default index check --- .../zh-cn/docs/development/openapi/dataset.md | 23 ++++--- packages/global/common/string/markdown.ts | 2 +- .../service/common/file/image/controller.ts | 17 +++-- packages/service/common/file/image/utils.ts | 19 ++++-- packages/service/common/file/read/utils.ts | 27 ++++---- packages/service/core/chat/utils.ts | 2 +- packages/service/worker/htmlStr2Md/utils.ts | 4 +- .../dataset/detail/InputDataModal.tsx | 64 ++++++++----------- .../service/core/dataset/data/controller.ts | 35 +++++++--- 9 files changed, 112 insertions(+), 81 deletions(-) diff --git a/docSite/content/zh-cn/docs/development/openapi/dataset.md b/docSite/content/zh-cn/docs/development/openapi/dataset.md index d41cc080e..d43b2026d 100644 --- a/docSite/content/zh-cn/docs/development/openapi/dataset.md +++ b/docSite/content/zh-cn/docs/development/openapi/dataset.md @@ -1063,10 +1063,12 @@ curl --location --request DELETE 'http://localhost:3000/api/core/dataset/collect | 字段 | 类型 | 说明 | 必填 | | --- | --- | --- | --- | -| defaultIndex | Boolean | 是否为默认索引 | ✅ | -| dataId | String | 关联的向量ID | ✅ | +| type | String | 可选索引类型:default-默认索引; custom-自定义索引; summary-总结索引; question-问题索引; image-图片索引 | | +| dataId | String | 关联的向量ID,变更数据时候传入该 ID,会进行差量更新,而不是全量更新 | | | text | String | 文本内容 | ✅ | +`type` 不填则默认为 `custom` 索引,还会基于 q/a 组成一个默认索引。如果传入了默认索引,则不会额外创建。 + ### 为集合批量添加添加数据 注意,每次最多推送 200 组数据。 @@ -1298,8 +1300,7 @@ curl --location --request GET 'http://localhost:3000/api/core/dataset/data/detai "chunkIndex": 0, "indexes": [ { - "defaultIndex": true, - "type": "chunk", + "type": "default", "dataId": "3720083", "text": "N o . 2 0 2 2 1 2中 国 信 息 通 信 研 究 院京东探索研究院2022年 9月人工智能生成内容(AIGC)白皮书(2022 年)版权声明本白皮书版权属于中国信息通信研究院和京东探索研究院,并受法律保护。转载、摘编或利用其它方式使用本白皮书文字或者观点的,应注明“来源:中国信息通信研究院和京东探索研究院”。违反上述声明者,编者将追究其相关法律责任。前 言习近平总书记曾指出,“数字技术正以新理念、新业态、新模式全面融入人类经济、政治、文化、社会、生态文明建设各领域和全过程”。在当前数字世界和物理世界加速融合的大背景下,人工智能生成内容(Artificial Intelligence Generated Content,简称 AIGC)正在悄然引导着一场深刻的变革,重塑甚至颠覆数字内容的生产方式和消费模式,将极大地丰富人们的数字生活,是未来全面迈向数字文明新时代不可或缺的支撑力量。", "_id": "65abd4b29d1448617cba61dc" @@ -1335,12 +1336,18 @@ curl --location --request PUT 'http://localhost:3000/api/core/dataset/data/updat "a":"sss", "indexes":[ { - "dataId": "xxx", - "defaultIndex":false, - "text":"自定义索引1" + "dataId": "xxxx", + "type": "default", + "text": "默认索引" }, { - "text":"修改后的自定义索引2。(会删除原来的自定义索引2,并插入新的自定义索引2)" + "dataId": "xxx", + "type": "custom", + "text": "旧的自定义索引1" + }, + { + "type":"custom", + "text":"新增的自定义索引" } ] }' diff --git a/packages/global/common/string/markdown.ts b/packages/global/common/string/markdown.ts index 82232319e..43a6e895a 100644 --- a/packages/global/common/string/markdown.ts +++ b/packages/global/common/string/markdown.ts @@ -168,7 +168,7 @@ export const markdownProcess = async ({ return simpleMarkdownText(imageProcess); }; -export const matchMdImgTextAndUpload = (text: string) => { +export const matchMdImg = (text: string) => { const base64Regex = /!\[([^\]]*)\]\((data:image\/[^;]+;base64[^)]+)\)/g; const imageList: ImageType[] = []; diff --git a/packages/service/common/file/image/controller.ts b/packages/service/common/file/image/controller.ts index c2772bcde..0bf898337 100644 --- a/packages/service/common/file/image/controller.ts +++ b/packages/service/common/file/image/controller.ts @@ -6,6 +6,7 @@ import { guessBase64ImageType } from '../utils'; import { readFromSecondary } from '../../mongo/utils'; import { addHours } from 'date-fns'; import { imageFileType } from '@fastgpt/global/common/file/constants'; +import { retryFn } from '@fastgpt/global/common/system/utils'; export const maxImgSize = 1024 * 1024 * 12; const base64MimeRegex = /data:image\/([^\)]+);base64/; @@ -40,13 +41,15 @@ export async function uploadMongoImg({ return Promise.reject(`Invalid image file type: ${mime}`); } - const { _id } = await MongoImage.create({ - teamId, - binary, - metadata: Object.assign({ mime }, metadata), - shareId, - expiredTime: forever ? undefined : addHours(new Date(), 1) - }); + const { _id } = await retryFn(() => + MongoImage.create({ + teamId, + binary, + metadata: Object.assign({ mime }, metadata), + shareId, + expiredTime: forever ? undefined : addHours(new Date(), 1) + }) + ); return `${process.env.NEXT_PUBLIC_BASE_URL || ''}${imageBaseUrl}${String(_id)}.${extension}`; } diff --git a/packages/service/common/file/image/utils.ts b/packages/service/common/file/image/utils.ts index e31d6431e..57820879d 100644 --- a/packages/service/common/file/image/utils.ts +++ b/packages/service/common/file/image/utils.ts @@ -2,23 +2,30 @@ import axios from 'axios'; import { addLog } from '../../system/log'; import { serverRequestBaseUrl } from '../../api/serverRequest'; import { getFileContentTypeFromHeader, guessBase64ImageType } from '../utils'; +import { retryFn } from '@fastgpt/global/common/system/utils'; export const getImageBase64 = async (url: string) => { addLog.debug(`Load image to base64: ${url}`); try { - const response = await axios.get(url, { - baseURL: serverRequestBaseUrl, - responseType: 'arraybuffer', - proxy: false - }); + const response = await retryFn(() => + axios.get(url, { + baseURL: serverRequestBaseUrl, + responseType: 'arraybuffer', + proxy: false + }) + ); const base64 = Buffer.from(response.data, 'binary').toString('base64'); const imageType = getFileContentTypeFromHeader(response.headers['content-type']) || guessBase64ImageType(base64); - return `data:${imageType};base64,${base64}`; + return { + completeBase64: `data:${imageType};base64,${base64}`, + base64, + mime: imageType + }; } catch (error) { addLog.debug(`Load image to base64 failed: ${url}`); console.log(error); diff --git a/packages/service/common/file/read/utils.ts b/packages/service/common/file/read/utils.ts index f8ec29283..7575b3675 100644 --- a/packages/service/common/file/read/utils.ts +++ b/packages/service/common/file/read/utils.ts @@ -6,11 +6,12 @@ import type { ImageType, ReadFileResponse } from '../../../worker/readFile/type' import axios from 'axios'; import { addLog } from '../../system/log'; import { batchRun } from '@fastgpt/global/common/system/utils'; -import { htmlTable2Md, matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown'; +import { htmlTable2Md, matchMdImg } from '@fastgpt/global/common/string/markdown'; import { createPdfParseUsage } from '../../../support/wallet/usage/controller'; import { getErrText } from '@fastgpt/global/common/error/utils'; import { delay } from '@fastgpt/global/common/system/utils'; import { getNanoid } from '@fastgpt/global/common/string/tools'; +import { getImageBase64 } from '../image/utils'; export type readRawTextByLocalFileParams = { teamId: string; @@ -99,7 +100,7 @@ export const readRawContentByFileBuffer = async ({ addLog.info(`Custom file parsing is complete, time: ${Date.now() - start}ms`); const rawText = response.markdown; - const { text, imageList } = matchMdImgTextAndUpload(rawText); + const { text, imageList } = matchMdImg(rawText); createPdfParseUsage({ teamId, @@ -120,8 +121,8 @@ export const readRawContentByFileBuffer = async ({ const parseTextImage = async (text: string) => { // Extract image links and convert to base64 const imageList: { id: string; url: string }[] = []; - const processedText = text.replace(/!\[.*?\]\((http[^)]+)\)/g, (match, url) => { - const id = getNanoid(); + let processedText = text.replace(/!\[.*?\]\((http[^)]+)\)/g, (match, url) => { + const id = `IMAGE_${getNanoid()}_IMAGE`; imageList.push({ id, url @@ -129,22 +130,24 @@ export const readRawContentByFileBuffer = async ({ return `![](${id})`; }); + // Get base64 from image url let resultImageList: ImageType[] = []; - await Promise.all( - imageList.map(async (item) => { + await batchRun( + imageList, + async (item) => { try { - const response = await axios.get(item.url, { responseType: 'arraybuffer' }); - const mime = response.headers['content-type'] || 'image/jpeg'; - const base64 = response.data.toString('base64'); + const { base64, mime } = await getImageBase64(item.url); resultImageList.push({ uuid: item.id, mime, base64 }); } catch (error) { + processedText = processedText.replace(item.id, item.url); addLog.warn(`Failed to get image from ${item.url}: ${getErrText(error)}`); } - }) + }, + 5 ); return { @@ -312,14 +315,14 @@ export const readRawContentByFileBuffer = async ({ return await uploadMongoImg({ base64Img: `data:${item.mime};base64,${item.base64}`, teamId, - // expiredTime: addHours(new Date(), 1), metadata: { ...metadata, mime: item.mime } }); } catch (error) { - return ''; + addLog.warn('Upload file image error', { error }); + return 'Upload load image error'; } })(); rawText = rawText.replace(item.uuid, src); diff --git a/packages/service/core/chat/utils.ts b/packages/service/core/chat/utils.ts index 52b4c201f..b5a70ace2 100644 --- a/packages/service/core/chat/utils.ts +++ b/packages/service/core/chat/utils.ts @@ -165,7 +165,7 @@ export const loadRequestMessages = async ({ try { // If imgUrl is a local path, load image from local, and set url to base64 if (imgUrl.startsWith('/') || process.env.MULTIPLE_DATA_TO_BASE64 === 'true') { - const base64 = await getImageBase64(imgUrl); + const { completeBase64: base64 } = await getImageBase64(imgUrl); return { ...item, diff --git a/packages/service/worker/htmlStr2Md/utils.ts b/packages/service/worker/htmlStr2Md/utils.ts index 8384d005a..0602fc818 100644 --- a/packages/service/worker/htmlStr2Md/utils.ts +++ b/packages/service/worker/htmlStr2Md/utils.ts @@ -1,6 +1,6 @@ import TurndownService from 'turndown'; import { ImageType } from '../readFile/type'; -import { matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown'; +import { matchMdImg } from '@fastgpt/global/common/string/markdown'; import { getNanoid } from '@fastgpt/global/common/string/tools'; // @ts-ignore const turndownPluginGfm = require('joplin-turndown-plugin-gfm'); @@ -46,7 +46,7 @@ export const html2md = ( // Base64 img to id, otherwise it will occupy memory when going to md const { processedHtml, images } = processBase64Images(html); const md = turndownService.turndown(processedHtml); - const { text, imageList } = matchMdImgTextAndUpload(md); + const { text, imageList } = matchMdImg(md); return { rawText: text, diff --git a/projects/app/src/pageComponents/dataset/detail/InputDataModal.tsx b/projects/app/src/pageComponents/dataset/detail/InputDataModal.tsx index 2286a3507..ce69dea01 100644 --- a/projects/app/src/pageComponents/dataset/detail/InputDataModal.tsx +++ b/projects/app/src/pageComponents/dataset/detail/InputDataModal.tsx @@ -1,5 +1,5 @@ import React, { useCallback, useEffect, useMemo, useRef, useState } from 'react'; -import { Box, Flex, Button, Textarea, useTheme } from '@chakra-ui/react'; +import { Box, Flex, Button, Textarea } from '@chakra-ui/react'; import { FieldArrayWithId, UseFieldArrayRemove, @@ -19,8 +19,7 @@ import MyModal from '@fastgpt/web/components/common/MyModal'; import MyTooltip from '@fastgpt/web/components/common/MyTooltip'; import { useQuery } from '@tanstack/react-query'; import { useTranslation } from 'next-i18next'; -import { useRequest, useRequest2 } from '@fastgpt/web/hooks/useRequest'; -import { useConfirm } from '@fastgpt/web/hooks/useConfirm'; +import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; import { getSourceNameIcon } from '@fastgpt/global/core/dataset/utils'; import { DatasetDataIndexItemType } from '@fastgpt/global/core/dataset/type'; import DeleteIcon from '@fastgpt/web/components/common/Icon/delete'; @@ -30,10 +29,12 @@ import MyBox from '@fastgpt/web/components/common/MyBox'; import { getErrText } from '@fastgpt/global/common/error/utils'; import { useSystemStore } from '@/web/common/system/useSystemStore'; import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip'; -import { useSystem } from '@fastgpt/web/hooks/useSystem'; import LightRowTabs from '@fastgpt/web/components/common/Tabs/LightRowTabs'; import styles from './styles.module.scss'; -import { getDatasetIndexMapData } from '@fastgpt/global/core/dataset/data/constants'; +import { + DatasetDataIndexTypeEnum, + getDatasetIndexMapData +} from '@fastgpt/global/core/dataset/data/constants'; export type InputDataType = { q: string; @@ -62,11 +63,10 @@ const InputDataModal = ({ onSuccess: (data: InputDataType & { dataId: string }) => void; }) => { const { t } = useTranslation(); - const theme = useTheme(); const { toast } = useToast(); const [currentTab, setCurrentTab] = useState(TabEnum.content); const { embeddingModelList, defaultModels } = useSystemStore(); - const { isPc } = useSystem(); + const { register, handleSubmit, reset, control } = useForm(); const { fields: indexes, @@ -112,11 +112,6 @@ const InputDataModal = ({ } ]; - const { ConfirmModal, openConfirm } = useConfirm({ - content: t('common:dataset.data.Delete Tip'), - type: 'delete' - }); - const { data: collection = defaultCollectionDetail } = useQuery( ['loadCollectionId', collectionId], () => { @@ -163,8 +158,8 @@ const InputDataModal = ({ }, [collection.dataset.vectorModel, defaultModels.embedding, embeddingModelList]); // import new data - const { mutate: sureImportData, isLoading: isImporting } = useRequest({ - mutationFn: async (e: InputDataType) => { + const { runAsync: sureImportData, loading: isImporting } = useRequest2( + async (e: InputDataType) => { if (!e.q) { setCurrentTab(TabEnum.content); return Promise.reject(t('common:dataset.data.input is empty')); @@ -181,12 +176,8 @@ const InputDataModal = ({ collectionId: collection._id, q: e.q, a: e.a, - // remove dataId - indexes: - e.indexes?.map((index) => ({ - ...index, - dataId: undefined - })) || [] + // Contains no default index + indexes: e.indexes }); return { @@ -194,18 +185,20 @@ const InputDataModal = ({ dataId }; }, - successToast: t('common:dataset.data.Input Success Tip'), - onSuccess(e) { - reset({ - ...e, - q: '', - a: '', - indexes: [] - }); - onSuccess(e); - }, - errorToast: t('common:common.error.unKnow') - }); + { + successToast: t('common:dataset.data.Input Success Tip'), + onSuccess(e) { + reset({ + ...e, + q: '', + a: '', + indexes: [] + }); + onSuccess(e); + }, + errorToast: t('common:common.error.unKnow') + } + ); // update const { runAsync: onUpdateData, loading: isUpdating } = useRequest2( @@ -239,6 +232,7 @@ const InputDataModal = ({ () => getSourceNameIcon({ sourceName: collection.sourceName, sourceId: collection.sourceId }), [collection] ); + return ( appendIndexes({ - type: 'custom', - text: '', - dataId: `${Date.now()}` + type: DatasetDataIndexTypeEnum.custom, + text: '' }) } > @@ -331,7 +324,6 @@ const InputDataModal = ({ - ); }; diff --git a/projects/app/src/service/core/dataset/data/controller.ts b/projects/app/src/service/core/dataset/data/controller.ts index b298c617b..ae77ce77e 100644 --- a/projects/app/src/service/core/dataset/data/controller.ts +++ b/projects/app/src/service/core/dataset/data/controller.ts @@ -25,16 +25,35 @@ const formatIndexes = ({ a?: string; }) => { indexes = indexes || []; - const defaultIndex = getDefaultIndex({ q, a }); + // If index not type, set it to custom + indexes = indexes + .map((item) => ({ + text: typeof item.text === 'string' ? item.text : String(item.text), + type: item.type || DatasetDataIndexTypeEnum.custom, + dataId: item.dataId + })) + .filter((item) => !!item.text.trim()); - // 1. Reset default index + // Recompute default indexes, Merge ids of the same index, reduce the number of rebuilds + const defaultIndexes = getDefaultIndex({ q, a }); + const concatDefaultIndexes = defaultIndexes.map((item) => { + const oldIndex = indexes!.find((index) => index.text === item.text); + if (oldIndex) { + return { + type: DatasetDataIndexTypeEnum.default, + text: item.text, + dataId: oldIndex.dataId + }; + } else { + return item; + } + }); indexes = indexes.filter((item) => item.type !== DatasetDataIndexTypeEnum.default); - // 2. Add default index - indexes.unshift(...defaultIndex); - // 3. Filter same text + indexes.push(...concatDefaultIndexes); + + // Filter same text indexes = indexes.filter( - (item, index, self) => - !!item.text.trim() && index === self.findIndex((t) => t.text === item.text) + (item, index, self) => index === self.findIndex((t) => t.text === item.text) ); return indexes.map((index) => ({ @@ -229,7 +248,7 @@ export async function updateData2Dataset({ const newIndexes = patchResult .filter((item) => item.type !== 'delete') .map((item) => item.index) as DatasetDataIndexItemType[]; - console.log(newIndexes, '---'); + // console.log(clonePatchResult2Insert); await mongoSessionRun(async (session) => { // Update MongoData