From e98d6f1d306a8954023509a4058044eb6094c22d Mon Sep 17 00:00:00 2001 From: Archer <545436317@qq.com> Date: Wed, 5 Mar 2025 10:14:33 +0800 Subject: [PATCH] Add markdown format; Update doc (#3969) * update doc * markdown --- .../zh-cn/docs/development/upgrading/490.md | 28 +++++++++- packages/service/common/file/read/utils.ts | 5 +- projects/app/src/components/Markdown/utils.ts | 52 +++++++++---------- 3 files changed, 54 insertions(+), 31 deletions(-) diff --git a/docSite/content/zh-cn/docs/development/upgrading/490.md b/docSite/content/zh-cn/docs/development/upgrading/490.md index 9992676f3..7836521d5 100644 --- a/docSite/content/zh-cn/docs/development/upgrading/490.md +++ b/docSite/content/zh-cn/docs/development/upgrading/490.md @@ -8,13 +8,34 @@ weight: 803 --- -## 重要更新 +## 更新指南 + +### 1. 做好数据库备份 + +### 2. 更新镜像 + +### 3. 运行升级脚本 + +从任意终端,发起 1 个 HTTP 请求。其中 {{rootkey}} 替换成环境变量里的 `rootkey`;{{host}} 替换成**FastGPT 域名**。 + +```bash +curl --location --request POST 'https://{{host}}/api/admin/initv490' \ +--header 'rootkey: {{rootkey}}' \ +--header 'Content-Type: application/json' +``` + +**脚本功能** + +1. 升级 PG Vector 插件版本 +2. 全量更新知识库集合字段。 +3. 全量更新知识库数据中,index 的 type 类型。(时间较长) + +## 兼容 & 弃用 1. 弃用 - 弃用旧版本地文件上传 API:/api/core/dataset/collection/create/file(以前仅商业版可用的 API,该接口已放切换成:/api/core/dataset/collection/create/localFile) 2. 停止维护,即将弃用 - 外部文件库相关 API,可通过 API 文件库替代。 3. API更新 - 上传文件至知识库、创建连接集合、API 文件库、推送分块数据等接口,`trainingType`字段未来仅支持`chunk`和`QA`两种模式。增强索引模式将设置单独字段:`autoIndexes`,目前仍有适配旧版`trainingType=auto`代码,但请尽快变更成新接口类型。具体可见:[知识库 OpenAPI 文档](/docs/development/openapi/dataset.md) - ## 🚀 新增内容 1. PDF增强解析交互添加到页面上。同时内嵌 Doc2x 服务,可直接使用 Doc2x 服务解析 PDF 文件。 @@ -24,5 +45,8 @@ weight: 803 ## ⚙️ 优化 1. 知识库数据不再限制索引数量,可无限自定义。同时可自动更新输入文本的索引,不影响自定义索引。 +2. Markdown 解析,增加链接后中文标点符号检测,增加空格。 ## 🐛 修复 + +1. 增加网页抓取安全链接校验。 \ No newline at end of file diff --git a/packages/service/common/file/read/utils.ts b/packages/service/common/file/read/utils.ts index 109f566bc..406c9f32f 100644 --- a/packages/service/common/file/read/utils.ts +++ b/packages/service/common/file/read/utils.ts @@ -84,7 +84,8 @@ export const readRawContentByFileBuffer = async ({ success: boolean; message: string; data: { - page: number; + page?: number; // abandon + pages: number; markdown: string; }; }>(url, data, { @@ -103,7 +104,7 @@ export const readRawContentByFileBuffer = async ({ createPdfParseUsage({ teamId, tmbId, - pages: response.data.page + pages: response.data.page || response.data.pages }); return { diff --git a/projects/app/src/components/Markdown/utils.ts b/projects/app/src/components/Markdown/utils.ts index d20f60848..6b4d1e5c5 100644 --- a/projects/app/src/components/Markdown/utils.ts +++ b/projects/app/src/components/Markdown/utils.ts @@ -14,32 +14,30 @@ export enum CodeClassNameEnum { export const mdTextFormat = (text: string) => { // NextChat function - Format latex to $$ - const escapeBrackets = (text: string) => { - const pattern = /(```[\s\S]*?```|`.*?`)|\\\[([\s\S]*?[^\\])\\\]|\\\((.*?)\\\)/g; - return text.replace(pattern, (match, codeBlock, squareBracket, roundBracket) => { - if (codeBlock) { - return codeBlock; - } else if (squareBracket) { - return `$$${squareBracket}$$`; - } else if (roundBracket) { - return `$${roundBracket}$`; - } - return match; - }); - }; - // 处理 [quote:id] 格式引用,将 [quote:675934a198f46329dfc6d05a] 转换为 [675934a198f46329dfc6d05a](QUOTE) - const formatQuote = (text: string) => { - return ( - text - // .replace( - // /([\u4e00-\u9fa5\u3000-\u303f])([a-zA-Z0-9])|([a-zA-Z0-9])([\u4e00-\u9fa5\u3000-\u303f])/g, - // '$1$3 $2$4' - // ) - // 处理 [quote:id] 格式引用,将 [quote:675934a198f46329dfc6d05a] 转换为 [675934a198f46329dfc6d05a](QUOTE) - .replace(/\[quote:?\s*([a-f0-9]{24})\](?!\()/gi, '[$1](QUOTE)') - .replace(/\[([a-f0-9]{24})\](?!\()/g, '[$1](QUOTE)') - ); - }; + const pattern = /(```[\s\S]*?```|`.*?`)|\\\[([\s\S]*?[^\\])\\\]|\\\((.*?)\\\)/g; + text = text.replace(pattern, (match, codeBlock, squareBracket, roundBracket) => { + if (codeBlock) { + return codeBlock; + } else if (squareBracket) { + return `$$${squareBracket}$$`; + } else if (roundBracket) { + return `$${roundBracket}$`; + } + return match; + }); - return formatQuote(escapeBrackets(text)); + // 处理 [quote:id] 格式引用,将 [quote:675934a198f46329dfc6d05a] 转换为 [675934a198f46329dfc6d05a](QUOTE) + text = text + // .replace( + // /([\u4e00-\u9fa5\u3000-\u303f])([a-zA-Z0-9])|([a-zA-Z0-9])([\u4e00-\u9fa5\u3000-\u303f])/g, + // '$1$3 $2$4' + // ) + // 处理 [quote:id] 格式引用,将 [quote:675934a198f46329dfc6d05a] 转换为 [675934a198f46329dfc6d05a](QUOTE) + .replace(/\[quote:?\s*([a-f0-9]{24})\](?!\()/gi, '[$1](QUOTE)') + .replace(/\[([a-f0-9]{24})\](?!\()/g, '[$1](QUOTE)'); + + // 处理链接后的中文标点符号,增加空格 + text = text.replace(/(https?:\/\/[^\s,。!?;:、]+)([,。!?;:、])/g, '$1 $2'); + + return text; };