diff --git a/packages/service/core/dataset/utils.ts b/packages/service/core/dataset/utils.ts index 47d81e4e4..7c8d98e6e 100644 --- a/packages/service/core/dataset/utils.ts +++ b/packages/service/core/dataset/utils.ts @@ -53,10 +53,9 @@ export function replaceS3KeyToPreviewUrl(documentQuoteText: string, expiredTime: if (!documentQuoteText || typeof documentQuoteText !== 'string') return documentQuoteText as string; - const prefixPattern = Object.values(S3Sources) - .map((pattern) => `${pattern}\\/[^\\s)]+`) - .join('|'); - const regex = new RegExp(String.raw`(!?)\[([^\]]*)\]\((?!https?:\/\/)(${prefixPattern})\)`, 'g'); + const prefixes = Object.values(S3Sources); + const pattern = prefixes.map((p) => `${p}\\/[^)]+`).join('|'); + const regex = new RegExp(String.raw`(!?)\[([^\]]*)\]\(\s*(?!https?:\/\/)(${pattern})\s*\)`, 'g'); const matches = Array.from(documentQuoteText.matchAll(regex)); let content = documentQuoteText; diff --git a/test/cases/service/core/dataset/utils.test.ts b/test/cases/service/core/dataset/utils.test.ts new file mode 100644 index 000000000..e69a1a54e --- /dev/null +++ b/test/cases/service/core/dataset/utils.test.ts @@ -0,0 +1,433 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { replaceS3KeyToPreviewUrl } from '@fastgpt/service/core/dataset/utils'; + +vi.mock('@fastgpt/service/common/s3/utils', () => ({ + jwtSignS3ObjectKey: vi.fn( + (objectKey: string) => `https://example.com/api/system/file/mock-jwt-token-${objectKey}` + ), + isS3ObjectKey: vi.fn((key: string, source: string) => { + if (!key) return false; + return key.startsWith(`${source}/`); + }) +})); + +vi.mock('@fastgpt/service/common/s3/type', () => ({ + S3Sources: { + avatar: 'avatar', + chat: 'chat', + dataset: 'dataset', + temp: 'temp', + rawText: 'rawText' + } +})); + +describe('replaceS3KeyToPreviewUrl', () => { + const expiredTime = new Date('2025-12-31'); + + beforeEach(() => { + vi.clearAllMocks(); + }); + + describe('边界情况处理', () => { + it('空字符串应返回空字符串', () => { + const result = replaceS3KeyToPreviewUrl('', expiredTime); + expect(result).toBe(''); + }); + + it('null 应返回 null', () => { + const result = replaceS3KeyToPreviewUrl(null as unknown as string, expiredTime); + expect(result).toBe(null); + }); + + it('undefined 应返回 undefined', () => { + const result = replaceS3KeyToPreviewUrl(undefined as unknown as string, expiredTime); + expect(result).toBe(undefined); + }); + + it('非字符串类型应原样返回', () => { + const result = replaceS3KeyToPreviewUrl(123 as unknown as string, expiredTime); + expect(result).toBe(123); + }); + }); + + // 测试不包含 S3 链接的普通文本 + describe('普通文本处理', () => { + it('纯文本不做任何替换', () => { + const text = '这是一段普通文本,不包含任何图片链接'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toBe(text); + }); + + it('普通 HTTP 链接不做替换', () => { + const text = '![image](https://example.com/image.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toBe(text); + }); + + it('普通 markdown 链接不做替换', () => { + const text = '[链接文本](https://example.com/page)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toBe(text); + }); + }); + + // 测试 dataset 前缀的 S3 链接替换 + describe('dataset S3 链接替换', () => { + it('应替换 dataset 图片链接', () => { + const text = + '![image.png](dataset/68fee42e1d416bb5ddc85b19/6901c3071ba2bea567e8d8db/aZos7D-214afce5-4d42-4356-9e05-8164d51c59ae.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + + expect(result).toContain('https://example.com/api/system/file/mock-jwt-token-'); + expect(result).toContain('dataset/68fee42e1d416bb5ddc85b19'); + expect(result).toMatch(/!\[image\.png\]\(https:\/\/example\.com/); + }); + + it('应替换 dataset 普通链接(非图片)', () => { + const text = '[文档](dataset/68fee42e1d416bb5ddc85b19/6901c3071ba2bea567e8d8db/document.pdf)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + + expect(result).toContain('https://example.com/api/system/file/mock-jwt-token-'); + expect(result).toMatch(/\[文档\]\(https:\/\/example\.com/); + }); + }); + + // 测试 chat 前缀的 S3 链接替换 + describe('chat S3 链接替换', () => { + it('应替换 chat 图片链接', () => { + const text = + '![screenshot.png](chat/691ae29d404d0468717dd747/68ad85a7463006c96379a07/jXfXy8yfGAFs9WJpcWRbAhV2/parsed/9a0f4fed-4edf-4613-a8d6-533af5ae51dc.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + + expect(result).toContain('https://example.com/api/system/file/mock-jwt-token-'); + expect(result).toContain('chat/691ae29d404d0468717dd747'); + }); + }); + + // 测试多个链接替换 + describe('多个链接替换', () => { + it('应正确替换多个 S3 链接', () => { + const text = `这是一段包含多个图片的文本: +![图片1](dataset/team1/collection1/image1.png) +一些中间文字 +![图片2](chat/app1/user1/chat1/image2.jpg) +更多文字 +![外部图片](https://external.com/image3.png)`; + + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + + // dataset 和 chat 链接应被替换 + expect(result).toContain('mock-jwt-token-dataset/team1/collection1/image1.png'); + expect(result).toContain('mock-jwt-token-chat/app1/user1/chat1/image2.jpg'); + // 外部链接不应被替换 + expect(result).toContain('https://external.com/image3.png'); + }); + }); + + // 测试不支持的 S3 前缀 + describe('不支持的 S3 前缀', () => { + it('avatar 前缀不应被替换(只支持 dataset 和 chat)', () => { + const text = '![头像](avatar/team1/user-avatar.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + // avatar 的 isS3ObjectKey 返回 false(因为只检查 dataset 和 chat) + expect(result).toBe(text); + }); + + it('temp 前缀不应被替换', () => { + const text = '![临时文件](temp/team1/temp-file.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toBe(text); + }); + }); + + // 测试特殊字符处理 + describe('特殊字符处理', () => { + // 中文字符 + it('文件名包含中文应正常处理', () => { + const text = '![中文图片名.png](dataset/team1/collection1/中文文件名.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('https://example.com/api/system/file/mock-jwt-token-'); + }); + + it('alt 文本为空应正常处理', () => { + const text = '![](dataset/team1/collection1/no-alt.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toMatch(/!\[\]\(https:\/\/example\.com/); + }); + + // 日韩文字符 + it('文件名包含日文应正常处理', () => { + const text = '![日本語ファイル](dataset/team1/日本語テスト.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/日本語テスト.png'); + }); + + it('文件名包含韩文应正常处理', () => { + const text = '![한국어](dataset/team1/한국어파일.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/한국어파일.png'); + }); + + // Emoji 表情符号 + it('文件名包含 emoji 应正常处理', () => { + const text = '![🎉 celebration](dataset/team1/🎉emoji🚀test.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/🎉emoji🚀test.png'); + }); + + it('alt 文本包含多个 emoji 应正常处理', () => { + const text = '![🔥💯🎯](dataset/team1/file.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toMatch(/!\[🔥💯🎯\]\(https:\/\/example\.com/); + }); + + // 特殊符号 + it('文件名包含下划线和连字符应正常处理', () => { + const text = '![image](dataset/team1/my_file-name_v2.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/my_file-name_v2.png'); + }); + + it('文件名包含 @ 符号应正常处理', () => { + const text = '![email](dataset/team1/user@example.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/user@example.png'); + }); + + it('文件名包含 # 符号应正常处理', () => { + const text = '![hash](dataset/team1/file#1.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/file#1.png'); + }); + + it('文件名包含 $ 符号应正常处理', () => { + const text = '![dollar](dataset/team1/price$100.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/price$100.png'); + }); + + it('文件名包含 % 符号应正常处理', () => { + const text = '![percent](dataset/team1/50%off.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/50%off.png'); + }); + + it('文件名包含 + 符号应正常处理', () => { + const text = '![plus](dataset/team1/a+b.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/a+b.png'); + }); + + it('文件名包含 = 符号应正常处理', () => { + const text = '![equals](dataset/team1/x=1.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/x=1.png'); + }); + + // 多个点号 + it('文件名包含多个点号应正常处理', () => { + const text = '![dots](dataset/team1/file.name.v1.2.3.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/file.name.v1.2.3.png'); + }); + + // 空格相关 + it('alt 文本包含空格应正常处理', () => { + const text = '![image with spaces](dataset/team1/file.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toMatch(/!\[image with spaces\]\(https:\/\/example\.com/); + }); + + it('文件名包含 URL 编码的空格 %20 应正常处理', () => { + const text = '![encoded](dataset/team1/file%20name.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/file%20name.png'); + }); + + // 括号类字符 + it('alt 文本包含转义方括号不匹配正则,不做替换', () => { + // 由于 markdown 正则 [^\]]* 不匹配包含 ] 的 alt 文本,这种情况不会被替换 + const text = '![image \\[1\\]](dataset/team1/file.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + // 预期不做替换 + expect(result).toBe(text); + }); + + it('alt 文本包含圆括号应正常处理', () => { + const text = '![image (1)](dataset/team1/file.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('https://example.com/api/system/file/mock-jwt-token-'); + }); + + it('文件名包含花括号应正常处理', () => { + const text = '![braces](dataset/team1/file{1}.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/file{1}.png'); + }); + + // 引号 + it('alt 文本包含单引号应正常处理', () => { + const text = "![it's a test](dataset/team1/file.png)"; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toMatch(/!\[it's a test\]\(https:\/\/example\.com/); + }); + + it('alt 文本包含双引号应正常处理', () => { + const text = '![say "hello"](dataset/team1/file.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('https://example.com/api/system/file/mock-jwt-token-'); + }); + + // 反斜杠 + it('alt 文本包含反斜杠应正常处理', () => { + const text = '![path\\to\\file](dataset/team1/file.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('https://example.com/api/system/file/mock-jwt-token-'); + }); + + // 特殊 markdown 字符 + it('alt 文本包含星号应正常处理', () => { + const text = '![*important*](dataset/team1/file.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toMatch(/!\[\*important\*\]\(https:\/\/example\.com/); + }); + + it('alt 文本包含下划线强调应正常处理', () => { + const text = '![_emphasis_](dataset/team1/file.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toMatch(/!\[_emphasis_\]\(https:\/\/example\.com/); + }); + + it('alt 文本包含反引号应正常处理', () => { + const text = '![`code`](dataset/team1/file.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toMatch(/!\[`code`\]\(https:\/\/example\.com/); + }); + + // 数字和字母混合 + it('文件名是纯 UUID 格式应正常处理', () => { + const text = '![uuid](dataset/team1/550e8400-e29b-41d4-a716-446655440000.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain( + 'mock-jwt-token-dataset/team1/550e8400-e29b-41d4-a716-446655440000.png' + ); + }); + + it('文件名是纯数字应正常处理', () => { + const text = '![numbers](dataset/team1/123456789.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/123456789.png'); + }); + + // 超长文件名 + it('超长文件名应正常处理', () => { + const longName = 'a'.repeat(200); + const text = `![long](dataset/team1/${longName}.png)`; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain(`mock-jwt-token-dataset/team1/${longName}.png`); + }); + + // 阿拉伯文和希伯来文(RTL 文字) + it('文件名包含阿拉伯文应正常处理', () => { + const text = '![عربي](dataset/team1/ملف.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/ملف.png'); + }); + + // 俄文 + it('文件名包含俄文应正常处理', () => { + const text = '![русский](dataset/team1/файл.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/файл.png'); + }); + + // 泰文 + it('文件名包含泰文应正常处理', () => { + const text = '![ไทย](dataset/team1/ไฟล์.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/ไฟล์.png'); + }); + + // 特殊扩展名 + it('无扩展名的文件应正常处理', () => { + const text = '![noext](dataset/team1/README)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/README'); + }); + + it('双扩展名的文件应正常处理', () => { + const text = '![tarball](dataset/team1/archive.tar.gz)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/archive.tar.gz'); + }); + + // 管道符和其他 shell 特殊字符 + it('文件名包含管道符应正常处理', () => { + const text = '![pipe](dataset/team1/a|b.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/a|b.png'); + }); + + it('文件名包含波浪号应正常处理', () => { + const text = '![tilde](dataset/team1/~user.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/~user.png'); + }); + + it('文件名包含 & 符号应正常处理', () => { + const text = '![ampersand](dataset/team1/a&b.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/a&b.png'); + }); + + // 换行符 + it('alt 文本不包含换行符时应正常处理', () => { + const text = '![single line](dataset/team1/file.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('https://example.com/api/system/file/mock-jwt-token-'); + }); + + // 特殊组合 + it('文件名包含多种特殊字符组合应正常处理', () => { + const text = '![complex](dataset/team1/file_v1.2-beta@test#1$100%off.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/file_v1.2-beta@test#1$100%off.png'); + }); + + it('中英文混合 alt 和文件名应正常处理', () => { + const text = '![测试image图片](dataset/team1/test测试file文件.png)'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + expect(result).toContain('mock-jwt-token-dataset/team1/test测试file文件.png'); + }); + }); + + // 测试链接格式边界情况 + describe('链接格式边界情况', () => { + it('链接中有空格应正常处理', () => { + const text = '![image]( dataset/team1/collection1/image.png )'; + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + + expect(result).toContain('https://example.com/api/system/file/mock-jwt-token-'); + }); + + it('混合文本和链接应只替换 S3 链接', () => { + const text = `# 标题 + +普通段落文字 ![S3图片](dataset/team1/file.png) 后续文字 + +[普通链接](https://google.com) + +\`\`\`code +代码块 +\`\`\``; + + const result = replaceS3KeyToPreviewUrl(text, expiredTime); + + expect(result).toContain( + 'https://example.com/api/system/file/mock-jwt-token-dataset/team1/file.png' + ); + expect(result).toContain('https://google.com'); + expect(result).toContain('# 标题'); + }); + }); +});