diff --git a/.gitignore b/.gitignore index 844017d54..0275e3f35 100644 --- a/.gitignore +++ b/.gitignore @@ -37,6 +37,4 @@ files/helm/fastgpt/charts/*.tgz tmp/ coverage -document/.source - -bun.lock \ No newline at end of file +document/.source \ No newline at end of file diff --git a/document/content/docs/toc.mdx b/document/content/docs/toc.mdx index 5f6e4505d..5e1659ad9 100644 --- a/document/content/docs/toc.mdx +++ b/document/content/docs/toc.mdx @@ -101,6 +101,7 @@ description: FastGPT 文档目录 - [/docs/upgrading/4-12/4120](/docs/upgrading/4-12/4120) - [/docs/upgrading/4-12/4121](/docs/upgrading/4-12/4121) - [/docs/upgrading/4-12/4122](/docs/upgrading/4-12/4122) +- [/docs/upgrading/4-12/4123](/docs/upgrading/4-12/4123) - [/docs/upgrading/4-8/40](/docs/upgrading/4-8/40) - [/docs/upgrading/4-8/41](/docs/upgrading/4-8/41) - [/docs/upgrading/4-8/42](/docs/upgrading/4-8/42) diff --git a/document/content/docs/upgrading/4-12/4123.mdx b/document/content/docs/upgrading/4-12/4123.mdx new file mode 100644 index 000000000..60f53c947 --- /dev/null +++ b/document/content/docs/upgrading/4-12/4123.mdx @@ -0,0 +1,17 @@ +--- +title: 'V4.12.3(进行中)' +description: 'FastGPT V4.12.3 更新说明' +--- + + +## 🚀 新增内容 + + +## ⚙️ 优化 + +1. 纠正 RRF 权重合并算法,使用标准 RRF 权重公式。 + +## 🐛 修复 + + +## 🔨 工具更新 diff --git a/document/content/docs/upgrading/4-12/meta.json b/document/content/docs/upgrading/4-12/meta.json index fb29138a8..72d8005d2 100644 --- a/document/content/docs/upgrading/4-12/meta.json +++ b/document/content/docs/upgrading/4-12/meta.json @@ -1,5 +1,5 @@ { "title": "4.12.x", "description": "", - "pages": ["4122", "4121", "4120"] + "pages": ["4123", "4122", "4121", "4120"] } diff --git a/document/data/doc-last-modified.json b/document/data/doc-last-modified.json index 02d7a9446..989c6aa02 100644 --- a/document/data/doc-last-modified.json +++ b/document/data/doc-last-modified.json @@ -41,7 +41,7 @@ "document/content/docs/introduction/guide/DialogBoxes/htmlRendering.mdx": "2025-07-23T21:35:03+08:00", "document/content/docs/introduction/guide/DialogBoxes/quoteList.mdx": "2025-07-23T21:35:03+08:00", "document/content/docs/introduction/guide/admin/sso.mdx": "2025-07-24T13:00:27+08:00", - "document/content/docs/introduction/guide/admin/teamMode.mdx": "2025-07-24T13:00:27+08:00", + "document/content/docs/introduction/guide/admin/teamMode.mdx": "2025-08-27T16:59:57+08:00", "document/content/docs/introduction/guide/course/ai_settings.mdx": "2025-07-24T13:00:27+08:00", "document/content/docs/introduction/guide/course/chat_input_guide.mdx": "2025-07-23T21:35:03+08:00", "document/content/docs/introduction/guide/course/fileInput.mdx": "2025-07-23T21:35:03+08:00", @@ -104,7 +104,7 @@ "document/content/docs/upgrading/4-11/4111.mdx": "2025-08-07T22:49:09+08:00", "document/content/docs/upgrading/4-12/4120.mdx": "2025-08-12T22:45:19+08:00", "document/content/docs/upgrading/4-12/4121.mdx": "2025-08-15T22:53:06+08:00", - "document/content/docs/upgrading/4-12/4122.mdx": "2025-08-26T23:51:54+08:00", + "document/content/docs/upgrading/4-12/4122.mdx": "2025-08-27T00:31:33+08:00", "document/content/docs/upgrading/4-8/40.mdx": "2025-08-02T19:38:37+08:00", "document/content/docs/upgrading/4-8/41.mdx": "2025-08-02T19:38:37+08:00", "document/content/docs/upgrading/4-8/42.mdx": "2025-08-02T19:38:37+08:00", diff --git a/packages/global/core/dataset/search/utils.ts b/packages/global/core/dataset/search/utils.ts index 52318c837..2fb6383ea 100644 --- a/packages/global/core/dataset/search/utils.ts +++ b/packages/global/core/dataset/search/utils.ts @@ -18,7 +18,7 @@ export const datasetSearchResultConcat = ( item.list.forEach((data, index) => { const rank = index + 1; - const score = (weight * 1) / (60 + rank); + const score = weight * (1 / (60 + rank)); const record = map.get(data.id); if (record) { // 合并两个score,有相同type的score,取最大值 @@ -64,8 +64,9 @@ export const datasetSearchResultConcat = ( }); } - // @ts-ignore - delete item.rrfScore; - return item; + return { + ...item, + rrfScore: undefined + }; }); }; diff --git a/packages/service/core/dataset/search/controller.ts b/packages/service/core/dataset/search/controller.ts index 8273bafb2..ad29c4de9 100644 --- a/packages/service/core/dataset/search/controller.ts +++ b/packages/service/core/dataset/search/controller.ts @@ -849,22 +849,16 @@ export async function searchDatasetData( } })(); - // embedding recall and fullText recall rrf concat - const embWeight = embeddingWeight; // 向量索引的 weight 大小 - const fullTextWeight = 1 - embeddingWeight; // 全文索引的 weight 大小 - const rrfSearchResult = datasetSearchResultConcat([ - { weight: embWeight, list: embeddingRecallResults }, - { weight: fullTextWeight, list: fullTextRecallResults } + { weight: embeddingWeight, list: embeddingRecallResults }, + { weight: 1 - embeddingWeight, list: fullTextRecallResults } ]); const rrfConcatResults = (() => { if (reRankResults.length === 0) return rrfSearchResult; if (rerankWeight === 1) return reRankResults; - const searchWeight = 1 - rerankWeight; // 搜索结果的 weight 大小 - return datasetSearchResultConcat([ - { weight: searchWeight, list: rrfSearchResult }, + { weight: 1 - rerankWeight, list: rrfSearchResult }, { weight: rerankWeight, list: reRankResults } ]); })(); diff --git a/test/cases/function/packages/global/core/dataset/search/utils.test.ts b/test/cases/function/packages/global/core/dataset/search/utils.test.ts new file mode 100644 index 000000000..86743d919 --- /dev/null +++ b/test/cases/function/packages/global/core/dataset/search/utils.test.ts @@ -0,0 +1,474 @@ +import { describe, it, expect } from 'vitest'; +import { datasetSearchResultConcat } from '@fastgpt/global/core/dataset/search/utils'; +import { SearchScoreTypeEnum } from '@fastgpt/global/core/dataset/constants'; +import type { SearchDataResponseItemType } from '@fastgpt/global/core/dataset/type'; + +describe('datasetSearchResultConcat', () => { + // Helper function to create test data + const createSearchItem = ( + id: string, + q: string, + scores: { type: `${SearchScoreTypeEnum}`; value: number; index: number }[] = [] + ): SearchDataResponseItemType => ({ + id, + datasetId: 'dataset1', + collectionId: 'collection1', + sourceName: 'source1', + sourceId: 'source1', + q, + a: `Answer for ${q}`, + chunkIndex: 0, + updateTime: new Date(), + score: scores + }); + + describe('Edge cases', () => { + it('should handle empty array', () => { + const result = datasetSearchResultConcat([]); + expect(result).toEqual([]); + }); + + it('should handle all empty lists', () => { + const input = [ + { weight: 1.0, list: [] }, + { weight: 0.5, list: [] } + ]; + const result = datasetSearchResultConcat(input); + expect(result).toEqual([]); + }); + + it('should handle only one non-empty list', () => { + const items = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.embedding, value: 0.9, index: 0 } + ]), + createSearchItem('2', 'Question 2', [ + { type: SearchScoreTypeEnum.embedding, value: 0.8, index: 1 } + ]) + ]; + + const input = [ + { weight: 1.0, list: items }, + { weight: 0.5, list: [] } + ]; + + const result = datasetSearchResultConcat(input); + expect(result).toEqual(items); + }); + }); + + describe('RRF algorithm tests', () => { + it('should calculate RRF scores correctly', () => { + const items1 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.embedding, value: 0.9, index: 0 } + ]), + createSearchItem('2', 'Question 2', [ + { type: SearchScoreTypeEnum.embedding, value: 0.8, index: 1 } + ]) + ]; + + const items2 = [ + createSearchItem('2', 'Question 2', [ + { type: SearchScoreTypeEnum.fullText, value: 0.7, index: 0 } + ]), + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.fullText, value: 0.6, index: 1 } + ]) + ]; + + const input = [ + { weight: 1.0, list: items1 }, + { weight: 1.0, list: items2 } + ]; + + const result = datasetSearchResultConcat(input); + + // Verify RRF score calculation + // item1: 1.0 * (1/(60+1)) + 1.0 * (1/(60+2)) = 1/61 + 1/62 ≈ 0.0163934 + 0.0161290 ≈ 0.0325224 + // item2: 1.0 * (1/(60+2)) + 1.0 * (1/(60+1)) = 1/62 + 1/61 ≈ 0.0161290 + 0.0163934 ≈ 0.0325224 + + expect(result).toHaveLength(2); + + // Verify RRF scores are added + result.forEach((item) => { + const rrfScore = item.score.find((s) => s.type === SearchScoreTypeEnum.rrf); + expect(rrfScore).toBeDefined(); + expect(rrfScore!.value).toBeCloseTo(0.0325224, 6); + }); + }); + + it('should weight RRF scores correctly', () => { + const items1 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.embedding, value: 0.9, index: 0 } + ]) + ]; + + const items2 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.fullText, value: 0.7, index: 0 } + ]) + ]; + + const input = [ + { weight: 2.0, list: items1 }, // Higher weight + { weight: 1.0, list: items2 } + ]; + + const result = datasetSearchResultConcat(input); + + expect(result).toHaveLength(1); + + const rrfScore = result[0].score.find((s) => s.type === SearchScoreTypeEnum.rrf); + expect(rrfScore).toBeDefined(); + + // Should be: 2.0 * (1/61) + 1.0 * (1/61) = 3.0 * (1/61) ≈ 0.0491803 + expect(rrfScore!.value).toBeCloseTo(3.0 / 61, 6); + }); + }); + + describe('Score merging tests', () => { + it('should merge different score types correctly', () => { + const items1 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.embedding, value: 0.9, index: 0 }, + { type: SearchScoreTypeEnum.reRank, value: 0.8, index: 0 } + ]) + ]; + + const items2 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.fullText, value: 0.7, index: 0 } + ]) + ]; + + const input = [ + { weight: 1.0, list: items1 }, + { weight: 1.0, list: items2 } + ]; + + const result = datasetSearchResultConcat(input); + + expect(result).toHaveLength(1); + expect(result[0].score).toHaveLength(4); // embedding, reRank, fullText, rrf + + // Verify all score types exist + const scoreTypes = result[0].score.map((s) => s.type); + expect(scoreTypes).toContain(SearchScoreTypeEnum.embedding); + expect(scoreTypes).toContain(SearchScoreTypeEnum.fullText); + expect(scoreTypes).toContain(SearchScoreTypeEnum.reRank); + expect(scoreTypes).toContain(SearchScoreTypeEnum.rrf); + }); + + it('should take max value for same score types', () => { + const items1 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.embedding, value: 0.9, index: 0 } + ]) + ]; + + const items2 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.embedding, value: 0.7, index: 0 } // Lower score + ]) + ]; + + const input = [ + { weight: 1.0, list: items1 }, + { weight: 1.0, list: items2 } + ]; + + const result = datasetSearchResultConcat(input); + + expect(result).toHaveLength(1); + + const embeddingScore = result[0].score.find((s) => s.type === SearchScoreTypeEnum.embedding); + expect(embeddingScore).toBeDefined(); + expect(embeddingScore!.value).toBe(0.9); // Should take higher value + }); + }); + + describe('Sorting tests', () => { + it('should sort by RRF score descending', () => { + const items1 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.embedding, value: 0.9, index: 0 } + ]), + createSearchItem('2', 'Question 2', [ + { type: SearchScoreTypeEnum.embedding, value: 0.8, index: 1 } + ]), + createSearchItem('3', 'Question 3', [ + { type: SearchScoreTypeEnum.embedding, value: 0.7, index: 2 } + ]) + ]; + + const items2 = [ + createSearchItem('3', 'Question 3', [ + { type: SearchScoreTypeEnum.fullText, value: 0.9, index: 0 } + ]), // First position, higher RRF + createSearchItem('2', 'Question 2', [ + { type: SearchScoreTypeEnum.fullText, value: 0.8, index: 1 } + ]), + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.fullText, value: 0.7, index: 2 } + ]) // Third position, lower RRF + ]; + + const input = [ + { weight: 1.0, list: items1 }, + { weight: 1.0, list: items2 } + ]; + + const result = datasetSearchResultConcat(input); + + expect(result).toHaveLength(3); + + // Verify descending RRF score order + for (let i = 0; i < result.length - 1; i++) { + const currentRrf = result[i].score.find((s) => s.type === SearchScoreTypeEnum.rrf)!.value; + const nextRrf = result[i + 1].score.find((s) => s.type === SearchScoreTypeEnum.rrf)!.value; + expect(currentRrf).toBeGreaterThanOrEqual(nextRrf); + } + + // item1 and item3 have same RRF score, but item1 should be first due to stable sort order + expect(['1', '3']).toContain(result[0].id); + }); + }); + + describe('RRF score update tests', () => { + it('should update existing RRF scores when multiple lists', () => { + const items1 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.embedding, value: 0.9, index: 0 }, + { type: SearchScoreTypeEnum.rrf, value: 0.5, index: 0 } // Existing RRF score + ]) + ]; + + const items2 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.fullText, value: 0.7, index: 0 } + ]) + ]; + + const input = [ + { weight: 1.0, list: items1 }, + { weight: 1.0, list: items2 } + ]; + + const result = datasetSearchResultConcat(input); + + expect(result).toHaveLength(1); + + const rrfScores = result[0].score.filter((s) => s.type === SearchScoreTypeEnum.rrf); + expect(rrfScores).toHaveLength(1); // Should only have one RRF score + + // RRF score should be updated to calculated value, not the original 0.5 + expect(rrfScores[0].value).not.toBe(0.5); + expect(rrfScores[0].value).toBeCloseTo(1.0 / 61 + 1.0 / 61, 6); + expect(rrfScores[0].index).toBe(0); // Index after sorting + }); + + it('should add RRF score for items without one when multiple lists', () => { + const items1 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.embedding, value: 0.9, index: 0 } + // No RRF score + ]) + ]; + + const items2 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.fullText, value: 0.7, index: 0 } + ]) + ]; + + const input = [ + { weight: 1.0, list: items1 }, + { weight: 1.0, list: items2 } + ]; + + const result = datasetSearchResultConcat(input); + + expect(result).toHaveLength(1); + + const rrfScore = result[0].score.find((s) => s.type === SearchScoreTypeEnum.rrf); + expect(rrfScore).toBeDefined(); + expect(rrfScore!.value).toBeCloseTo(1.0 / 61 + 1.0 / 61, 6); + expect(rrfScore!.index).toBe(0); + }); + + it('should not modify single list (direct return)', () => { + const items1 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.embedding, value: 0.9, index: 0 } + ]) + ]; + + const input = [{ weight: 1.0, list: items1 }]; + + const result = datasetSearchResultConcat(input); + + expect(result).toHaveLength(1); + expect(result).toEqual(items1); // Should be exactly the same as input + + // Should not have RRF score because single list is returned directly + const rrfScore = result[0].score.find((s) => s.type === SearchScoreTypeEnum.rrf); + expect(rrfScore).toBeUndefined(); + }); + }); + + describe('Complex scenario tests', () => { + it('should handle complex multi-source merging', () => { + const embeddingResults = [ + createSearchItem('doc1', 'AI Introduction', [ + { type: SearchScoreTypeEnum.embedding, value: 0.95, index: 0 } + ]), + createSearchItem('doc2', 'Machine Learning Basics', [ + { type: SearchScoreTypeEnum.embedding, value: 0.9, index: 1 } + ]), + createSearchItem('doc3', 'Deep Learning Principles', [ + { type: SearchScoreTypeEnum.embedding, value: 0.85, index: 2 } + ]) + ]; + + const fullTextResults = [ + createSearchItem('doc2', 'Machine Learning Basics', [ + { type: SearchScoreTypeEnum.fullText, value: 0.88, index: 0 } + ]), + createSearchItem('doc4', 'Neural Network Applications', [ + { type: SearchScoreTypeEnum.fullText, value: 0.82, index: 1 } + ]), + createSearchItem('doc1', 'AI Introduction', [ + { type: SearchScoreTypeEnum.fullText, value: 0.78, index: 2 } + ]) + ]; + + const reRankResults = [ + createSearchItem('doc3', 'Deep Learning Principles', [ + { type: SearchScoreTypeEnum.reRank, value: 0.92, index: 0 } + ]), + createSearchItem('doc1', 'AI Introduction', [ + { type: SearchScoreTypeEnum.reRank, value: 0.89, index: 1 } + ]) + ]; + + const input = [ + { weight: 1.0, list: embeddingResults }, + { weight: 0.8, list: fullTextResults }, + { weight: 1.2, list: reRankResults } + ]; + + const result = datasetSearchResultConcat(input); + + // Should have 4 unique documents + expect(result).toHaveLength(4); + + // Verify all documents have RRF scores + result.forEach((item) => { + const rrfScore = item.score.find((s) => s.type === SearchScoreTypeEnum.rrf); + expect(rrfScore).toBeDefined(); + expect(rrfScore!.value).toBeGreaterThan(0); + }); + + // Verify merged scores + const doc1 = result.find((item) => item.id === 'doc1')!; + const doc1ScoreTypes = doc1.score.map((s) => s.type); + expect(doc1ScoreTypes).toContain(SearchScoreTypeEnum.embedding); + expect(doc1ScoreTypes).toContain(SearchScoreTypeEnum.fullText); + expect(doc1ScoreTypes).toContain(SearchScoreTypeEnum.reRank); + expect(doc1ScoreTypes).toContain(SearchScoreTypeEnum.rrf); + + // Verify sorting by RRF score descending + for (let i = 0; i < result.length - 1; i++) { + const currentRrf = result[i].score.find((s) => s.type === SearchScoreTypeEnum.rrf)!.value; + const nextRrf = result[i + 1].score.find((s) => s.type === SearchScoreTypeEnum.rrf)!.value; + expect(currentRrf).toBeGreaterThanOrEqual(nextRrf); + } + }); + }); + + describe('Edge weight tests', () => { + it('should handle zero weight', () => { + const items1 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.embedding, value: 0.9, index: 0 } + ]) + ]; + + const items2 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.fullText, value: 0.7, index: 0 } + ]) + ]; + + const input = [ + { weight: 0, list: items1 }, + { weight: 1.0, list: items2 } + ]; + + const result = datasetSearchResultConcat(input); + + expect(result).toHaveLength(1); + + const rrfScore = result[0].score.find((s) => s.type === SearchScoreTypeEnum.rrf); + expect(rrfScore).toBeDefined(); + expect(rrfScore!.value).toBeCloseTo(1.0 / 61, 6); // Only from second list + }); + + it('should handle negative weight', () => { + const items1 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.embedding, value: 0.9, index: 0 } + ]) + ]; + + const items2 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.fullText, value: 0.7, index: 0 } + ]) + ]; + + const input = [ + { weight: -1.0, list: items1 }, + { weight: 1.0, list: items2 } + ]; + + const result = datasetSearchResultConcat(input); + + expect(result).toHaveLength(1); + + const rrfScore = result[0].score.find((s) => s.type === SearchScoreTypeEnum.rrf); + expect(rrfScore).toBeDefined(); + // Should be: -1.0 * (1/61) + 1.0 * (1/61) = 0 + expect(rrfScore!.value).toBeCloseTo(0, 6); + }); + + it('should handle very small weight', () => { + const items1 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.embedding, value: 0.9, index: 0 } + ]) + ]; + + const items2 = [ + createSearchItem('1', 'Question 1', [ + { type: SearchScoreTypeEnum.fullText, value: 0.7, index: 0 } + ]) + ]; + + const input = [ + { weight: 0.001, list: items1 }, + { weight: 1.0, list: items2 } + ]; + + const result = datasetSearchResultConcat(input); + + expect(result).toHaveLength(1); + + const rrfScore = result[0].score.find((s) => s.type === SearchScoreTypeEnum.rrf); + expect(rrfScore).toBeDefined(); + expect(rrfScore!.value).toBeCloseTo(0.001 / 61 + 1.0 / 61, 6); + }); + }); +});