diff --git a/packages/service/core/dataset/search/controller.ts b/packages/service/core/dataset/search/controller.ts index 71428b4aa0..297f9c6f53 100644 --- a/packages/service/core/dataset/search/controller.ts +++ b/packages/service/core/dataset/search/controller.ts @@ -25,6 +25,7 @@ import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection import { Types } from '../../../common/mongo'; import json5 from 'json5'; import { MongoDatasetCollectionTags } from '../tag/schema'; +import { computeFilterIntersection } from './utils'; import { readFromSecondary } from '../../../common/mongo/utils'; import { MongoDatasetDataText } from '../data/dataTextSchema'; import { type ChatItemType } from '@fastgpt/global/core/chat/type'; @@ -302,6 +303,7 @@ export async function searchDatasetData( let tagCollectionIdList: string[] | undefined = undefined; let createTimeCollectionIdList: string[] | undefined = undefined; + let inputCollectionIdList: string[] | undefined = undefined; try { const jsonMatch = @@ -428,16 +430,23 @@ export async function searchDatasetData( createTimeCollectionIdList = collections.map((item) => String(item._id)); } - // Concat tag and time - const collectionIds = (() => { - if (tagCollectionIdList && createTimeCollectionIdList) { - return tagCollectionIdList.filter((id) => - (createTimeCollectionIdList as string[]).includes(id) - ); + // collectionIds + const inputCollectionIds = jsonMatch?.collectionIds as string[] | undefined; + if (Array.isArray(inputCollectionIds) && inputCollectionIds.length > 0) { + inputCollectionIdList = await getAllCollectionIds({ + parentCollectionIds: inputCollectionIds + }); + if (inputCollectionIdList && inputCollectionIdList.length === 0) { + return []; } + } - return tagCollectionIdList || createTimeCollectionIdList; - })(); + // Concat tag, time and collectionIds + const collectionIds = computeFilterIntersection([ + tagCollectionIdList, + createTimeCollectionIdList, + inputCollectionIdList + ]); return await getAllCollectionIds({ parentCollectionIds: collectionIds diff --git a/packages/service/core/dataset/search/utils.ts b/packages/service/core/dataset/search/utils.ts index 230d70ac1f..959d18b9e0 100644 --- a/packages/service/core/dataset/search/utils.ts +++ b/packages/service/core/dataset/search/utils.ts @@ -3,6 +3,18 @@ import { type ChatItemType } from '@fastgpt/global/core/chat/type'; import { hashStr } from '@fastgpt/global/common/string/tools'; import { addLog } from '../../../common/system/log'; +export const computeFilterIntersection = (lists: (string[] | undefined)[]) => { + const validLists = lists.filter((list): list is string[] => list !== undefined); + + if (validLists.length === 0) return undefined; + + // reduce without initial value uses first element as accumulator + return validLists.reduce((acc, list) => { + const set = new Set(list); + return acc.filter((id) => set.has(id)); + }); +}; + export const datasetSearchQueryExtension = async ({ query, llmModel, diff --git a/packages/web/i18n/en/workflow.json b/packages/web/i18n/en/workflow.json index 1d36446253..ca3991ee4c 100644 --- a/packages/web/i18n/en/workflow.json +++ b/packages/web/i18n/en/workflow.json @@ -72,7 +72,7 @@ "field_name_already_exists": "Field name already exists", "field_required": "Required", "field_used_as_tool_input": "Used as Tool Call Parameter", - "filter_description": "Currently supports filtering by tags and creation time. Fill in the format as follows:\n{\n \"tags\": {\n \"$and\": [\"Tag 1\",\"Tag 2\"],\n \"$or\": [\"When there are $and tags, and is effective, or is not effective\"]\n },\n \"createTime\": {\n \"$gte\": \"YYYY-MM-DD HH:mm format, collection creation time greater than this time\",\n \"$lte\": \"YYYY-MM-DD HH:mm format, collection creation time less than this time, can be used with $gte\"\n }\n}", + "filter_description": "Currently supports filtering by tags, creation time, and collection IDs. Fill in the format as follows:\n{\n \"tags\": {\n \"$and\": [\"Tag 1\",\"Tag 2\"],\n \"$or\": [\"When there are $and tags, and is effective, or is not effective\"]\n },\n \"createTime\": {\n \"$gte\": \"YYYY-MM-DD HH:mm format, collection creation time greater than this time\",\n \"$lte\": \"YYYY-MM-DD HH:mm format, collection creation time less than this time, can be used with $gte\"\n },\n \"collectionIds\": [\"collectionId1\", \"collectionId2\", \"Folder IDs are supported and will automatically expand to get all sub-collections\"]\n}", "find_tip": "Find node ctrl f", "find_tip_mac": "Find node ⌘ f", "foldAll": "Collapse all", diff --git a/packages/web/i18n/zh-CN/workflow.json b/packages/web/i18n/zh-CN/workflow.json index 2c0fc31385..3c815f06af 100644 --- a/packages/web/i18n/zh-CN/workflow.json +++ b/packages/web/i18n/zh-CN/workflow.json @@ -72,7 +72,7 @@ "field_name_already_exists": "字段名已经存在", "field_required": "必填", "field_used_as_tool_input": "作为工具调用参数", - "filter_description": "目前支持标签和创建时间过滤,需按照以下格式填写:\n{\n \"tags\": {\n \"$and\": [\"标签 1\",\"标签 2\"],\n \"$or\": [\"有 $and 标签时,and 生效,or 不生效\"]\n },\n \"createTime\": {\n \"$gte\": \"YYYY-MM-DD HH:mm 格式即可,集合的创建时间大于该时间\",\n \"$lte\": \"YYYY-MM-DD HH:mm 格式即可,集合的创建时间小于该时间,可和 $gte 共同使用\"\n }\n}", + "filter_description": "目前支持标签、创建时间和集合 ID 过滤,需按照以下格式填写:\n{\n \"tags\": {\n \"$and\": [\"标签 1\",\"标签 2\"],\n \"$or\": [\"有 $and 标签时,and 生效,or 不生效\"]\n },\n \"createTime\": {\n \"$gte\": \"YYYY-MM-DD HH:mm 格式即可,集合的创建时间大于该时间\",\n \"$lte\": \"YYYY-MM-DD HH:mm 格式即可,集合的创建时间小于该时间,可和 $gte 共同使用\"\n },\n \"collectionIds\": [\"集合ID1\", \"集合ID2\", \"支持文件夹ID,会自动展开获取所有子集合\"]\n}", "find_tip": "查找节点 ctrl f", "find_tip_mac": "查找节点 ⌘ f", "foldAll": "全部折叠", diff --git a/packages/web/i18n/zh-Hant/workflow.json b/packages/web/i18n/zh-Hant/workflow.json index df492e67d2..513c5f4f59 100644 --- a/packages/web/i18n/zh-Hant/workflow.json +++ b/packages/web/i18n/zh-Hant/workflow.json @@ -72,7 +72,7 @@ "field_name_already_exists": "欄位名稱已存在", "field_required": "必填", "field_used_as_tool_input": "作為工具呼叫參數", - "filter_description": "目前支援標籤和建立時間篩選,需按照以下格式填寫:\n{\n \"tags\": {\n \"$and\": [\"標籤 1\",\"標籤 2\"],\n \"$or\": [\"當有 $and 標籤時,$and 才會生效,$or 不會生效\"]\n },\n \"createTime\": {\n \"$gte\": \"YYYY-MM-DD HH:mm 格式,資料集的建立時間大於這個時間\",\n \"$lte\": \"YYYY-MM-DD HH:mm 格式,資料集的建立時間小於這個時間,可以和 $gte 一起使用\"\n }\n}", + "filter_description": "目前支援標籤、建立時間和集合 ID 篩選,需按照以下格式填寫:\n{\n \"tags\": {\n \"$and\": [\"標籤 1\",\"標籤 2\"],\n \"$or\": [\"當有 $and 標籤時,$and 才會生效,$or 不會生效\"]\n },\n \"createTime\": {\n \"$gte\": \"YYYY-MM-DD HH:mm 格式,資料集的建立時間大於這個時間\",\n \"$lte\": \"YYYY-MM-DD HH:mm 格式,資料集的建立時間小於這個時間,可以和 $gte 一起使用\"\n },\n \"collectionIds\": [\"集合ID1\", \"集合ID2\", \"支援資料夾 ID,會自動展開獲取所有子集合\"]\n}", "find_tip": "查找節點 ctrl f", "find_tip_mac": "查找節點 ⌘ f", "foldAll": "全部折疊", diff --git a/test/cases/service/core/dataset/search/utils.test.ts b/test/cases/service/core/dataset/search/utils.test.ts new file mode 100644 index 0000000000..0c23d42ca2 --- /dev/null +++ b/test/cases/service/core/dataset/search/utils.test.ts @@ -0,0 +1,109 @@ +import { describe, it, expect } from 'vitest'; +import { computeFilterIntersection } from '@fastgpt/service/core/dataset/search/utils'; + +describe('computeFilterIntersection', () => { + describe('edge cases', () => { + it('should return undefined for empty array', () => { + const result = computeFilterIntersection([]); + expect(result).toBeUndefined(); + }); + + it('should return undefined for all undefined arrays', () => { + const result = computeFilterIntersection([undefined, undefined, undefined]); + expect(result).toBeUndefined(); + }); + + it('should return single array as-is', () => { + const result = computeFilterIntersection([['a', 'b', 'c']]); + expect(result).toEqual(['a', 'b', 'c']); + }); + + it('should filter out undefined arrays', () => { + const result = computeFilterIntersection([undefined, ['a', 'b'], undefined]); + expect(result).toEqual(['a', 'b']); + }); + + it('should return empty array when intersection is empty', () => { + const result = computeFilterIntersection([ + ['a', 'b'], + ['c', 'd'] + ]); + expect(result).toEqual([]); + }); + }); + + describe('two arrays', () => { + it('should compute intersection of two arrays', () => { + const result = computeFilterIntersection([ + ['a', 'b', 'c'], + ['b', 'c', 'd'] + ]); + expect(result).toEqual(['b', 'c']); + }); + + it('should handle duplicate elements', () => { + const result = computeFilterIntersection([ + ['a', 'a', 'b', 'b'], + ['a', 'b', 'c'] + ]); + expect(result).toEqual(['a', 'a', 'b', 'b']); + }); + + it('should preserve order from first array', () => { + const result = computeFilterIntersection([ + ['c', 'b', 'a'], + ['a', 'b', 'c'] + ]); + expect(result).toEqual(['c', 'b', 'a']); + }); + }); + + describe('three arrays (tags, createTime, collectionIds)', () => { + it('should compute intersection of three arrays', () => { + const tagIds = ['id1', 'id2', 'id3']; + const timeIds = ['id2', 'id3', 'id4']; + const collectionIds = ['id3', 'id4', 'id5']; + + const result = computeFilterIntersection([tagIds, timeIds, collectionIds]); + expect(result).toEqual(['id3']); + }); + + it('should return empty when no common elements', () => { + const tagIds = ['id1', 'id2']; + const timeIds = ['id3', 'id4']; + const collectionIds = ['id5', 'id6']; + + const result = computeFilterIntersection([tagIds, timeIds, collectionIds]); + expect(result).toEqual([]); + }); + + it('should handle partial undefined', () => { + const tagIds = ['id1', 'id2', 'id3']; + const collectionIds = ['id2', 'id3', 'id4']; + + const result = computeFilterIntersection([tagIds, undefined, collectionIds]); + expect(result).toEqual(['id2', 'id3']); + }); + + it('should handle all same elements', () => { + const ids = ['id1', 'id2', 'id3']; + const result = computeFilterIntersection([ids, ids, ids]); + expect(result).toEqual(['id1', 'id2', 'id3']); + }); + }); + + describe('performance with Set optimization', () => { + it('should handle large arrays efficiently', () => { + const size = 10000; + const arr1 = Array.from({ length: size }, (_, i) => `id${i}`); + const arr2 = Array.from({ length: size }, (_, i) => `id${i + size / 2}`); + + const start = performance.now(); + const result = computeFilterIntersection([arr1, arr2]); + const duration = performance.now() - start; + + expect(result?.length).toBe(size / 2); + expect(duration).toBeLessThan(100); // Should complete within 100ms + }); + }); +});