add collectionIds filter for metadata search (#6379)

* add collectionIds filter for metadata search

* add test
This commit is contained in:
heheer
2026-02-04 20:42:59 +08:00
committed by GitHub
parent e6c7593d95
commit 214b3138ad
6 changed files with 141 additions and 11 deletions

View File

@@ -25,6 +25,7 @@ import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection
import { Types } from '../../../common/mongo';
import json5 from 'json5';
import { MongoDatasetCollectionTags } from '../tag/schema';
import { computeFilterIntersection } from './utils';
import { readFromSecondary } from '../../../common/mongo/utils';
import { MongoDatasetDataText } from '../data/dataTextSchema';
import { type ChatItemType } from '@fastgpt/global/core/chat/type';
@@ -302,6 +303,7 @@ export async function searchDatasetData(
let tagCollectionIdList: string[] | undefined = undefined;
let createTimeCollectionIdList: string[] | undefined = undefined;
let inputCollectionIdList: string[] | undefined = undefined;
try {
const jsonMatch =
@@ -428,16 +430,23 @@ export async function searchDatasetData(
createTimeCollectionIdList = collections.map((item) => String(item._id));
}
// Concat tag and time
const collectionIds = (() => {
if (tagCollectionIdList && createTimeCollectionIdList) {
return tagCollectionIdList.filter((id) =>
(createTimeCollectionIdList as string[]).includes(id)
);
// collectionIds
const inputCollectionIds = jsonMatch?.collectionIds as string[] | undefined;
if (Array.isArray(inputCollectionIds) && inputCollectionIds.length > 0) {
inputCollectionIdList = await getAllCollectionIds({
parentCollectionIds: inputCollectionIds
});
if (inputCollectionIdList && inputCollectionIdList.length === 0) {
return [];
}
}
return tagCollectionIdList || createTimeCollectionIdList;
})();
// Concat tag, time and collectionIds
const collectionIds = computeFilterIntersection([
tagCollectionIdList,
createTimeCollectionIdList,
inputCollectionIdList
]);
return await getAllCollectionIds({
parentCollectionIds: collectionIds

View File

@@ -3,6 +3,18 @@ import { type ChatItemType } from '@fastgpt/global/core/chat/type';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { addLog } from '../../../common/system/log';
export const computeFilterIntersection = (lists: (string[] | undefined)[]) => {
const validLists = lists.filter((list): list is string[] => list !== undefined);
if (validLists.length === 0) return undefined;
// reduce without initial value uses first element as accumulator
return validLists.reduce((acc, list) => {
const set = new Set(list);
return acc.filter((id) => set.has(id));
});
};
export const datasetSearchQueryExtension = async ({
query,
llmModel,

View File

@@ -72,7 +72,7 @@
"field_name_already_exists": "Field name already exists",
"field_required": "Required",
"field_used_as_tool_input": "Used as Tool Call Parameter",
"filter_description": "Currently supports filtering by tags and creation time. Fill in the format as follows:\n{\n \"tags\": {\n \"$and\": [\"Tag 1\",\"Tag 2\"],\n \"$or\": [\"When there are $and tags, and is effective, or is not effective\"]\n },\n \"createTime\": {\n \"$gte\": \"YYYY-MM-DD HH:mm format, collection creation time greater than this time\",\n \"$lte\": \"YYYY-MM-DD HH:mm format, collection creation time less than this time, can be used with $gte\"\n }\n}",
"filter_description": "Currently supports filtering by tags, creation time, and collection IDs. Fill in the format as follows:\n{\n \"tags\": {\n \"$and\": [\"Tag 1\",\"Tag 2\"],\n \"$or\": [\"When there are $and tags, and is effective, or is not effective\"]\n },\n \"createTime\": {\n \"$gte\": \"YYYY-MM-DD HH:mm format, collection creation time greater than this time\",\n \"$lte\": \"YYYY-MM-DD HH:mm format, collection creation time less than this time, can be used with $gte\"\n },\n \"collectionIds\": [\"collectionId1\", \"collectionId2\", \"Folder IDs are supported and will automatically expand to get all sub-collections\"]\n}",
"find_tip": "Find node ctrl f",
"find_tip_mac": "Find node ⌘ f",
"foldAll": "Collapse all",

View File

@@ -72,7 +72,7 @@
"field_name_already_exists": "字段名已经存在",
"field_required": "必填",
"field_used_as_tool_input": "作为工具调用参数",
"filter_description": "目前支持标签创建时间过滤,需按照以下格式填写:\n{\n \"tags\": {\n \"$and\": [\"标签 1\",\"标签 2\"],\n \"$or\": [\"有 $and 标签时and 生效or 不生效\"]\n },\n \"createTime\": {\n \"$gte\": \"YYYY-MM-DD HH:mm 格式即可,集合的创建时间大于该时间\",\n \"$lte\": \"YYYY-MM-DD HH:mm 格式即可,集合的创建时间小于该时间,可和 $gte 共同使用\"\n }\n}",
"filter_description": "目前支持标签创建时间和集合 ID 过滤,需按照以下格式填写:\n{\n \"tags\": {\n \"$and\": [\"标签 1\",\"标签 2\"],\n \"$or\": [\"有 $and 标签时and 生效or 不生效\"]\n },\n \"createTime\": {\n \"$gte\": \"YYYY-MM-DD HH:mm 格式即可,集合的创建时间大于该时间\",\n \"$lte\": \"YYYY-MM-DD HH:mm 格式即可,集合的创建时间小于该时间,可和 $gte 共同使用\"\n },\n \"collectionIds\": [\"集合ID1\", \"集合ID2\", \"支持文件夹ID会自动展开获取所有子集合\"]\n}",
"find_tip": "查找节点 ctrl f",
"find_tip_mac": "查找节点 ⌘ f",
"foldAll": "全部折叠",

View File

@@ -72,7 +72,7 @@
"field_name_already_exists": "欄位名稱已存在",
"field_required": "必填",
"field_used_as_tool_input": "作為工具呼叫參數",
"filter_description": "目前支援標籤建立時間篩選,需按照以下格式填寫:\n{\n \"tags\": {\n \"$and\": [\"標籤 1\",\"標籤 2\"],\n \"$or\": [\"當有 $and 標籤時,$and 才會生效,$or 不會生效\"]\n },\n \"createTime\": {\n \"$gte\": \"YYYY-MM-DD HH:mm 格式,資料集的建立時間大於這個時間\",\n \"$lte\": \"YYYY-MM-DD HH:mm 格式,資料集的建立時間小於這個時間,可以和 $gte 一起使用\"\n }\n}",
"filter_description": "目前支援標籤建立時間和集合 ID 篩選,需按照以下格式填寫:\n{\n \"tags\": {\n \"$and\": [\"標籤 1\",\"標籤 2\"],\n \"$or\": [\"當有 $and 標籤時,$and 才會生效,$or 不會生效\"]\n },\n \"createTime\": {\n \"$gte\": \"YYYY-MM-DD HH:mm 格式,資料集的建立時間大於這個時間\",\n \"$lte\": \"YYYY-MM-DD HH:mm 格式,資料集的建立時間小於這個時間,可以和 $gte 一起使用\"\n },\n \"collectionIds\": [\"集合ID1\", \"集合ID2\", \"支援資料夾 ID會自動展開獲取所有子集合\"]\n}",
"find_tip": "查找節點 ctrl f",
"find_tip_mac": "查找節點 ⌘ f",
"foldAll": "全部折疊",

View File

@@ -0,0 +1,109 @@
import { describe, it, expect } from 'vitest';
import { computeFilterIntersection } from '@fastgpt/service/core/dataset/search/utils';
describe('computeFilterIntersection', () => {
describe('edge cases', () => {
it('should return undefined for empty array', () => {
const result = computeFilterIntersection([]);
expect(result).toBeUndefined();
});
it('should return undefined for all undefined arrays', () => {
const result = computeFilterIntersection([undefined, undefined, undefined]);
expect(result).toBeUndefined();
});
it('should return single array as-is', () => {
const result = computeFilterIntersection([['a', 'b', 'c']]);
expect(result).toEqual(['a', 'b', 'c']);
});
it('should filter out undefined arrays', () => {
const result = computeFilterIntersection([undefined, ['a', 'b'], undefined]);
expect(result).toEqual(['a', 'b']);
});
it('should return empty array when intersection is empty', () => {
const result = computeFilterIntersection([
['a', 'b'],
['c', 'd']
]);
expect(result).toEqual([]);
});
});
describe('two arrays', () => {
it('should compute intersection of two arrays', () => {
const result = computeFilterIntersection([
['a', 'b', 'c'],
['b', 'c', 'd']
]);
expect(result).toEqual(['b', 'c']);
});
it('should handle duplicate elements', () => {
const result = computeFilterIntersection([
['a', 'a', 'b', 'b'],
['a', 'b', 'c']
]);
expect(result).toEqual(['a', 'a', 'b', 'b']);
});
it('should preserve order from first array', () => {
const result = computeFilterIntersection([
['c', 'b', 'a'],
['a', 'b', 'c']
]);
expect(result).toEqual(['c', 'b', 'a']);
});
});
describe('three arrays (tags, createTime, collectionIds)', () => {
it('should compute intersection of three arrays', () => {
const tagIds = ['id1', 'id2', 'id3'];
const timeIds = ['id2', 'id3', 'id4'];
const collectionIds = ['id3', 'id4', 'id5'];
const result = computeFilterIntersection([tagIds, timeIds, collectionIds]);
expect(result).toEqual(['id3']);
});
it('should return empty when no common elements', () => {
const tagIds = ['id1', 'id2'];
const timeIds = ['id3', 'id4'];
const collectionIds = ['id5', 'id6'];
const result = computeFilterIntersection([tagIds, timeIds, collectionIds]);
expect(result).toEqual([]);
});
it('should handle partial undefined', () => {
const tagIds = ['id1', 'id2', 'id3'];
const collectionIds = ['id2', 'id3', 'id4'];
const result = computeFilterIntersection([tagIds, undefined, collectionIds]);
expect(result).toEqual(['id2', 'id3']);
});
it('should handle all same elements', () => {
const ids = ['id1', 'id2', 'id3'];
const result = computeFilterIntersection([ids, ids, ids]);
expect(result).toEqual(['id1', 'id2', 'id3']);
});
});
describe('performance with Set optimization', () => {
it('should handle large arrays efficiently', () => {
const size = 10000;
const arr1 = Array.from({ length: size }, (_, i) => `id${i}`);
const arr2 = Array.from({ length: size }, (_, i) => `id${i + size / 2}`);
const start = performance.now();
const result = computeFilterIntersection([arr1, arr2]);
const duration = performance.now() - start;
expect(result?.length).toBe(size / 2);
expect(duration).toBeLessThan(100); // Should complete within 100ms
});
});
});