Collection tag (#2266)

* feat: collection metadata filter (#2211) * feat: add dataset collection tags (#2231) * dataset page * workflow page * move * fix * add plus filter * fix * fix * fix * perf: collection tag code * fix: collection tags (#2249) * fix * fix * fix tags of dataset page * fix tags of workflow page * doc * add comments * fix: collection tags (#2264) * fix: metadata filter * feat: search filter --------- Co-authored-by: heheer <1239331448@qq.com> Co-authored-by: heheer <heheer@sealos.io>
2025-07-22 20:37:48 +00:00 · 2024-08-05 12:08:46 +08:00
parent 56f6e69bc7
commit fe71efbbd2
46 changed files with 1914 additions and 112 deletions
--- a/packages/service/core/dataset/search/controller.ts
+++ b/packages/service/core/dataset/search/controller.ts
@@ -20,6 +20,9 @@ import { hashStr } from '@fastgpt/global/common/string/tools';
 import { jiebaSplit } from '../../../common/string/jieba';
 import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection/utils';
 import { Types } from '../../../common/mongo';
+import json5 from 'json5';
+import { MongoDatasetCollectionTags } from '../tag/schema';
+import { readFromSecondary } from '../../../common/mongo/utils';

 type SearchDatasetDataProps = {
  teamId: string;
@@ -31,6 +34,20 @@ type SearchDatasetDataProps = {
  usingReRank?: boolean;
  reRankQuery: string;
  queries: string[];
+
+  /* 
+    {
+      tags: {
+        $and: ["str1","str2"],
+        $or: ["str1","str2",null] null means no tags
+      },
+      createTime: {
+        $gte: 'xx',
+        $lte: 'xxx'
+      }
+    }
+  */
+  collectionFilterMatch?: string;
 };

 export async function searchDatasetData(props: SearchDatasetDataProps) {
@@ -43,7 +60,8 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
    limit: maxTokens,
    searchMode = DatasetSearchModeEnum.embedding,
    usingReRank = false,
-    datasetIds = []
+    datasetIds = [],
+    collectionFilterMatch
  } = props;

  /* init params */
@@ -87,14 +105,148 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
      forbidCollectionIdList: collections.map((item) => String(item._id))
    };
  };
+  /* 
+    Collection metadata filter
+    标签过滤：
+    1. and 先生效
+    2. and 标签和 null 不能共存，否则返回空数组
+  */
+  const filterCollectionByMetadata = async (): Promise<string[] | undefined> => {
+    if (!collectionFilterMatch || !global.feConfigs.isPlus) return;
+
+    let tagCollectionIdList: string[] | undefined = undefined;
+    let createTimeCollectionIdList: string[] | undefined = undefined;
+
+    try {
+      const jsonMatch = json5.parse(collectionFilterMatch);
+
+      // Tag
+      let andTags = jsonMatch?.tags?.$and as (string | null)[] | undefined;
+      let orTags = jsonMatch?.tags?.$or as (string | null)[] | undefined;
+
+      // get andTagIds
+      if (andTags && andTags.length > 0) {
+        // tag 去重
+        andTags = Array.from(new Set(andTags));
+
+        if (andTags.includes(null) && andTags.some((tag) => typeof tag === 'string')) {
+          return [];
+        }
+
+        if (andTags.every((tag) => typeof tag === 'string')) {
+          // Get tagId by tag string
+          const andTagIdList = await MongoDatasetCollectionTags.find(
+            {
+              teamId,
+              datasetId: { $in: datasetIds },
+              tag: { $in: andTags }
+            },
+            '_id',
+            {
+              ...readFromSecondary
+            }
+          ).lean();
+
+          // If you enter a tag that does not exist, none will be found
+          if (andTagIdList.length !== andTags.length) return [];
+
+          // Get collectionId by tagId
+          const collections = await MongoDatasetCollection.find(
+            {
+              teamId,
+              datasetId: { $in: datasetIds },
+              tags: { $all: andTagIdList.map((item) => String(item._id)) }
+            },
+            '_id',
+            {
+              ...readFromSecondary
+            }
+          ).lean();
+          tagCollectionIdList = collections.map((item) => String(item._id));
+        } else if (andTags.every((tag) => tag === null)) {
+          const collections = await MongoDatasetCollection.find(
+            {
+              teamId,
+              datasetId: { $in: datasetIds },
+              $or: [{ tags: { $size: 0 } }, { tags: { $exists: false } }]
+            },
+            '_id',
+            {
+              ...readFromSecondary
+            }
+          ).lean();
+          tagCollectionIdList = collections.map((item) => String(item._id));
+        }
+      } else if (orTags && orTags.length > 0) {
+        // Get tagId by tag string
+        const orTagArray = await MongoDatasetCollectionTags.find(
+          {
+            teamId,
+            datasetId: { $in: datasetIds },
+            tag: { $in: orTags.filter((tag) => tag !== null) }
+          },
+          '_id',
+          { ...readFromSecondary }
+        ).lean();
+        const orTagIds = orTagArray.map((item) => String(item._id));
+
+        // Get collections by tagId
+        const collections = await MongoDatasetCollection.find(
+          {
+            teamId,
+            datasetId: { $in: datasetIds },
+            $or: [
+              { tags: { $in: orTagIds } },
+              ...(orTags.includes(null) ? [{ tags: { $size: 0 } }] : [])
+            ]
+          },
+          '_id',
+          { ...readFromSecondary }
+        ).lean();
+
+        tagCollectionIdList = collections.map((item) => String(item._id));
+      }
+
+      // time
+      const getCreateTime = jsonMatch?.createTime?.$gte as string | undefined;
+      const lteCreateTime = jsonMatch?.createTime?.$lte as string | undefined;
+      if (getCreateTime || lteCreateTime) {
+        const collections = await MongoDatasetCollection.find(
+          {
+            teamId,
+            datasetId: { $in: datasetIds },
+            createTime: {
+              ...(getCreateTime && { $gte: new Date(getCreateTime) }),
+              ...(lteCreateTime && {
+                $lte: new Date(lteCreateTime)
+              })
+            }
+          },
+          '_id'
+        );
+        createTimeCollectionIdList = collections.map((item) => String(item._id));
+      }
+
+      // Concat tag and time
+      if (tagCollectionIdList && createTimeCollectionIdList) {
+        return tagCollectionIdList.filter((id) => createTimeCollectionIdList!.includes(id));
+      } else if (tagCollectionIdList) {
+        return tagCollectionIdList;
+      } else if (createTimeCollectionIdList) {
+        return createTimeCollectionIdList;
+      }
+    } catch (error) {}
+  };
  const embeddingRecall = async ({
    query,
    limit,
-    forbidCollectionIdList
+    forbidCollectionIdList,
+    filterCollectionIdList
  }: {
    query: string;
    limit: number;
    forbidCollectionIdList: string[];
+    filterCollectionIdList?: string[];
  }) => {
    const { vectors, tokens } = await getVectorsByText({
      model: getVectorModel(model),
@@ -107,7 +259,8 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
      datasetIds,
      vector: vectors[0],
      limit,
-      forbidCollectionIdList
+      forbidCollectionIdList,
+      filterCollectionIdList
    });

    // get q and a
@@ -165,10 +318,12 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
  };
  const fullTextRecall = async ({
    query,
-    limit
+    limit,
+    filterCollectionIdList
  }: {
    query: string;
    limit: number;
+    filterCollectionIdList?: string[];
  }): Promise<{
    fullTextRecallResults: SearchDataResponseItemType[];
    tokenLen: number;
@@ -188,7 +343,14 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
              $match: {
                teamId: new Types.ObjectId(teamId),
                datasetId: new Types.ObjectId(id),
-                $text: { $search: jiebaSplit({ text: query }) }
+                $text: { $search: jiebaSplit({ text: query }) },
+                ...(filterCollectionIdList && filterCollectionIdList.length > 0
+                  ? {
+                      collectionId: {
+                        $in: filterCollectionIdList.map((id) => new Types.ObjectId(id))
+                      }
+                    }
+                  : {})
              }
            },
            {
@@ -327,19 +489,24 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
    const fullTextRecallResList: SearchDataResponseItemType[][] = [];
    let totalTokens = 0;

-    const { forbidCollectionIdList } = await getForbidData();
-
+    const [{ forbidCollectionIdList }, filterCollectionIdList] = await Promise.all([
+      getForbidData(),
+      filterCollectionByMetadata()
+    ]);
+    console.log(filterCollectionIdList, '===');
    await Promise.all(
      queries.map(async (query) => {
        const [{ tokens, embeddingRecallResults }, { fullTextRecallResults }] = await Promise.all([
          embeddingRecall({
            query,
            limit: embeddingLimit,
-            forbidCollectionIdList
+            forbidCollectionIdList,
+            filterCollectionIdList
          }),
          fullTextRecall({
            query,
-            limit: fullTextLimit
+            limit: fullTextLimit,
+            filterCollectionIdList
          })
        ]);
        totalTokens += tokens;