Collection tag (#2266)

* feat: collection metadata filter (#2211)

* feat: add dataset collection tags (#2231)

* dataset page

* workflow page

* move

* fix

* add plus filter

* fix

* fix

* fix

* perf: collection tag code

* fix: collection tags (#2249)

* fix

* fix

* fix tags of dataset page

* fix tags of workflow page

* doc

* add comments

* fix: collection tags (#2264)

* fix: metadata filter

* feat: search filter

---------

Co-authored-by: heheer <1239331448@qq.com>
Co-authored-by: heheer <heheer@sealos.io>
This commit is contained in:
Archer
2024-08-05 12:08:46 +08:00
committed by GitHub
parent 56f6e69bc7
commit fe71efbbd2
46 changed files with 1914 additions and 112 deletions

View File

@@ -20,6 +20,9 @@ import { hashStr } from '@fastgpt/global/common/string/tools';
import { jiebaSplit } from '../../../common/string/jieba';
import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection/utils';
import { Types } from '../../../common/mongo';
import json5 from 'json5';
import { MongoDatasetCollectionTags } from '../tag/schema';
import { readFromSecondary } from '../../../common/mongo/utils';
type SearchDatasetDataProps = {
teamId: string;
@@ -31,6 +34,20 @@ type SearchDatasetDataProps = {
usingReRank?: boolean;
reRankQuery: string;
queries: string[];
/*
{
tags: {
$and: ["str1","str2"],
$or: ["str1","str2",null] null means no tags
},
createTime: {
$gte: 'xx',
$lte: 'xxx'
}
}
*/
collectionFilterMatch?: string;
};
export async function searchDatasetData(props: SearchDatasetDataProps) {
@@ -43,7 +60,8 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
limit: maxTokens,
searchMode = DatasetSearchModeEnum.embedding,
usingReRank = false,
datasetIds = []
datasetIds = [],
collectionFilterMatch
} = props;
/* init params */
@@ -87,14 +105,148 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
forbidCollectionIdList: collections.map((item) => String(item._id))
};
};
/*
Collection metadata filter
标签过滤:
1. and 先生效
2. and 标签和 null 不能共存,否则返回空数组
*/
const filterCollectionByMetadata = async (): Promise<string[] | undefined> => {
if (!collectionFilterMatch || !global.feConfigs.isPlus) return;
let tagCollectionIdList: string[] | undefined = undefined;
let createTimeCollectionIdList: string[] | undefined = undefined;
try {
const jsonMatch = json5.parse(collectionFilterMatch);
// Tag
let andTags = jsonMatch?.tags?.$and as (string | null)[] | undefined;
let orTags = jsonMatch?.tags?.$or as (string | null)[] | undefined;
// get andTagIds
if (andTags && andTags.length > 0) {
// tag 去重
andTags = Array.from(new Set(andTags));
if (andTags.includes(null) && andTags.some((tag) => typeof tag === 'string')) {
return [];
}
if (andTags.every((tag) => typeof tag === 'string')) {
// Get tagId by tag string
const andTagIdList = await MongoDatasetCollectionTags.find(
{
teamId,
datasetId: { $in: datasetIds },
tag: { $in: andTags }
},
'_id',
{
...readFromSecondary
}
).lean();
// If you enter a tag that does not exist, none will be found
if (andTagIdList.length !== andTags.length) return [];
// Get collectionId by tagId
const collections = await MongoDatasetCollection.find(
{
teamId,
datasetId: { $in: datasetIds },
tags: { $all: andTagIdList.map((item) => String(item._id)) }
},
'_id',
{
...readFromSecondary
}
).lean();
tagCollectionIdList = collections.map((item) => String(item._id));
} else if (andTags.every((tag) => tag === null)) {
const collections = await MongoDatasetCollection.find(
{
teamId,
datasetId: { $in: datasetIds },
$or: [{ tags: { $size: 0 } }, { tags: { $exists: false } }]
},
'_id',
{
...readFromSecondary
}
).lean();
tagCollectionIdList = collections.map((item) => String(item._id));
}
} else if (orTags && orTags.length > 0) {
// Get tagId by tag string
const orTagArray = await MongoDatasetCollectionTags.find(
{
teamId,
datasetId: { $in: datasetIds },
tag: { $in: orTags.filter((tag) => tag !== null) }
},
'_id',
{ ...readFromSecondary }
).lean();
const orTagIds = orTagArray.map((item) => String(item._id));
// Get collections by tagId
const collections = await MongoDatasetCollection.find(
{
teamId,
datasetId: { $in: datasetIds },
$or: [
{ tags: { $in: orTagIds } },
...(orTags.includes(null) ? [{ tags: { $size: 0 } }] : [])
]
},
'_id',
{ ...readFromSecondary }
).lean();
tagCollectionIdList = collections.map((item) => String(item._id));
}
// time
const getCreateTime = jsonMatch?.createTime?.$gte as string | undefined;
const lteCreateTime = jsonMatch?.createTime?.$lte as string | undefined;
if (getCreateTime || lteCreateTime) {
const collections = await MongoDatasetCollection.find(
{
teamId,
datasetId: { $in: datasetIds },
createTime: {
...(getCreateTime && { $gte: new Date(getCreateTime) }),
...(lteCreateTime && {
$lte: new Date(lteCreateTime)
})
}
},
'_id'
);
createTimeCollectionIdList = collections.map((item) => String(item._id));
}
// Concat tag and time
if (tagCollectionIdList && createTimeCollectionIdList) {
return tagCollectionIdList.filter((id) => createTimeCollectionIdList!.includes(id));
} else if (tagCollectionIdList) {
return tagCollectionIdList;
} else if (createTimeCollectionIdList) {
return createTimeCollectionIdList;
}
} catch (error) {}
};
const embeddingRecall = async ({
query,
limit,
forbidCollectionIdList
forbidCollectionIdList,
filterCollectionIdList
}: {
query: string;
limit: number;
forbidCollectionIdList: string[];
filterCollectionIdList?: string[];
}) => {
const { vectors, tokens } = await getVectorsByText({
model: getVectorModel(model),
@@ -107,7 +259,8 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
datasetIds,
vector: vectors[0],
limit,
forbidCollectionIdList
forbidCollectionIdList,
filterCollectionIdList
});
// get q and a
@@ -165,10 +318,12 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
};
const fullTextRecall = async ({
query,
limit
limit,
filterCollectionIdList
}: {
query: string;
limit: number;
filterCollectionIdList?: string[];
}): Promise<{
fullTextRecallResults: SearchDataResponseItemType[];
tokenLen: number;
@@ -188,7 +343,14 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
$match: {
teamId: new Types.ObjectId(teamId),
datasetId: new Types.ObjectId(id),
$text: { $search: jiebaSplit({ text: query }) }
$text: { $search: jiebaSplit({ text: query }) },
...(filterCollectionIdList && filterCollectionIdList.length > 0
? {
collectionId: {
$in: filterCollectionIdList.map((id) => new Types.ObjectId(id))
}
}
: {})
}
},
{
@@ -327,19 +489,24 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
const fullTextRecallResList: SearchDataResponseItemType[][] = [];
let totalTokens = 0;
const { forbidCollectionIdList } = await getForbidData();
const [{ forbidCollectionIdList }, filterCollectionIdList] = await Promise.all([
getForbidData(),
filterCollectionByMetadata()
]);
console.log(filterCollectionIdList, '===');
await Promise.all(
queries.map(async (query) => {
const [{ tokens, embeddingRecallResults }, { fullTextRecallResults }] = await Promise.all([
embeddingRecall({
query,
limit: embeddingLimit,
forbidCollectionIdList
forbidCollectionIdList,
filterCollectionIdList
}),
fullTextRecall({
query,
limit: fullTextLimit
limit: fullTextLimit,
filterCollectionIdList
})
]);
totalTokens += tokens;