mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-22 20:37:48 +00:00
Collection tag (#2266)
* feat: collection metadata filter (#2211) * feat: add dataset collection tags (#2231) * dataset page * workflow page * move * fix * add plus filter * fix * fix * fix * perf: collection tag code * fix: collection tags (#2249) * fix * fix * fix tags of dataset page * fix tags of workflow page * doc * add comments * fix: collection tags (#2264) * fix: metadata filter * feat: search filter --------- Co-authored-by: heheer <1239331448@qq.com> Co-authored-by: heheer <heheer@sealos.io>
This commit is contained in:
@@ -20,6 +20,9 @@ import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
import { jiebaSplit } from '../../../common/string/jieba';
|
||||
import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection/utils';
|
||||
import { Types } from '../../../common/mongo';
|
||||
import json5 from 'json5';
|
||||
import { MongoDatasetCollectionTags } from '../tag/schema';
|
||||
import { readFromSecondary } from '../../../common/mongo/utils';
|
||||
|
||||
type SearchDatasetDataProps = {
|
||||
teamId: string;
|
||||
@@ -31,6 +34,20 @@ type SearchDatasetDataProps = {
|
||||
usingReRank?: boolean;
|
||||
reRankQuery: string;
|
||||
queries: string[];
|
||||
|
||||
/*
|
||||
{
|
||||
tags: {
|
||||
$and: ["str1","str2"],
|
||||
$or: ["str1","str2",null] null means no tags
|
||||
},
|
||||
createTime: {
|
||||
$gte: 'xx',
|
||||
$lte: 'xxx'
|
||||
}
|
||||
}
|
||||
*/
|
||||
collectionFilterMatch?: string;
|
||||
};
|
||||
|
||||
export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
@@ -43,7 +60,8 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
limit: maxTokens,
|
||||
searchMode = DatasetSearchModeEnum.embedding,
|
||||
usingReRank = false,
|
||||
datasetIds = []
|
||||
datasetIds = [],
|
||||
collectionFilterMatch
|
||||
} = props;
|
||||
|
||||
/* init params */
|
||||
@@ -87,14 +105,148 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
forbidCollectionIdList: collections.map((item) => String(item._id))
|
||||
};
|
||||
};
|
||||
/*
|
||||
Collection metadata filter
|
||||
标签过滤:
|
||||
1. and 先生效
|
||||
2. and 标签和 null 不能共存,否则返回空数组
|
||||
*/
|
||||
const filterCollectionByMetadata = async (): Promise<string[] | undefined> => {
|
||||
if (!collectionFilterMatch || !global.feConfigs.isPlus) return;
|
||||
|
||||
let tagCollectionIdList: string[] | undefined = undefined;
|
||||
let createTimeCollectionIdList: string[] | undefined = undefined;
|
||||
|
||||
try {
|
||||
const jsonMatch = json5.parse(collectionFilterMatch);
|
||||
|
||||
// Tag
|
||||
let andTags = jsonMatch?.tags?.$and as (string | null)[] | undefined;
|
||||
let orTags = jsonMatch?.tags?.$or as (string | null)[] | undefined;
|
||||
|
||||
// get andTagIds
|
||||
if (andTags && andTags.length > 0) {
|
||||
// tag 去重
|
||||
andTags = Array.from(new Set(andTags));
|
||||
|
||||
if (andTags.includes(null) && andTags.some((tag) => typeof tag === 'string')) {
|
||||
return [];
|
||||
}
|
||||
|
||||
if (andTags.every((tag) => typeof tag === 'string')) {
|
||||
// Get tagId by tag string
|
||||
const andTagIdList = await MongoDatasetCollectionTags.find(
|
||||
{
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds },
|
||||
tag: { $in: andTags }
|
||||
},
|
||||
'_id',
|
||||
{
|
||||
...readFromSecondary
|
||||
}
|
||||
).lean();
|
||||
|
||||
// If you enter a tag that does not exist, none will be found
|
||||
if (andTagIdList.length !== andTags.length) return [];
|
||||
|
||||
// Get collectionId by tagId
|
||||
const collections = await MongoDatasetCollection.find(
|
||||
{
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds },
|
||||
tags: { $all: andTagIdList.map((item) => String(item._id)) }
|
||||
},
|
||||
'_id',
|
||||
{
|
||||
...readFromSecondary
|
||||
}
|
||||
).lean();
|
||||
tagCollectionIdList = collections.map((item) => String(item._id));
|
||||
} else if (andTags.every((tag) => tag === null)) {
|
||||
const collections = await MongoDatasetCollection.find(
|
||||
{
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds },
|
||||
$or: [{ tags: { $size: 0 } }, { tags: { $exists: false } }]
|
||||
},
|
||||
'_id',
|
||||
{
|
||||
...readFromSecondary
|
||||
}
|
||||
).lean();
|
||||
tagCollectionIdList = collections.map((item) => String(item._id));
|
||||
}
|
||||
} else if (orTags && orTags.length > 0) {
|
||||
// Get tagId by tag string
|
||||
const orTagArray = await MongoDatasetCollectionTags.find(
|
||||
{
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds },
|
||||
tag: { $in: orTags.filter((tag) => tag !== null) }
|
||||
},
|
||||
'_id',
|
||||
{ ...readFromSecondary }
|
||||
).lean();
|
||||
const orTagIds = orTagArray.map((item) => String(item._id));
|
||||
|
||||
// Get collections by tagId
|
||||
const collections = await MongoDatasetCollection.find(
|
||||
{
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds },
|
||||
$or: [
|
||||
{ tags: { $in: orTagIds } },
|
||||
...(orTags.includes(null) ? [{ tags: { $size: 0 } }] : [])
|
||||
]
|
||||
},
|
||||
'_id',
|
||||
{ ...readFromSecondary }
|
||||
).lean();
|
||||
|
||||
tagCollectionIdList = collections.map((item) => String(item._id));
|
||||
}
|
||||
|
||||
// time
|
||||
const getCreateTime = jsonMatch?.createTime?.$gte as string | undefined;
|
||||
const lteCreateTime = jsonMatch?.createTime?.$lte as string | undefined;
|
||||
if (getCreateTime || lteCreateTime) {
|
||||
const collections = await MongoDatasetCollection.find(
|
||||
{
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds },
|
||||
createTime: {
|
||||
...(getCreateTime && { $gte: new Date(getCreateTime) }),
|
||||
...(lteCreateTime && {
|
||||
$lte: new Date(lteCreateTime)
|
||||
})
|
||||
}
|
||||
},
|
||||
'_id'
|
||||
);
|
||||
createTimeCollectionIdList = collections.map((item) => String(item._id));
|
||||
}
|
||||
|
||||
// Concat tag and time
|
||||
if (tagCollectionIdList && createTimeCollectionIdList) {
|
||||
return tagCollectionIdList.filter((id) => createTimeCollectionIdList!.includes(id));
|
||||
} else if (tagCollectionIdList) {
|
||||
return tagCollectionIdList;
|
||||
} else if (createTimeCollectionIdList) {
|
||||
return createTimeCollectionIdList;
|
||||
}
|
||||
} catch (error) {}
|
||||
};
|
||||
const embeddingRecall = async ({
|
||||
query,
|
||||
limit,
|
||||
forbidCollectionIdList
|
||||
forbidCollectionIdList,
|
||||
filterCollectionIdList
|
||||
}: {
|
||||
query: string;
|
||||
limit: number;
|
||||
forbidCollectionIdList: string[];
|
||||
filterCollectionIdList?: string[];
|
||||
}) => {
|
||||
const { vectors, tokens } = await getVectorsByText({
|
||||
model: getVectorModel(model),
|
||||
@@ -107,7 +259,8 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
datasetIds,
|
||||
vector: vectors[0],
|
||||
limit,
|
||||
forbidCollectionIdList
|
||||
forbidCollectionIdList,
|
||||
filterCollectionIdList
|
||||
});
|
||||
|
||||
// get q and a
|
||||
@@ -165,10 +318,12 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
};
|
||||
const fullTextRecall = async ({
|
||||
query,
|
||||
limit
|
||||
limit,
|
||||
filterCollectionIdList
|
||||
}: {
|
||||
query: string;
|
||||
limit: number;
|
||||
filterCollectionIdList?: string[];
|
||||
}): Promise<{
|
||||
fullTextRecallResults: SearchDataResponseItemType[];
|
||||
tokenLen: number;
|
||||
@@ -188,7 +343,14 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
$match: {
|
||||
teamId: new Types.ObjectId(teamId),
|
||||
datasetId: new Types.ObjectId(id),
|
||||
$text: { $search: jiebaSplit({ text: query }) }
|
||||
$text: { $search: jiebaSplit({ text: query }) },
|
||||
...(filterCollectionIdList && filterCollectionIdList.length > 0
|
||||
? {
|
||||
collectionId: {
|
||||
$in: filterCollectionIdList.map((id) => new Types.ObjectId(id))
|
||||
}
|
||||
}
|
||||
: {})
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -327,19 +489,24 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
const fullTextRecallResList: SearchDataResponseItemType[][] = [];
|
||||
let totalTokens = 0;
|
||||
|
||||
const { forbidCollectionIdList } = await getForbidData();
|
||||
|
||||
const [{ forbidCollectionIdList }, filterCollectionIdList] = await Promise.all([
|
||||
getForbidData(),
|
||||
filterCollectionByMetadata()
|
||||
]);
|
||||
console.log(filterCollectionIdList, '===');
|
||||
await Promise.all(
|
||||
queries.map(async (query) => {
|
||||
const [{ tokens, embeddingRecallResults }, { fullTextRecallResults }] = await Promise.all([
|
||||
embeddingRecall({
|
||||
query,
|
||||
limit: embeddingLimit,
|
||||
forbidCollectionIdList
|
||||
forbidCollectionIdList,
|
||||
filterCollectionIdList
|
||||
}),
|
||||
fullTextRecall({
|
||||
query,
|
||||
limit: fullTextLimit
|
||||
limit: fullTextLimit,
|
||||
filterCollectionIdList
|
||||
})
|
||||
]);
|
||||
totalTokens += tokens;
|
||||
|
Reference in New Issue
Block a user