mirror of
https://github.com/labring/FastGPT.git
synced 2026-05-07 01:02:55 +08:00
76d6234de6
* Agent features (#6345) * Test agent (#6220) * squash: compress all commits into one * feat: plan response in ui * response ui * perf: agent config * merge * tool select ux * perf: chat ui * perf: agent editform * tmp code * feat: save chat * Complete agent parent (#6049) * add role and tools filling * add: file-upload --------- Co-authored-by: xxyyh <2289112474@qq> * perf: top agent code * top agent (#6062) Co-authored-by: xxyyh <2289112474@qq> * fix: ts * skill editor ui * ui * perf: rewrite type with zod * skill edit ui * skill agent (#6089) * cp skill chat * rebasefdf933dand add skill chat * 1. skill 的 CRUD 2. skill 的信息渲染到前端界面 * solve comment * remove chatid and chatItemId * skill match * perf: skill manage * fix: ts --------- Co-authored-by: xxyyh <2289112474@qq> Co-authored-by: archer <545436317@qq.com> * fix: ts * fix: loop import * skill tool config (#6114) Co-authored-by: xxyyh <2289112474@qq> * feat: load tool in agent * skill memory (#6126) Co-authored-by: xxyyh <2289112474@qq> * perf: agent skill editor * perf: helperbot ui * agent code * perf: context * fix: request context * agent usage * perf: agent context and pause * perf: plan response * Test agent sigle skill (#6184) * feat:top box fill * prompt fix --------- Co-authored-by: xxyyh <2289112474@qq> * perf: agent chat ui * Test agent new (#6219) * have-replan * agent --------- Co-authored-by: xxyyh <2289112474@qq> * fix: ts --------- Co-authored-by: YeYuheng <57035043+YYH211@users.noreply.github.com> Co-authored-by: xxyyh <2289112474@qq> * feat: consolidate agent and MCP improvements This commit consolidates 17 commits including: - MCP tools enhancements and fixes - Agent system improvements and optimizations - Auth limit and prompt updates - Tool response compression and error tracking - Simple app adaptation - Code quality improvements (TypeScript, ESLint, Zod) - Version type migration to schema - Remove deprecated useRequest2 - Add LLM error tracking - Toolset ID validation fixes --------- Co-authored-by: YeYuheng <57035043+YYH211@users.noreply.github.com> Co-authored-by: xxyyh <2289112474@qq> * fix: transform avatar copy;perf: filter invalid tool * update llm response storage time * fix: openapi schema * update skill desc * feat: cache hit data * i18n * lock * chat logs support error filter & user search (#6373) * chat log support searching by user name * support error filter * fix * fix overflow * optimize * fix init script * fix * perf: get log users * updat ecomment * fix: ts * fix: test --------- Co-authored-by: archer <545436317@qq.com> * Fix: agent (#6376) * Agent features (#6345) * Test agent (#6220) * squash: compress all commits into one * feat: plan response in ui * response ui * perf: agent config * merge * tool select ux * perf: chat ui * perf: agent editform * tmp code * feat: save chat * Complete agent parent (#6049) * add role and tools filling * add: file-upload --------- Co-authored-by: xxyyh <2289112474@qq> * perf: top agent code * top agent (#6062) Co-authored-by: xxyyh <2289112474@qq> * fix: ts * skill editor ui * ui * perf: rewrite type with zod * skill edit ui * skill agent (#6089) * cp skill chat * rebasefdf933dand add skill chat * 1. skill 的 CRUD 2. skill 的信息渲染到前端界面 * solve comment * remove chatid and chatItemId * skill match * perf: skill manage * fix: ts --------- Co-authored-by: xxyyh <2289112474@qq> Co-authored-by: archer <545436317@qq.com> * fix: ts * fix: loop import * skill tool config (#6114) Co-authored-by: xxyyh <2289112474@qq> * feat: load tool in agent * skill memory (#6126) Co-authored-by: xxyyh <2289112474@qq> * perf: agent skill editor * perf: helperbot ui * agent code * perf: context * fix: request context * agent usage * perf: agent context and pause * perf: plan response * Test agent sigle skill (#6184) * feat:top box fill * prompt fix --------- Co-authored-by: xxyyh <2289112474@qq> * perf: agent chat ui * Test agent new (#6219) * have-replan * agent --------- Co-authored-by: xxyyh <2289112474@qq> * fix: ts --------- Co-authored-by: YeYuheng <57035043+YYH211@users.noreply.github.com> Co-authored-by: xxyyh <2289112474@qq> * feat: consolidate agent and MCP improvements This commit consolidates 17 commits including: - MCP tools enhancements and fixes - Agent system improvements and optimizations - Auth limit and prompt updates - Tool response compression and error tracking - Simple app adaptation - Code quality improvements (TypeScript, ESLint, Zod) - Version type migration to schema - Remove deprecated useRequest2 - Add LLM error tracking - Toolset ID validation fixes --------- Co-authored-by: YeYuheng <57035043+YYH211@users.noreply.github.com> Co-authored-by: xxyyh <2289112474@qq> * 1. 把辅助生成前端上的 system prompt 加入到上下文中 2. mcp工具的前端渲染(图标) 3. 文件读取工具和文件上传进行关联 4. 添加了辅助生成返回格式出错的重试方案 5. ask 不出现在 plan 步骤中 6. 添加了辅助生成的头像和交互 UI * fix:read_file * helperbot ui * ts error * helper ui * delete Unused import * perf: helper bot * lock --------- Co-authored-by: Archer <545436317@qq.com> Co-authored-by: xxyyh <2289112474@qq> * fix date variable required & model auth (#6386) * fix date variable required & model auth * doc * feat: add chat id to finish callback * fix: iphone safari shareId (#6387) * fix: iphone safari shareId * fix: mcp file list can't setting * fix: reason output field * fix: skip JSON validation for HTTP tool body with variable (#6392) * fix: skip JSON validation for HTTP tool body with variable * doc * workflow fitview * perf: selecting memory * perf: cp api * ui * perf: toolcall auto adapt * fix: catch workflow error * fix: ts * perf: pagination type * remove * ignore * update doc * fix: simple app tool select * add default avatar to logs user * perf: loading user * select dataset ui * rename version * feat: add global/common test * perf: packages/global/common test * feat: package/global/ai,app test * add global/chat test * global/core test * global/core test * feat: packages/global all test * perf: test * add server api test * perf: init shell * perf: init4150 shell * remove invalid code * update doc * remove log * fix: chat effect * fix: plan fake tool (#6398) * 1. 提示词防注入功能 2. 无工具不进入 plan,防止虚拟工具生成 * Agent-dataset * dataset * dataset presetInfo * prefix * perf: prompt --------- Co-authored-by: xxyyh <2289112474@qq> Co-authored-by: archer <545436317@qq.com> * fix: review * adapt kimi2.5 think toolcall * feat: invoke fastgpt user info (#6403) feat: invoke fastgpt user info * fix: invoke fastgpt user info return orgs (#6404) * skill and version * retry helperbot (#6405) Co-authored-by: xxyyh <2289112474@qq> * update template * remove log * doc * update doc * doc * perf: internal ip check * adapt get paginationRecords * tool call adapt * fix: test * doc * fix: agent initial version * adapt completions v1 * feat: instrumentation check * rename skill * add workflow demo mode tracks (#6407) * chore: 统一 skills 目录命名为小写 将 .claude/Skills/ 重命名为 .claude/skills/ 以保持命名一致性。 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * add workflow demo mode tracks * code * optimize * fix: improve workflowDemoTrack based on PR review - Add comment to empty catch block for maintainability - Add @param docs to onDemoChange clarifying nodeCount usage - Replace silent .catch with console.debug for dev debugging - Handle appId changes by reporting old data before re-init Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: archer <545436317@qq.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> * remove repeat skill * fix(workflow): filter out orphan edges to prevent runtime errors (#6399) * fix(workflow): filter out orphan edges to prevent runtime errors Runtime edges that reference non-existent nodes (orphan edges) can cause unexpected behavior or crashes during workflow dispatch. This change adds a pre-check to filter out such edges before execution begins, ensuring system stability even with inconsistent graph data. * fix(workflow): enhance orphan edge filtering with logging and tests - Refactor: Extract logic to 'filterOrphanEdges' in utils.ts for better reusability - Feat: Add performance monitoring (warn if >100ms) and comprehensive logging - Feat: Support detailed edge inspection in debug mode - Docs: Add JSDoc explaining causes of orphan edges (migration, manual edits) - Test: Add unit tests covering edge cases and performance (1000 edges) Addresses PR review feedback regarding logging, variable naming, and testing." * move code * move code * add more unit test --------- Co-authored-by: archer <545436317@qq.com> * test * perf: test * add server/common/string test * fix: resolve $ref references in MCP tool input schemas (#6395) (#6409) * fix: resolve $ref references in MCP tool input schemas (#6395) * add test code --------- Co-authored-by: archer <545436317@qq.com> * chore(docs): add fastgpt, fastgpt-plugin version choice guide (#6411) * chore(doc): add fastgpt version description * doc * doc --------- Co-authored-by: archer <545436317@qq.com> * fix:dataset cite and description info (#6410) * 1. 添加知识库引用(plan 步骤和直接知识库调用) 2. 提示词框中的@知识库工具 3. plan 中 step 的 description dataset_search 改为中文 * fix: i18n * prompt * prompt --------- Co-authored-by: xxyyh <2289112474@qq> * fix: tool call * perf: workflow props * fix: merge ECharts toolbox options instead of overwriting (#6269) (#6412) * feat: integrate logtape and otel (#6400) * fix: deps * feat(logger): integrate logtape and otel * wip(log): add basic infras logs * wip(log): add request id and inject it into context * wip(log): add basic tx logs * wip(log): migrate * wip(log): category * wip(log): more sub category * fix: type * fix: sessionRun * fix: export getLogger from client.ts * chore: improve logs * docs: update signoz and changelog * change type * fix: ts * remove skill.md * fix: lockfile specifier * fix: test --------- Co-authored-by: archer <545436317@qq.com> * init log * doc * remove invalid log * fix: review * template * replace new log * fix: ts * remove log * chore: migrate all addLog to logtape * move skill * chore: migrate all addLog to logtape (#6417) * update skill * remove log * fix: tool check --------- Co-authored-by: YeYuheng <57035043+YYH211@users.noreply.github.com> Co-authored-by: xxyyh <2289112474@qq> Co-authored-by: heheer <heheer@sealos.io> Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: xuyafei1996 <54217479+xuyafei1996@users.noreply.github.com> Co-authored-by: ToukoYui <2331631097@qq.com> Co-authored-by: roy <whoeverimf5@gmail.com>
1000 lines
29 KiB
TypeScript
1000 lines
29 KiB
TypeScript
import {
|
|
DatasetSearchModeEnum,
|
|
DatasetSearchModeMap,
|
|
SearchScoreTypeEnum
|
|
} from '@fastgpt/global/core/dataset/constants';
|
|
import { recallFromVectorStore } from '../../../common/vectorDB/controller';
|
|
import { getVectorsByText } from '../../ai/embedding';
|
|
import { getEmbeddingModel, getDefaultRerankModel, getLLMModel } from '../../ai/model';
|
|
import { MongoDatasetData } from '../data/schema';
|
|
import type {
|
|
DatasetCollectionSchemaType,
|
|
DatasetDataSchemaType
|
|
} from '@fastgpt/global/core/dataset/type';
|
|
import {
|
|
type DatasetDataTextSchemaType,
|
|
type SearchDataResponseItemType
|
|
} from '@fastgpt/global/core/dataset/type';
|
|
import { MongoDatasetCollection } from '../collection/schema';
|
|
import { reRankRecall } from '../../../core/ai/rerank';
|
|
import { countPromptTokens } from '../../../common/string/tiktoken/index';
|
|
import { datasetSearchResultConcat } from '@fastgpt/global/core/dataset/search/utils';
|
|
import { hashStr } from '@fastgpt/global/common/string/tools';
|
|
import { jiebaSplit } from '../../../common/string/jieba/index';
|
|
import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection/utils';
|
|
import { Types } from '../../../common/mongo';
|
|
import json5 from 'json5';
|
|
import { MongoDatasetCollectionTags } from '../tag/schema';
|
|
import { computeFilterIntersection } from './utils';
|
|
import { readFromSecondary } from '../../../common/mongo/utils';
|
|
import { MongoDatasetDataText } from '../data/dataTextSchema';
|
|
import { type ChatItemType } from '@fastgpt/global/core/chat/type';
|
|
import type { NodeInputKeyEnum } from '@fastgpt/global/core/workflow/constants';
|
|
import { datasetSearchQueryExtension } from './utils';
|
|
import type { RerankModelItemType } from '@fastgpt/global/core/ai/model.schema';
|
|
import { formatDatasetDataValue } from '../data/controller';
|
|
import { pushTrack } from '../../../common/middle/tracks/utils';
|
|
import { replaceS3KeyToPreviewUrl } from '../../../core/dataset/utils';
|
|
import { addDays, addHours } from 'date-fns';
|
|
import { getLogger, LogCategories } from '../../../common/logger';
|
|
|
|
const logger = getLogger(LogCategories.MODULE.DATASET.DATA);
|
|
|
|
export type SearchDatasetDataProps = {
|
|
histories: ChatItemType[];
|
|
teamId: string;
|
|
uid?: string;
|
|
tmbId?: string;
|
|
model: string;
|
|
datasetIds: string[];
|
|
reRankQuery: string;
|
|
queries: string[];
|
|
|
|
[NodeInputKeyEnum.datasetSimilarity]?: number; // min distance
|
|
[NodeInputKeyEnum.datasetMaxTokens]: number; // max Token limit
|
|
[NodeInputKeyEnum.datasetSearchMode]?: `${DatasetSearchModeEnum}`;
|
|
[NodeInputKeyEnum.datasetSearchEmbeddingWeight]?: number;
|
|
|
|
[NodeInputKeyEnum.datasetSearchUsingReRank]?: boolean;
|
|
[NodeInputKeyEnum.datasetSearchRerankModel]?: RerankModelItemType;
|
|
[NodeInputKeyEnum.datasetSearchRerankWeight]?: number;
|
|
|
|
/*
|
|
{
|
|
tags: {
|
|
$and: ["str1","str2"],
|
|
$or: ["str1","str2",null] null means no tags
|
|
},
|
|
createTime: {
|
|
$gte: 'xx',
|
|
$lte: 'xxx'
|
|
}
|
|
}
|
|
*/
|
|
collectionFilterMatch?: string;
|
|
};
|
|
|
|
export type SearchDatasetDataResponse = {
|
|
searchRes: SearchDataResponseItemType[];
|
|
embeddingTokens: number;
|
|
reRankInputTokens: number;
|
|
searchMode: `${DatasetSearchModeEnum}`;
|
|
limit: number;
|
|
similarity: number;
|
|
usingReRank: boolean;
|
|
usingSimilarityFilter: boolean;
|
|
|
|
queryExtensionResult?: {
|
|
llmModel: string;
|
|
embeddingModel: string;
|
|
inputTokens: number;
|
|
outputTokens: number;
|
|
embeddingTokens: number;
|
|
query: string;
|
|
};
|
|
deepSearchResult?: { model: string; inputTokens: number; outputTokens: number };
|
|
};
|
|
|
|
export const datasetDataReRank = async ({
|
|
rerankModel,
|
|
data,
|
|
query
|
|
}: {
|
|
rerankModel?: RerankModelItemType;
|
|
data: SearchDataResponseItemType[];
|
|
query: string;
|
|
}): Promise<{
|
|
results: SearchDataResponseItemType[];
|
|
inputTokens: number;
|
|
}> => {
|
|
const { results, inputTokens } = await reRankRecall({
|
|
model: rerankModel,
|
|
query,
|
|
documents: data.map((item) => ({
|
|
id: item.id,
|
|
text: `${item.q}\n${item.a}`
|
|
}))
|
|
});
|
|
|
|
if (results.length === 0) {
|
|
return Promise.reject('Rerank error');
|
|
}
|
|
|
|
// add new score to data
|
|
const mergeResult = results
|
|
.map((item, index) => {
|
|
const target = data.find((dataItem) => dataItem.id === item.id);
|
|
if (!target) return null;
|
|
const score = item.score || 0;
|
|
|
|
return {
|
|
...target,
|
|
score: [{ type: SearchScoreTypeEnum.reRank, value: score, index }]
|
|
};
|
|
})
|
|
.filter(Boolean) as SearchDataResponseItemType[];
|
|
|
|
return {
|
|
results: mergeResult,
|
|
inputTokens
|
|
};
|
|
};
|
|
export const filterDatasetDataByMaxTokens = async (
|
|
data: SearchDataResponseItemType[],
|
|
maxTokens: number
|
|
) => {
|
|
const filterMaxTokensResult = await (async () => {
|
|
// Count tokens
|
|
const tokensScoreFilter = await Promise.all(
|
|
data.map(async (item) => ({
|
|
...item,
|
|
tokens: await countPromptTokens(item.q + item.a)
|
|
}))
|
|
);
|
|
|
|
const results: SearchDataResponseItemType[] = [];
|
|
let totalTokens = 0;
|
|
|
|
for await (const item of tokensScoreFilter) {
|
|
results.push(item);
|
|
|
|
totalTokens += item.tokens;
|
|
|
|
if (totalTokens > maxTokens) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return results.length === 0 ? data.slice(0, 1) : results;
|
|
})();
|
|
|
|
return filterMaxTokensResult;
|
|
};
|
|
|
|
export async function searchDatasetData(
|
|
props: SearchDatasetDataProps
|
|
): Promise<SearchDatasetDataResponse> {
|
|
let {
|
|
teamId,
|
|
reRankQuery,
|
|
queries,
|
|
model,
|
|
similarity = 0,
|
|
limit: maxTokens,
|
|
searchMode = DatasetSearchModeEnum.embedding,
|
|
embeddingWeight = 0.5,
|
|
usingReRank = false,
|
|
rerankModel,
|
|
rerankWeight = 0.5,
|
|
datasetIds = [],
|
|
collectionFilterMatch
|
|
} = props;
|
|
|
|
// Constants data
|
|
const datasetDataSelectField =
|
|
'_id datasetId collectionId updateTime q a imageId imageDescMap chunkIndex indexes';
|
|
const datsaetCollectionSelectField =
|
|
'_id name fileId rawLink apiFileId externalFileId externalFileUrl';
|
|
|
|
/* init params */
|
|
searchMode = DatasetSearchModeMap[searchMode] ? searchMode : DatasetSearchModeEnum.embedding;
|
|
usingReRank = usingReRank && !!getDefaultRerankModel();
|
|
|
|
// Compatible with topk limit
|
|
let set = new Set<string>();
|
|
let usingSimilarityFilter = false;
|
|
|
|
/* function */
|
|
const countRecallLimit = () => {
|
|
if (searchMode === DatasetSearchModeEnum.embedding) {
|
|
return {
|
|
embeddingLimit: 100,
|
|
fullTextLimit: 0
|
|
};
|
|
}
|
|
if (searchMode === DatasetSearchModeEnum.fullTextRecall) {
|
|
return {
|
|
embeddingLimit: 0,
|
|
fullTextLimit: 100
|
|
};
|
|
}
|
|
return {
|
|
embeddingLimit: 80,
|
|
fullTextLimit: 60
|
|
};
|
|
};
|
|
const getForbidData = async () => {
|
|
const collections = await MongoDatasetCollection.find(
|
|
{
|
|
teamId,
|
|
datasetId: { $in: datasetIds },
|
|
forbid: true
|
|
},
|
|
'_id'
|
|
);
|
|
|
|
return {
|
|
forbidCollectionIdList: collections.map((item) => String(item._id))
|
|
};
|
|
};
|
|
|
|
/*
|
|
Collection metadata filter
|
|
标签过滤:
|
|
1. and 先生效
|
|
2. and 标签和 null 不能共存,否则返回空数组
|
|
*/
|
|
const filterCollectionByMetadata = async (): Promise<string[] | undefined> => {
|
|
const getAllCollectionIds = async ({
|
|
parentCollectionIds
|
|
}: {
|
|
parentCollectionIds?: string[];
|
|
}): Promise<string[] | undefined> => {
|
|
if (!parentCollectionIds) return;
|
|
if (parentCollectionIds.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
const collections = await MongoDatasetCollection.find(
|
|
{
|
|
teamId,
|
|
datasetId: { $in: datasetIds },
|
|
_id: { $in: parentCollectionIds }
|
|
},
|
|
'_id type',
|
|
{
|
|
...readFromSecondary
|
|
}
|
|
).lean();
|
|
|
|
const resultIds = new Set<string>();
|
|
collections.forEach((item) => {
|
|
if (item.type !== 'folder') {
|
|
resultIds.add(String(item._id));
|
|
}
|
|
});
|
|
|
|
const folderIds = collections
|
|
.filter((item) => item.type === 'folder')
|
|
.map((item) => String(item._id));
|
|
|
|
// Get all child collection ids
|
|
if (folderIds.length) {
|
|
const childCollections = await MongoDatasetCollection.find(
|
|
{
|
|
teamId,
|
|
datasetId: { $in: datasetIds },
|
|
parentId: { $in: folderIds }
|
|
},
|
|
'_id type',
|
|
{
|
|
...readFromSecondary
|
|
}
|
|
).lean();
|
|
|
|
const childIds = await getAllCollectionIds({
|
|
parentCollectionIds: childCollections.map((item) => String(item._id))
|
|
});
|
|
|
|
childIds?.forEach((id) => resultIds.add(id));
|
|
}
|
|
|
|
return Array.from(resultIds);
|
|
};
|
|
|
|
if (!collectionFilterMatch || !global.feConfigs.isPlus) return;
|
|
|
|
let tagCollectionIdList: string[] | undefined = undefined;
|
|
let createTimeCollectionIdList: string[] | undefined = undefined;
|
|
let inputCollectionIdList: string[] | undefined = undefined;
|
|
|
|
try {
|
|
const jsonMatch =
|
|
typeof collectionFilterMatch === 'object'
|
|
? collectionFilterMatch
|
|
: json5.parse(collectionFilterMatch);
|
|
|
|
const andTags = jsonMatch?.tags?.$and as (string | null)[] | undefined;
|
|
const orTags = jsonMatch?.tags?.$or as (string | null)[] | undefined;
|
|
|
|
if (andTags && andTags.length > 0) {
|
|
const uniqueAndTags = Array.from(new Set(andTags));
|
|
if (uniqueAndTags.includes(null) && uniqueAndTags.some((tag) => typeof tag === 'string')) {
|
|
return [];
|
|
}
|
|
if (uniqueAndTags.every((tag) => typeof tag === 'string')) {
|
|
const matchedTags = await MongoDatasetCollectionTags.find(
|
|
{
|
|
teamId,
|
|
datasetId: { $in: datasetIds },
|
|
tag: { $in: uniqueAndTags as string[] }
|
|
},
|
|
'_id datasetId tag',
|
|
{ ...readFromSecondary }
|
|
).lean();
|
|
|
|
// Group tags by dataset
|
|
const datasetTagMap = new Map<string, { tagIds: string[]; tagNames: Set<string> }>();
|
|
|
|
matchedTags.forEach((tag) => {
|
|
const datasetId = String(tag.datasetId);
|
|
if (!datasetTagMap.has(datasetId)) {
|
|
datasetTagMap.set(datasetId, {
|
|
tagIds: [],
|
|
tagNames: new Set()
|
|
});
|
|
}
|
|
|
|
const datasetData = datasetTagMap.get(datasetId)!;
|
|
datasetData.tagIds.push(String(tag._id));
|
|
datasetData.tagNames.add(tag.tag);
|
|
});
|
|
|
|
const validDatasetIds = Array.from(datasetTagMap.entries())
|
|
.filter(([_, data]) => uniqueAndTags.every((tag) => data.tagNames.has(tag as string)))
|
|
.map(([datasetId]) => datasetId);
|
|
|
|
if (validDatasetIds.length === 0) return [];
|
|
|
|
const collectionsPromises = validDatasetIds.map((datasetId) => {
|
|
const { tagIds } = datasetTagMap.get(datasetId)!;
|
|
return MongoDatasetCollection.find(
|
|
{
|
|
teamId,
|
|
datasetId,
|
|
tags: { $all: tagIds }
|
|
},
|
|
'_id',
|
|
{ ...readFromSecondary }
|
|
).lean();
|
|
});
|
|
|
|
const collectionsResults = await Promise.all(collectionsPromises);
|
|
tagCollectionIdList = collectionsResults.flat().map((item) => String(item._id));
|
|
} else if (uniqueAndTags.every((tag) => tag === null)) {
|
|
const collections = await MongoDatasetCollection.find(
|
|
{
|
|
teamId,
|
|
datasetId: { $in: datasetIds },
|
|
$or: [{ tags: { $size: 0 } }, { tags: { $exists: false } }]
|
|
},
|
|
'_id',
|
|
{ ...readFromSecondary }
|
|
).lean();
|
|
tagCollectionIdList = collections.map((item) => String(item._id));
|
|
}
|
|
} else if (orTags && orTags.length > 0) {
|
|
// Get tagId by tag string
|
|
const orTagArray = await MongoDatasetCollectionTags.find(
|
|
{
|
|
teamId,
|
|
datasetId: { $in: datasetIds },
|
|
tag: { $in: orTags.filter((tag) => tag !== null) }
|
|
},
|
|
'_id',
|
|
{ ...readFromSecondary }
|
|
).lean();
|
|
const orTagIds = orTagArray.map((item) => String(item._id));
|
|
|
|
// Get collections by tagId
|
|
const collections = await MongoDatasetCollection.find(
|
|
{
|
|
teamId,
|
|
datasetId: { $in: datasetIds },
|
|
$or: [
|
|
{ tags: { $in: orTagIds } },
|
|
...(orTags.includes(null) ? [{ tags: { $size: 0 } }] : [])
|
|
]
|
|
},
|
|
'_id',
|
|
{ ...readFromSecondary }
|
|
).lean();
|
|
|
|
tagCollectionIdList = collections.map((item) => String(item._id));
|
|
}
|
|
|
|
// time
|
|
const getCreateTime = jsonMatch?.createTime?.$gte as string | undefined;
|
|
const lteCreateTime = jsonMatch?.createTime?.$lte as string | undefined;
|
|
if (getCreateTime || lteCreateTime) {
|
|
const collections = await MongoDatasetCollection.find(
|
|
{
|
|
teamId,
|
|
datasetId: { $in: datasetIds },
|
|
createTime: {
|
|
...(getCreateTime && { $gte: new Date(getCreateTime) }),
|
|
...(lteCreateTime && {
|
|
$lte: new Date(lteCreateTime)
|
|
})
|
|
}
|
|
},
|
|
'_id'
|
|
);
|
|
createTimeCollectionIdList = collections.map((item) => String(item._id));
|
|
}
|
|
|
|
// collectionIds
|
|
const inputCollectionIds = jsonMatch?.collectionIds as string[] | undefined;
|
|
if (Array.isArray(inputCollectionIds) && inputCollectionIds.length > 0) {
|
|
inputCollectionIdList = await getAllCollectionIds({
|
|
parentCollectionIds: inputCollectionIds
|
|
});
|
|
if (inputCollectionIdList && inputCollectionIdList.length === 0) {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
// Concat tag, time and collectionIds
|
|
const collectionIds = computeFilterIntersection([
|
|
tagCollectionIdList,
|
|
createTimeCollectionIdList,
|
|
inputCollectionIdList
|
|
]);
|
|
|
|
return await getAllCollectionIds({
|
|
parentCollectionIds: collectionIds
|
|
});
|
|
} catch (error) {}
|
|
};
|
|
const embeddingRecall = async ({
|
|
queries,
|
|
limit,
|
|
forbidCollectionIdList,
|
|
filterCollectionIdList
|
|
}: {
|
|
queries: string[];
|
|
limit: number;
|
|
forbidCollectionIdList: string[];
|
|
filterCollectionIdList?: string[];
|
|
}): Promise<{
|
|
embeddingRecallResults: SearchDataResponseItemType[][];
|
|
tokens: number;
|
|
}> => {
|
|
if (limit === 0) {
|
|
return {
|
|
embeddingRecallResults: [],
|
|
tokens: 0
|
|
};
|
|
}
|
|
|
|
const { vectors, tokens } = await getVectorsByText({
|
|
model: getEmbeddingModel(model),
|
|
input: queries,
|
|
type: 'query'
|
|
});
|
|
|
|
const recallResults = await Promise.all(
|
|
vectors.map(async (vector) => {
|
|
return await recallFromVectorStore({
|
|
teamId,
|
|
datasetIds,
|
|
vector,
|
|
limit,
|
|
forbidCollectionIdList,
|
|
filterCollectionIdList
|
|
});
|
|
})
|
|
);
|
|
|
|
// Get data and collections
|
|
const collectionIdList = Array.from(
|
|
new Set(recallResults.map((item) => item.results.map((item) => item.collectionId)).flat())
|
|
);
|
|
const indexDataIds = Array.from(
|
|
new Set(recallResults.map((item) => item.results.map((item) => item.id?.trim())).flat())
|
|
);
|
|
|
|
const [dataMaps, collectionMaps] = await Promise.all([
|
|
MongoDatasetData.find(
|
|
{
|
|
teamId,
|
|
datasetId: { $in: datasetIds },
|
|
collectionId: { $in: collectionIdList },
|
|
'indexes.dataId': { $in: indexDataIds }
|
|
},
|
|
datasetDataSelectField,
|
|
{ ...readFromSecondary }
|
|
)
|
|
.lean()
|
|
.then((res) => {
|
|
const map = new Map<string, DatasetDataSchemaType>();
|
|
|
|
res.forEach((item) => {
|
|
item.indexes.forEach((index) => {
|
|
map.set(String(index.dataId), item);
|
|
});
|
|
});
|
|
|
|
return map;
|
|
}),
|
|
MongoDatasetCollection.find(
|
|
{
|
|
_id: { $in: collectionIdList }
|
|
},
|
|
datsaetCollectionSelectField,
|
|
{ ...readFromSecondary }
|
|
)
|
|
.lean()
|
|
.then((res) => {
|
|
const map = new Map<string, DatasetCollectionSchemaType>();
|
|
|
|
res.forEach((item) => {
|
|
map.set(String(item._id), item);
|
|
});
|
|
|
|
return map;
|
|
})
|
|
]);
|
|
|
|
const embeddingRecallResults = recallResults.map((item) => {
|
|
const set = new Set<string>();
|
|
return (
|
|
item.results
|
|
.map((item, index) => {
|
|
const collection = collectionMaps.get(String(item.collectionId));
|
|
if (!collection) {
|
|
logger.warn('Dataset collection not found during recall', {
|
|
collectionId: item.collectionId,
|
|
dataId: item.id
|
|
});
|
|
return;
|
|
}
|
|
|
|
const data = dataMaps.get(String(item.id));
|
|
if (!data) {
|
|
logger.warn('Dataset data not found during recall', {
|
|
dataId: item.id,
|
|
collectionId: item.collectionId
|
|
});
|
|
return;
|
|
}
|
|
|
|
const result: SearchDataResponseItemType = {
|
|
id: String(data._id),
|
|
updateTime: data.updateTime,
|
|
...formatDatasetDataValue({
|
|
q: data.q,
|
|
a: data.a,
|
|
imageId: data.imageId,
|
|
imageDescMap: data.imageDescMap
|
|
}),
|
|
chunkIndex: data.chunkIndex,
|
|
datasetId: String(data.datasetId),
|
|
collectionId: String(data.collectionId),
|
|
...getCollectionSourceData(collection),
|
|
score: [{ type: SearchScoreTypeEnum.embedding, value: item?.score || 0, index }]
|
|
};
|
|
|
|
return result;
|
|
})
|
|
// 多个向量对应一个数据,每一路召回,保障数据只有一份,并且取最高排名
|
|
.filter((item) => {
|
|
if (!item) return false;
|
|
if (set.has(item.id)) return false;
|
|
set.add(item.id);
|
|
return true;
|
|
})
|
|
.map((item, index) => {
|
|
return {
|
|
...item!,
|
|
score: item!.score.map((item) => ({ ...item, index }))
|
|
};
|
|
}) as SearchDataResponseItemType[]
|
|
);
|
|
});
|
|
|
|
return {
|
|
embeddingRecallResults,
|
|
tokens
|
|
};
|
|
};
|
|
const fullTextRecall = async ({
|
|
queries,
|
|
limit,
|
|
filterCollectionIdList,
|
|
forbidCollectionIdList
|
|
}: {
|
|
queries: string[];
|
|
limit: number;
|
|
filterCollectionIdList?: string[];
|
|
forbidCollectionIdList: string[];
|
|
}): Promise<{
|
|
fullTextRecallResults: SearchDataResponseItemType[][];
|
|
}> => {
|
|
if (limit === 0) {
|
|
return {
|
|
fullTextRecallResults: []
|
|
};
|
|
}
|
|
|
|
const recallResults = await Promise.all(
|
|
queries.map(async (query) => {
|
|
return (await MongoDatasetDataText.aggregate(
|
|
[
|
|
{
|
|
$match: {
|
|
teamId: new Types.ObjectId(teamId),
|
|
$text: { $search: await jiebaSplit({ text: query }) },
|
|
datasetId: { $in: datasetIds.map((id) => new Types.ObjectId(id)) },
|
|
...(filterCollectionIdList
|
|
? {
|
|
collectionId: {
|
|
$in: filterCollectionIdList
|
|
.filter((id) => !forbidCollectionIdList.includes(id))
|
|
.map((id) => new Types.ObjectId(id))
|
|
}
|
|
}
|
|
: forbidCollectionIdList?.length
|
|
? {
|
|
collectionId: {
|
|
$nin: forbidCollectionIdList.map((id) => new Types.ObjectId(id))
|
|
}
|
|
}
|
|
: {})
|
|
}
|
|
},
|
|
{
|
|
$sort: {
|
|
score: { $meta: 'textScore' }
|
|
}
|
|
},
|
|
{
|
|
$limit: limit
|
|
},
|
|
{
|
|
$project: {
|
|
_id: 1,
|
|
collectionId: 1,
|
|
dataId: 1,
|
|
score: { $meta: 'textScore' }
|
|
}
|
|
}
|
|
],
|
|
{
|
|
...readFromSecondary
|
|
}
|
|
)) as (DatasetDataTextSchemaType & { score: number })[];
|
|
})
|
|
);
|
|
|
|
const dataIds = Array.from(
|
|
new Set(recallResults.map((item) => item.map((item) => item.dataId)).flat())
|
|
);
|
|
const collectionIds = Array.from(
|
|
new Set(recallResults.map((item) => item.map((item) => item.collectionId)).flat())
|
|
);
|
|
|
|
// Get data and collections
|
|
const [dataMaps, collectionMaps] = await Promise.all([
|
|
MongoDatasetData.find(
|
|
{
|
|
_id: { $in: dataIds }
|
|
},
|
|
datasetDataSelectField,
|
|
{ ...readFromSecondary }
|
|
)
|
|
.lean()
|
|
.then((res) => {
|
|
const map = new Map<string, DatasetDataSchemaType>();
|
|
|
|
res.forEach((item) => {
|
|
map.set(String(item._id), item);
|
|
});
|
|
|
|
return map;
|
|
}),
|
|
MongoDatasetCollection.find(
|
|
{
|
|
_id: { $in: collectionIds }
|
|
},
|
|
datsaetCollectionSelectField,
|
|
{ ...readFromSecondary }
|
|
)
|
|
.lean()
|
|
.then((res) => {
|
|
const map = new Map<string, DatasetCollectionSchemaType>();
|
|
|
|
res.forEach((item) => {
|
|
map.set(String(item._id), item);
|
|
});
|
|
|
|
return map;
|
|
})
|
|
]);
|
|
|
|
const fullTextRecallResults = recallResults.map((item) => {
|
|
return item
|
|
.map((item, index) => {
|
|
const collection = collectionMaps.get(String(item.collectionId));
|
|
if (!collection) {
|
|
logger.warn('Dataset collection not found during full-text recall', {
|
|
collectionId: item.collectionId,
|
|
dataId: item.dataId
|
|
});
|
|
return;
|
|
}
|
|
|
|
const data = dataMaps.get(String(item.dataId));
|
|
if (!data) {
|
|
logger.warn('Dataset data not found during full-text recall', {
|
|
dataId: item.dataId,
|
|
collectionId: item.collectionId
|
|
});
|
|
return;
|
|
}
|
|
|
|
return {
|
|
id: String(data._id),
|
|
datasetId: String(data.datasetId),
|
|
collectionId: String(data.collectionId),
|
|
updateTime: data.updateTime,
|
|
...formatDatasetDataValue({
|
|
q: data.q,
|
|
a: data.a,
|
|
imageId: data.imageId,
|
|
imageDescMap: data.imageDescMap
|
|
}),
|
|
chunkIndex: data.chunkIndex,
|
|
indexes: data.indexes,
|
|
...getCollectionSourceData(collection),
|
|
score: [
|
|
{
|
|
type: SearchScoreTypeEnum.fullText,
|
|
value: item.score || 0,
|
|
index
|
|
}
|
|
]
|
|
};
|
|
})
|
|
.filter((item) => {
|
|
if (!item) return false;
|
|
return true;
|
|
})
|
|
.map((item, index) => {
|
|
return {
|
|
...item,
|
|
score: item!.score.map((item) => ({ ...item, index }))
|
|
};
|
|
}) as SearchDataResponseItemType[];
|
|
});
|
|
|
|
return {
|
|
fullTextRecallResults
|
|
};
|
|
};
|
|
const multiQueryRecall = async ({
|
|
embeddingLimit,
|
|
fullTextLimit
|
|
}: {
|
|
embeddingLimit: number;
|
|
fullTextLimit: number;
|
|
}) => {
|
|
const [{ forbidCollectionIdList }, filterCollectionIdList] = await Promise.all([
|
|
getForbidData(),
|
|
filterCollectionByMetadata()
|
|
]);
|
|
|
|
const [{ tokens, embeddingRecallResults }, { fullTextRecallResults }] = await Promise.all([
|
|
embeddingRecall({
|
|
queries,
|
|
limit: embeddingLimit,
|
|
forbidCollectionIdList,
|
|
filterCollectionIdList
|
|
}),
|
|
fullTextRecall({
|
|
queries,
|
|
limit: fullTextLimit,
|
|
filterCollectionIdList,
|
|
forbidCollectionIdList
|
|
})
|
|
]);
|
|
|
|
// rrf concat
|
|
const rrfEmbRecall = datasetSearchResultConcat(
|
|
embeddingRecallResults.map((list) => ({ weight: 1, list }))
|
|
).slice(0, embeddingLimit);
|
|
const rrfFTRecall = datasetSearchResultConcat(
|
|
fullTextRecallResults.map((list) => ({ weight: 1, list }))
|
|
).slice(0, fullTextLimit);
|
|
|
|
return {
|
|
tokens,
|
|
embeddingRecallResults: rrfEmbRecall,
|
|
fullTextRecallResults: rrfFTRecall
|
|
};
|
|
};
|
|
|
|
/* main step */
|
|
// count limit
|
|
const { embeddingLimit, fullTextLimit } = countRecallLimit();
|
|
|
|
// recall
|
|
const {
|
|
embeddingRecallResults,
|
|
fullTextRecallResults,
|
|
tokens: embeddingTokens
|
|
} = await multiQueryRecall({
|
|
embeddingLimit,
|
|
fullTextLimit
|
|
});
|
|
|
|
// ReRank results
|
|
const { results: reRankResults, inputTokens: reRankInputTokens } = await (async () => {
|
|
if (!usingReRank) {
|
|
return {
|
|
results: [],
|
|
inputTokens: 0
|
|
};
|
|
}
|
|
|
|
set = new Set<string>(embeddingRecallResults.map((item) => item.id));
|
|
const concatRecallResults = embeddingRecallResults.concat(
|
|
fullTextRecallResults.filter((item) => !set.has(item.id))
|
|
);
|
|
|
|
// remove same q and a data
|
|
set = new Set<string>();
|
|
const filterSameDataResults = concatRecallResults.filter((item) => {
|
|
// 删除所有的标点符号与空格等,只对文本进行比较
|
|
const str = hashStr(`${item.q}${item.a}`.replace(/[^\p{L}\p{N}]/gu, ''));
|
|
if (set.has(str)) return false;
|
|
set.add(str);
|
|
return true;
|
|
});
|
|
try {
|
|
return await datasetDataReRank({
|
|
rerankModel,
|
|
query: reRankQuery,
|
|
data: filterSameDataResults
|
|
});
|
|
} catch (error) {
|
|
usingReRank = false;
|
|
return {
|
|
results: [],
|
|
inputTokens: 0
|
|
};
|
|
}
|
|
})();
|
|
|
|
const rrfSearchResult = datasetSearchResultConcat([
|
|
{ weight: embeddingWeight, list: embeddingRecallResults },
|
|
{ weight: 1 - embeddingWeight, list: fullTextRecallResults }
|
|
]);
|
|
const rrfConcatResults = (() => {
|
|
if (reRankResults.length === 0) return rrfSearchResult;
|
|
if (rerankWeight === 1) return reRankResults;
|
|
|
|
return datasetSearchResultConcat([
|
|
{ weight: 1 - rerankWeight, list: rrfSearchResult },
|
|
{ weight: rerankWeight, list: reRankResults }
|
|
]);
|
|
})();
|
|
|
|
// remove same q and a data
|
|
set = new Set<string>();
|
|
const filterSameDataResults = rrfConcatResults.filter((item) => {
|
|
// 删除所有的标点符号与空格等,只对文本进行比较
|
|
const str = hashStr(`${item.q}${item.a}`.replace(/[^\p{L}\p{N}]/gu, ''));
|
|
if (set.has(str)) return false;
|
|
set.add(str);
|
|
return true;
|
|
});
|
|
|
|
// score filter
|
|
const scoreFilter = (() => {
|
|
if (usingReRank) {
|
|
usingSimilarityFilter = true;
|
|
|
|
return filterSameDataResults.filter((item) => {
|
|
const reRankScore = item.score.find((item) => item.type === SearchScoreTypeEnum.reRank);
|
|
if (reRankScore && reRankScore.value < similarity) return false;
|
|
return true;
|
|
});
|
|
}
|
|
if (searchMode === DatasetSearchModeEnum.embedding) {
|
|
usingSimilarityFilter = true;
|
|
return filterSameDataResults.filter((item) => {
|
|
const embeddingScore = item.score.find(
|
|
(item) => item.type === SearchScoreTypeEnum.embedding
|
|
);
|
|
if (embeddingScore && embeddingScore.value < similarity) return false;
|
|
return true;
|
|
});
|
|
}
|
|
return filterSameDataResults;
|
|
})();
|
|
|
|
// token filter
|
|
const filterMaxTokensResult = await filterDatasetDataByMaxTokens(scoreFilter, maxTokens);
|
|
|
|
const finalResult = filterMaxTokensResult.map((item) => {
|
|
item.q = replaceS3KeyToPreviewUrl(item.q, addDays(new Date(), 90));
|
|
return item;
|
|
});
|
|
|
|
pushTrack.datasetSearch({ datasetIds, teamId });
|
|
|
|
return {
|
|
searchRes: finalResult,
|
|
embeddingTokens,
|
|
reRankInputTokens,
|
|
searchMode,
|
|
limit: maxTokens,
|
|
similarity,
|
|
usingReRank,
|
|
usingSimilarityFilter
|
|
};
|
|
}
|
|
|
|
export type DefaultSearchDatasetDataProps = SearchDatasetDataProps & {
|
|
[NodeInputKeyEnum.datasetSearchUsingExtensionQuery]?: boolean;
|
|
[NodeInputKeyEnum.datasetSearchExtensionModel]?: string;
|
|
[NodeInputKeyEnum.datasetSearchExtensionBg]?: string;
|
|
};
|
|
export const defaultSearchDatasetData = async ({
|
|
datasetSearchUsingExtensionQuery,
|
|
datasetSearchExtensionModel,
|
|
datasetSearchExtensionBg,
|
|
...props
|
|
}: DefaultSearchDatasetDataProps): Promise<SearchDatasetDataResponse> => {
|
|
const query = props.queries[0];
|
|
const histories = props.histories;
|
|
|
|
const { searchQueries, reRankQuery, aiExtensionResult } = await datasetSearchQueryExtension({
|
|
query,
|
|
llmModel: datasetSearchUsingExtensionQuery
|
|
? getLLMModel(datasetSearchExtensionModel).model
|
|
: undefined,
|
|
embeddingModel: props.model,
|
|
extensionBg: datasetSearchExtensionBg,
|
|
histories
|
|
});
|
|
|
|
const result = await searchDatasetData({
|
|
...props,
|
|
reRankQuery: reRankQuery,
|
|
queries: searchQueries
|
|
});
|
|
|
|
return {
|
|
...result,
|
|
queryExtensionResult: aiExtensionResult
|
|
? {
|
|
llmModel: aiExtensionResult.llmModel,
|
|
inputTokens: aiExtensionResult.inputTokens,
|
|
outputTokens: aiExtensionResult.outputTokens,
|
|
embeddingModel: aiExtensionResult.embeddingModel,
|
|
embeddingTokens: aiExtensionResult.embeddingTokens,
|
|
query: searchQueries.join('\n')
|
|
}
|
|
: undefined
|
|
};
|
|
};
|
|
|
|
export type DeepRagSearchProps = SearchDatasetDataProps & {
|
|
[NodeInputKeyEnum.datasetDeepSearchModel]?: string;
|
|
[NodeInputKeyEnum.datasetDeepSearchMaxTimes]?: number;
|
|
[NodeInputKeyEnum.datasetDeepSearchBg]?: string;
|
|
};
|
|
export const deepRagSearch = (data: DeepRagSearchProps) => global.deepRagHandler(data);
|