mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 13:03:50 +00:00

* feat: stop toolCall and rename some field. (#46) * perf: node delete tip;pay tip * fix: toolCall cannot save child answer * feat: stop tool * fix: team modal * fix feckbackMoal auth bug (#47) * 简单的支持提示词运行tool。优化workflow模板 (#49) * remove templates * fix: request body undefined * feat: prompt tool run * feat: workflow tamplates modal * perf: plugin start * 4.7 (#50) * fix docker-compose download url (#994) original code is a bad url with '404 NOT FOUND' return. fix docker-compose download url, add 'v' before docker-compose version * Update ai_settings.md (#1000) * Update configuration.md * Update configuration.md * Fix history in classifyQuestion and extract modules (#1012) * Fix history in classifyQuestion and extract modules * Add chatValue2RuntimePrompt import and update text formatting * flow controller to packages * fix: rerank select * modal ui * perf: modal code path * point not sufficient * feat: http url support variable * fix http key * perf: prompt * perf: ai setting modal * simple edit ui --------- Co-authored-by: entorick <entorick11@qq.com> Co-authored-by: liujianglc <liujianglc@163.com> Co-authored-by: Fengrui Liu <liufengrui.work@bytedance.com> * fix team share redirect to login (#51) * feat: support openapi import plugins (#48) * feat: support openapi import plugins * feat: import from url * fix: add body params parse * fix build * fix * fix * fix * tool box ui (#52) * fix: training queue * feat: simple edit tool select * perf: simple edit dataset prompt * fix: chatbox tool ux * feat: quote prompt module * perf: plugin tools sign * perf: model avatar * tool selector ui * feat: max histories * perf: http plugin import (#53) * perf: plugin http import * chatBox ui * perf: name * fix: Node template card (#54) * fix: ts * setting modal * package * package * feat: add plugins search (#57) * feat: add plugins search * perf: change http plugin header input * Yjl (#56) * perf: prompt tool call * perf: chat box ux * doc * doc * price tip * perf: tool selector * ui' * fix: vector queue * fix: empty tool and empty response * fix: empty msg * perf: pg index * perf: ui tip * doc * tool tip --------- Co-authored-by: yst <77910600+yu-and-liu@users.noreply.github.com> Co-authored-by: entorick <entorick11@qq.com> Co-authored-by: liujianglc <liujianglc@163.com> Co-authored-by: Fengrui Liu <liufengrui.work@bytedance.com> Co-authored-by: heheer <71265218+newfish-cmyk@users.noreply.github.com>
408 lines
12 KiB
TypeScript
408 lines
12 KiB
TypeScript
import {
|
|
DatasetSearchModeEnum,
|
|
DatasetSearchModeMap,
|
|
SearchScoreTypeEnum
|
|
} from '@fastgpt/global/core/dataset/constants';
|
|
import { recallFromVectorStore } from '../../../common/vectorStore/controller';
|
|
import { getVectorsByText } from '../../ai/embedding';
|
|
import { getVectorModel } from '../../ai/model';
|
|
import { MongoDatasetData } from '../data/schema';
|
|
import {
|
|
DatasetDataSchemaType,
|
|
DatasetDataWithCollectionType,
|
|
SearchDataResponseItemType
|
|
} from '@fastgpt/global/core/dataset/type';
|
|
import { MongoDatasetCollection } from '../collection/schema';
|
|
import { reRankRecall } from '../../../core/ai/rerank';
|
|
import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
|
|
import { datasetSearchResultConcat } from '@fastgpt/global/core/dataset/search/utils';
|
|
import { hashStr } from '@fastgpt/global/common/string/tools';
|
|
import { jiebaSplit } from '../../../common/string/jieba';
|
|
|
|
type SearchDatasetDataProps = {
|
|
teamId: string;
|
|
model: string;
|
|
similarity?: number; // min distance
|
|
limit: number; // max Token limit
|
|
datasetIds: string[];
|
|
searchMode?: `${DatasetSearchModeEnum}`;
|
|
usingReRank?: boolean;
|
|
reRankQuery: string;
|
|
queries: string[];
|
|
};
|
|
|
|
export async function searchDatasetData(props: SearchDatasetDataProps) {
|
|
let {
|
|
teamId,
|
|
reRankQuery,
|
|
queries,
|
|
model,
|
|
similarity = 0,
|
|
limit: maxTokens,
|
|
searchMode = DatasetSearchModeEnum.embedding,
|
|
usingReRank = false,
|
|
datasetIds = []
|
|
} = props;
|
|
|
|
/* init params */
|
|
searchMode = DatasetSearchModeMap[searchMode] ? searchMode : DatasetSearchModeEnum.embedding;
|
|
usingReRank = usingReRank && global.reRankModels.length > 0;
|
|
|
|
// Compatible with topk limit
|
|
if (maxTokens < 50) {
|
|
maxTokens = 1500;
|
|
}
|
|
let set = new Set<string>();
|
|
let usingSimilarityFilter = false;
|
|
|
|
/* function */
|
|
const countRecallLimit = () => {
|
|
if (searchMode === DatasetSearchModeEnum.embedding) {
|
|
return {
|
|
embeddingLimit: 150,
|
|
fullTextLimit: 0
|
|
};
|
|
}
|
|
if (searchMode === DatasetSearchModeEnum.fullTextRecall) {
|
|
return {
|
|
embeddingLimit: 0,
|
|
fullTextLimit: 150
|
|
};
|
|
}
|
|
return {
|
|
embeddingLimit: 100,
|
|
fullTextLimit: 80
|
|
};
|
|
};
|
|
const embeddingRecall = async ({ query, limit }: { query: string; limit: number }) => {
|
|
const { vectors, tokens } = await getVectorsByText({
|
|
model: getVectorModel(model),
|
|
input: query
|
|
});
|
|
|
|
const { results } = await recallFromVectorStore({
|
|
vectors,
|
|
limit,
|
|
datasetIds,
|
|
efSearch: global.systemEnv?.pgHNSWEfSearch
|
|
});
|
|
|
|
// get q and a
|
|
const dataList = (await MongoDatasetData.find(
|
|
{
|
|
teamId,
|
|
datasetId: { $in: datasetIds },
|
|
'indexes.dataId': { $in: results.map((item) => item.id?.trim()) }
|
|
},
|
|
'datasetId collectionId q a chunkIndex indexes'
|
|
)
|
|
.populate('collectionId', 'name fileId rawLink')
|
|
.lean()) as DatasetDataWithCollectionType[];
|
|
|
|
// add score to data(It's already sorted. The first one is the one with the most points)
|
|
const concatResults = dataList.map((data) => {
|
|
const dataIdList = data.indexes.map((item) => item.dataId);
|
|
|
|
const maxScoreResult = results.find((item) => {
|
|
return dataIdList.includes(item.id);
|
|
});
|
|
|
|
return {
|
|
...data,
|
|
score: maxScoreResult?.score || 0
|
|
};
|
|
});
|
|
|
|
concatResults.sort((a, b) => b.score - a.score);
|
|
|
|
const formatResult = concatResults
|
|
.map((data, index) => {
|
|
if (!data.collectionId) {
|
|
console.log('Collection is not found', data);
|
|
}
|
|
|
|
const result: SearchDataResponseItemType = {
|
|
id: String(data._id),
|
|
q: data.q,
|
|
a: data.a,
|
|
chunkIndex: data.chunkIndex,
|
|
datasetId: String(data.datasetId),
|
|
collectionId: String(data.collectionId?._id),
|
|
sourceName: data.collectionId?.name || '',
|
|
sourceId: data.collectionId?.fileId || data.collectionId?.rawLink,
|
|
score: [{ type: SearchScoreTypeEnum.embedding, value: data.score, index }]
|
|
};
|
|
|
|
return result;
|
|
})
|
|
.filter((item) => item !== null) as SearchDataResponseItemType[];
|
|
|
|
return {
|
|
embeddingRecallResults: formatResult,
|
|
tokens
|
|
};
|
|
};
|
|
const fullTextRecall = async ({
|
|
query,
|
|
limit
|
|
}: {
|
|
query: string;
|
|
limit: number;
|
|
}): Promise<{
|
|
fullTextRecallResults: SearchDataResponseItemType[];
|
|
tokenLen: number;
|
|
}> => {
|
|
if (limit === 0) {
|
|
return {
|
|
fullTextRecallResults: [],
|
|
tokenLen: 0
|
|
};
|
|
}
|
|
|
|
let searchResults = (
|
|
await Promise.all(
|
|
datasetIds.map((id) =>
|
|
MongoDatasetData.find(
|
|
{
|
|
teamId,
|
|
datasetId: id,
|
|
$text: { $search: jiebaSplit({ text: query }) }
|
|
},
|
|
{
|
|
score: { $meta: 'textScore' },
|
|
_id: 1,
|
|
datasetId: 1,
|
|
collectionId: 1,
|
|
q: 1,
|
|
a: 1,
|
|
chunkIndex: 1
|
|
}
|
|
)
|
|
.sort({ score: { $meta: 'textScore' } })
|
|
.limit(limit)
|
|
.lean()
|
|
)
|
|
)
|
|
).flat() as (DatasetDataSchemaType & { score: number })[];
|
|
|
|
// resort
|
|
searchResults.sort((a, b) => b.score - a.score);
|
|
searchResults.slice(0, limit);
|
|
|
|
const collections = await MongoDatasetCollection.find(
|
|
{
|
|
_id: { $in: searchResults.map((item) => item.collectionId) }
|
|
},
|
|
'_id name fileId rawLink'
|
|
);
|
|
|
|
return {
|
|
fullTextRecallResults: searchResults.map((item, index) => {
|
|
const collection = collections.find((col) => String(col._id) === String(item.collectionId));
|
|
return {
|
|
id: String(item._id),
|
|
datasetId: String(item.datasetId),
|
|
collectionId: String(item.collectionId),
|
|
sourceName: collection?.name || '',
|
|
sourceId: collection?.fileId || collection?.rawLink,
|
|
q: item.q,
|
|
a: item.a,
|
|
chunkIndex: item.chunkIndex,
|
|
indexes: item.indexes,
|
|
score: [{ type: SearchScoreTypeEnum.fullText, value: item.score, index }]
|
|
};
|
|
}),
|
|
tokenLen: 0
|
|
};
|
|
};
|
|
const reRankSearchResult = async ({
|
|
data,
|
|
query
|
|
}: {
|
|
data: SearchDataResponseItemType[];
|
|
query: string;
|
|
}): Promise<SearchDataResponseItemType[]> => {
|
|
try {
|
|
const results = await reRankRecall({
|
|
query,
|
|
inputs: data.map((item) => ({
|
|
id: item.id,
|
|
text: `${item.q}\n${item.a}`
|
|
}))
|
|
});
|
|
|
|
if (results.length === 0) {
|
|
usingReRank = false;
|
|
return [];
|
|
}
|
|
|
|
// add new score to data
|
|
const mergeResult = results
|
|
.map((item, index) => {
|
|
const target = data.find((dataItem) => dataItem.id === item.id);
|
|
if (!target) return null;
|
|
const score = item.score || 0;
|
|
|
|
return {
|
|
...target,
|
|
score: [{ type: SearchScoreTypeEnum.reRank, value: score, index }]
|
|
};
|
|
})
|
|
.filter(Boolean) as SearchDataResponseItemType[];
|
|
|
|
return mergeResult;
|
|
} catch (error) {
|
|
usingReRank = false;
|
|
return [];
|
|
}
|
|
};
|
|
const filterResultsByMaxTokens = (list: SearchDataResponseItemType[], maxTokens: number) => {
|
|
const results: SearchDataResponseItemType[] = [];
|
|
let totalTokens = 0;
|
|
|
|
for (let i = 0; i < list.length; i++) {
|
|
const item = list[i];
|
|
totalTokens += countPromptTokens(item.q + item.a);
|
|
if (totalTokens > maxTokens + 500) {
|
|
break;
|
|
}
|
|
results.push(item);
|
|
if (totalTokens > maxTokens) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return results.length === 0 ? list.slice(0, 1) : results;
|
|
};
|
|
const multiQueryRecall = async ({
|
|
embeddingLimit,
|
|
fullTextLimit
|
|
}: {
|
|
embeddingLimit: number;
|
|
fullTextLimit: number;
|
|
}) => {
|
|
// multi query recall
|
|
const embeddingRecallResList: SearchDataResponseItemType[][] = [];
|
|
const fullTextRecallResList: SearchDataResponseItemType[][] = [];
|
|
let totalTokens = 0;
|
|
|
|
await Promise.all(
|
|
queries.map(async (query) => {
|
|
const [{ tokens, embeddingRecallResults }, { fullTextRecallResults }] = await Promise.all([
|
|
embeddingRecall({
|
|
query,
|
|
limit: embeddingLimit
|
|
}),
|
|
fullTextRecall({
|
|
query,
|
|
limit: fullTextLimit
|
|
})
|
|
]);
|
|
totalTokens += tokens;
|
|
|
|
embeddingRecallResList.push(embeddingRecallResults);
|
|
fullTextRecallResList.push(fullTextRecallResults);
|
|
})
|
|
);
|
|
|
|
// rrf concat
|
|
const rrfEmbRecall = datasetSearchResultConcat(
|
|
embeddingRecallResList.map((list) => ({ k: 60, list }))
|
|
).slice(0, embeddingLimit);
|
|
const rrfFTRecall = datasetSearchResultConcat(
|
|
fullTextRecallResList.map((list) => ({ k: 60, list }))
|
|
).slice(0, fullTextLimit);
|
|
|
|
return {
|
|
tokens: totalTokens,
|
|
embeddingRecallResults: rrfEmbRecall,
|
|
fullTextRecallResults: rrfFTRecall
|
|
};
|
|
};
|
|
|
|
/* main step */
|
|
// count limit
|
|
const { embeddingLimit, fullTextLimit } = countRecallLimit();
|
|
|
|
// recall
|
|
const { embeddingRecallResults, fullTextRecallResults, tokens } = await multiQueryRecall({
|
|
embeddingLimit,
|
|
fullTextLimit
|
|
});
|
|
|
|
// ReRank results
|
|
const reRankResults = await (async () => {
|
|
if (!usingReRank) return [];
|
|
|
|
set = new Set<string>(embeddingRecallResults.map((item) => item.id));
|
|
const concatRecallResults = embeddingRecallResults.concat(
|
|
fullTextRecallResults.filter((item) => !set.has(item.id))
|
|
);
|
|
|
|
// remove same q and a data
|
|
set = new Set<string>();
|
|
const filterSameDataResults = concatRecallResults.filter((item) => {
|
|
// 删除所有的标点符号与空格等,只对文本进行比较
|
|
const str = hashStr(`${item.q}${item.a}`.replace(/[^\p{L}\p{N}]/gu, ''));
|
|
if (set.has(str)) return false;
|
|
set.add(str);
|
|
return true;
|
|
});
|
|
return reRankSearchResult({
|
|
query: reRankQuery,
|
|
data: filterSameDataResults
|
|
});
|
|
})();
|
|
|
|
// embedding recall and fullText recall rrf concat
|
|
const rrfConcatResults = datasetSearchResultConcat([
|
|
{ k: 60, list: embeddingRecallResults },
|
|
{ k: 60, list: fullTextRecallResults },
|
|
{ k: 58, list: reRankResults }
|
|
]);
|
|
|
|
// remove same q and a data
|
|
set = new Set<string>();
|
|
const filterSameDataResults = rrfConcatResults.filter((item) => {
|
|
// 删除所有的标点符号与空格等,只对文本进行比较
|
|
const str = hashStr(`${item.q}${item.a}`.replace(/[^\p{L}\p{N}]/gu, ''));
|
|
if (set.has(str)) return false;
|
|
set.add(str);
|
|
return true;
|
|
});
|
|
|
|
// score filter
|
|
const scoreFilter = (() => {
|
|
if (usingReRank) {
|
|
usingSimilarityFilter = true;
|
|
|
|
return filterSameDataResults.filter((item) => {
|
|
const reRankScore = item.score.find((item) => item.type === SearchScoreTypeEnum.reRank);
|
|
if (reRankScore && reRankScore.value < similarity) return false;
|
|
return true;
|
|
});
|
|
}
|
|
if (searchMode === DatasetSearchModeEnum.embedding) {
|
|
usingSimilarityFilter = true;
|
|
return filterSameDataResults.filter((item) => {
|
|
const embeddingScore = item.score.find(
|
|
(item) => item.type === SearchScoreTypeEnum.embedding
|
|
);
|
|
if (embeddingScore && embeddingScore.value < similarity) return false;
|
|
return true;
|
|
});
|
|
}
|
|
return filterSameDataResults;
|
|
})();
|
|
|
|
return {
|
|
searchRes: filterResultsByMaxTokens(scoreFilter, maxTokens),
|
|
tokens,
|
|
searchMode,
|
|
limit: maxTokens,
|
|
similarity,
|
|
usingReRank,
|
|
usingSimilarityFilter
|
|
};
|
|
}
|