mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 05:12:39 +00:00
4.8.6 merge (#1943)
* Dataset collection forbid (#1885) * perf: tool call support same id * feat: collection forbid * feat: collection forbid * Inheritance Permission for apps (#1897) * feat: app schema define chore: references of authapp * feat: authApp method inheritance * feat: create and update api * feat: update * feat: inheritance Permission controller for app. * feat: abstract version of inheritPermission * feat: ancestorId for apps * chore: update app * fix: inheritPermission abstract version * feat: update folder defaultPermission * feat: app update api * chore: inheritance frontend * chore: app list api * feat: update defaultPermission in app deatil * feat: backend api finished * feat: app inheritance permission fe * fix: app update defaultpermission causes collaborator miss * fix: ts error * chore: adjust the codes * chore: i18n chore: i18n * chore: fe adjust and i18n * chore: adjust the code * feat: resume api; chore: rewrite update api and inheritPermission methods * chore: something * chore: fe code adjusting * feat: frontend adjusting * chore: fe code adjusting * chore: adjusting the code * perf: fe loading * format * Inheritance fix (#1908) * fix: SlideCard * fix: authapp did not return parent app for inheritance app * fix: fe adjusting * feat: fe adjusing * perf: inherit per ux * doc * fix: ts errors (#1916) * perf: inherit permission * fix: permission inherit * Workflow type (#1938) * perf: workflow type tmp workflow perf: workflow type feat: custom field config * perf: dynamic input * perf: node classify * perf: node classify * perf: node classify * perf: node classify * fix: workflow custom input * feat: text editor and customFeedback move to basic nodes * feat: community system plugin * fix: ts * feat: exprEval plugin * perf: workflow type * perf: plugin important * fix: default templates * perf: markdown hr css * lock * perf: fetch url * perf: new plugin version * fix: chat histories update * fix: collection paths invalid * perf: app card ui --------- Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
This commit is contained in:
@@ -48,12 +48,15 @@ const DatasetCollectionSchema = new Schema({
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
},
|
||||
forbid: {
|
||||
type: Boolean,
|
||||
default: false
|
||||
},
|
||||
|
||||
// chunk filed
|
||||
trainingType: {
|
||||
type: String,
|
||||
enum: Object.keys(TrainingTypeMap),
|
||||
required: true
|
||||
enum: Object.keys(TrainingTypeMap)
|
||||
},
|
||||
chunkSize: {
|
||||
type: Number,
|
||||
@@ -91,23 +94,25 @@ const DatasetCollectionSchema = new Schema({
|
||||
}
|
||||
});
|
||||
|
||||
export const MongoDatasetCollection: Model<DatasetCollectionSchemaType> =
|
||||
models[DatasetColCollectionName] || model(DatasetColCollectionName, DatasetCollectionSchema);
|
||||
|
||||
try {
|
||||
// auth file
|
||||
DatasetCollectionSchema.index({ teamId: 1, fileId: 1 }, { background: true });
|
||||
DatasetCollectionSchema.index({ teamId: 1, fileId: 1 });
|
||||
|
||||
// list collection; deep find collections
|
||||
DatasetCollectionSchema.index(
|
||||
{
|
||||
teamId: 1,
|
||||
datasetId: 1,
|
||||
parentId: 1,
|
||||
updateTime: -1
|
||||
},
|
||||
{ background: true }
|
||||
);
|
||||
DatasetCollectionSchema.index({
|
||||
teamId: 1,
|
||||
datasetId: 1,
|
||||
parentId: 1,
|
||||
updateTime: -1
|
||||
});
|
||||
|
||||
// get forbid
|
||||
// DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, forbid: 1 });
|
||||
|
||||
MongoDatasetCollection.syncIndexes({ background: true });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
export const MongoDatasetCollection: Model<DatasetCollectionSchemaType> =
|
||||
models[DatasetColCollectionName] || model(DatasetColCollectionName, DatasetCollectionSchema);
|
||||
|
@@ -53,29 +53,6 @@ export async function findCollectionAndChild({
|
||||
return [collection, ...childCollections];
|
||||
}
|
||||
|
||||
export async function getDatasetCollectionPaths({
|
||||
parentId = ''
|
||||
}: {
|
||||
parentId?: string;
|
||||
}): Promise<ParentTreePathItemType[]> {
|
||||
async function find(parentId?: string): Promise<ParentTreePathItemType[]> {
|
||||
if (!parentId) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const parent = await MongoDatasetCollection.findOne({ _id: parentId }, 'name parentId');
|
||||
|
||||
if (!parent) return [];
|
||||
|
||||
const paths = await find(parent.parentId);
|
||||
paths.push({ parentId, parentName: parent.name });
|
||||
|
||||
return paths;
|
||||
}
|
||||
|
||||
return await find(parentId);
|
||||
}
|
||||
|
||||
export function getCollectionUpdateTime({ name, time }: { time?: Date; name: string }) {
|
||||
if (time) return time;
|
||||
if (name.startsWith('手动') || ['manual', 'mark'].includes(name)) return new Date('2999/9/9');
|
||||
|
@@ -37,7 +37,7 @@ export async function findDatasetAndAllChildren({
|
||||
return datasets;
|
||||
};
|
||||
const [dataset, childDatasets] = await Promise.all([
|
||||
MongoDataset.findById(datasetId),
|
||||
MongoDataset.findById(datasetId).lean(),
|
||||
find(datasetId)
|
||||
]);
|
||||
|
||||
|
@@ -77,27 +77,27 @@ const DatasetDataSchema = new Schema({
|
||||
rebuilding: Boolean
|
||||
});
|
||||
|
||||
try {
|
||||
// list collection and count data; list data; delete collection(relate data)
|
||||
DatasetDataSchema.index(
|
||||
{ teamId: 1, datasetId: 1, collectionId: 1, chunkIndex: 1, updateTime: -1 },
|
||||
{ background: true }
|
||||
);
|
||||
// full text index
|
||||
DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' }, { background: true });
|
||||
// Recall vectors after data matching
|
||||
DatasetDataSchema.index(
|
||||
{ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 },
|
||||
{ background: true }
|
||||
);
|
||||
DatasetDataSchema.index({ updateTime: 1 }, { background: true });
|
||||
// rebuild data
|
||||
DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 }, { background: true });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
export const MongoDatasetData: Model<DatasetDataSchemaType> =
|
||||
models[DatasetDataCollectionName] || model(DatasetDataCollectionName, DatasetDataSchema);
|
||||
|
||||
MongoDatasetData.syncIndexes();
|
||||
try {
|
||||
// list collection and count data; list data; delete collection(relate data)
|
||||
DatasetDataSchema.index({
|
||||
teamId: 1,
|
||||
datasetId: 1,
|
||||
collectionId: 1,
|
||||
chunkIndex: 1,
|
||||
updateTime: -1
|
||||
});
|
||||
// full text index
|
||||
DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
|
||||
// Recall vectors after data matching
|
||||
DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 });
|
||||
DatasetDataSchema.index({ updateTime: 1 });
|
||||
// rebuild data
|
||||
DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 });
|
||||
|
||||
MongoDatasetData.syncIndexes({ background: true });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
@@ -74,11 +74,6 @@ const DatasetSchema = new Schema({
|
||||
type: String,
|
||||
default: ''
|
||||
},
|
||||
permission: {
|
||||
type: String,
|
||||
enum: Object.keys(PermissionTypeMap),
|
||||
default: PermissionTypeEnum.private
|
||||
},
|
||||
websiteConfig: {
|
||||
type: {
|
||||
url: {
|
||||
|
@@ -12,13 +12,14 @@ import {
|
||||
DatasetDataWithCollectionType,
|
||||
SearchDataResponseItemType
|
||||
} from '@fastgpt/global/core/dataset/type';
|
||||
import { MongoDatasetCollection } from '../collection/schema';
|
||||
import { DatasetColCollectionName, MongoDatasetCollection } from '../collection/schema';
|
||||
import { reRankRecall } from '../../../core/ai/rerank';
|
||||
import { countPromptTokens } from '../../../common/string/tiktoken/index';
|
||||
import { datasetSearchResultConcat } from '@fastgpt/global/core/dataset/search/utils';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
import { jiebaSplit } from '../../../common/string/jieba';
|
||||
import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection/utils';
|
||||
import { Types } from '../../../common/mongo';
|
||||
|
||||
type SearchDatasetDataProps = {
|
||||
teamId: string;
|
||||
@@ -50,9 +51,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
usingReRank = usingReRank && global.reRankModels.length > 0;
|
||||
|
||||
// Compatible with topk limit
|
||||
if (maxTokens < 50) {
|
||||
maxTokens = 1500;
|
||||
}
|
||||
let set = new Set<string>();
|
||||
let usingSimilarityFilter = false;
|
||||
|
||||
@@ -75,7 +73,29 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
fullTextLimit: 60
|
||||
};
|
||||
};
|
||||
const embeddingRecall = async ({ query, limit }: { query: string; limit: number }) => {
|
||||
const getForbidData = async () => {
|
||||
const collections = await MongoDatasetCollection.find(
|
||||
{
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds },
|
||||
forbid: true
|
||||
},
|
||||
'_id'
|
||||
);
|
||||
|
||||
return {
|
||||
forbidCollectionIdList: collections.map((item) => String(item._id))
|
||||
};
|
||||
};
|
||||
const embeddingRecall = async ({
|
||||
query,
|
||||
limit,
|
||||
forbidCollectionIdList
|
||||
}: {
|
||||
query: string;
|
||||
limit: number;
|
||||
forbidCollectionIdList: string[];
|
||||
}) => {
|
||||
const { vectors, tokens } = await getVectorsByText({
|
||||
model: getVectorModel(model),
|
||||
input: query,
|
||||
@@ -86,7 +106,8 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
teamId,
|
||||
datasetIds,
|
||||
vector: vectors[0],
|
||||
limit
|
||||
limit,
|
||||
forbidCollectionIdList
|
||||
});
|
||||
|
||||
// get q and a
|
||||
@@ -161,27 +182,66 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
|
||||
let searchResults = (
|
||||
await Promise.all(
|
||||
datasetIds.map((id) =>
|
||||
MongoDatasetData.find(
|
||||
datasetIds.map(async (id) => {
|
||||
return MongoDatasetData.aggregate([
|
||||
{
|
||||
teamId,
|
||||
datasetId: id,
|
||||
$text: { $search: jiebaSplit({ text: query }) }
|
||||
$match: {
|
||||
teamId: new Types.ObjectId(teamId),
|
||||
datasetId: new Types.ObjectId(id),
|
||||
$text: { $search: jiebaSplit({ text: query }) }
|
||||
}
|
||||
},
|
||||
{
|
||||
score: { $meta: 'textScore' },
|
||||
_id: 1,
|
||||
datasetId: 1,
|
||||
collectionId: 1,
|
||||
q: 1,
|
||||
a: 1,
|
||||
chunkIndex: 1
|
||||
$addFields: {
|
||||
score: { $meta: 'textScore' }
|
||||
}
|
||||
},
|
||||
{
|
||||
$sort: {
|
||||
score: { $meta: 'textScore' }
|
||||
}
|
||||
},
|
||||
{
|
||||
$limit: limit
|
||||
},
|
||||
{
|
||||
$lookup: {
|
||||
from: DatasetColCollectionName,
|
||||
let: { collectionId: '$collectionId' },
|
||||
pipeline: [
|
||||
{
|
||||
$match: {
|
||||
$expr: { $eq: ['$_id', '$$collectionId'] },
|
||||
forbid: { $eq: false } // 直接在lookup阶段过滤
|
||||
}
|
||||
},
|
||||
{
|
||||
$project: {
|
||||
_id: 1 // 只需要_id字段来确认匹配
|
||||
}
|
||||
}
|
||||
],
|
||||
as: 'collection'
|
||||
}
|
||||
},
|
||||
{
|
||||
$match: {
|
||||
collection: { $ne: [] }
|
||||
}
|
||||
},
|
||||
{
|
||||
$project: {
|
||||
_id: 1,
|
||||
datasetId: 1,
|
||||
collectionId: 1,
|
||||
q: 1,
|
||||
a: 1,
|
||||
chunkIndex: 1,
|
||||
score: 1
|
||||
}
|
||||
}
|
||||
)
|
||||
.sort({ score: { $meta: 'textScore' } })
|
||||
.limit(limit)
|
||||
.lean()
|
||||
)
|
||||
]);
|
||||
})
|
||||
)
|
||||
).flat() as (DatasetDataSchemaType & { score: number })[];
|
||||
|
||||
@@ -255,27 +315,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
return [];
|
||||
}
|
||||
};
|
||||
const filterResultsByMaxTokens = async (
|
||||
list: SearchDataResponseItemType[],
|
||||
maxTokens: number
|
||||
) => {
|
||||
const results: SearchDataResponseItemType[] = [];
|
||||
let totalTokens = 0;
|
||||
|
||||
for await (const item of list) {
|
||||
totalTokens += await countPromptTokens(item.q + item.a);
|
||||
|
||||
if (totalTokens > maxTokens + 500) {
|
||||
break;
|
||||
}
|
||||
results.push(item);
|
||||
if (totalTokens > maxTokens) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return results.length === 0 ? list.slice(0, 1) : results;
|
||||
};
|
||||
const multiQueryRecall = async ({
|
||||
embeddingLimit,
|
||||
fullTextLimit
|
||||
@@ -288,12 +327,15 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
const fullTextRecallResList: SearchDataResponseItemType[][] = [];
|
||||
let totalTokens = 0;
|
||||
|
||||
const { forbidCollectionIdList } = await getForbidData();
|
||||
|
||||
await Promise.all(
|
||||
queries.map(async (query) => {
|
||||
const [{ tokens, embeddingRecallResults }, { fullTextRecallResults }] = await Promise.all([
|
||||
embeddingRecall({
|
||||
query,
|
||||
limit: embeddingLimit
|
||||
limit: embeddingLimit,
|
||||
forbidCollectionIdList
|
||||
}),
|
||||
fullTextRecall({
|
||||
query,
|
||||
@@ -397,8 +439,28 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
return filterSameDataResults;
|
||||
})();
|
||||
|
||||
// token filter
|
||||
const filterMaxTokensResult = await (async () => {
|
||||
const results: SearchDataResponseItemType[] = [];
|
||||
let totalTokens = 0;
|
||||
|
||||
for await (const item of scoreFilter) {
|
||||
totalTokens += await countPromptTokens(item.q + item.a);
|
||||
|
||||
if (totalTokens > maxTokens + 500) {
|
||||
break;
|
||||
}
|
||||
results.push(item);
|
||||
if (totalTokens > maxTokens) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return results.length === 0 ? scoreFilter.slice(0, 1) : results;
|
||||
})();
|
||||
|
||||
return {
|
||||
searchRes: await filterResultsByMaxTokens(scoreFilter, maxTokens),
|
||||
searchRes: filterMaxTokensResult,
|
||||
tokens,
|
||||
searchMode,
|
||||
limit: maxTokens,
|
||||
|
Reference in New Issue
Block a user