4.8.6 merge (#1943)

* Dataset collection forbid (#1885)

* perf: tool call support same id

* feat: collection forbid

* feat: collection forbid

* Inheritance Permission for apps (#1897)

* feat: app schema define

chore: references of authapp

* feat: authApp method inheritance

* feat: create and update api

* feat: update

* feat: inheritance Permission controller for app.

* feat: abstract version of inheritPermission

* feat: ancestorId for apps

* chore: update app

* fix: inheritPermission abstract version

* feat: update folder defaultPermission

* feat: app update api

* chore: inheritance frontend

* chore: app list api

* feat: update defaultPermission in app deatil

* feat: backend api finished

* feat: app inheritance permission fe

* fix: app update defaultpermission causes collaborator miss

* fix: ts error

* chore: adjust the codes

* chore: i18n

chore: i18n

* chore: fe adjust and i18n

* chore: adjust the code

* feat: resume api;
chore: rewrite update api and inheritPermission methods

* chore: something

* chore: fe code adjusting

* feat: frontend adjusting

* chore: fe code adjusting

* chore: adjusting the code

* perf: fe loading

* format

* Inheritance fix (#1908)

* fix: SlideCard

* fix: authapp did not return parent app for inheritance app

* fix: fe adjusting

* feat: fe adjusing

* perf: inherit per ux

* doc

* fix: ts errors (#1916)

* perf: inherit permission

* fix: permission inherit

* Workflow type (#1938)

* perf: workflow type

tmp workflow

perf: workflow type

feat: custom field config

* perf: dynamic input

* perf: node classify

* perf: node classify

* perf: node classify

* perf: node classify

* fix: workflow custom input

* feat: text editor and customFeedback move to basic nodes

* feat: community system plugin

* fix: ts

* feat: exprEval plugin

* perf: workflow type

* perf: plugin important

* fix: default templates

* perf: markdown hr css

* lock

* perf: fetch url

* perf: new plugin version

* fix: chat histories update

* fix: collection paths invalid

* perf: app card ui

---------

Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
This commit is contained in:
Archer
2024-07-04 17:42:09 +08:00
committed by GitHub
parent babf03c218
commit a9cdece341
303 changed files with 18883 additions and 13149 deletions

View File

@@ -48,12 +48,15 @@ const DatasetCollectionSchema = new Schema({
type: Date,
default: () => new Date()
},
forbid: {
type: Boolean,
default: false
},
// chunk filed
trainingType: {
type: String,
enum: Object.keys(TrainingTypeMap),
required: true
enum: Object.keys(TrainingTypeMap)
},
chunkSize: {
type: Number,
@@ -91,23 +94,25 @@ const DatasetCollectionSchema = new Schema({
}
});
export const MongoDatasetCollection: Model<DatasetCollectionSchemaType> =
models[DatasetColCollectionName] || model(DatasetColCollectionName, DatasetCollectionSchema);
try {
// auth file
DatasetCollectionSchema.index({ teamId: 1, fileId: 1 }, { background: true });
DatasetCollectionSchema.index({ teamId: 1, fileId: 1 });
// list collection; deep find collections
DatasetCollectionSchema.index(
{
teamId: 1,
datasetId: 1,
parentId: 1,
updateTime: -1
},
{ background: true }
);
DatasetCollectionSchema.index({
teamId: 1,
datasetId: 1,
parentId: 1,
updateTime: -1
});
// get forbid
// DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, forbid: 1 });
MongoDatasetCollection.syncIndexes({ background: true });
} catch (error) {
console.log(error);
}
export const MongoDatasetCollection: Model<DatasetCollectionSchemaType> =
models[DatasetColCollectionName] || model(DatasetColCollectionName, DatasetCollectionSchema);

View File

@@ -53,29 +53,6 @@ export async function findCollectionAndChild({
return [collection, ...childCollections];
}
export async function getDatasetCollectionPaths({
parentId = ''
}: {
parentId?: string;
}): Promise<ParentTreePathItemType[]> {
async function find(parentId?: string): Promise<ParentTreePathItemType[]> {
if (!parentId) {
return [];
}
const parent = await MongoDatasetCollection.findOne({ _id: parentId }, 'name parentId');
if (!parent) return [];
const paths = await find(parent.parentId);
paths.push({ parentId, parentName: parent.name });
return paths;
}
return await find(parentId);
}
export function getCollectionUpdateTime({ name, time }: { time?: Date; name: string }) {
if (time) return time;
if (name.startsWith('手动') || ['manual', 'mark'].includes(name)) return new Date('2999/9/9');

View File

@@ -37,7 +37,7 @@ export async function findDatasetAndAllChildren({
return datasets;
};
const [dataset, childDatasets] = await Promise.all([
MongoDataset.findById(datasetId),
MongoDataset.findById(datasetId).lean(),
find(datasetId)
]);

View File

@@ -77,27 +77,27 @@ const DatasetDataSchema = new Schema({
rebuilding: Boolean
});
try {
// list collection and count data; list data; delete collection(relate data)
DatasetDataSchema.index(
{ teamId: 1, datasetId: 1, collectionId: 1, chunkIndex: 1, updateTime: -1 },
{ background: true }
);
// full text index
DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' }, { background: true });
// Recall vectors after data matching
DatasetDataSchema.index(
{ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 },
{ background: true }
);
DatasetDataSchema.index({ updateTime: 1 }, { background: true });
// rebuild data
DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 }, { background: true });
} catch (error) {
console.log(error);
}
export const MongoDatasetData: Model<DatasetDataSchemaType> =
models[DatasetDataCollectionName] || model(DatasetDataCollectionName, DatasetDataSchema);
MongoDatasetData.syncIndexes();
try {
// list collection and count data; list data; delete collection(relate data)
DatasetDataSchema.index({
teamId: 1,
datasetId: 1,
collectionId: 1,
chunkIndex: 1,
updateTime: -1
});
// full text index
DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
// Recall vectors after data matching
DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 });
DatasetDataSchema.index({ updateTime: 1 });
// rebuild data
DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 });
MongoDatasetData.syncIndexes({ background: true });
} catch (error) {
console.log(error);
}

View File

@@ -74,11 +74,6 @@ const DatasetSchema = new Schema({
type: String,
default: ''
},
permission: {
type: String,
enum: Object.keys(PermissionTypeMap),
default: PermissionTypeEnum.private
},
websiteConfig: {
type: {
url: {

View File

@@ -12,13 +12,14 @@ import {
DatasetDataWithCollectionType,
SearchDataResponseItemType
} from '@fastgpt/global/core/dataset/type';
import { MongoDatasetCollection } from '../collection/schema';
import { DatasetColCollectionName, MongoDatasetCollection } from '../collection/schema';
import { reRankRecall } from '../../../core/ai/rerank';
import { countPromptTokens } from '../../../common/string/tiktoken/index';
import { datasetSearchResultConcat } from '@fastgpt/global/core/dataset/search/utils';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { jiebaSplit } from '../../../common/string/jieba';
import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection/utils';
import { Types } from '../../../common/mongo';
type SearchDatasetDataProps = {
teamId: string;
@@ -50,9 +51,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
usingReRank = usingReRank && global.reRankModels.length > 0;
// Compatible with topk limit
if (maxTokens < 50) {
maxTokens = 1500;
}
let set = new Set<string>();
let usingSimilarityFilter = false;
@@ -75,7 +73,29 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
fullTextLimit: 60
};
};
const embeddingRecall = async ({ query, limit }: { query: string; limit: number }) => {
const getForbidData = async () => {
const collections = await MongoDatasetCollection.find(
{
teamId,
datasetId: { $in: datasetIds },
forbid: true
},
'_id'
);
return {
forbidCollectionIdList: collections.map((item) => String(item._id))
};
};
const embeddingRecall = async ({
query,
limit,
forbidCollectionIdList
}: {
query: string;
limit: number;
forbidCollectionIdList: string[];
}) => {
const { vectors, tokens } = await getVectorsByText({
model: getVectorModel(model),
input: query,
@@ -86,7 +106,8 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
teamId,
datasetIds,
vector: vectors[0],
limit
limit,
forbidCollectionIdList
});
// get q and a
@@ -161,27 +182,66 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
let searchResults = (
await Promise.all(
datasetIds.map((id) =>
MongoDatasetData.find(
datasetIds.map(async (id) => {
return MongoDatasetData.aggregate([
{
teamId,
datasetId: id,
$text: { $search: jiebaSplit({ text: query }) }
$match: {
teamId: new Types.ObjectId(teamId),
datasetId: new Types.ObjectId(id),
$text: { $search: jiebaSplit({ text: query }) }
}
},
{
score: { $meta: 'textScore' },
_id: 1,
datasetId: 1,
collectionId: 1,
q: 1,
a: 1,
chunkIndex: 1
$addFields: {
score: { $meta: 'textScore' }
}
},
{
$sort: {
score: { $meta: 'textScore' }
}
},
{
$limit: limit
},
{
$lookup: {
from: DatasetColCollectionName,
let: { collectionId: '$collectionId' },
pipeline: [
{
$match: {
$expr: { $eq: ['$_id', '$$collectionId'] },
forbid: { $eq: false } // 直接在lookup阶段过滤
}
},
{
$project: {
_id: 1 // 只需要_id字段来确认匹配
}
}
],
as: 'collection'
}
},
{
$match: {
collection: { $ne: [] }
}
},
{
$project: {
_id: 1,
datasetId: 1,
collectionId: 1,
q: 1,
a: 1,
chunkIndex: 1,
score: 1
}
}
)
.sort({ score: { $meta: 'textScore' } })
.limit(limit)
.lean()
)
]);
})
)
).flat() as (DatasetDataSchemaType & { score: number })[];
@@ -255,27 +315,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
return [];
}
};
const filterResultsByMaxTokens = async (
list: SearchDataResponseItemType[],
maxTokens: number
) => {
const results: SearchDataResponseItemType[] = [];
let totalTokens = 0;
for await (const item of list) {
totalTokens += await countPromptTokens(item.q + item.a);
if (totalTokens > maxTokens + 500) {
break;
}
results.push(item);
if (totalTokens > maxTokens) {
break;
}
}
return results.length === 0 ? list.slice(0, 1) : results;
};
const multiQueryRecall = async ({
embeddingLimit,
fullTextLimit
@@ -288,12 +327,15 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
const fullTextRecallResList: SearchDataResponseItemType[][] = [];
let totalTokens = 0;
const { forbidCollectionIdList } = await getForbidData();
await Promise.all(
queries.map(async (query) => {
const [{ tokens, embeddingRecallResults }, { fullTextRecallResults }] = await Promise.all([
embeddingRecall({
query,
limit: embeddingLimit
limit: embeddingLimit,
forbidCollectionIdList
}),
fullTextRecall({
query,
@@ -397,8 +439,28 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
return filterSameDataResults;
})();
// token filter
const filterMaxTokensResult = await (async () => {
const results: SearchDataResponseItemType[] = [];
let totalTokens = 0;
for await (const item of scoreFilter) {
totalTokens += await countPromptTokens(item.q + item.a);
if (totalTokens > maxTokens + 500) {
break;
}
results.push(item);
if (totalTokens > maxTokens) {
break;
}
}
return results.length === 0 ? scoreFilter.slice(0, 1) : results;
})();
return {
searchRes: await filterResultsByMaxTokens(scoreFilter, maxTokens),
searchRes: filterMaxTokensResult,
tokens,
searchMode,
limit: maxTokens,