4.8.6 merge (#1943)

* Dataset collection forbid (#1885) * perf: tool call support same id * feat: collection forbid * feat: collection forbid * Inheritance Permission for apps (#1897) * feat: app schema define chore: references of authapp * feat: authApp method inheritance * feat: create and update api * feat: update * feat: inheritance Permission controller for app. * feat: abstract version of inheritPermission * feat: ancestorId for apps * chore: update app * fix: inheritPermission abstract version * feat: update folder defaultPermission * feat: app update api * chore: inheritance frontend * chore: app list api * feat: update defaultPermission in app deatil * feat: backend api finished * feat: app inheritance permission fe * fix: app update defaultpermission causes collaborator miss * fix: ts error * chore: adjust the codes * chore: i18n chore: i18n * chore: fe adjust and i18n * chore: adjust the code * feat: resume api; chore: rewrite update api and inheritPermission methods * chore: something * chore: fe code adjusting * feat: frontend adjusting * chore: fe code adjusting * chore: adjusting the code * perf: fe loading * format * Inheritance fix (#1908) * fix: SlideCard * fix: authapp did not return parent app for inheritance app * fix: fe adjusting * feat: fe adjusing * perf: inherit per ux * doc * fix: ts errors (#1916) * perf: inherit permission * fix: permission inherit * Workflow type (#1938) * perf: workflow type tmp workflow perf: workflow type feat: custom field config * perf: dynamic input * perf: node classify * perf: node classify * perf: node classify * perf: node classify * fix: workflow custom input * feat: text editor and customFeedback move to basic nodes * feat: community system plugin * fix: ts * feat: exprEval plugin * perf: workflow type * perf: plugin important * fix: default templates * perf: markdown hr css * lock * perf: fetch url * perf: new plugin version * fix: chat histories update * fix: collection paths invalid * perf: app card ui --------- Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
2025-10-14 15:11:13 +00:00 · 2024-07-04 17:42:09 +08:00
parent babf03c218
commit a9cdece341
303 changed files with 18883 additions and 13149 deletions
--- a/packages/service/core/dataset/collection/schema.ts
+++ b/packages/service/core/dataset/collection/schema.ts
@@ -48,12 +48,15 @@ const DatasetCollectionSchema = new Schema({
    type: Date,
    default: () => new Date()
  },
+  forbid: {
+    type: Boolean,
+    default: false
+  },

  // chunk filed
  trainingType: {
    type: String,
-    enum: Object.keys(TrainingTypeMap),
-    required: true
+    enum: Object.keys(TrainingTypeMap)
  },
  chunkSize: {
    type: Number,
@@ -91,23 +94,25 @@ const DatasetCollectionSchema = new Schema({
  }
 });

+export const MongoDatasetCollection: Model<DatasetCollectionSchemaType> =
+  models[DatasetColCollectionName] || model(DatasetColCollectionName, DatasetCollectionSchema);
+
 try {
  // auth file
-  DatasetCollectionSchema.index({ teamId: 1, fileId: 1 }, { background: true });
+  DatasetCollectionSchema.index({ teamId: 1, fileId: 1 });

  // list collection; deep find collections
-  DatasetCollectionSchema.index(
-    {
-      teamId: 1,
-      datasetId: 1,
-      parentId: 1,
-      updateTime: -1
-    },
-    { background: true }
-  );
+  DatasetCollectionSchema.index({
+    teamId: 1,
+    datasetId: 1,
+    parentId: 1,
+    updateTime: -1
+  });
+
+  // get forbid
+  // DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, forbid: 1 });
+
+  MongoDatasetCollection.syncIndexes({ background: true });
 } catch (error) {
  console.log(error);
 }
-
-export const MongoDatasetCollection: Model<DatasetCollectionSchemaType> =
-  models[DatasetColCollectionName] || model(DatasetColCollectionName, DatasetCollectionSchema);
--- a/packages/service/core/dataset/collection/utils.ts
+++ b/packages/service/core/dataset/collection/utils.ts
@@ -53,29 +53,6 @@ export async function findCollectionAndChild({
  return [collection, ...childCollections];
 }

-export async function getDatasetCollectionPaths({
-  parentId = ''
-}: {
-  parentId?: string;
-}): Promise<ParentTreePathItemType[]> {
-  async function find(parentId?: string): Promise<ParentTreePathItemType[]> {
-    if (!parentId) {
-      return [];
-    }
-
-    const parent = await MongoDatasetCollection.findOne({ _id: parentId }, 'name parentId');
-
-    if (!parent) return [];
-
-    const paths = await find(parent.parentId);
-    paths.push({ parentId, parentName: parent.name });
-
-    return paths;
-  }
-
-  return await find(parentId);
-}
-
 export function getCollectionUpdateTime({ name, time }: { time?: Date; name: string }) {
  if (time) return time;
  if (name.startsWith('手动') || ['manual', 'mark'].includes(name)) return new Date('2999/9/9');
--- a/packages/service/core/dataset/controller.ts
+++ b/packages/service/core/dataset/controller.ts
@@ -37,7 +37,7 @@ export async function findDatasetAndAllChildren({
    return datasets;
  };
  const [dataset, childDatasets] = await Promise.all([
-    MongoDataset.findById(datasetId),
+    MongoDataset.findById(datasetId).lean(),
    find(datasetId)
  ]);

--- a/packages/service/core/dataset/data/schema.ts
+++ b/packages/service/core/dataset/data/schema.ts
@@ -77,27 +77,27 @@ const DatasetDataSchema = new Schema({
  rebuilding: Boolean
 });

-try {
-  // list collection and count data; list data; delete collection(relate data)
-  DatasetDataSchema.index(
-    { teamId: 1, datasetId: 1, collectionId: 1, chunkIndex: 1, updateTime: -1 },
-    { background: true }
-  );
-  // full text index
-  DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' }, { background: true });
-  // Recall vectors after data matching
-  DatasetDataSchema.index(
-    { teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 },
-    { background: true }
-  );
-  DatasetDataSchema.index({ updateTime: 1 }, { background: true });
-  // rebuild data
-  DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 }, { background: true });
-} catch (error) {
-  console.log(error);
-}
-
 export const MongoDatasetData: Model<DatasetDataSchemaType> =
  models[DatasetDataCollectionName] || model(DatasetDataCollectionName, DatasetDataSchema);

-MongoDatasetData.syncIndexes();
+try {
+  // list collection and count data; list data; delete collection(relate data)
+  DatasetDataSchema.index({
+    teamId: 1,
+    datasetId: 1,
+    collectionId: 1,
+    chunkIndex: 1,
+    updateTime: -1
+  });
+  // full text index
+  DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
+  // Recall vectors after data matching
+  DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 });
+  DatasetDataSchema.index({ updateTime: 1 });
+  // rebuild data
+  DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 });
+
+  MongoDatasetData.syncIndexes({ background: true });
+} catch (error) {
+  console.log(error);
+}
--- a/packages/service/core/dataset/schema.ts
+++ b/packages/service/core/dataset/schema.ts
@@ -74,11 +74,6 @@ const DatasetSchema = new Schema({
    type: String,
    default: ''
  },
-  permission: {
-    type: String,
-    enum: Object.keys(PermissionTypeMap),
-    default: PermissionTypeEnum.private
-  },
  websiteConfig: {
    type: {
      url: {
--- a/packages/service/core/dataset/search/controller.ts
+++ b/packages/service/core/dataset/search/controller.ts
@@ -12,13 +12,14 @@ import {
  DatasetDataWithCollectionType,
  SearchDataResponseItemType
 } from '@fastgpt/global/core/dataset/type';
-import { MongoDatasetCollection } from '../collection/schema';
+import { DatasetColCollectionName, MongoDatasetCollection } from '../collection/schema';
 import { reRankRecall } from '../../../core/ai/rerank';
 import { countPromptTokens } from '../../../common/string/tiktoken/index';
 import { datasetSearchResultConcat } from '@fastgpt/global/core/dataset/search/utils';
 import { hashStr } from '@fastgpt/global/common/string/tools';
 import { jiebaSplit } from '../../../common/string/jieba';
 import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection/utils';
+import { Types } from '../../../common/mongo';

 type SearchDatasetDataProps = {
  teamId: string;
@@ -50,9 +51,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
  usingReRank = usingReRank && global.reRankModels.length > 0;

  // Compatible with topk limit
-  if (maxTokens < 50) {
-    maxTokens = 1500;
-  }
  let set = new Set<string>();
  let usingSimilarityFilter = false;

@@ -75,7 +73,29 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
      fullTextLimit: 60
    };
  };
-  const embeddingRecall = async ({ query, limit }: { query: string; limit: number }) => {
+  const getForbidData = async () => {
+    const collections = await MongoDatasetCollection.find(
+      {
+        teamId,
+        datasetId: { $in: datasetIds },
+        forbid: true
+      },
+      '_id'
+    );
+
+    return {
+      forbidCollectionIdList: collections.map((item) => String(item._id))
+    };
+  };
+  const embeddingRecall = async ({
+    query,
+    limit,
+    forbidCollectionIdList
+  }: {
+    query: string;
+    limit: number;
+    forbidCollectionIdList: string[];
+  }) => {
    const { vectors, tokens } = await getVectorsByText({
      model: getVectorModel(model),
      input: query,
@@ -86,7 +106,8 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
      teamId,
      datasetIds,
      vector: vectors[0],
-      limit
+      limit,
+      forbidCollectionIdList
    });

    // get q and a
@@ -161,27 +182,66 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {

    let searchResults = (
      await Promise.all(
-        datasetIds.map((id) =>
-          MongoDatasetData.find(
+        datasetIds.map(async (id) => {
+          return MongoDatasetData.aggregate([
            {
-              teamId,
-              datasetId: id,
-              $text: { $search: jiebaSplit({ text: query }) }
+              $match: {
+                teamId: new Types.ObjectId(teamId),
+                datasetId: new Types.ObjectId(id),
+                $text: { $search: jiebaSplit({ text: query }) }
+              }
            },
            {
-              score: { $meta: 'textScore' },
-              _id: 1,
-              datasetId: 1,
-              collectionId: 1,
-              q: 1,
-              a: 1,
-              chunkIndex: 1
+              $addFields: {
+                score: { $meta: 'textScore' }
+              }
+            },
+            {
+              $sort: {
+                score: { $meta: 'textScore' }
+              }
+            },
+            {
+              $limit: limit
+            },
+            {
+              $lookup: {
+                from: DatasetColCollectionName,
+                let: { collectionId: '$collectionId' },
+                pipeline: [
+                  {
+                    $match: {
+                      $expr: { $eq: ['$_id', '$$collectionId'] },
+                      forbid: { $eq: false } // 直接在lookup阶段过滤
+                    }
+                  },
+                  {
+                    $project: {
+                      _id: 1 // 只需要_id字段来确认匹配
+                    }
+                  }
+                ],
+                as: 'collection'
+              }
+            },
+            {
+              $match: {
+                collection: { $ne: [] }
+              }
+            },
+            {
+              $project: {
+                _id: 1,
+                datasetId: 1,
+                collectionId: 1,
+                q: 1,
+                a: 1,
+                chunkIndex: 1,
+                score: 1
+              }
            }
-          )
-            .sort({ score: { $meta: 'textScore' } })
-            .limit(limit)
-            .lean()
-        )
+          ]);
+        })
      )
    ).flat() as (DatasetDataSchemaType & { score: number })[];

@@ -255,27 +315,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
      return [];
    }
  };
-  const filterResultsByMaxTokens = async (
-    list: SearchDataResponseItemType[],
-    maxTokens: number
-  ) => {
-    const results: SearchDataResponseItemType[] = [];
-    let totalTokens = 0;
-
-    for await (const item of list) {
-      totalTokens += await countPromptTokens(item.q + item.a);
-
-      if (totalTokens > maxTokens + 500) {
-        break;
-      }
-      results.push(item);
-      if (totalTokens > maxTokens) {
-        break;
-      }
-    }
-
-    return results.length === 0 ? list.slice(0, 1) : results;
-  };
  const multiQueryRecall = async ({
    embeddingLimit,
    fullTextLimit
@@ -288,12 +327,15 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
    const fullTextRecallResList: SearchDataResponseItemType[][] = [];
    let totalTokens = 0;

+    const { forbidCollectionIdList } = await getForbidData();
+
    await Promise.all(
      queries.map(async (query) => {
        const [{ tokens, embeddingRecallResults }, { fullTextRecallResults }] = await Promise.all([
          embeddingRecall({
            query,
-            limit: embeddingLimit
+            limit: embeddingLimit,
+            forbidCollectionIdList
          }),
          fullTextRecall({
            query,
@@ -397,8 +439,28 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
    return filterSameDataResults;
  })();

+  // token filter
+  const filterMaxTokensResult = await (async () => {
+    const results: SearchDataResponseItemType[] = [];
+    let totalTokens = 0;
+
+    for await (const item of scoreFilter) {
+      totalTokens += await countPromptTokens(item.q + item.a);
+
+      if (totalTokens > maxTokens + 500) {
+        break;
+      }
+      results.push(item);
+      if (totalTokens > maxTokens) {
+        break;
+      }
+    }
+
+    return results.length === 0 ? scoreFilter.slice(0, 1) : results;
+  })();
+
  return {
-    searchRes: await filterResultsByMaxTokens(scoreFilter, maxTokens),
+    searchRes: filterMaxTokensResult,
    tokens,
    searchMode,
    limit: maxTokens,