V4.8.18 feature (#3565)

* feat: org CRUD (#3380) * feat: add org schema * feat: org manage UI * feat: OrgInfoModal * feat: org tree view * feat: org management * fix: init root org * feat: org permission for app * feat: org support for dataset * fix: disable org role control * styles: opt type signatures * fix: remove unused permission * feat: delete org collaborator * perf: Team org ui (#3499) * perf: org ui * perf: org ui * feat: org auth for app & dataset (#3498) * feat: auth org resource permission * feat: org auth support for app & dataset * perf: org permission check (#3500) * i18n (#3501) * name * i18n * feat: support dataset changeOwner (#3483) * feat: support dataset changeOwner * chore: update dataset change owner api * feat: permission manage UI for org (#3503) * perf: password check;perf: image upload check;perf: sso login check (#3509) * perf: password check * perf: image upload check * perf: sso login check * force show update notification modal & fix login page text (#3512) * fix login page English text * update notification modal * perf: notify account (#3515) * perf(plugin): improve searXNG empty result handling and documentation (#3507) * perf(plugin): improve searXNG empty result handling and documentation * 修改了文档和代码部分无搜索的结果的反馈 * refactor: org pathId (#3516) * optimize payment process (#3517) * feat: support wecom sso (#3518) * feat: support wecom sso * chore: remove unused wecom js-sdk dependency * fix qrcode script (#3520) * fix qrcode script * i18n * perf: full text collection and search code;perf: rename function (#3519) * perf: full text collection and search code * perf: rename function * perf: notify modal * remove invalid code * perf: sso login * perf: pay process * 4.8.18 test (#3524) * perf: remove local token * perf: index * perf: file encoding;perf: leave team code;@c121914yu perf: full text search code (#3528) * perf: text encoding * perf: leave team code * perf: full text search code * fix: http status * perf: embedding search and vector avatar * perf: async read file (#3531) * refactor: team permission manager (#3535) * perf: classify org, group and member * refactor: team per manager * fix: missing functions * 4.8.18 test (#3543) * perf: login check * doc * perf: llm model config * perf: team clb config * fix: MemberModal UI (#3553) * fix: adapt MemberModal title and icon * fix: adapt member modal * fix: search input placeholder * fix: add button text * perf: org permission (#3556) * docs:用户答疑的官方文档补充 (#3540) * docs:用户答疑的官方文档补充 * 问题回答的内容修补 * share link random avatar (#3541) * share link random avatar * fix * delete unused code * share page avatar (#3558) * feat: init 4818 * share page avatar * feat: tmp upgrade code (#3559) * feat: tmp upgrade code * fulltext search test * update action * full text tmp code (#3561) * full text tmp code * fix: init * fix: init * remove tmp code * remove tmp code * 4818-alpha * 4.8.18 test (#3562) * full text tmp code * fix: init * upgrade code * account log * account log * perf: dockerfile * upgrade code * chore: update docs app template submission (#3564) --------- Co-authored-by: a.e. <49438478+I-Info@users.noreply.github.com> Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com> Co-authored-by: heheer <heheer@sealos.io> Co-authored-by: Jiangween <145003935+Jiangween@users.noreply.github.com>
2025-12-19 01:09:49 +08:00 · 2025-01-11 15:15:38 +08:00
parent bb669ca3ff
commit 10d8c56e23
205 changed files with 5305 additions and 2428 deletions
--- a/packages/service/core/dataset/collection/controller.ts
+++ b/packages/service/core/dataset/collection/controller.ts
@@ -24,6 +24,7 @@ import { pushDataListToTrainingQueue } from '../training/controller';
 import { MongoImage } from '../../../common/file/image/schema';
 import { hashStr } from '@fastgpt/global/common/string/tools';
 import { addDays } from 'date-fns';
+import { MongoDatasetDataText } from '../data/dataTextSchema';

 export const createCollectionAndInsertData = async ({
  dataset,
@@ -240,12 +241,12 @@ export const delCollectionRelatedSource = async ({
    .map((item) => item?.metadata?.relatedImgId || '')
    .filter(Boolean);

-  // delete files
+  // Delete files
  await delFileByFileIdList({
    bucketName: BucketNameEnum.dataset,
    fileIdList
  });
-  // delete images
+  // Delete images
  await delImgByRelatedId({
    teamId,
    relateIds: relatedImageIds,
@@ -273,7 +274,7 @@ export async function delCollection({
  const datasetIds = Array.from(new Set(collections.map((item) => String(item.datasetId))));
  const collectionIds = collections.map((item) => String(item._id));

-  // delete training data
+  // Delete training data
  await MongoDatasetTraining.deleteMany({
    teamId,
    datasetIds: { $in: datasetIds },
@@ -285,11 +286,16 @@ export async function delCollection({
    await delCollectionRelatedSource({ collections, session });
  }

-  // delete dataset.datas
+  // Delete dataset_datas
  await MongoDatasetData.deleteMany(
    { teamId, datasetIds: { $in: datasetIds }, collectionId: { $in: collectionIds } },
    { session }
  );
+  // Delete dataset_data_texts
+  await MongoDatasetDataText.deleteMany(
+    { teamId, datasetIds: { $in: datasetIds }, collectionId: { $in: collectionIds } },
+    { session }
+  );

  // delete collections
  await MongoDatasetCollection.deleteMany(
--- a/packages/service/core/dataset/controller.ts
+++ b/packages/service/core/dataset/controller.ts
@@ -6,6 +6,7 @@ import { ClientSession } from '../../common/mongo';
 import { MongoDatasetTraining } from './training/schema';
 import { MongoDatasetData } from './data/schema';
 import { deleteDatasetDataVector } from '../../common/vectorStore/controller';
+import { MongoDatasetDataText } from './data/dataTextSchema';

 /* ============= dataset ========== */
 /* find all datasetId by top datasetId */
@@ -92,7 +93,7 @@ export async function delDatasetRelevantData({
    { session }
  ).lean();

-  // image and file
+  // Delete Image and file
  await delCollectionRelatedSource({ collections, session });

  // delete collections
@@ -101,9 +102,15 @@ export async function delDatasetRelevantData({
    datasetId: { $in: datasetIds }
  }).session(session);

-  // delete dataset.datas(Not need session)
+  // No session delete:
+  // Delete dataset_data_texts
+  await MongoDatasetDataText.deleteMany({
+    teamId,
+    datasetId: { $in: datasetIds }
+  });
+  // delete dataset_datas
  await MongoDatasetData.deleteMany({ teamId, datasetId: { $in: datasetIds } });

-  // no session delete: delete files, vector data
+  // Delete vector data
  await deleteDatasetDataVector({ teamId, datasetIds });
 }
--- a/packages/service/core/dataset/data/dataTextSchema.ts
+++ b/packages/service/core/dataset/data/dataTextSchema.ts
@@ -0,0 +1,45 @@
+import { connectionMongo, getMongoModel } from '../../../common/mongo';
+const { Schema } = connectionMongo;
+import { DatasetDataSchemaType } from '@fastgpt/global/core/dataset/type.d';
+import { TeamCollectionName } from '@fastgpt/global/support/user/team/constant';
+import { DatasetCollectionName } from '../schema';
+import { DatasetColCollectionName } from '../collection/schema';
+import { DatasetDataCollectionName } from './schema';
+
+export const DatasetDataTextCollectionName = 'dataset_data_texts';
+
+const DatasetDataTextSchema = new Schema({
+  teamId: {
+    type: Schema.Types.ObjectId,
+    ref: TeamCollectionName,
+    required: true
+  },
+  datasetId: {
+    type: Schema.Types.ObjectId,
+    ref: DatasetCollectionName,
+    required: true
+  },
+  collectionId: {
+    type: Schema.Types.ObjectId,
+    ref: DatasetColCollectionName,
+    required: true
+  },
+  dataId: {
+    type: Schema.Types.ObjectId,
+    ref: DatasetDataCollectionName,
+    required: true
+  },
+  fullTextToken: String
+});
+
+try {
+  DatasetDataTextSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
+  DatasetDataTextSchema.index({ dataId: 1 }, { unique: true });
+} catch (error) {
+  console.log(error);
+}
+
+export const MongoDatasetDataText = getMongoModel<DatasetDataSchemaType>(
+  DatasetDataTextCollectionName,
+  DatasetDataTextSchema
+);
--- a/packages/service/core/dataset/data/schema.ts
+++ b/packages/service/core/dataset/data/schema.ts
@@ -1,4 +1,4 @@
-import { connectionMongo, getMongoModel, type Model } from '../../../common/mongo';
+import { connectionMongo, getMongoModel } from '../../../common/mongo';
 const { Schema, model, models } = connectionMongo;
 import { DatasetDataSchemaType } from '@fastgpt/global/core/dataset/type.d';
 import {
@@ -39,10 +39,6 @@ const DatasetDataSchema = new Schema({
    type: String,
    default: ''
  },
-  fullTextToken: {
-    type: String,
-    default: ''
-  },
  indexes: {
    type: [
      {
@@ -71,17 +67,11 @@ const DatasetDataSchema = new Schema({
    type: Number,
    default: 0
  },
-  inited: {
-    type: Boolean
-  },
-  rebuilding: Boolean
-});
+  rebuilding: Boolean,

-DatasetDataSchema.virtual('collection', {
-  ref: DatasetColCollectionName,
-  localField: 'collectionId',
-  foreignField: '_id',
-  justOne: true
+  // Abandon
+  fullTextToken: String,
+  initFullText: Boolean
 });

 try {
@@ -93,13 +83,15 @@ try {
    chunkIndex: 1,
    updateTime: -1
  });
-  // full text index
-  DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
+  // FullText tmp full text index
+  // DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
  // Recall vectors after data matching
  DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 });
  DatasetDataSchema.index({ updateTime: 1 });
  // rebuild data
  DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 });
+
+  DatasetDataSchema.index({ initFullText: 1 });
 } catch (error) {
  console.log(error);
 }
--- a/packages/service/core/dataset/search/controller.ts
+++ b/packages/service/core/dataset/search/controller.ts
@@ -8,8 +8,8 @@ import { getVectorsByText } from '../../ai/embedding';
 import { getVectorModel } from '../../ai/model';
 import { MongoDatasetData } from '../data/schema';
 import {
-  DatasetCollectionSchemaType,
  DatasetDataSchemaType,
+  DatasetDataTextSchemaType,
  SearchDataResponseItemType
 } from '@fastgpt/global/core/dataset/type';
 import { MongoDatasetCollection } from '../collection/schema';
@@ -23,6 +23,7 @@ import { Types } from '../../../common/mongo';
 import json5 from 'json5';
 import { MongoDatasetCollectionTags } from '../tag/schema';
 import { readFromSecondary } from '../../../common/mongo/utils';
+import { MongoDatasetDataText } from '../data/dataTextSchema';

 type SearchDatasetDataProps = {
  teamId: string;
@@ -266,57 +267,60 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
      filterCollectionIdList
    });

-    // get q and a
-    const dataList = await MongoDatasetData.find(
-      {
-        teamId,
-        datasetId: { $in: datasetIds },
-        collectionId: { $in: Array.from(new Set(results.map((item) => item.collectionId))) },
-        'indexes.dataId': { $in: results.map((item) => item.id?.trim()) }
-      },
-      'datasetId collectionId updateTime q a chunkIndex indexes'
-    )
-      .populate<{ collection: DatasetCollectionSchemaType }>(
-        'collection',
-        'name fileId rawLink externalFileId externalFileUrl'
-      )
-      .lean();
+    // Get data and collections
+    const collectionIdList = Array.from(new Set(results.map((item) => item.collectionId)));
+    const [dataList, collections] = await Promise.all([
+      MongoDatasetData.find(
+        {
+          teamId,
+          datasetId: { $in: datasetIds },
+          collectionId: { $in: collectionIdList },
+          'indexes.dataId': { $in: results.map((item) => item.id?.trim()) }
+        },
+        '_id datasetId collectionId updateTime q a chunkIndex indexes',
+        { ...readFromSecondary }
+      ).lean(),
+      MongoDatasetCollection.find(
+        {
+          _id: { $in: collectionIdList }
+        },
+        '_id name fileId rawLink externalFileId externalFileUrl',
+        { ...readFromSecondary }
+      ).lean()
+    ]);

-    // add score to data(It's already sorted. The first one is the one with the most points)
-    const concatResults = dataList.map((data) => {
-      const dataIdList = data.indexes.map((item) => item.dataId);
+    const formatResult = results
+      .map((item, index) => {
+        const collection = collections.find((col) => String(col._id) === String(item.collectionId));
+        if (!collection) {
+          console.log('Collection is not found', item);
+          return;
+        }
+        const data = dataList.find((data) =>
+          data.indexes.some((index) => index.dataId === item.id)
+        );
+        if (!data) {
+          console.log('Data is not found', item);
+          return;
+        }

-      const maxScoreResult = results.find((item) => {
-        return dataIdList.includes(item.id);
-      });
+        const score = item?.score || 0;

-      return {
-        ...data,
-        score: maxScoreResult?.score || 0
-      };
-    });
+        const result: SearchDataResponseItemType = {
+          id: String(data._id),
+          updateTime: data.updateTime,
+          q: data.q,
+          a: data.a,
+          chunkIndex: data.chunkIndex,
+          datasetId: String(data.datasetId),
+          collectionId: String(data.collectionId),
+          ...getCollectionSourceData(collection),
+          score: [{ type: SearchScoreTypeEnum.embedding, value: score, index }]
+        };

-    concatResults.sort((a, b) => b.score - a.score);
-
-    const formatResult = concatResults.map((data, index) => {
-      if (!data.collectionId) {
-        console.log('Collection is not found', data);
-      }
-
-      const result: SearchDataResponseItemType = {
-        id: String(data._id),
-        updateTime: data.updateTime,
-        q: data.q,
-        a: data.a,
-        chunkIndex: data.chunkIndex,
-        datasetId: String(data.datasetId),
-        collectionId: String(data.collectionId),
-        ...getCollectionSourceData(data.collection),
-        score: [{ type: SearchScoreTypeEnum.embedding, value: data.score, index }]
-      };
-
-      return result;
-    });
+        return result;
+      })
+      .filter(Boolean) as SearchDataResponseItemType[];

    return {
      embeddingRecallResults: formatResult,
@@ -344,88 +348,224 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
      };
    }

-    let searchResults = (
+    const searchResults = (
      await Promise.all(
        datasetIds.map(async (id) => {
-          return MongoDatasetData.aggregate([
-            {
-              $match: {
-                teamId: new Types.ObjectId(teamId),
-                datasetId: new Types.ObjectId(id),
-                $text: { $search: jiebaSplit({ text: query }) },
-                ...(filterCollectionIdList
-                  ? {
-                      collectionId: {
-                        $in: filterCollectionIdList.map((id) => new Types.ObjectId(id))
+          return MongoDatasetData.aggregate(
+            [
+              {
+                $match: {
+                  teamId: new Types.ObjectId(teamId),
+                  datasetId: new Types.ObjectId(id),
+                  $text: { $search: jiebaSplit({ text: query }) },
+                  ...(filterCollectionIdList
+                    ? {
+                        collectionId: {
+                          $in: filterCollectionIdList.map((id) => new Types.ObjectId(id))
+                        }
                      }
-                    }
-                  : {}),
-                ...(forbidCollectionIdList && forbidCollectionIdList.length > 0
-                  ? {
-                      collectionId: {
-                        $nin: forbidCollectionIdList.map((id) => new Types.ObjectId(id))
+                    : {}),
+                  ...(forbidCollectionIdList && forbidCollectionIdList.length > 0
+                    ? {
+                        collectionId: {
+                          $nin: forbidCollectionIdList.map((id) => new Types.ObjectId(id))
+                        }
                      }
-                    }
-                  : {})
+                    : {})
+                }
+              },
+              {
+                $sort: {
+                  score: { $meta: 'textScore' }
+                }
+              },
+              {
+                $limit: limit
+              },
+              {
+                $project: {
+                  _id: 1,
+                  datasetId: 1,
+                  collectionId: 1,
+                  updateTime: 1,
+                  q: 1,
+                  a: 1,
+                  chunkIndex: 1,
+                  score: { $meta: 'textScore' }
+                }
              }
-            },
+            ],
            {
-              $addFields: {
-                score: { $meta: 'textScore' }
-              }
-            },
-            {
-              $sort: {
-                score: { $meta: 'textScore' }
-              }
-            },
-            {
-              $limit: limit
-            },
-            {
-              $project: {
-                _id: 1,
-                datasetId: 1,
-                collectionId: 1,
-                updateTime: 1,
-                q: 1,
-                a: 1,
-                chunkIndex: 1,
-                score: 1
-              }
+              ...readFromSecondary
            }
-          ]);
+          );
        })
      )
    ).flat() as (DatasetDataSchemaType & { score: number })[];

-    // resort
-    searchResults.sort((a, b) => b.score - a.score);
-    searchResults.slice(0, limit);
-
+    // Get data and collections
    const collections = await MongoDatasetCollection.find(
      {
        _id: { $in: searchResults.map((item) => item.collectionId) }
      },
-      '_id name fileId rawLink'
-    );
+      '_id name fileId rawLink externalFileId externalFileUrl',
+      { ...readFromSecondary }
+    ).lean();

    return {
-      fullTextRecallResults: searchResults.map((item, index) => {
-        const collection = collections.find((col) => String(col._id) === String(item.collectionId));
-        return {
-          id: String(item._id),
-          datasetId: String(item.datasetId),
-          collectionId: String(item.collectionId),
-          updateTime: item.updateTime,
-          ...getCollectionSourceData(collection),
-          q: item.q,
-          a: item.a,
-          chunkIndex: item.chunkIndex,
-          indexes: item.indexes,
-          score: [{ type: SearchScoreTypeEnum.fullText, value: item.score, index }]
-        };
-      }),
+      fullTextRecallResults: searchResults
+        .map((data, index) => {
+          const collection = collections.find(
+            (col) => String(col._id) === String(data.collectionId)
+          );
+          if (!collection) {
+            console.log('Collection is not found', data);
+            return;
+          }
+
+          return {
+            id: String(data._id),
+            datasetId: String(data.datasetId),
+            collectionId: String(data.collectionId),
+            updateTime: data.updateTime,
+            q: data.q,
+            a: data.a,
+            chunkIndex: data.chunkIndex,
+            indexes: data.indexes,
+            ...getCollectionSourceData(collection),
+            score: [{ type: SearchScoreTypeEnum.fullText, value: data.score ?? 0, index }]
+          };
+        })
+        .filter(Boolean) as SearchDataResponseItemType[],
+      tokenLen: 0
+    };
+  };
+  const fullTextRecall2 = async ({
+    query,
+    limit,
+    filterCollectionIdList,
+    forbidCollectionIdList
+  }: {
+    query: string;
+    limit: number;
+    filterCollectionIdList?: string[];
+    forbidCollectionIdList: string[];
+  }): Promise<{
+    fullTextRecallResults: SearchDataResponseItemType[];
+    tokenLen: number;
+  }> => {
+    if (limit === 0) {
+      return {
+        fullTextRecallResults: [],
+        tokenLen: 0
+      };
+    }
+
+    const searchResults = (
+      await Promise.all(
+        datasetIds.map(async (id) => {
+          return MongoDatasetDataText.aggregate(
+            [
+              {
+                $match: {
+                  teamId: new Types.ObjectId(teamId),
+                  datasetId: new Types.ObjectId(id),
+                  $text: { $search: jiebaSplit({ text: query }) },
+                  ...(filterCollectionIdList
+                    ? {
+                        collectionId: {
+                          $in: filterCollectionIdList.map((id) => new Types.ObjectId(id))
+                        }
+                      }
+                    : {}),
+                  ...(forbidCollectionIdList && forbidCollectionIdList.length > 0
+                    ? {
+                        collectionId: {
+                          $nin: forbidCollectionIdList.map((id) => new Types.ObjectId(id))
+                        }
+                      }
+                    : {})
+                }
+              },
+              {
+                $sort: {
+                  score: { $meta: 'textScore' }
+                }
+              },
+              {
+                $limit: limit
+              },
+              {
+                $project: {
+                  _id: 1,
+                  collectionId: 1,
+                  dataId: 1,
+                  score: { $meta: 'textScore' }
+                }
+              }
+            ],
+            {
+              ...readFromSecondary
+            }
+          );
+        })
+      )
+    ).flat() as (DatasetDataTextSchemaType & { score: number })[];
+
+    // Get data and collections
+    const [dataList, collections] = await Promise.all([
+      MongoDatasetData.find(
+        {
+          _id: { $in: searchResults.map((item) => item.dataId) }
+        },
+        '_id datasetId collectionId updateTime q a chunkIndex indexes',
+        { ...readFromSecondary }
+      ).lean(),
+      MongoDatasetCollection.find(
+        {
+          _id: { $in: searchResults.map((item) => item.collectionId) }
+        },
+        '_id name fileId rawLink externalFileId externalFileUrl',
+        { ...readFromSecondary }
+      ).lean()
+    ]);
+
+    return {
+      fullTextRecallResults: searchResults
+        .map((item, index) => {
+          const collection = collections.find(
+            (col) => String(col._id) === String(item.collectionId)
+          );
+          if (!collection) {
+            console.log('Collection is not found', item);
+            return;
+          }
+          const data = dataList.find((data) => String(data._id) === String(item.dataId));
+          if (!data) {
+            console.log('Data is not found', item);
+            return;
+          }
+
+          return {
+            id: String(data._id),
+            datasetId: String(data.datasetId),
+            collectionId: String(data.collectionId),
+            updateTime: data.updateTime,
+            q: data.q,
+            a: data.a,
+            chunkIndex: data.chunkIndex,
+            indexes: data.indexes,
+            ...getCollectionSourceData(collection),
+            score: [
+              {
+                type: SearchScoreTypeEnum.fullText,
+                value: item.score || 0,
+                index
+              }
+            ]
+          };
+        })
+        .filter(Boolean) as SearchDataResponseItemType[],
      tokenLen: 0
    };
  };
@@ -496,7 +636,8 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
            forbidCollectionIdList,
            filterCollectionIdList
          }),
-          fullTextRecall({
+          // FullText tmp
+          fullTextRecall2({
            query,
            limit: fullTextLimit,
            filterCollectionIdList,