4.6.7-alpha commit (#743)

Co-authored-by: Archer <545436317@qq.com> Co-authored-by: heheer <71265218+newfish-cmyk@users.noreply.github.com>
2025-07-23 05:12:39 +00:00 · 2024-01-19 11:17:28 +08:00
parent 8ee7407c4c
commit c031e6dcc9
324 changed files with 8509 additions and 4757 deletions
--- a/packages/service/core/dataset/collection/controller.ts
+++ b/packages/service/core/dataset/collection/controller.ts
@@ -1,6 +1,20 @@
-import { TrainingModeEnum, DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
+import {
+  TrainingModeEnum,
+  DatasetCollectionTypeEnum
+} from '@fastgpt/global/core/dataset/constants';
 import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
 import { MongoDatasetCollection } from './schema';
+import {
+  CollectionWithDatasetType,
+  DatasetCollectionSchemaType
+} from '@fastgpt/global/core/dataset/type';
+import { MongoDatasetTraining } from '../training/schema';
+import { delay } from '@fastgpt/global/common/system/utils';
+import { MongoDatasetData } from '../data/schema';
+import { delImgByRelatedId } from '../../../common/file/image/controller';
+import { deleteDatasetDataVector } from '../../../common/vectorStore/controller';
+import { delFileByFileIdList } from '../../../common/file/gridfs/controller';
+import { BucketNameEnum } from '@fastgpt/global/common/file/constants';

 export async function createOneCollection({
  teamId,
@@ -85,20 +99,50 @@ export function createDefaultCollection({
  });
 }

-// check same collection
-export const getSameRawTextCollection = async ({
-  datasetId,
-  hashRawText
+/**
+ * delete collection and it related data
+ */
+export async function delCollectionAndRelatedSources({
+  collections
 }: {
-  datasetId: string;
-  hashRawText?: string;
-}) => {
-  if (!hashRawText) return undefined;
+  collections: (CollectionWithDatasetType | DatasetCollectionSchemaType)[];
+}) {
+  if (collections.length === 0) return;

-  const collection = await MongoDatasetCollection.findOne({
-    datasetId,
-    hashRawText
+  const teamId = collections[0].teamId;
+
+  if (!teamId) return Promise.reject('teamId is not exist');
+
+  const collectionIds = collections.map((item) => String(item._id));
+  const fileIdList = collections.map((item) => item?.fileId || '').filter(Boolean);
+  const relatedImageIds = collections
+    .map((item) => item?.metadata?.relatedImgId || '')
+    .filter(Boolean);
+
+  // delete training data
+  await MongoDatasetTraining.deleteMany({
+    teamId,
+    collectionId: { $in: collectionIds }
  });

-  return collection;
-};
+  await delay(2000);
+
+  // delete dataset.datas
+  await MongoDatasetData.deleteMany({ teamId, collectionId: { $in: collectionIds } });
+  // delete pg data
+  await deleteDatasetDataVector({ teamId, collectionIds });
+
+  // delete file and imgs
+  await Promise.all([
+    delImgByRelatedId(relatedImageIds),
+    delFileByFileIdList({
+      bucketName: BucketNameEnum.dataset,
+      fileIdList
+    })
+  ]);
+
+  // delete collections
+  await MongoDatasetCollection.deleteMany({
+    _id: { $in: collectionIds }
+  });
+}
--- a/packages/service/core/dataset/collection/schema.ts
+++ b/packages/service/core/dataset/collection/schema.ts
@@ -1,7 +1,7 @@
 import { connectionMongo, type Model } from '../../../common/mongo';
 const { Schema, model, models } = connectionMongo;
 import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
-import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constant';
+import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constants';
 import { DatasetCollectionName } from '../schema';
 import {
  TeamCollectionName,
@@ -91,11 +91,19 @@ const DatasetCollectionSchema = new Schema({
 });

 try {
-  DatasetCollectionSchema.index({ teamId: 1 });
-  DatasetCollectionSchema.index({ datasetId: 1 });
-  DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, parentId: 1 });
-  DatasetCollectionSchema.index({ updateTime: -1 });
-  DatasetCollectionSchema.index({ hashRawText: -1 });
+  // auth file
+  DatasetCollectionSchema.index({ teamId: 1, fileId: 1 }, { background: true });
+
+  // list collection; deep find collections
+  DatasetCollectionSchema.index(
+    {
+      teamId: 1,
+      datasetId: 1,
+      parentId: 1,
+      updateTime: -1
+    },
+    { background: true }
+  );
 } catch (error) {
  console.log(error);
 }
--- a/packages/service/core/dataset/collection/utils.ts
+++ b/packages/service/core/dataset/collection/utils.ts
@@ -4,16 +4,32 @@ import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder
 import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
 import { MongoDatasetTraining } from '../training/schema';
 import { urlsFetch } from '../../../common/string/cheerio';
-import { DatasetCollectionTypeEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
+import {
+  DatasetCollectionTypeEnum,
+  TrainingModeEnum
+} from '@fastgpt/global/core/dataset/constants';
 import { hashStr } from '@fastgpt/global/common/string/tools';

 /**
 * get all collection by top collectionId
 */
-export async function findCollectionAndChild(id: string, fields = '_id parentId name metadata') {
+export async function findCollectionAndChild({
+  teamId,
+  datasetId,
+  collectionId,
+  fields = '_id parentId name metadata'
+}: {
+  teamId: string;
+  datasetId: string;
+  collectionId: string;
+  fields?: string;
+}) {
  async function find(id: string) {
    // find children
-    const children = await MongoDatasetCollection.find({ parentId: id }, fields);
+    const children = await MongoDatasetCollection.find(
+      { teamId, datasetId, parentId: id },
+      fields
+    ).lean();

    let collections = children;

@@ -25,8 +41,8 @@ export async function findCollectionAndChild(id: string, fields = '_id parentId
    return collections;
  }
  const [collection, childCollections] = await Promise.all([
-    MongoDatasetCollection.findById(id, fields),
-    find(id)
+    MongoDatasetCollection.findById(collectionId, fields),
+    find(collectionId)
  ]);

  if (!collection) {
@@ -107,8 +123,8 @@ export const getCollectionAndRawText = async ({
      });

      return {
-        title: result[0].title,
-        rawText: result[0].content
+        title: result[0]?.title,
+        rawText: result[0]?.content
      };
    }

@@ -121,7 +137,7 @@ export const getCollectionAndRawText = async ({
  })();

  const hashRawText = hashStr(rawText);
-  const isSameRawText = col.hashRawText === hashRawText;
+  const isSameRawText = rawText && col.hashRawText === hashRawText;

  return {
    collection: col,
@@ -161,8 +177,7 @@ export const reloadCollectionChunks = async ({
  // split data
  const { chunks } = splitText2Chunks({
    text: newRawText,
-    chunkLen: col.chunkSize || 512,
-    countTokens: false
+    chunkLen: col.chunkSize || 512
  });

  // insert to training queue