V4.9.11 feature (#4969)

* Feat: Images dataset collection (#4941) * New pic (#4858) * 更新数据集相关类型，添加图像文件ID和预览URL支持；优化数据集导入功能，新增图像数据集处理组件；修复部分国际化文本；更新文件上传逻辑以支持新功能。 * 与原先代码的差别 * 新增 V4.9.10 更新说明，支持 PG 设置`systemEnv.hnswMaxScanTuples`参数，优化 LLM stream 调用超时，修复全文检索多知识库排序问题。同时更新数据集索引，移除 datasetId 字段以简化查询。 * 更换成fileId_image逻辑，并增加训练队列匹配的逻辑 * 新增图片集合判断逻辑，优化预览URL生成流程，确保仅在数据集为图片集合时生成预览URL，并添加相关日志输出以便调试。 * Refactor Docker Compose configuration to comment out exposed ports for production environments, update image versions for pgvector, fastgpt, and mcp_server, and enhance Redis service with a health check. Additionally, standardize dataset collection labels in constants and improve internationalization strings across multiple languages. * Enhance TrainingStates component by adding internationalization support for the imageParse training mode and update defaultCounts to include imageParse mode in trainingDetail API. * Enhance dataset import context by adding additional steps for image dataset import process and improve internationalization strings for modal buttons in the useEditTitle hook. * Update DatasetImportContext to conditionally render MyStep component based on data source type, improving the import process for non-image datasets. * Refactor image dataset handling by improving internationalization strings, enhancing error messages, and streamlining the preview URL generation process. * 图片上传到新建的 dataset_collection_images 表，逻辑跟随更改 * 修改了除了controller的其他部分问题 * 把图片数据集的逻辑整合到controller里面 * 补充i18n * 补充i18n * resolve评论：主要是上传逻辑的更改和组件复用 * 图片名称的图标显示 * 修改编译报错的命名问题 * 删除不需要的collectionid部分 * 多余文件的处理和改动一个删除按钮 * 除了loading和统一的imageId,其他都resolve掉的 * 处理图标报错 * 复用了MyPhotoView并采用全部替换的方式将imageFileId变成imageId * 去除不必要文件修改 * 报错和字段修改 * 增加上传成功后删除临时文件的逻辑以及回退一些修改 * 删除path字段，将图片保存到gridfs内，并修改增删等操作的代码 * 修正编译错误 --------- Co-authored-by: archer <545436317@qq.com> * perf: image dataset * feat: insert image * perf: image icon * fix: training state --------- Co-authored-by: Zhuangzai fa <143257420+ctrlz526@users.noreply.github.com> * fix: ts (#4948) * Thirddatasetmd (#4942) * add thirddataset.md * fix thirddataset.md * fix * delete wrong png --------- Co-authored-by: dreamer6680 <146868355@qq.com> * perf: api dataset code * perf: log * add secondary.tsx (#4946) * add secondary.tsx * fix --------- Co-authored-by: dreamer6680 <146868355@qq.com> * perf: multiple menu * perf: i18n * feat: parse queue (#4960) * feat: parse queue * feat: sync parse queue * fix thirddataset.md (#4962) * fix thirddataset-4.png (#4963) * feat: Dataset template import (#4934) * 模版导入部分除了文档还没写 * 修复模版导入的 build 错误 * Document production * compress pictures * Change some constants to variables --------- Co-authored-by: Archer <545436317@qq.com> * perf: template import * doc * llm pargraph * bocha tool * fix: del collection --------- Co-authored-by: Zhuangzai fa <143257420+ctrlz526@users.noreply.github.com> Co-authored-by: dreamer6680 <1468683855@qq.com> Co-authored-by: dreamer6680 <146868355@qq.com>
2025-10-18 01:16:01 +00:00 · 2025-06-06 14:48:44 +08:00
parent bb810a43a1
commit c30f069f2f
198 changed files with 4934 additions and 2290 deletions
--- a/packages/service/common/api/type.d.ts
+++ b/packages/service/common/api/type.d.ts
@@ -1,5 +1,8 @@
-import type { ApiDatasetDetailResponse } from '@fastgpt/global/core/dataset/apiDataset';
-import { FeishuServer, YuqueServer } from '@fastgpt/global/core/dataset/apiDataset';
+import type {
+  ApiDatasetDetailResponse,
+  FeishuServer,
+  YuqueServer
+} from '@fastgpt/global/core/dataset/apiDataset/type';
 import type {
  DeepRagSearchProps,
  SearchDatasetDataResponse
--- a/packages/service/common/buffer/rawText/controller.ts
+++ b/packages/service/common/buffer/rawText/controller.ts
@@ -142,23 +142,26 @@ export const updateRawTextBufferExpiredTime = async ({
 };

 export const clearExpiredRawTextBufferCron = async () => {
+  const gridBucket = getGridBucket();
+
  const clearExpiredRawTextBuffer = async () => {
    addLog.debug('Clear expired raw text buffer start');
-    const gridBucket = getGridBucket();

-    return retryFn(async () => {
-      const data = await MongoRawTextBufferSchema.find(
-        {
-          'metadata.expiredTime': { $lt: new Date() }
-        },
-        '_id'
-      ).lean();
+    const data = await MongoRawTextBufferSchema.find(
+      {
+        'metadata.expiredTime': { $lt: new Date() }
+      },
+      '_id'
+    ).lean();

-      for (const item of data) {
+    for (const item of data) {
+      try {
        await gridBucket.delete(item._id);
+      } catch (error) {
+        addLog.error('Delete expired raw text buffer error', error);
      }
-      addLog.debug('Clear expired raw text buffer end');
-    });
+    }
+    addLog.debug('Clear expired raw text buffer end');
  };

  setCron('*/10 * * * *', async () => {
--- a/packages/service/common/file/gridfs/controller.ts
+++ b/packages/service/common/file/gridfs/controller.ts
@@ -7,12 +7,13 @@ import { MongoChatFileSchema, MongoDatasetFileSchema } from './schema';
 import { detectFileEncoding, detectFileEncodingByPath } from '@fastgpt/global/common/file/tools';
 import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
 import { readRawContentByFileBuffer } from '../read/utils';
-import { gridFsStream2Buffer, stream2Encoding } from './utils';
+import { computeGridFsChunSize, gridFsStream2Buffer, stream2Encoding } from './utils';
 import { addLog } from '../../system/log';
 import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools';
 import { Readable } from 'stream';
 import { addRawTextBuffer, getRawTextBuffer } from '../../buffer/rawText/controller';
 import { addMinutes } from 'date-fns';
+import { retryFn } from '@fastgpt/global/common/system/utils';

 export function getGFSCollection(bucket: `${BucketNameEnum}`) {
  MongoDatasetFileSchema;
@@ -64,23 +65,7 @@ export async function uploadFile({
  // create a gridfs bucket
  const bucket = getGridBucket(bucketName);

-  const fileSize = stats.size;
-  // 单块大小：尽可能大，但不超过 14MB，不小于512KB
-  const chunkSizeBytes = (() => {
-    // 计算理想块大小：文件大小 ÷ 目标块数(10)。 并且每个块需要小于 14MB
-    const idealChunkSize = Math.min(Math.ceil(fileSize / 10), 14 * 1024 * 1024);
-
-    // 确保块大小至少为512KB
-    const minChunkSize = 512 * 1024; // 512KB
-
-    // 取理想块大小和最小块大小中的较大值
-    let chunkSize = Math.max(idealChunkSize, minChunkSize);
-
-    // 将块大小向上取整到最接近的64KB的倍数，使其更整齐
-    chunkSize = Math.ceil(chunkSize / (64 * 1024)) * (64 * 1024);
-
-    return chunkSize;
-  })();
+  const chunkSizeBytes = computeGridFsChunSize(stats.size);

  const stream = bucket.openUploadStream(filename, {
    metadata,
@@ -173,24 +158,18 @@ export async function getFileById({

 export async function delFileByFileIdList({
  bucketName,
-  fileIdList,
-  retry = 3
+  fileIdList
 }: {
  bucketName: `${BucketNameEnum}`;
  fileIdList: string[];
-  retry?: number;
 }): Promise<any> {
-  try {
+  return retryFn(async () => {
    const bucket = getGridBucket(bucketName);

    for await (const fileId of fileIdList) {
      await bucket.delete(new Types.ObjectId(fileId));
    }
-  } catch (error) {
-    if (retry > 0) {
-      return delFileByFileIdList({ bucketName, fileIdList, retry: retry - 1 });
-    }
-  }
+  });
 }

 export async function getDownloadStream({
--- a/packages/service/common/file/gridfs/utils.ts
+++ b/packages/service/common/file/gridfs/utils.ts
@@ -105,3 +105,20 @@ export const stream2Encoding = async (stream: NodeJS.ReadableStream) => {
    stream: copyStream
  };
 };
+
+// 单块大小：尽可能大，但不超过 14MB，不小于512KB
+export const computeGridFsChunSize = (fileSize: number) => {
+  // 计算理想块大小：文件大小 ÷ 目标块数(10)。 并且每个块需要小于 14MB
+  const idealChunkSize = Math.min(Math.ceil(fileSize / 10), 14 * 1024 * 1024);
+
+  // 确保块大小至少为512KB
+  const minChunkSize = 512 * 1024; // 512KB
+
+  // 取理想块大小和最小块大小中的较大值
+  let chunkSize = Math.max(idealChunkSize, minChunkSize);
+
+  // 将块大小向上取整到最接近的64KB的倍数，使其更整齐
+  chunkSize = Math.ceil(chunkSize / (64 * 1024)) * (64 * 1024);
+
+  return chunkSize;
+};
--- a/packages/service/common/file/multer.ts
+++ b/packages/service/common/file/multer.ts
@@ -22,7 +22,7 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => {
  maxSize *= 1024 * 1024;

  class UploadModel {
-    uploader = multer({
+    uploaderSingle = multer({
      limits: {
        fieldSize: maxSize
      },
@@ -41,8 +41,7 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => {
        }
      })
    }).single('file');
-
-    async doUpload<T = any>(
+    async getUploadFile<T = any>(
      req: NextApiRequest,
      res: NextApiResponse,
      originBucketName?: `${BucketNameEnum}`
@@ -54,7 +53,7 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => {
        bucketName?: `${BucketNameEnum}`;
      }>((resolve, reject) => {
        // @ts-ignore
-        this.uploader(req, res, (error) => {
+        this.uploaderSingle(req, res, (error) => {
          if (error) {
            return reject(error);
          }
@@ -94,6 +93,58 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => {
        });
      });
    }
+
+    uploaderMultiple = multer({
+      limits: {
+        fieldSize: maxSize
+      },
+      preservePath: true,
+      storage: multer.diskStorage({
+        // destination: (_req, _file, cb) => {
+        //   cb(null, tmpFileDirPath);
+        // },
+        filename: (req, file, cb) => {
+          if (!file?.originalname) {
+            cb(new Error('File not found'), '');
+          } else {
+            const { ext } = path.parse(decodeURIComponent(file.originalname));
+            cb(null, `${getNanoid()}${ext}`);
+          }
+        }
+      })
+    }).array('file', global.feConfigs?.uploadFileMaxSize);
+    async getUploadFiles<T = any>(req: NextApiRequest, res: NextApiResponse) {
+      return new Promise<{
+        files: FileType[];
+        data: T;
+      }>((resolve, reject) => {
+        // @ts-ignore
+        this.uploaderMultiple(req, res, (error) => {
+          if (error) {
+            console.log(error);
+            return reject(error);
+          }
+
+          // @ts-ignore
+          const files = req.files as FileType[];
+
+          resolve({
+            files: files.map((file) => ({
+              ...file,
+              originalname: decodeURIComponent(file.originalname)
+            })),
+            data: (() => {
+              if (!req.body?.data) return {};
+              try {
+                return JSON.parse(req.body.data);
+              } catch (error) {
+                return {};
+              }
+            })()
+          });
+        });
+      });
+    }
  }

  return new UploadModel();
--- a/packages/service/common/system/frequencyLimit/utils.ts
+++ b/packages/service/common/system/frequencyLimit/utils.ts
@@ -4,7 +4,8 @@ import { MongoFrequencyLimit } from './schema';
 export const authFrequencyLimit = async ({
  eventId,
  maxAmount,
-  expiredTime
+  expiredTime,
+  num = 1
 }: AuthFrequencyLimitProps) => {
  try {
    // 对应 eventId 的 account+1, 不存在的话，则创建一个
@@ -14,7 +15,7 @@ export const authFrequencyLimit = async ({
        expiredTime: { $gte: new Date() }
      },
      {
-        $inc: { amount: 1 },
+        $inc: { amount: num },
        // If not exist, set the expiredTime
        $setOnInsert: { expiredTime }
      },
--- a/packages/service/common/system/timerLock/constants.ts
+++ b/packages/service/common/system/timerLock/constants.ts
@@ -6,7 +6,9 @@ export enum TimerIdEnum {
  updateStandardPlan = 'updateStandardPlan',
  scheduleTriggerApp = 'scheduleTriggerApp',
  notification = 'notification',
-  clearExpiredRawTextBuffer = 'clearExpiredRawTextBuffer'
+
+  clearExpiredRawTextBuffer = 'clearExpiredRawTextBuffer',
+  clearExpiredDatasetImage = 'clearExpiredDatasetImage'
 }

 export enum LockNotificationEnum {