perf: buffer;fix: back up split (#4913)

* perf: buffer * fix: back up split * fix: app limit * doc
2025-10-15 15:41:05 +00:00 · 2025-05-28 18:18:25 +08:00
parent 802de11363
commit a171c7b11c
11 changed files with 208 additions and 93 deletions
--- a/packages/service/core/dataset/collection/controller.ts
+++ b/packages/service/core/dataset/collection/controller.ts
@@ -77,7 +77,10 @@ export const createCollectionAndInsertData = async ({
  const chunkSplitter = computeChunkSplitter(createCollectionParams);
  const paragraphChunkDeep = computeParagraphChunkDeep(createCollectionParams);

-  if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
+  if (
+    trainingType === DatasetCollectionDataProcessModeEnum.qa ||
+    trainingType === DatasetCollectionDataProcessModeEnum.backup
+  ) {
    delete createCollectionParams.chunkTriggerType;
    delete createCollectionParams.chunkTriggerMinSize;
    delete createCollectionParams.dataEnhanceCollectionName;
--- a/packages/service/core/dataset/read.ts
+++ b/packages/service/core/dataset/read.ts
@@ -218,6 +218,10 @@ export const rawText2Chunks = ({
    };
  };

+  if (backupParse) {
+    return parseDatasetBackup2Chunks(rawText).chunks;
+  }
+
  // Chunk condition
  // 1. 选择最大值条件，只有超过了最大值(默认为模型的最大值*0.7），才会触发分块
  if (chunkTriggerType === ChunkTriggerConfigTypeEnum.maxSize) {
@@ -240,10 +244,6 @@ export const rawText2Chunks = ({
    }
  }

-  if (backupParse) {
-    return parseDatasetBackup2Chunks(rawText).chunks;
-  }
-
  const { chunks } = splitText2Chunks({
    text: rawText,
    chunkSize,
--- a/packages/service/core/workflow/dispatch/tools/readFiles.ts
+++ b/packages/service/core/workflow/dispatch/tools/readFiles.ts
@@ -5,8 +5,6 @@ import { NodeOutputKeyEnum } from '@fastgpt/global/core/workflow/constants';
 import { type DispatchNodeResultType } from '@fastgpt/global/core/workflow/runtime/type';
 import axios from 'axios';
 import { serverRequestBaseUrl } from '../../../../common/api/serverRequest';
-import { MongoRawTextBuffer } from '../../../../common/buffer/rawText/schema';
-import { readFromSecondary } from '../../../../common/mongo/utils';
 import { getErrText } from '@fastgpt/global/common/error/utils';
 import { detectFileEncoding, parseUrlToFileType } from '@fastgpt/global/common/file/tools';
 import { readRawContentByFileBuffer } from '../../../../common/file/read/utils';
@@ -14,6 +12,8 @@ import { ChatRoleEnum } from '@fastgpt/global/core/chat/constants';
 import { type ChatItemType, type UserChatItemValueItemType } from '@fastgpt/global/core/chat/type';
 import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools';
 import { addLog } from '../../../../common/system/log';
+import { addRawTextBuffer, getRawTextBuffer } from '../../../../common/buffer/rawText/controller';
+import { addMinutes } from 'date-fns';

 type Props = ModuleDispatchProps<{
  [NodeInputKeyEnum.fileUrlList]: string[];
@@ -158,14 +158,12 @@ export const getFileContentFromLinks = async ({
    parseUrlList
      .map(async (url) => {
        // Get from buffer
-        const fileBuffer = await MongoRawTextBuffer.findOne({ sourceId: url }, undefined, {
-          ...readFromSecondary
-        }).lean();
+        const fileBuffer = await getRawTextBuffer(url);
        if (fileBuffer) {
          return formatResponseObject({
-            filename: fileBuffer.metadata?.filename || url,
+            filename: fileBuffer.sourceName || url,
            url,
-            content: fileBuffer.rawText
+            content: fileBuffer.text
          });
        }

@@ -220,17 +218,12 @@ export const getFileContentFromLinks = async ({
          });

          // Add to buffer
-          try {
-            if (buffer.length < 14 * 1024 * 1024 && rawText.trim()) {
-              MongoRawTextBuffer.create({
-                sourceId: url,
-                rawText,
-                metadata: {
-                  filename: filename
-                }
-              });
-            }
-          } catch (error) {}
+          addRawTextBuffer({
+            sourceId: url,
+            sourceName: filename,
+            text: rawText,
+            expiredTime: addMinutes(new Date(), 20)
+          });

          return formatResponseObject({ filename, url, content: rawText });
        } catch (error) {