From 01ff56b42b6fcd1122a93e9636c04c47443180a8 Mon Sep 17 00:00:00 2001
From: Archer <545436317@qq.com>
Date: Tue, 10 Jun 2025 00:05:54 +0800
Subject: [PATCH] perf: password special chars;feat: llm paragraph;perf: chunk
 setting params;perf: text splitter worker (#4984)

* perf: password special chars

* feat: llm paragraph;perf: chunk setting params

* perf: text splitter worker

* perf: get rawtext buffer

* fix: test

* fix: test

* doc

* min chunk size
---
 .../zh-cn/docs/development/upgrading/4912.md  |  22 +++
 packages/global/common/string/password.ts     |   4 +-
 packages/global/common/string/textSplitter.ts |  10 +-
 packages/global/common/system/constants.ts    |   1 +
 packages/global/core/dataset/constants.ts     |   3 +-
 .../global/core/dataset/training/utils.ts     | 118 +++++++-----
 packages/global/package.json                  |   4 +-
 .../common/buffer/rawText/controller.ts       |  41 ++--
 packages/service/common/file/gridfs/utils.ts  |   6 +-
 packages/service/common/file/read/utils.ts    |   7 +-
 packages/service/common/mongo/index.ts        |   3 +-
 .../core/dataset/collection/controller.ts     |  99 +++++-----
 packages/service/core/dataset/read.ts         |  21 +-
 .../core/dataset/training/controller.ts       |  17 +-
 .../support/wallet/usage/controller.ts        |  64 ++++++
 packages/service/worker/controller.ts         |  18 ++
 packages/service/worker/function.ts           |  24 +++
 packages/service/worker/htmlStr2Md/index.ts   |  12 +-
 packages/service/worker/readFile/index.ts     |  14 +-
 packages/service/worker/text2Chunks/index.ts  |  14 ++
 packages/service/worker/utils.ts              |   3 +-
 .../web/components/common/MySelect/index.tsx  |   5 +-
 packages/web/i18n/en/account_usage.json       |   2 +
 packages/web/i18n/en/dataset.json             |   6 +
 packages/web/i18n/zh-CN/account_usage.json    |   2 +
 packages/web/i18n/zh-CN/dataset.json          |   6 +
 packages/web/i18n/zh-Hant/account_usage.json  |   2 +
 packages/web/i18n/zh-Hant/dataset.json        |   6 +
 pnpm-lock.yaml                                |   6 +
 .../detail/CollectionCard/WebsiteConfig.tsx   |  21 +-
 .../detail/Form/CollectionChunkForm.tsx       | 182 +++++++-----------
 .../dataset/detail/Import/Context.tsx         |   6 +-
 .../Import/commonProgress/PreviewData.tsx     |   7 +-
 .../detail/Import/commonProgress/Upload.tsx   |   9 +-
 .../api/core/dataset/file/getPreviewChunks.ts |  60 ++----
 .../app/src/pages/api/core/dataset/update.ts  |  12 +-
 .../service/core/dataset/data/controller.ts   |  32 +--
 .../core/dataset/queues/datasetParse.ts       |  50 +++--
 .../service/core/dataset/queues/generateQA.ts |  15 +-
 .../src/service/support/wallet/usage/push.ts  |  36 ----
 .../service/core/dataset/textSplitter.test.ts |  24 +--
 41 files changed, 546 insertions(+), 448 deletions(-)
 create mode 100644 docSite/content/zh-cn/docs/development/upgrading/4912.md
 create mode 100644 packages/service/worker/controller.ts
 create mode 100644 packages/service/worker/function.ts
 create mode 100644 packages/service/worker/text2Chunks/index.ts

diff --git a/docSite/content/zh-cn/docs/development/upgrading/4912.md b/docSite/content/zh-cn/docs/development/upgrading/4912.md
new file mode 100644
index 000000000..836ecd57d
--- /dev/null
+++ b/docSite/content/zh-cn/docs/development/upgrading/4912.md
@@ -0,0 +1,22 @@
+---
+title: 'V4.9.11(进行中)'
+description: 'FastGPT V4.9.12 更新说明'
+icon: 'upgrade'
+draft: false
+toc: true
+weight: 789
+---
+
+## 🚀 新增内容
+
+1. 商业版支持知识库分块时，LLM 进行自动分段识别。
+
+## ⚙️ 优化
+
+1. 密码校验时，增加更多的特殊字符
+2. 后端全量计算知识库 chunk 参数，避免自动模式下部分参数未正确使用默认值。
+3. 将文本分块移至 worker 线程，避免阻塞。
+
+## 🐛 修复
+
+1. 自定义问答提取提示词被覆盖。
\ No newline at end of file
diff --git a/packages/global/common/string/password.ts b/packages/global/common/string/password.ts
index 68b92d544..4042019a0 100644
--- a/packages/global/common/string/password.ts
+++ b/packages/global/common/string/password.ts
@@ -3,9 +3,9 @@ export const checkPasswordRule = (password: string) => {
     /\d/, // Contains digits
     /[a-z]/, // Contains lowercase letters
     /[A-Z]/, // Contains uppercase letters
-    /[!@#$%^&*()_+=-]/ // Contains special characters
+    /[!@#$%^&*()_+=.,:;?\/\\|`~"'<>{}\[\]-]/ // Contains special characters
   ];
-  const validChars = /^[\dA-Za-z!@#$%^&*()_+=-]{8,100}$/;
+  const validChars = /^[\dA-Za-z!@#$%^&*()_+=.,:;?\/\\|`~"'<>{}\[\]-]{8,100}$/;
 
   // Check length and valid characters
   if (!validChars.test(password)) return false;
diff --git a/packages/global/common/string/textSplitter.ts b/packages/global/common/string/textSplitter.ts
index 30dfe5174..6ded96260 100644
--- a/packages/global/common/string/textSplitter.ts
+++ b/packages/global/common/string/textSplitter.ts
@@ -1,10 +1,11 @@
 import { defaultMaxChunkSize } from '../../core/dataset/training/utils';
 import { getErrText } from '../error/utils';
+import { simpleText } from './tools';
 import { getTextValidLength } from './utils';
 
 export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
 
-type SplitProps = {
+export type SplitProps = {
   text: string;
   chunkSize: number;
 
@@ -19,7 +20,7 @@ export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkSize'> & {
   chunkSize?: number;
 };
 
-type SplitResponse = {
+export type SplitResponse = {
   chunks: string[];
   chars: number;
 };
@@ -474,7 +475,10 @@ export const splitText2Chunks = (props: SplitProps): SplitResponse => {
   });
 
   return {
-    chunks: splitResult.map((item) => item.chunks).flat(),
+    chunks: splitResult
+      .map((item) => item.chunks)
+      .flat()
+      .map((chunk) => simpleText(chunk)),
     chars: splitResult.reduce((sum, item) => sum + item.chars, 0)
   };
 };
diff --git a/packages/global/common/system/constants.ts b/packages/global/common/system/constants.ts
index 4dcc4d276..ab7a281be 100644
--- a/packages/global/common/system/constants.ts
+++ b/packages/global/common/system/constants.ts
@@ -7,3 +7,4 @@ export const DEFAULT_ORG_AVATAR = '/imgs/avatar/defaultOrgAvatar.svg';
 export const DEFAULT_USER_AVATAR = '/imgs/avatar/BlueAvatar.svg';
 
 export const isProduction = process.env.NODE_ENV === 'production';
+export const isTestEnv = process.env.NODE_ENV === 'test';
diff --git a/packages/global/core/dataset/constants.ts b/packages/global/core/dataset/constants.ts
index 8e74b7c2a..5f759247d 100644
--- a/packages/global/core/dataset/constants.ts
+++ b/packages/global/core/dataset/constants.ts
@@ -211,7 +211,8 @@ export enum DataChunkSplitModeEnum {
 }
 export enum ParagraphChunkAIModeEnum {
   auto = 'auto',
-  force = 'force'
+  force = 'force',
+  forbid = 'forbid'
 }
 
 /* ------------ data -------------- */
diff --git a/packages/global/core/dataset/training/utils.ts b/packages/global/core/dataset/training/utils.ts
index d98390e9c..ac9715eb3 100644
--- a/packages/global/core/dataset/training/utils.ts
+++ b/packages/global/core/dataset/training/utils.ts
@@ -3,8 +3,11 @@ import { type EmbeddingModelItemType, type LLMModelItemType } from '../../../cor
 import {
   ChunkSettingModeEnum,
   DataChunkSplitModeEnum,
-  DatasetCollectionDataProcessModeEnum
+  DatasetCollectionDataProcessModeEnum,
+  ParagraphChunkAIModeEnum
 } from '../constants';
+import type { ChunkSettingsType } from '../type';
+import { cloneDeep } from 'lodash';
 
 export const minChunkSize = 64; // min index and chunk size
 
@@ -103,53 +106,78 @@ export const getIndexSizeSelectList = (max = 512) => {
 };
 
 // Compute
-export const computeChunkSize = (params: {
-  trainingType: DatasetCollectionDataProcessModeEnum;
-  chunkSettingMode?: ChunkSettingModeEnum;
-  chunkSplitMode?: DataChunkSplitModeEnum;
+export const computedCollectionChunkSettings = <T extends ChunkSettingsType>({
+  llmModel,
+  vectorModel,
+  ...data
+}: {
   llmModel?: LLMModelItemType;
-  chunkSize?: number;
-}) => {
-  if (params.trainingType === DatasetCollectionDataProcessModeEnum.qa) {
-    if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
-      return getLLMDefaultChunkSize(params.llmModel);
+  vectorModel?: EmbeddingModelItemType;
+} & T) => {
+  const {
+    trainingType = DatasetCollectionDataProcessModeEnum.chunk,
+    chunkSettingMode = ChunkSettingModeEnum.auto,
+    chunkSplitMode,
+    chunkSize,
+    paragraphChunkDeep = 5,
+    indexSize,
+    autoIndexes
+  } = data;
+  const cloneChunkSettings = cloneDeep(data);
+
+  if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
+    delete cloneChunkSettings.qaPrompt;
+  }
+
+  // Format training type indexSize/chunkSize
+  const trainingModeSize: {
+    autoChunkSize: number;
+    autoIndexSize: number;
+    chunkSize?: number;
+    indexSize?: number;
+  } = (() => {
+    if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
+      return {
+        autoChunkSize: getLLMDefaultChunkSize(llmModel),
+        autoIndexSize: getMaxIndexSize(vectorModel),
+        chunkSize,
+        indexSize: getMaxIndexSize(vectorModel)
+      };
+    } else if (autoIndexes) {
+      return {
+        autoChunkSize: chunkAutoChunkSize,
+        autoIndexSize: getAutoIndexSize(vectorModel),
+        chunkSize,
+        indexSize
+      };
+    } else {
+      return {
+        autoChunkSize: chunkAutoChunkSize,
+        autoIndexSize: getAutoIndexSize(vectorModel),
+        chunkSize,
+        indexSize
+      };
     }
+  })();
+
+  if (chunkSettingMode === ChunkSettingModeEnum.auto) {
+    cloneChunkSettings.chunkSplitMode = DataChunkSplitModeEnum.paragraph;
+    cloneChunkSettings.paragraphChunkAIMode = ParagraphChunkAIModeEnum.forbid;
+    cloneChunkSettings.paragraphChunkDeep = 5;
+    cloneChunkSettings.paragraphChunkMinSize = 100;
+    cloneChunkSettings.chunkSize = trainingModeSize.autoChunkSize;
+    cloneChunkSettings.indexSize = trainingModeSize.autoIndexSize;
+
+    cloneChunkSettings.chunkSplitter = undefined;
   } else {
-    // chunk
-    if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
-      return chunkAutoChunkSize;
-    }
+    cloneChunkSettings.paragraphChunkDeep =
+      chunkSplitMode === DataChunkSplitModeEnum.paragraph ? paragraphChunkDeep : 0;
+
+    cloneChunkSettings.chunkSize = trainingModeSize.chunkSize
+      ? Math.min(trainingModeSize.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(llmModel))
+      : undefined;
+    cloneChunkSettings.indexSize = trainingModeSize.indexSize;
   }
 
-  if (params.chunkSplitMode === DataChunkSplitModeEnum.char) {
-    return getLLMMaxChunkSize(params.llmModel);
-  }
-
-  return Math.min(params.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel));
-};
-export const computeChunkSplitter = (params: {
-  chunkSettingMode?: ChunkSettingModeEnum;
-  chunkSplitMode?: DataChunkSplitModeEnum;
-  chunkSplitter?: string;
-}) => {
-  if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
-    return undefined;
-  }
-  if (params.chunkSplitMode !== DataChunkSplitModeEnum.char) {
-    return undefined;
-  }
-  return params.chunkSplitter;
-};
-export const computeParagraphChunkDeep = (params: {
-  chunkSettingMode?: ChunkSettingModeEnum;
-  chunkSplitMode?: DataChunkSplitModeEnum;
-  paragraphChunkDeep?: number;
-}) => {
-  if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
-    return 5;
-  }
-  if (params.chunkSplitMode === DataChunkSplitModeEnum.paragraph) {
-    return params.paragraphChunkDeep;
-  }
-  return 0;
+  return cloneChunkSettings;
 };
diff --git a/packages/global/package.json b/packages/global/package.json
index 376a115bd..93dd88a28 100644
--- a/packages/global/package.json
+++ b/packages/global/package.json
@@ -15,9 +15,11 @@
     "next": "14.2.28",
     "openai": "4.61.0",
     "openapi-types": "^12.1.3",
-    "timezones-list": "^3.0.2"
+    "timezones-list": "^3.0.2",
+    "lodash": "^4.17.21"
   },
   "devDependencies": {
+    "@types/lodash": "^4.14.191",
     "@types/js-yaml": "^4.0.9",
     "@types/node": "20.14.0"
   }
diff --git a/packages/service/common/buffer/rawText/controller.ts b/packages/service/common/buffer/rawText/controller.ts
index 8750494a3..306edac32 100644
--- a/packages/service/common/buffer/rawText/controller.ts
+++ b/packages/service/common/buffer/rawText/controller.ts
@@ -5,6 +5,8 @@ import { addLog } from '../../system/log';
 import { setCron } from '../../system/cron';
 import { checkTimerLock } from '../../system/timerLock/utils';
 import { TimerIdEnum } from '../../system/timerLock/constants';
+import { gridFsStream2Buffer } from '../../file/gridfs/utils';
+import { readRawContentFromBuffer } from '../../../worker/function';
 
 const getGridBucket = () => {
   return new connectionMongo.mongo.GridFSBucket(connectionMongo.connection.db!, {
@@ -85,30 +87,27 @@ export const getRawTextBuffer = async (sourceId: string) => {
 
     // Read file content
     const downloadStream = gridBucket.openDownloadStream(bufferData._id);
-    const chunks: Buffer[] = [];
 
-    return new Promise<{
-      text: string;
-      sourceName: string;
-    } | null>((resolve, reject) => {
-      downloadStream.on('data', (chunk) => {
-        chunks.push(chunk);
-      });
+    const fileBuffers = await gridFsStream2Buffer(downloadStream);
 
-      downloadStream.on('end', () => {
-        const buffer = Buffer.concat(chunks);
-        const text = buffer.toString('utf8');
-        resolve({
-          text,
-          sourceName: bufferData.metadata?.sourceName || ''
-        });
-      });
+    const rawText = await (async () => {
+      if (fileBuffers.length < 10000000) {
+        return fileBuffers.toString('utf8');
+      } else {
+        return (
+          await readRawContentFromBuffer({
+            extension: 'txt',
+            encoding: 'utf8',
+            buffer: fileBuffers
+          })
+        ).rawText;
+      }
+    })();
 
-      downloadStream.on('error', (error) => {
-        addLog.error('getRawTextBuffer error', error);
-        resolve(null);
-      });
-    });
+    return {
+      text: rawText,
+      sourceName: bufferData.metadata?.sourceName || ''
+    };
   });
 };
 
diff --git a/packages/service/common/file/gridfs/utils.ts b/packages/service/common/file/gridfs/utils.ts
index 4c72fb61d..691d85a4f 100644
--- a/packages/service/common/file/gridfs/utils.ts
+++ b/packages/service/common/file/gridfs/utils.ts
@@ -55,13 +55,17 @@ export const createFileFromText = async ({
 
 export const gridFsStream2Buffer = (stream: NodeJS.ReadableStream) => {
   return new Promise<Buffer>((resolve, reject) => {
+    if (!stream.readable) {
+      return resolve(Buffer.from([]));
+    }
+
     const chunks: Uint8Array[] = [];
 
     stream.on('data', (chunk) => {
       chunks.push(chunk);
     });
     stream.on('end', () => {
-      const resultBuffer = Buffer.concat(chunks); // 一次性拼接
+      const resultBuffer = Buffer.concat(chunks); // One-time splicing
       resolve(resultBuffer);
     });
     stream.on('error', (err) => {
diff --git a/packages/service/common/file/read/utils.ts b/packages/service/common/file/read/utils.ts
index b08d36137..461da09dc 100644
--- a/packages/service/common/file/read/utils.ts
+++ b/packages/service/common/file/read/utils.ts
@@ -1,6 +1,5 @@
 import { uploadMongoImg } from '../image/controller';
 import FormData from 'form-data';
-import { WorkerNameEnum, runWorker } from '../../../worker/utils';
 import fs from 'fs';
 import type { ReadFileResponse } from '../../../worker/readFile/type';
 import axios from 'axios';
@@ -9,6 +8,7 @@ import { batchRun } from '@fastgpt/global/common/system/utils';
 import { matchMdImg } from '@fastgpt/global/common/string/markdown';
 import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
 import { useDoc2xServer } from '../../../thirdProvider/doc2x';
+import { readRawContentFromBuffer } from '../../../worker/function';
 
 export type readRawTextByLocalFileParams = {
   teamId: string;
@@ -63,11 +63,10 @@ export const readRawContentByFileBuffer = async ({
   rawText: string;
 }> => {
   const systemParse = () =>
-    runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
+    readRawContentFromBuffer({
       extension,
       encoding,
-      buffer,
-      teamId
+      buffer
     });
   const parsePdfFromCustomService = async (): Promise<ReadFileResponse> => {
     const url = global.systemEnv.customPdfParse?.url;
diff --git a/packages/service/common/mongo/index.ts b/packages/service/common/mongo/index.ts
index 6153714be..0abb1e615 100644
--- a/packages/service/common/mongo/index.ts
+++ b/packages/service/common/mongo/index.ts
@@ -1,3 +1,4 @@
+import { isTestEnv } from '@fastgpt/global/common/system/constants';
 import { addLog } from '../../common/system/log';
 import type { Model } from 'mongoose';
 import mongoose, { Mongoose } from 'mongoose';
@@ -70,7 +71,7 @@ const addCommonMiddleware = (schema: mongoose.Schema) => {
 
 export const getMongoModel = <T>(name: string, schema: mongoose.Schema) => {
   if (connectionMongo.models[name]) return connectionMongo.models[name] as Model<T>;
-  if (process.env.NODE_ENV !== 'test') console.log('Load model======', name);
+  if (!isTestEnv) console.log('Load model======', name);
   addCommonMiddleware(schema);
 
   const model = connectionMongo.model<T>(name, schema);
diff --git a/packages/service/core/dataset/collection/controller.ts b/packages/service/core/dataset/collection/controller.ts
index 646c5ab4e..9a032b5f3 100644
--- a/packages/service/core/dataset/collection/controller.ts
+++ b/packages/service/core/dataset/collection/controller.ts
@@ -32,10 +32,7 @@ import { MongoDatasetDataText } from '../data/dataTextSchema';
 import { retryFn } from '@fastgpt/global/common/system/utils';
 import { getTrainingModeByCollection } from './utils';
 import {
-  computeChunkSize,
-  computeChunkSplitter,
-  computeParagraphChunkDeep,
-  getAutoIndexSize,
+  computedCollectionChunkSettings,
   getLLMMaxChunkSize
 } from '@fastgpt/global/core/dataset/training/utils';
 import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
@@ -68,31 +65,50 @@ export const createCollectionAndInsertData = async ({
     createCollectionParams.autoIndexes = true;
   }
 
-  const teamId = createCollectionParams.teamId;
-  const tmbId = createCollectionParams.tmbId;
+  const formatCreateCollectionParams = computedCollectionChunkSettings({
+    ...createCollectionParams,
+    llmModel: getLLMModel(dataset.agentModel),
+    vectorModel: getEmbeddingModel(dataset.vectorModel)
+  });
+
+  const teamId = formatCreateCollectionParams.teamId;
+  const tmbId = formatCreateCollectionParams.tmbId;
 
   // Set default params
   const trainingType =
-    createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
-  const chunkSplitter = computeChunkSplitter(createCollectionParams);
-  const paragraphChunkDeep = computeParagraphChunkDeep(createCollectionParams);
+    formatCreateCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
   const trainingMode = getTrainingModeByCollection({
     trainingType: trainingType,
-    autoIndexes: createCollectionParams.autoIndexes,
-    imageIndex: createCollectionParams.imageIndex
+    autoIndexes: formatCreateCollectionParams.autoIndexes,
+    imageIndex: formatCreateCollectionParams.imageIndex
   });
 
   if (
     trainingType === DatasetCollectionDataProcessModeEnum.qa ||
-    trainingType === DatasetCollectionDataProcessModeEnum.backup
+    trainingType === DatasetCollectionDataProcessModeEnum.backup ||
+    trainingType === DatasetCollectionDataProcessModeEnum.template
   ) {
-    delete createCollectionParams.chunkTriggerType;
-    delete createCollectionParams.chunkTriggerMinSize;
-    delete createCollectionParams.dataEnhanceCollectionName;
-    delete createCollectionParams.imageIndex;
-    delete createCollectionParams.autoIndexes;
-    delete createCollectionParams.indexSize;
-    delete createCollectionParams.qaPrompt;
+    delete formatCreateCollectionParams.chunkTriggerType;
+    delete formatCreateCollectionParams.chunkTriggerMinSize;
+    delete formatCreateCollectionParams.dataEnhanceCollectionName;
+    delete formatCreateCollectionParams.imageIndex;
+    delete formatCreateCollectionParams.autoIndexes;
+
+    if (
+      trainingType === DatasetCollectionDataProcessModeEnum.backup ||
+      trainingType === DatasetCollectionDataProcessModeEnum.template
+    ) {
+      delete formatCreateCollectionParams.paragraphChunkAIMode;
+      delete formatCreateCollectionParams.paragraphChunkDeep;
+      delete formatCreateCollectionParams.paragraphChunkMinSize;
+      delete formatCreateCollectionParams.chunkSplitMode;
+      delete formatCreateCollectionParams.chunkSize;
+      delete formatCreateCollectionParams.chunkSplitter;
+      delete formatCreateCollectionParams.indexSize;
+    }
+  }
+  if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
+    delete formatCreateCollectionParams.qaPrompt;
   }
 
   // 1. split chunks or create image chunks
@@ -109,30 +125,27 @@ export const createCollectionAndInsertData = async ({
     }>;
     chunkSize?: number;
     indexSize?: number;
-  } = (() => {
+  } = await (async () => {
     if (rawText) {
-      const chunkSize = computeChunkSize({
-        ...createCollectionParams,
-        trainingType,
-        llmModel: getLLMModel(dataset.agentModel)
-      });
       // Process text chunks
-      const chunks = rawText2Chunks({
+      const chunks = await rawText2Chunks({
         rawText,
-        chunkTriggerType: createCollectionParams.chunkTriggerType,
-        chunkTriggerMinSize: createCollectionParams.chunkTriggerMinSize,
-        chunkSize,
-        paragraphChunkDeep,
-        paragraphChunkMinSize: createCollectionParams.paragraphChunkMinSize,
+        chunkTriggerType: formatCreateCollectionParams.chunkTriggerType,
+        chunkTriggerMinSize: formatCreateCollectionParams.chunkTriggerMinSize,
+        chunkSize: formatCreateCollectionParams.chunkSize,
+        paragraphChunkDeep: formatCreateCollectionParams.paragraphChunkDeep,
+        paragraphChunkMinSize: formatCreateCollectionParams.paragraphChunkMinSize,
         maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
         overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
-        customReg: chunkSplitter ? [chunkSplitter] : [],
+        customReg: formatCreateCollectionParams.chunkSplitter
+          ? [formatCreateCollectionParams.chunkSplitter]
+          : [],
         backupParse
       });
       return {
         chunks,
-        chunkSize,
-        indexSize: createCollectionParams.indexSize ?? getAutoIndexSize(dataset.vectorModel)
+        chunkSize: formatCreateCollectionParams.chunkSize,
+        indexSize: formatCreateCollectionParams.indexSize
       };
     }
 
@@ -147,12 +160,8 @@ export const createCollectionAndInsertData = async ({
 
     return {
       chunks: [],
-      chunkSize: computeChunkSize({
-        ...createCollectionParams,
-        trainingType,
-        llmModel: getLLMModel(dataset.agentModel)
-      }),
-      indexSize: createCollectionParams.indexSize ?? getAutoIndexSize(dataset.vectorModel)
+      chunkSize: formatCreateCollectionParams.chunkSize,
+      indexSize: formatCreateCollectionParams.indexSize
     };
   })();
 
@@ -165,11 +174,9 @@ export const createCollectionAndInsertData = async ({
   const fn = async (session: ClientSession) => {
     // 3. Create collection
     const { _id: collectionId } = await createOneCollection({
-      ...createCollectionParams,
+      ...formatCreateCollectionParams,
       trainingType,
-      paragraphChunkDeep,
       chunkSize,
-      chunkSplitter,
       indexSize,
 
       hashRawText: rawText ? hashStr(rawText) : undefined,
@@ -179,7 +186,7 @@ export const createCollectionAndInsertData = async ({
         if (!dataset.autoSync && dataset.type === DatasetTypeEnum.websiteDataset) return undefined;
         if (
           [DatasetCollectionTypeEnum.link, DatasetCollectionTypeEnum.apiFile].includes(
-            createCollectionParams.type
+            formatCreateCollectionParams.type
           )
         ) {
           return addDays(new Date(), 1);
@@ -195,7 +202,7 @@ export const createCollectionAndInsertData = async ({
       const { billId: newBillId } = await createTrainingUsage({
         teamId,
         tmbId,
-        appName: createCollectionParams.name,
+        appName: formatCreateCollectionParams.name,
         billSource: UsageSourceEnum.training,
         vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
         agentModel: getLLMModel(dataset.agentModel)?.name,
@@ -218,7 +225,7 @@ export const createCollectionAndInsertData = async ({
           vlmModel: dataset.vlmModel,
           indexSize,
           mode: trainingMode,
-          prompt: createCollectionParams.qaPrompt,
+          prompt: formatCreateCollectionParams.qaPrompt,
           billId: traingBillId,
           data: chunks.map((item, index) => ({
             ...item,
diff --git a/packages/service/core/dataset/read.ts b/packages/service/core/dataset/read.ts
index 20b34e3bc..f2a4a9c1c 100644
--- a/packages/service/core/dataset/read.ts
+++ b/packages/service/core/dataset/read.ts
@@ -5,13 +5,14 @@ import {
 } from '@fastgpt/global/core/dataset/constants';
 import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
 import { urlsFetch } from '../../common/string/cheerio';
-import { type TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
+import { type TextSplitProps } from '@fastgpt/global/common/string/textSplitter';
 import axios from 'axios';
 import { readRawContentByFileBuffer } from '../../common/file/read/utils';
 import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools';
 import { getApiDatasetRequest } from './apiDataset';
 import Papa from 'papaparse';
 import type { ApiDatasetServerType } from '@fastgpt/global/core/dataset/apiDataset/type';
+import { text2Chunks } from '../../worker/function';
 
 export const readFileRawTextByUrl = async ({
   teamId,
@@ -165,7 +166,7 @@ export const readApiServerFileContent = async ({
   });
 };
 
-export const rawText2Chunks = ({
+export const rawText2Chunks = async ({
   rawText,
   chunkTriggerType = ChunkTriggerConfigTypeEnum.minSize,
   chunkTriggerMinSize = 1000,
@@ -182,12 +183,14 @@ export const rawText2Chunks = ({
 
   backupParse?: boolean;
   tableParse?: boolean;
-} & TextSplitProps): {
-  q: string;
-  a: string;
-  indexes?: string[];
-  imageIdList?: string[];
-}[] => {
+} & TextSplitProps): Promise<
+  {
+    q: string;
+    a: string;
+    indexes?: string[];
+    imageIdList?: string[];
+  }[]
+> => {
   const parseDatasetBackup2Chunks = (rawText: string) => {
     const csvArr = Papa.parse(rawText).data as string[][];
 
@@ -233,7 +236,7 @@ export const rawText2Chunks = ({
     }
   }
 
-  const { chunks } = splitText2Chunks({
+  const { chunks } = await text2Chunks({
     text: rawText,
     chunkSize,
     ...splitProps
diff --git a/packages/service/core/dataset/training/controller.ts b/packages/service/core/dataset/training/controller.ts
index 990cfa427..7385d783e 100644
--- a/packages/service/core/dataset/training/controller.ts
+++ b/packages/service/core/dataset/training/controller.ts
@@ -112,24 +112,15 @@ export async function pushDataListToTrainingQueue({
 
   // format q and a, remove empty char
   data = data.filter((item) => {
-    item.q = simpleText(item.q);
-    item.a = simpleText(item.a);
-
-    item.indexes = item.indexes
-      ?.map((index) => {
-        return {
-          ...index,
-          text: simpleText(index.text)
-        };
-      })
-      .filter(Boolean);
+    const q = item.q || '';
+    const a = item.a || '';
 
     // filter repeat content
-    if (!item.imageId && !item.q) {
+    if (!item.imageId && !q) {
       return;
     }
 
-    const text = item.q + item.a;
+    const text = q + a;
 
     // Oversize llm tokens
     if (text.length > maxToken) {
diff --git a/packages/service/support/wallet/usage/controller.ts b/packages/service/support/wallet/usage/controller.ts
index ec840e35b..8ff311650 100644
--- a/packages/service/support/wallet/usage/controller.ts
+++ b/packages/service/support/wallet/usage/controller.ts
@@ -8,6 +8,8 @@ import {
   type CreateUsageProps
 } from '@fastgpt/global/support/wallet/usage/api';
 import { i18nT } from '../../../../web/i18n/utils';
+import { formatModelChars2Points } from './utils';
+import { ModelTypeEnum } from '@fastgpt/global/core/ai/model';
 
 export async function createUsage(data: CreateUsageProps) {
   try {
@@ -67,6 +69,14 @@ export const createChatUsage = ({
   return { totalPoints };
 };
 
+export type DatasetTrainingMode = 'paragraph' | 'qa' | 'autoIndex' | 'imageIndex' | 'imageParse';
+export const datasetTrainingUsageIndexMap: Record<DatasetTrainingMode, number> = {
+  paragraph: 1,
+  qa: 2,
+  autoIndex: 3,
+  imageIndex: 4,
+  imageParse: 5
+};
 export const createTrainingUsage = async ({
   teamId,
   tmbId,
@@ -108,6 +118,13 @@ export const createTrainingUsage = async ({
             : []),
           ...(agentModel
             ? [
+                {
+                  moduleName: i18nT('account_usage:llm_paragraph'),
+                  model: agentModel,
+                  amount: 0,
+                  inputTokens: 0,
+                  outputTokens: 0
+                },
                 {
                   moduleName: i18nT('account_usage:qa'),
                   model: agentModel,
@@ -126,6 +143,13 @@ export const createTrainingUsage = async ({
             : []),
           ...(vllmModel
             ? [
+                {
+                  moduleName: i18nT('account_usage:image_index'),
+                  model: vllmModel,
+                  amount: 0,
+                  inputTokens: 0,
+                  outputTokens: 0
+                },
                 {
                   moduleName: i18nT('account_usage:image_parse'),
                   model: vllmModel,
@@ -171,3 +195,43 @@ export const createPdfParseUsage = async ({
     ]
   });
 };
+
+export const pushLLMTrainingUsage = async ({
+  teamId,
+  tmbId,
+  model,
+  inputTokens,
+  outputTokens,
+  billId,
+  mode
+}: {
+  teamId: string;
+  tmbId: string;
+  model: string;
+  inputTokens: number;
+  outputTokens: number;
+  billId: string;
+  mode: DatasetTrainingMode;
+}) => {
+  const index = datasetTrainingUsageIndexMap[mode];
+
+  // Compute points
+  const { totalPoints } = formatModelChars2Points({
+    model,
+    modelType: ModelTypeEnum.llm,
+    inputTokens,
+    outputTokens
+  });
+
+  concatUsage({
+    billId,
+    teamId,
+    tmbId,
+    totalPoints,
+    inputTokens,
+    outputTokens,
+    listIndex: index
+  });
+
+  return { totalPoints };
+};
diff --git a/packages/service/worker/controller.ts b/packages/service/worker/controller.ts
new file mode 100644
index 000000000..0b9db3717
--- /dev/null
+++ b/packages/service/worker/controller.ts
@@ -0,0 +1,18 @@
+import type { MessagePort } from 'worker_threads';
+
+export const workerResponse = ({
+  parentPort,
+  status,
+  data
+}: {
+  parentPort: MessagePort | null;
+  status: 'success' | 'error';
+  data: any;
+}) => {
+  parentPort?.postMessage({
+    type: status,
+    data: data
+  });
+
+  process.exit();
+};
diff --git a/packages/service/worker/function.ts b/packages/service/worker/function.ts
new file mode 100644
index 000000000..6e1e76168
--- /dev/null
+++ b/packages/service/worker/function.ts
@@ -0,0 +1,24 @@
+import {
+  splitText2Chunks,
+  type SplitProps,
+  type SplitResponse
+} from '@fastgpt/global/common/string/textSplitter';
+import { runWorker, WorkerNameEnum } from './utils';
+import type { ReadFileResponse } from './readFile/type';
+import { isTestEnv } from '@fastgpt/global/common/system/constants';
+
+export const text2Chunks = (props: SplitProps) => {
+  // Test env, not run worker
+  if (isTestEnv) {
+    return splitText2Chunks(props);
+  }
+  return runWorker<SplitResponse>(WorkerNameEnum.text2Chunks, props);
+};
+
+export const readRawContentFromBuffer = (props: {
+  extension: string;
+  encoding: string;
+  buffer: Buffer;
+}) => {
+  return runWorker<ReadFileResponse>(WorkerNameEnum.readFile, props);
+};
diff --git a/packages/service/worker/htmlStr2Md/index.ts b/packages/service/worker/htmlStr2Md/index.ts
index 22a998760..bc63c6d1b 100644
--- a/packages/service/worker/htmlStr2Md/index.ts
+++ b/packages/service/worker/htmlStr2Md/index.ts
@@ -1,19 +1,21 @@
 import { parentPort } from 'worker_threads';
 import { html2md } from './utils';
+import { workerResponse } from '../controller';
 
 parentPort?.on('message', (params: { html: string }) => {
   try {
     const md = html2md(params?.html || '');
 
-    parentPort?.postMessage({
-      type: 'success',
+    workerResponse({
+      parentPort,
+      status: 'success',
       data: md
     });
   } catch (error) {
-    parentPort?.postMessage({
-      type: 'error',
+    workerResponse({
+      parentPort,
+      status: 'error',
       data: error
     });
   }
-  process.exit();
 });
diff --git a/packages/service/worker/readFile/index.ts b/packages/service/worker/readFile/index.ts
index 40a55025a..78c3edc5b 100644
--- a/packages/service/worker/readFile/index.ts
+++ b/packages/service/worker/readFile/index.ts
@@ -7,6 +7,7 @@ import { readDocsFile } from './extension/docx';
 import { readPptxRawText } from './extension/pptx';
 import { readXlsxRawText } from './extension/xlsx';
 import { readCsvRawText } from './extension/csv';
+import { workerResponse } from '../controller';
 
 parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
   const read = async (params: ReadRawTextByBuffer) => {
@@ -41,17 +42,16 @@ parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
   };
 
   try {
-    parentPort?.postMessage({
-      type: 'success',
+    workerResponse({
+      parentPort,
+      status: 'success',
       data: await read(newProps)
     });
   } catch (error) {
-    console.log(error);
-    parentPort?.postMessage({
-      type: 'error',
+    workerResponse({
+      parentPort,
+      status: 'error',
       data: error
     });
   }
-
-  process.exit();
 });
diff --git a/packages/service/worker/text2Chunks/index.ts b/packages/service/worker/text2Chunks/index.ts
new file mode 100644
index 000000000..9a9fc1147
--- /dev/null
+++ b/packages/service/worker/text2Chunks/index.ts
@@ -0,0 +1,14 @@
+import { parentPort } from 'worker_threads';
+import type { SplitProps } from '@fastgpt/global/common/string/textSplitter';
+import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
+import { workerResponse } from '../controller';
+
+parentPort?.on('message', async (props: SplitProps) => {
+  const result = splitText2Chunks(props);
+
+  workerResponse({
+    parentPort,
+    status: 'success',
+    data: result
+  });
+});
diff --git a/packages/service/worker/utils.ts b/packages/service/worker/utils.ts
index b7508def0..fb541ae90 100644
--- a/packages/service/worker/utils.ts
+++ b/packages/service/worker/utils.ts
@@ -6,7 +6,8 @@ export enum WorkerNameEnum {
   readFile = 'readFile',
   htmlStr2Md = 'htmlStr2Md',
   countGptMessagesTokens = 'countGptMessagesTokens',
-  systemPluginRun = 'systemPluginRun'
+  systemPluginRun = 'systemPluginRun',
+  text2Chunks = 'text2Chunks'
 }
 
 export const getSafeEnv = () => {
diff --git a/packages/web/components/common/MySelect/index.tsx b/packages/web/components/common/MySelect/index.tsx
index 6534a4d41..6a17f2b92 100644
--- a/packages/web/components/common/MySelect/index.tsx
+++ b/packages/web/components/common/MySelect/index.tsx
@@ -151,8 +151,7 @@ const MySelect = <T = any,>(
                   ? {
                       ref: SelectedItemRef,
                       color: 'primary.700',
-                      bg: 'myGray.100',
-                      fontWeight: '600'
+                      bg: 'myGray.100'
                     }
                   : {
                       color: 'myGray.900'
@@ -167,7 +166,7 @@ const MySelect = <T = any,>(
                 display={'block'}
                 mb={0.5}
               >
-                <Flex alignItems={'center'}>
+                <Flex alignItems={'center'} fontWeight={value === item.value ? '600' : 'normal'}>
                   {item.icon && (
                     <Avatar mr={2} src={item.icon as any} w={item.iconSize ?? '1rem'} />
                   )}
diff --git a/packages/web/i18n/en/account_usage.json b/packages/web/i18n/en/account_usage.json
index 6a07f8f05..a754ef15d 100644
--- a/packages/web/i18n/en/account_usage.json
+++ b/packages/web/i18n/en/account_usage.json
@@ -20,8 +20,10 @@
   "export_title": "Time,Members,Type,Project name,AI points",
   "feishu": "Feishu",
   "generation_time": "Generation time",
+  "image_index": "Image index",
   "image_parse": "Image tagging",
   "input_token_length": "input tokens",
+  "llm_paragraph": "LLM segmentation",
   "mcp": "MCP call",
   "member": "member",
   "member_name": "Member name",
diff --git a/packages/web/i18n/en/dataset.json b/packages/web/i18n/en/dataset.json
index fa29d8b66..e693d1e9a 100644
--- a/packages/web/i18n/en/dataset.json
+++ b/packages/web/i18n/en/dataset.json
@@ -45,6 +45,7 @@
   "core.dataset.import.Adjust parameters": "Adjust parameters",
   "custom_data_process_params": "Custom",
   "custom_data_process_params_desc": "Customize data processing rules",
+  "custom_split_char": "Char",
   "custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.",
   "data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
   "data_error_amount": "{{errorAmount}} Group training exception",
@@ -117,6 +118,11 @@
   "insert_images_success": "The new picture is successfully added, and you need to wait for the training to be completed before it will be displayed.",
   "is_open_schedule": "Enable scheduled synchronization",
   "keep_image": "Keep the picture",
+  "llm_paragraph_mode": "LLM recognition paragraph(Beta)",
+  "llm_paragraph_mode_auto": "automatic",
+  "llm_paragraph_mode_auto_desc": "Enable the model to automatically recognize the title when the file content does not contain a Markdown title.",
+  "llm_paragraph_mode_forbid": "Disabled",
+  "llm_paragraph_mode_forbid_desc": "Force the disabling of the model's automatic paragraph recognition",
   "loading": "Loading...",
   "max_chunk_size": "Maximum chunk size",
   "move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.",
diff --git a/packages/web/i18n/zh-CN/account_usage.json b/packages/web/i18n/zh-CN/account_usage.json
index 13b6a1151..8befa05ab 100644
--- a/packages/web/i18n/zh-CN/account_usage.json
+++ b/packages/web/i18n/zh-CN/account_usage.json
@@ -20,8 +20,10 @@
   "export_title": "时间,成员,类型,项目名,AI 积分消耗",
   "feishu": "飞书",
   "generation_time": "生成时间",
+  "image_index": "图片索引",
   "image_parse": "图片标注",
   "input_token_length": "输入 tokens",
+  "llm_paragraph": "模型分段",
   "mcp": "MCP 调用",
   "member": "成员",
   "member_name": "成员名",
diff --git a/packages/web/i18n/zh-CN/dataset.json b/packages/web/i18n/zh-CN/dataset.json
index eb9a41ea3..d785aef8b 100644
--- a/packages/web/i18n/zh-CN/dataset.json
+++ b/packages/web/i18n/zh-CN/dataset.json
@@ -45,6 +45,7 @@
   "core.dataset.import.Adjust parameters": "调整参数",
   "custom_data_process_params": "自定义",
   "custom_data_process_params_desc": "自定义设置数据处理规则",
+  "custom_split_char": "分隔符",
   "custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据，使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符，例如：“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号，例如: * () [] {} 等。",
   "data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引",
   "data_error_amount": "{{errorAmount}} 组训练异常",
@@ -117,6 +118,11 @@
   "insert_images_success": "新增图片成功，需等待训练完成才会展示",
   "is_open_schedule": "启用定时同步",
   "keep_image": "保留图片",
+  "llm_paragraph_mode": "模型识别段落(Beta)",
+  "llm_paragraph_mode_auto": "自动",
+  "llm_paragraph_mode_auto_desc": "当文件内容不包含 Markdown 标题时，启用模型自动识别标题。",
+  "llm_paragraph_mode_forbid": "禁用",
+  "llm_paragraph_mode_forbid_desc": "强制禁用模型自动识别段落",
   "loading": "加载中...",
   "max_chunk_size": "最大分块大小",
   "move.hint": "移动后，所选知识库/文件夹将继承新文件夹的权限设置，原先的权限设置失效。",
diff --git a/packages/web/i18n/zh-Hant/account_usage.json b/packages/web/i18n/zh-Hant/account_usage.json
index fe1145801..db2f54fbd 100644
--- a/packages/web/i18n/zh-Hant/account_usage.json
+++ b/packages/web/i18n/zh-Hant/account_usage.json
@@ -20,8 +20,10 @@
   "export_title": "時間,成員,類型,項目名,AI 積分消耗",
   "feishu": "飛書",
   "generation_time": "生成時間",
+  "image_index": "圖片索引",
   "image_parse": "圖片標註",
   "input_token_length": "輸入 tokens",
+  "llm_paragraph": "模型分段",
   "mcp": "MCP 調用",
   "member": "成員",
   "member_name": "成員名",
diff --git a/packages/web/i18n/zh-Hant/dataset.json b/packages/web/i18n/zh-Hant/dataset.json
index d5bc64a43..6d5d1789e 100644
--- a/packages/web/i18n/zh-Hant/dataset.json
+++ b/packages/web/i18n/zh-Hant/dataset.json
@@ -44,6 +44,7 @@
   "core.dataset.import.Adjust parameters": "調整參數",
   "custom_data_process_params": "自訂",
   "custom_data_process_params_desc": "自訂資料處理規則",
+  "custom_split_char": "分隔符",
   "custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的資料，使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符，例如：“。|.”表示中英文句號。\n\n盡量避免使用正則相關特殊符號，例如：* () [] {} 等。",
   "data_amount": "{{dataAmount}} 組資料，{{indexAmount}} 組索引",
   "data_error_amount": "{{errorAmount}} 組訓練異常",
@@ -116,6 +117,11 @@
   "insert_images_success": "新增圖片成功，需等待訓練完成才會展示",
   "is_open_schedule": "啟用定時同步",
   "keep_image": "保留圖片",
+  "llm_paragraph_mode": "模型識別段落(Beta)",
+  "llm_paragraph_mode_auto": "自動",
+  "llm_paragraph_mode_auto_desc": "當文件內容不包含 Markdown 標題時，啟用模型自動識別標題。",
+  "llm_paragraph_mode_forbid": "禁用",
+  "llm_paragraph_mode_forbid_desc": "強制禁用模型自動識別段落",
   "loading": "加載中...",
   "max_chunk_size": "最大分塊大小",
   "move.hint": "移動後，所選資料集／資料夾將繼承新資料夾的權限設定，原先的權限設定將失效。",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 145765dff..2d422b810 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -89,6 +89,9 @@ importers:
       json5:
         specifier: ^2.2.3
         version: 2.2.3
+      lodash:
+        specifier: ^4.17.21
+        version: 4.17.21
       nanoid:
         specifier: ^5.1.3
         version: 5.1.3
@@ -108,6 +111,9 @@ importers:
       '@types/js-yaml':
         specifier: ^4.0.9
         version: 4.0.9
+      '@types/lodash':
+        specifier: ^4.14.191
+        version: 4.17.16
       '@types/node':
         specifier: 20.14.0
         version: 20.14.0
diff --git a/projects/app/src/pageComponents/dataset/detail/CollectionCard/WebsiteConfig.tsx b/projects/app/src/pageComponents/dataset/detail/CollectionCard/WebsiteConfig.tsx
index 3c5d7f5c1..ace2df1a9 100644
--- a/projects/app/src/pageComponents/dataset/detail/CollectionCard/WebsiteConfig.tsx
+++ b/projects/app/src/pageComponents/dataset/detail/CollectionCard/WebsiteConfig.tsx
@@ -9,25 +9,14 @@ import { useMyStep } from '@fastgpt/web/hooks/useStep';
 import MyDivider from '@fastgpt/web/components/common/MyDivider';
 import React from 'react';
 import { Box, Link, Input, Button, ModalBody, ModalFooter, Stack } from '@chakra-ui/react';
-import {
-  DataChunkSplitModeEnum,
-  DatasetCollectionDataProcessModeEnum
-} from '@fastgpt/global/core/dataset/constants';
-import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
 import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
 import { useContextSelector } from 'use-context-selector';
 import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
-import CollectionChunkForm, {
-  collectionChunkForm2StoreChunkData,
-  type CollectionChunkFormType
-} from '../Form/CollectionChunkForm';
-import {
-  getAutoIndexSize,
-  getLLMDefaultChunkSize
-} from '@fastgpt/global/core/dataset/training/utils';
+import CollectionChunkForm, { type CollectionChunkFormType } from '../Form/CollectionChunkForm';
 import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
 import PopoverConfirm from '@fastgpt/web/components/common/MyPopover/PopoverConfirm';
 import { defaultFormData } from '../Import/Context';
+import { computedCollectionChunkSettings } from '@fastgpt/global/core/dataset/training/utils';
 
 export type WebsiteConfigFormType = {
   websiteConfig: {
@@ -80,7 +69,7 @@ const WebsiteConfigModal = ({
 
   const form = useForm<CollectionChunkFormType>({
     defaultValues: {
-      trainingType: chunkSettings?.trainingType,
+      trainingType: chunkSettings?.trainingType || defaultFormData.trainingType,
 
       chunkTriggerType: chunkSettings?.chunkTriggerType || defaultFormData.chunkTriggerType,
       chunkTriggerMinSize:
@@ -204,9 +193,9 @@ const WebsiteConfigModal = ({
                 form.handleSubmit((data) =>
                   onSuccess({
                     websiteConfig: websiteInfoGetValues(),
-                    chunkSettings: collectionChunkForm2StoreChunkData({
+                    chunkSettings: computedCollectionChunkSettings({
                       ...data,
-                      agentModel: datasetDetail.agentModel,
+                      llmModel: datasetDetail.agentModel,
                       vectorModel: datasetDetail.vectorModel
                     })
                   })
diff --git a/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx b/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx
index b7d9d31be..3ac7164cb 100644
--- a/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx
+++ b/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx
@@ -17,7 +17,7 @@ import {
 } from '@chakra-ui/react';
 import MyIcon from '@fastgpt/web/components/common/Icon';
 import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
-import type { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
+import { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
 import { ChunkTriggerConfigTypeEnum } from '@fastgpt/global/core/dataset/constants';
 import {
   DataChunkSplitModeEnum,
@@ -133,6 +133,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
   const autoIndexes = watch('autoIndexes');
   const indexSize = watch('indexSize');
   const imageIndex = watch('imageIndex');
+  const paragraphChunkAIMode = watch('paragraphChunkAIMode');
 
   const trainingModeList = useMemo(() => {
     const list = {
@@ -362,11 +363,35 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
                       onChange={(e) => {
                         setValue('chunkSplitMode', e);
                       }}
+                      fontSize={'md'}
                     />
 
                     {chunkSplitMode === DataChunkSplitModeEnum.paragraph && (
                       <>
-                        <Box mt={1.5}>
+                        <Box mt={3}>
+                          <Box fontSize={'sm'}>{t('dataset:llm_paragraph_mode')}</Box>
+                          <MySelect<ParagraphChunkAIModeEnum>
+                            size={'sm'}
+                            bg={'myGray.50'}
+                            value={paragraphChunkAIMode}
+                            onChange={(e) => {
+                              setValue('paragraphChunkAIMode', e);
+                            }}
+                            list={[
+                              {
+                                label: t('dataset:llm_paragraph_mode_forbid'),
+                                value: ParagraphChunkAIModeEnum.forbid,
+                                description: t('dataset:llm_paragraph_mode_forbid_desc')
+                              },
+                              {
+                                label: t('dataset:llm_paragraph_mode_auto'),
+                                value: ParagraphChunkAIModeEnum.auto,
+                                description: t('dataset:llm_paragraph_mode_auto_desc')
+                              }
+                            ]}
+                          />
+                        </Box>
+                        <Box mt={2} fontSize={'sm'}>
                           <Box>{t('dataset:paragraph_max_deep')}</Box>
                           <MyNumberInput
                             size={'sm'}
@@ -379,7 +404,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
                             h={'32px'}
                           />
                         </Box>
-                        <Box mt={1.5}>
+                        <Box mt={2} fontSize={'sm'}>
                           <Box>{t('dataset:max_chunk_size')}</Box>
                           <Box
                             css={{
@@ -409,7 +434,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
                     )}
 
                     {chunkSplitMode === DataChunkSplitModeEnum.size && (
-                      <Box mt={1.5}>
+                      <Box mt={3} fontSize={'sm'}>
                         <Box>{t('dataset:chunk_size')}</Box>
                         <Box
                           css={{
@@ -438,45 +463,48 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
                     )}
 
                     {chunkSplitMode === DataChunkSplitModeEnum.char && (
-                      <HStack mt={1.5}>
-                        <Box flex={'1 0 0'}>
-                          <MySelect<string>
-                            list={customSplitList}
-                            size={'sm'}
-                            bg={'myGray.50'}
-                            value={customListSelectValue}
-                            h={'32px'}
-                            onChange={(val) => {
-                              if (val === 'Other') {
-                                setValue('chunkSplitter', '');
-                              } else {
-                                setValue('chunkSplitter', val);
-                              }
-                              setCustomListSelectValue(val);
-                            }}
-                          />
-                        </Box>
-                        {customListSelectValue === 'Other' && (
-                          <Input
-                            flex={'1 0 0'}
-                            h={'32px'}
-                            size={'sm'}
-                            bg={'myGray.50'}
-                            placeholder="\n;======;==SPLIT=="
-                            {...register('chunkSplitter')}
-                          />
-                        )}
-                      </HStack>
+                      <Box mt={3} fontSize={'sm'}>
+                        <Box>{t('dataset:custom_split_char')}</Box>
+                        <HStack>
+                          <Box flex={'1 0 0'}>
+                            <MySelect<string>
+                              list={customSplitList}
+                              size={'sm'}
+                              bg={'myGray.50'}
+                              value={customListSelectValue}
+                              h={'32px'}
+                              onChange={(val) => {
+                                if (val === 'Other') {
+                                  setValue('chunkSplitter', '');
+                                } else {
+                                  setValue('chunkSplitter', val);
+                                }
+                                setCustomListSelectValue(val);
+                              }}
+                            />
+                          </Box>
+                          {customListSelectValue === 'Other' && (
+                            <Input
+                              flex={'1 0 0'}
+                              h={'32px'}
+                              size={'sm'}
+                              bg={'myGray.50'}
+                              placeholder="\n;======;==SPLIT=="
+                              {...register('chunkSplitter')}
+                            />
+                          )}
+                        </HStack>
+                      </Box>
                     )}
                   </Box>
 
                   {trainingType === DatasetCollectionDataProcessModeEnum.chunk && (
-                    <Box>
-                      <Flex alignItems={'center'} mt={3}>
+                    <Box fontSize={'sm'} mt={2}>
+                      <Flex alignItems={'center'}>
                         <Box>{t('dataset:index_size')}</Box>
                         <QuestionTip label={t('dataset:index_size_tips')} />
                       </Flex>
-                      <Box mt={1}>
+                      <Box>
                         <MySelect<number>
                           bg={'myGray.50'}
                           list={indexSizeSeletorList}
@@ -490,7 +518,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
                   )}
 
                   {showQAPromptInput && (
-                    <Box mt={3}>
+                    <Box mt={2}>
                       <Box>{t('common:core.dataset.collection.QA Prompt')}</Box>
                       <Box
                         position={'relative'}
@@ -570,83 +598,3 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
 };
 
 export default CollectionChunkForm;
-
-// Get chunk settings from form
-export const collectionChunkForm2StoreChunkData = ({
-  agentModel,
-  vectorModel,
-  ...data
-}: CollectionChunkFormType & {
-  agentModel: LLMModelItemType;
-  vectorModel: EmbeddingModelItemType;
-}): CollectionChunkFormType => {
-  const {
-    trainingType,
-    autoIndexes,
-    chunkSettingMode,
-    chunkSize,
-    chunkSplitter,
-    indexSize,
-    qaPrompt
-  } = data;
-
-  // 根据处理方式，获取 auto 和 custom 的参数。
-  const trainingModeSize: {
-    autoChunkSize: number;
-    autoIndexSize: number;
-    chunkSize: number;
-    indexSize: number;
-  } = (() => {
-    if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
-      return {
-        autoChunkSize: getLLMDefaultChunkSize(agentModel),
-        autoIndexSize: getMaxIndexSize(vectorModel),
-        chunkSize,
-        indexSize: getMaxIndexSize(vectorModel)
-      };
-    } else if (autoIndexes) {
-      return {
-        autoChunkSize: chunkAutoChunkSize,
-        autoIndexSize: getAutoIndexSize(vectorModel),
-        chunkSize,
-        indexSize
-      };
-    } else {
-      return {
-        autoChunkSize: chunkAutoChunkSize,
-        autoIndexSize: getAutoIndexSize(vectorModel),
-        chunkSize,
-        indexSize
-      };
-    }
-  })();
-
-  // 获取真实参数
-  const {
-    chunkSize: formatChunkIndex,
-    indexSize: formatIndexSize,
-    chunkSplitter: formatChunkSplitter
-  } = (() => {
-    if (chunkSettingMode === ChunkSettingModeEnum.auto) {
-      return {
-        chunkSize: trainingModeSize.autoChunkSize,
-        indexSize: trainingModeSize.autoIndexSize,
-        chunkSplitter: ''
-      };
-    } else {
-      return {
-        chunkSize: trainingModeSize.chunkSize,
-        indexSize: trainingModeSize.indexSize,
-        chunkSplitter
-      };
-    }
-  })();
-
-  return {
-    ...data,
-    chunkSize: formatChunkIndex,
-    indexSize: formatIndexSize,
-    chunkSplitter: formatChunkSplitter,
-    qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
-  };
-};
diff --git a/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx b/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx
index a55ccf4c7..c2cbb9d92 100644
--- a/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx
+++ b/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx
@@ -52,7 +52,7 @@ export const defaultFormData: ImportFormType = {
 
   chunkSettingMode: ChunkSettingModeEnum.auto,
   chunkSplitMode: DataChunkSplitModeEnum.paragraph,
-  paragraphChunkAIMode: ParagraphChunkAIModeEnum.auto,
+  paragraphChunkAIMode: ParagraphChunkAIModeEnum.forbid,
   paragraphChunkDeep: 5,
   paragraphChunkMinSize: 100,
 
@@ -198,10 +198,10 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
   const vectorModel = datasetDetail.vectorModel;
 
   const processParamsForm = useForm<ImportFormType>({
-    defaultValues: {
+    defaultValues: (() => ({
       ...defaultFormData,
       indexSize: getAutoIndexSize(vectorModel)
-    }
+    }))()
   });
 
   const [sources, setSources] = useState<ImportSourceItemType[]>([]);
diff --git a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx
index 56dd189f8..9116f6495 100644
--- a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx
+++ b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx
@@ -17,7 +17,6 @@ import MyBox from '@fastgpt/web/components/common/MyBox';
 import Markdown from '@/components/Markdown';
 import { useToast } from '@fastgpt/web/hooks/useToast';
 import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
-import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';
 
 const PreviewData = () => {
   const { t } = useTranslation();
@@ -37,11 +36,7 @@ const PreviewData = () => {
     async () => {
       if (!previewFile) return { chunks: [], total: 0 };
 
-      const chunkData = collectionChunkForm2StoreChunkData({
-        ...processParamsForm.getValues(),
-        vectorModel: datasetDetail.vectorModel,
-        agentModel: datasetDetail.agentModel
-      });
+      const chunkData = processParamsForm.getValues();
 
       if (importSource === ImportDataSourceEnum.fileCustom) {
         const chunkSplitter = processParamsForm.getValues('chunkSplitter');
diff --git a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx
index bffc8e16e..c992c5141 100644
--- a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx
+++ b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx
@@ -37,7 +37,6 @@ import { useContextSelector } from 'use-context-selector';
 import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
 import { DatasetImportContext, type ImportFormType } from '../Context';
 import { type ApiCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
-import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';
 
 const Upload = () => {
   const { t } = useTranslation();
@@ -82,12 +81,6 @@ const Upload = () => {
 
   const { runAsync: startUpload, loading: isLoading } = useRequest2(
     async ({ customPdfParse, webSelector, ...data }: ImportFormType) => {
-      const chunkData = collectionChunkForm2StoreChunkData({
-        ...data,
-        vectorModel: datasetDetail.vectorModel,
-        agentModel: datasetDetail.agentModel
-      });
-
       if (sources.length === 0) return;
       const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting');
 
@@ -108,7 +101,7 @@ const Upload = () => {
         const commonParams: ApiCreateDatasetCollectionParams & {
           name: string;
         } = {
-          ...chunkData,
+          ...data,
           parentId,
           datasetId: datasetDetail._id,
           name: item.sourceName,
diff --git a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts
index 540ec199d..b6a8c1dfb 100644
--- a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts
+++ b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts
@@ -1,7 +1,3 @@
-import {
-  ChunkSettingModeEnum,
-  DatasetCollectionDataProcessModeEnum
-} from '@fastgpt/global/core/dataset/constants';
 import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
 import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
 import { NextAPI } from '@/service/middleware/entry';
@@ -13,13 +9,11 @@ import {
 import { authCollectionFile } from '@fastgpt/service/support/permission/auth/file';
 import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
 import {
-  computeChunkSize,
-  computeChunkSplitter,
-  computeParagraphChunkDeep,
+  computedCollectionChunkSettings,
   getLLMMaxChunkSize
 } from '@fastgpt/global/core/dataset/training/utils';
 import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
-import { getLLMModel } from '@fastgpt/service/core/ai/model';
+import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
 import type { ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
 
 export type PostPreviewFilesChunksProps = ChunkSettingsType & {
@@ -52,22 +46,12 @@ async function handler(
     sourceId,
     customPdfParse = false,
 
-    trainingType = DatasetCollectionDataProcessModeEnum.chunk,
-
-    chunkTriggerType,
-    chunkTriggerMinSize,
-
-    chunkSettingMode = ChunkSettingModeEnum.auto,
-    chunkSplitMode,
-    paragraphChunkDeep,
-    paragraphChunkMinSize,
-    chunkSize,
-    chunkSplitter,
-
     overlapRatio,
     selector,
     datasetId,
-    externalFileId
+    externalFileId,
+
+    ...chunkSettings
   } = req.body;
 
   if (!sourceId) {
@@ -97,22 +81,10 @@ async function handler(
     return Promise.reject(CommonErrEnum.unAuthFile);
   }
 
-  chunkSize = computeChunkSize({
-    trainingType,
-    chunkSettingMode,
-    chunkSplitMode,
-    chunkSize,
-    llmModel: getLLMModel(dataset.agentModel)
-  });
-  chunkSplitter = computeChunkSplitter({
-    chunkSettingMode,
-    chunkSplitMode,
-    chunkSplitter
-  });
-  paragraphChunkDeep = computeParagraphChunkDeep({
-    chunkSettingMode,
-    chunkSplitMode,
-    paragraphChunkDeep
+  const formatChunkSettings = computedCollectionChunkSettings({
+    ...chunkSettings,
+    llmModel: getLLMModel(dataset.agentModel),
+    vectorModel: getEmbeddingModel(dataset.vectorModel)
   });
 
   const { rawText } = await readDatasetSourceRawText({
@@ -126,16 +98,16 @@ async function handler(
     apiDatasetServer: dataset.apiDatasetServer
   });
 
-  const chunks = rawText2Chunks({
+  const chunks = await rawText2Chunks({
     rawText,
-    chunkTriggerType,
-    chunkTriggerMinSize,
-    chunkSize,
-    paragraphChunkDeep,
-    paragraphChunkMinSize,
+    chunkTriggerType: formatChunkSettings.chunkTriggerType,
+    chunkTriggerMinSize: formatChunkSettings.chunkTriggerMinSize,
+    chunkSize: formatChunkSettings.chunkSize,
+    paragraphChunkDeep: formatChunkSettings.paragraphChunkDeep,
+    paragraphChunkMinSize: formatChunkSettings.paragraphChunkMinSize,
     maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
     overlapRatio,
-    customReg: chunkSplitter ? [chunkSplitter] : []
+    customReg: formatChunkSettings.chunkSplitter ? [formatChunkSettings.chunkSplitter] : []
   });
 
   return {
diff --git a/projects/app/src/pages/api/core/dataset/update.ts b/projects/app/src/pages/api/core/dataset/update.ts
index 7ea50dd42..02b3aaa7d 100644
--- a/projects/app/src/pages/api/core/dataset/update.ts
+++ b/projects/app/src/pages/api/core/dataset/update.ts
@@ -40,6 +40,8 @@ import { isEqual } from 'lodash';
 import { addOperationLog } from '@fastgpt/service/support/operationLog/addOperationLog';
 import { OperationLogEventEnum } from '@fastgpt/global/support/operationLog/constants';
 import { getI18nDatasetType } from '@fastgpt/service/support/operationLog/util';
+import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
+import { computedCollectionChunkSettings } from '@fastgpt/global/core/dataset/training/utils';
 
 export type DatasetUpdateQuery = {};
 export type DatasetUpdateResponse = any;
@@ -59,7 +61,7 @@ async function handler(
   req: ApiRequestProps<DatasetUpdateBody, DatasetUpdateQuery>,
   _res: ApiResponseType<any>
 ): Promise<DatasetUpdateResponse> {
-  const {
+  let {
     id,
     parentId,
     name,
@@ -89,6 +91,14 @@ async function handler(
 
   let targetName = '';
 
+  chunkSettings = chunkSettings
+    ? computedCollectionChunkSettings({
+        ...chunkSettings,
+        llmModel: getLLMModel(dataset.agentModel),
+        vectorModel: getEmbeddingModel(dataset.vectorModel)
+      })
+    : undefined;
+
   if (isMove) {
     if (parentId) {
       // move to a folder, check the target folder's permission
diff --git a/projects/app/src/service/core/dataset/data/controller.ts b/projects/app/src/service/core/dataset/data/controller.ts
index d076f78e3..336455c24 100644
--- a/projects/app/src/service/core/dataset/data/controller.ts
+++ b/projects/app/src/service/core/dataset/data/controller.ts
@@ -16,9 +16,9 @@ import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
 import { type ClientSession } from '@fastgpt/service/common/mongo';
 import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema';
 import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
-import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
 import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
 import { deleteDatasetImage } from '@fastgpt/service/core/dataset/image/controller';
+import { text2Chunks } from '@fastgpt/service/worker/function';
 
 const formatIndexes = async ({
   indexes = [],
@@ -40,7 +40,7 @@ const formatIndexes = async ({
   }[]
 > => {
   /* get dataset data default index */
-  const getDefaultIndex = ({
+  const getDefaultIndex = async ({
     q = '',
     a,
     indexSize
@@ -49,13 +49,15 @@ const formatIndexes = async ({
     a?: string;
     indexSize: number;
   }) => {
-    const qChunks = splitText2Chunks({
-      text: q,
-      chunkSize: indexSize,
-      maxSize: maxIndexSize
-    }).chunks;
+    const qChunks = (
+      await text2Chunks({
+        text: q,
+        chunkSize: indexSize,
+        maxSize: maxIndexSize
+      })
+    ).chunks;
     const aChunks = a
-      ? splitText2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize }).chunks
+      ? (await text2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize })).chunks
       : [];
 
     return [
@@ -80,7 +82,7 @@ const formatIndexes = async ({
     .filter((item) => !!item.text.trim());
 
   // Recompute default indexes, Merge ids of the same index, reduce the number of rebuilds
-  const defaultIndexes = getDefaultIndex({ q, a, indexSize });
+  const defaultIndexes = await getDefaultIndex({ q, a, indexSize });
 
   const concatDefaultIndexes = defaultIndexes.map((item) => {
     const oldIndex = indexes!.find((index) => index.text === item.text);
@@ -114,11 +116,13 @@ const formatIndexes = async ({
         // If oversize tokens, split it
         const tokens = await countPromptTokens(item.text);
         if (tokens > maxIndexSize) {
-          const splitText = splitText2Chunks({
-            text: item.text,
-            chunkSize: indexSize,
-            maxSize: maxIndexSize
-          }).chunks;
+          const splitText = (
+            await text2Chunks({
+              text: item.text,
+              chunkSize: indexSize,
+              maxSize: maxIndexSize
+            })
+          ).chunks;
           return splitText.map((text) => ({
             text,
             type: item.type
diff --git a/projects/app/src/service/core/dataset/queues/datasetParse.ts b/projects/app/src/service/core/dataset/queues/datasetParse.ts
index eccd4a8b5..f68abfbb4 100644
--- a/projects/app/src/service/core/dataset/queues/datasetParse.ts
+++ b/projects/app/src/service/core/dataset/queues/datasetParse.ts
@@ -1,6 +1,6 @@
 /* Dataset collection source parse, not max size. */
 
-import type { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
+import { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
 import {
   DatasetCollectionDataProcessModeEnum,
   DatasetCollectionTypeEnum,
@@ -29,7 +29,7 @@ import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
 import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
 import { hashStr } from '@fastgpt/global/common/string/tools';
 import { POST } from '@fastgpt/service/common/api/plusRequest';
-import { deleteRawTextBuffer } from '@fastgpt/service/common/buffer/rawText/controller';
+import { pushLLMTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
 
 const requestLLMPargraph = async ({
   rawText,
@@ -42,13 +42,11 @@ const requestLLMPargraph = async ({
   billId: string;
   paragraphChunkAIMode: ParagraphChunkAIModeEnum;
 }) => {
-  return {
-    resultText: rawText,
-    totalInputTokens: 0,
-    totalOutputTokens: 0
-  };
-
-  if (!global.feConfigs?.isPlus || !paragraphChunkAIMode) {
+  if (
+    !global.feConfigs?.isPlus ||
+    !paragraphChunkAIMode ||
+    paragraphChunkAIMode === ParagraphChunkAIModeEnum.forbid
+  ) {
     return {
       resultText: rawText,
       totalInputTokens: 0,
@@ -57,16 +55,16 @@ const requestLLMPargraph = async ({
   }
 
   // Check is markdown text(Include 1 group of title)
-  // if (paragraphChunkAIMode === ParagraphChunkAIModeEnum.auto) {
-  //   const isMarkdown = /^(#+)\s/.test(rawText);
-  //   if (isMarkdown) {
-  //     return {
-  //       resultText: rawText,
-  //       totalInputTokens: 0,
-  //       totalOutputTokens: 0
-  //     };
-  //   }
-  // }
+  if (paragraphChunkAIMode === ParagraphChunkAIModeEnum.auto) {
+    const isMarkdown = /^(#+)\s/.test(rawText);
+    if (isMarkdown) {
+      return {
+        resultText: rawText,
+        totalInputTokens: 0,
+        totalOutputTokens: 0
+      };
+    }
+  }
 
   const data = await POST<{
     resultText: string;
@@ -226,15 +224,25 @@ export const datasetParseQueue = async (): Promise<any> => {
     });
 
     // 3. LLM Pargraph
-    const { resultText } = await requestLLMPargraph({
+    const { resultText, totalInputTokens, totalOutputTokens } = await requestLLMPargraph({
       rawText,
       model: dataset.agentModel,
       billId: data.billId,
       paragraphChunkAIMode: collection.paragraphChunkAIMode
     });
+    // Push usage
+    pushLLMTrainingUsage({
+      teamId: data.teamId,
+      tmbId: data.tmbId,
+      model: dataset.agentModel,
+      inputTokens: totalInputTokens,
+      outputTokens: totalOutputTokens,
+      billId: data.billId,
+      mode: 'paragraph'
+    });
 
     // 4. Chunk split
-    const chunks = rawText2Chunks({
+    const chunks = await rawText2Chunks({
       rawText: resultText,
       chunkTriggerType: collection.chunkTriggerType,
       chunkTriggerMinSize: collection.chunkTriggerMinSize,
diff --git a/projects/app/src/service/core/dataset/queues/generateQA.ts b/projects/app/src/service/core/dataset/queues/generateQA.ts
index e4fb1d355..e7b5c6b6f 100644
--- a/projects/app/src/service/core/dataset/queues/generateQA.ts
+++ b/projects/app/src/service/core/dataset/queues/generateQA.ts
@@ -1,10 +1,9 @@
 import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
-import { pushQAUsage } from '@/service/support/wallet/usage/push';
+import { pushLLMTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
 import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
 import { createChatCompletion } from '@fastgpt/service/core/ai/config';
 import type { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type.d';
 import { addLog } from '@fastgpt/service/common/system/log';
-import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
 import { replaceVariable } from '@fastgpt/global/common/string/tools';
 import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
 import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api.d';
@@ -24,6 +23,7 @@ import {
   getLLMMaxChunkSize
 } from '@fastgpt/global/core/dataset/training/utils';
 import { getErrText } from '@fastgpt/global/common/error/utils';
+import { text2Chunks } from '@fastgpt/service/worker/function';
 
 const reduceQueue = () => {
   global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0;
@@ -144,7 +144,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
     const inputTokens = usage?.prompt_tokens || (await countGptMessagesTokens(messages));
     const outputTokens = usage?.completion_tokens || (await countPromptTokens(answer));
 
-    const qaArr = formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对
+    const qaArr = await formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对
 
     // get vector and insert
     await pushDataListToTrainingQueueByCollectionId({
@@ -163,13 +163,14 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
     await MongoDatasetTraining.findByIdAndDelete(data._id);
 
     // add bill
-    pushQAUsage({
+    pushLLMTrainingUsage({
       teamId: data.teamId,
       tmbId: data.tmbId,
       inputTokens,
       outputTokens,
       billId: data.billId,
-      model: modelData.model
+      model: modelData.model,
+      mode: 'qa'
     });
     addLog.info(`[QA Queue] Finish`, {
       time: Date.now() - startTime,
@@ -196,7 +197,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
 }
 
 // Format qa answer
-function formatSplitText({
+async function formatSplitText({
   answer,
   rawText,
   llmModel
@@ -223,7 +224,7 @@ function formatSplitText({
 
   // empty result. direct split chunk
   if (result.length === 0) {
-    const { chunks } = splitText2Chunks({
+    const { chunks } = await text2Chunks({
       text: rawText,
       chunkSize: chunkAutoChunkSize,
       maxSize: getLLMMaxChunkSize(llmModel)
diff --git a/projects/app/src/service/support/wallet/usage/push.ts b/projects/app/src/service/support/wallet/usage/push.ts
index 33997582d..d8b2e5e16 100644
--- a/projects/app/src/service/support/wallet/usage/push.ts
+++ b/projects/app/src/service/support/wallet/usage/push.ts
@@ -5,42 +5,6 @@ import { i18nT } from '@fastgpt/web/i18n/utils';
 import { ModelTypeEnum } from '@fastgpt/global/core/ai/model';
 import { getDefaultTTSModel } from '@fastgpt/service/core/ai/model';
 
-export const pushQAUsage = async ({
-  teamId,
-  tmbId,
-  model,
-  inputTokens,
-  outputTokens,
-  billId
-}: {
-  teamId: string;
-  tmbId: string;
-  model: string;
-  inputTokens: number;
-  outputTokens: number;
-  billId: string;
-}) => {
-  // 计算价格
-  const { totalPoints } = formatModelChars2Points({
-    model,
-    modelType: ModelTypeEnum.llm,
-    inputTokens,
-    outputTokens
-  });
-
-  concatUsage({
-    billId,
-    teamId,
-    tmbId,
-    totalPoints,
-    inputTokens,
-    outputTokens,
-    listIndex: 1
-  });
-
-  return { totalPoints };
-};
-
 export const pushGenerateVectorUsage = ({
   billId,
   teamId,
diff --git a/test/cases/function/packages/service/core/dataset/textSplitter.test.ts b/test/cases/function/packages/service/core/dataset/textSplitter.test.ts
index d1bcd79ba..ef61821c9 100644
--- a/test/cases/function/packages/service/core/dataset/textSplitter.test.ts
+++ b/test/cases/function/packages/service/core/dataset/textSplitter.test.ts
@@ -16,7 +16,7 @@ const formatResult = (result: string[]) => {
 };
 
 // 最大值分块测试-小于最大值，不分块
-it(`Test splitText2Chunks 1`, () => {
+it(`Test splitText2Chunks 1`, async () => {
   const mock = {
     text: `# A
   
@@ -61,7 +61,7 @@ dsgsgfsgs22sddddddd`
     ]
   };
 
-  const data = rawText2Chunks({
+  const data = await rawText2Chunks({
     rawText: mock.text,
     chunkTriggerType: ChunkTriggerConfigTypeEnum.maxSize,
     chunkTriggerMinSize: 1000,
@@ -72,7 +72,7 @@ dsgsgfsgs22sddddddd`
   expect(formatChunks(data)).toEqual(formatResult(mock.result));
 });
 // 最大值分块测试-大于最大值，分块
-it(`Test splitText2Chunks 2`, () => {
+it(`Test splitText2Chunks 2`, async () => {
   const mock = {
     text: `# A
 
@@ -122,7 +122,7 @@ dsgsgfsgs22sddddddd`
     ]
   };
 
-  const data = rawText2Chunks({
+  const data = await rawText2Chunks({
     rawText: mock.text,
     chunkTriggerType: ChunkTriggerConfigTypeEnum.maxSize,
     chunkTriggerMinSize: 10,
@@ -135,7 +135,7 @@ dsgsgfsgs22sddddddd`
 });
 
 // 最小值分块测试-大于最小值，不分块
-it(`Test splitText2Chunks 3`, () => {
+it(`Test splitText2Chunks 3`, async () => {
   const mock = {
     text: `# A
   
@@ -179,7 +179,7 @@ it(`Test splitText2Chunks 3`, () => {
     ]
   };
 
-  const data = rawText2Chunks({
+  const data = await rawText2Chunks({
     rawText: mock.text,
     chunkTriggerType: ChunkTriggerConfigTypeEnum.minSize,
     chunkTriggerMinSize: 1000,
@@ -191,7 +191,7 @@ it(`Test splitText2Chunks 3`, () => {
   expect(formatChunks(data)).toEqual(formatResult(mock.result));
 });
 // 最小值分块测试-小于最小值，分块
-it(`Test splitText2Chunks 4`, () => {
+it(`Test splitText2Chunks 4`, async () => {
   const mock = {
     text: `# A
 
@@ -241,7 +241,7 @@ dsgsgfsgs22sddddddd`,
     ]
   };
 
-  const data = rawText2Chunks({
+  const data = await rawText2Chunks({
     rawText: mock.text,
     chunkTriggerType: ChunkTriggerConfigTypeEnum.minSize,
     chunkTriggerMinSize: 10,
@@ -254,7 +254,7 @@ dsgsgfsgs22sddddddd`,
 });
 
 // 强制分块测试-小于最小值和最大值
-it(`Test splitText2Chunks 5`, () => {
+it(`Test splitText2Chunks 5`, async () => {
   const mock = {
     text: `# A
 
@@ -304,7 +304,7 @@ dsgsgfsgs22sddddddd`,
     ]
   };
 
-  const data = rawText2Chunks({
+  const data = await rawText2Chunks({
     rawText: mock.text,
     chunkTriggerType: ChunkTriggerConfigTypeEnum.forceChunk,
     chunkTriggerMinSize: 1000,
@@ -317,7 +317,7 @@ dsgsgfsgs22sddddddd`,
 });
 
 // 强制分块测试-大于最小值
-it(`Test splitText2Chunks 6`, () => {
+it(`Test splitText2Chunks 6`, async () => {
   const mock = {
     text: `# A
   
@@ -367,7 +367,7 @@ dsgsgfsgs22sddddddd`,
     ]
   };
 
-  const data = rawText2Chunks({
+  const data = await rawText2Chunks({
     rawText: mock.text,
     chunkTriggerType: ChunkTriggerConfigTypeEnum.forceChunk,
     chunkTriggerMinSize: 10,