feat: custom dataset split sign (#4221)

* feat: custom dataset split sign * feat: custom dataset split sign
2025-10-15 07:31:19 +00:00 · 2025-03-18 23:15:20 +08:00
parent cb29076e5b
commit ec30d79286
9 changed files with 121 additions and 26 deletions
--- a/packages/global/common/string/textSplitter.ts
+++ b/packages/global/common/string/textSplitter.ts
@@ -1,5 +1,4 @@
 import { getErrText } from '../error/utils';
-import { replaceRegChars } from './tools';

 export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';

@@ -115,9 +114,10 @@ const commonSplit = (props: SplitProps): SplitResponse => {
  // The larger maxLen is, the next sentence is less likely to trigger splitting
  const markdownIndex = 4;
  const forbidOverlapIndex = 8;
-  const stepReges: { reg: RegExp; maxLen: number }[] = [
+
+  const stepReges: { reg: RegExp | string; maxLen: number }[] = [
    ...customReg.map((text) => ({
-      reg: new RegExp(`(${replaceRegChars(text)})`, 'g'),
+      reg: text.replaceAll('\\n', '\n'),
      maxLen: chunkLen * 1.4
    })),
    { reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkLen * 1.2 },
@@ -161,17 +161,32 @@ const commonSplit = (props: SplitProps): SplitResponse => {

    const { reg } = stepReges[step];

-    const splitTexts = text
-      .replace(
+    const replaceText = (() => {
+      if (typeof reg === 'string') {
+        let tmpText = text;
+        reg.split('|').forEach((itemReg) => {
+          tmpText = tmpText.replaceAll(
+            itemReg,
+            (() => {
+              if (isCustomStep) return splitMarker;
+              if (isMarkdownSplit) return `${splitMarker}$1`;
+              return `$1${splitMarker}`;
+            })()
+          );
+        });
+        return tmpText;
+      }
+
+      return text.replace(
        reg,
        (() => {
          if (isCustomStep) return splitMarker;
          if (isMarkdownSplit) return `${splitMarker}$1`;
          return `$1${splitMarker}`;
        })()
-      )
-      .split(`${splitMarker}`)
-      .filter((part) => part.trim());
+      );
+    })();
+    const splitTexts = replaceText.split(splitMarker).filter((part) => part.trim());

    return splitTexts
      .map((text) => {
--- a/packages/web/i18n/en/common.json
+++ b/packages/web/i18n/en/common.json
@@ -570,7 +570,6 @@
  "core.dataset.import.Custom process desc": "Customize segmentation and preprocessing rules",
  "core.dataset.import.Custom prompt": "Custom Prompt",
  "core.dataset.import.Custom split char": "Custom Separator",
-  "core.dataset.import.Custom split char Tips": "Allows you to segment based on custom separators. Usually used for pre-processed data, using specific separators for precise segmentation.",
  "core.dataset.import.Custom text": "Custom Text",
  "core.dataset.import.Custom text desc": "Manually enter a piece of text as a dataset",
  "core.dataset.import.Data process params": "Data Processing Parameters",
--- a/packages/web/i18n/en/dataset.json
+++ b/packages/web/i18n/en/dataset.json
@@ -25,6 +25,7 @@
  "core.dataset.import.Adjust parameters": "Adjust parameters",
  "custom_data_process_params": "Custom",
  "custom_data_process_params_desc": "Customize data processing rules",
+  "custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.",
  "data.ideal_chunk_length": "ideal block length",
  "data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
  "data_index_num": "Index {{index}}",
@@ -86,6 +87,14 @@
  "retain_collection": "Adjust Training Parameters",
  "retrain_task_submitted": "The retraining task has been submitted",
  "same_api_collection": "The same API set exists",
+  "split_sign_break": "1 newline character",
+  "split_sign_break2": "2 newline characters",
+  "split_sign_custom": "Customize",
+  "split_sign_exclamatiob": "exclamation mark",
+  "split_sign_null": "Not set",
+  "split_sign_period": "period",
+  "split_sign_question": "question mark",
+  "split_sign_semicolon": "semicolon",
  "start_sync_website_tip": "Confirm to start synchronizing data? \nThe old data will be deleted and retrieved again, please confirm!",
  "sync_collection_failed": "Synchronization collection error, please check whether the source file can be accessed normally",
  "sync_schedule": "Timing synchronization",
--- a/packages/web/i18n/zh-CN/common.json
+++ b/packages/web/i18n/zh-CN/common.json
@@ -574,7 +574,6 @@
  "core.dataset.import.Custom process desc": "自定义设置数据处理规则",
  "core.dataset.import.Custom prompt": "自定义提示词",
  "core.dataset.import.Custom split char": "自定义分隔符",
-  "core.dataset.import.Custom split char Tips": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据，使用特定的分隔符来精确分块。",
  "core.dataset.import.Custom text": "自定义文本",
  "core.dataset.import.Custom text desc": "手动输入一段文本作为数据集",
  "core.dataset.import.Data process params": "数据处理参数",
--- a/packages/web/i18n/zh-CN/dataset.json
+++ b/packages/web/i18n/zh-CN/dataset.json
@@ -25,6 +25,7 @@
  "core.dataset.import.Adjust parameters": "调整参数",
  "custom_data_process_params": "自定义",
  "custom_data_process_params_desc": "自定义设置数据处理规则",
+  "custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据，使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符，例如：“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号，例如: * () [] {} 等。",
  "data.ideal_chunk_length": "理想分块长度",
  "data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引",
  "data_index_num": "索引 {{index}}",
@@ -86,6 +87,14 @@
  "retain_collection": "调整训练参数",
  "retrain_task_submitted": "重新训练任务已提交",
  "same_api_collection": "存在相同的 API 集合",
+  "split_sign_break": "1 个换行符",
+  "split_sign_break2": "2 个换行符",
+  "split_sign_custom": "自定义",
+  "split_sign_exclamatiob": "感叹号",
+  "split_sign_null": "不设置",
+  "split_sign_period": "句号",
+  "split_sign_question": "问号",
+  "split_sign_semicolon": "分号",
  "start_sync_website_tip": "确认开始同步数据？将会删除旧数据后重新获取，请确认！",
  "sync_collection_failed": "同步集合错误，请检查是否能正常访问源文件",
  "sync_schedule": "定时同步",
--- a/packages/web/i18n/zh-Hant/common.json
+++ b/packages/web/i18n/zh-Hant/common.json
@@ -569,7 +569,6 @@
  "core.dataset.import.Custom process desc": "自訂設定資料處理規則",
  "core.dataset.import.Custom prompt": "自訂提示詞",
  "core.dataset.import.Custom split char": "自訂分隔符",
-  "core.dataset.import.Custom split char Tips": "允許您根據自訂的分隔符進行分割。通常用於已處理好的資料，使用特定的分隔符來精確分割。",
  "core.dataset.import.Custom text": "自訂文字",
  "core.dataset.import.Custom text desc": "手動輸入一段文字作為資料集",
  "core.dataset.import.Data process params": "資料處理參數",
--- a/packages/web/i18n/zh-Hant/dataset.json
+++ b/packages/web/i18n/zh-Hant/dataset.json
@@ -25,6 +25,7 @@
  "core.dataset.import.Adjust parameters": "調整參數",
  "custom_data_process_params": "自訂",
  "custom_data_process_params_desc": "自訂資料處理規則",
+  "custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的數據，使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符，例如：“。|.” 表示中英文句號。\n\n盡量避免使用正則相關特殊符號，例如: * () [] {} 等。",
  "data.ideal_chunk_length": "理想分塊長度",
  "data_amount": "{{dataAmount}} 組數據, {{indexAmount}} 組索引",
  "data_index_num": "索引 {{index}}",
@@ -86,6 +87,14 @@
  "retain_collection": "調整訓練參數",
  "retrain_task_submitted": "重新訓練任務已提交",
  "same_api_collection": "存在相同的 API 集合",
+  "split_sign_break": "1 個換行符",
+  "split_sign_break2": "2 個換行符",
+  "split_sign_custom": "自定義",
+  "split_sign_exclamatiob": "驚嘆號",
+  "split_sign_null": "不設置",
+  "split_sign_period": "句號",
+  "split_sign_question": "問號",
+  "split_sign_semicolon": "分號",
  "start_sync_website_tip": "確認開始同步資料？\n將會刪除舊資料後重新獲取，請確認！",
  "sync_collection_failed": "同步集合錯誤，請檢查是否能正常存取來源文件",
  "sync_schedule": "定時同步",