feat: custom dataset split sign (#4221)

* feat: custom dataset split sign * feat: custom dataset split sign
2025-07-23 13:03:50 +00:00 · 2025-03-18 23:15:20 +08:00
parent cb29076e5b
commit ec30d79286
9 changed files with 121 additions and 26 deletions
--- a/packages/web/i18n/en/dataset.json
+++ b/packages/web/i18n/en/dataset.json
@@ -25,6 +25,7 @@
  "core.dataset.import.Adjust parameters": "Adjust parameters",
  "custom_data_process_params": "Custom",
  "custom_data_process_params_desc": "Customize data processing rules",
+  "custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.",
  "data.ideal_chunk_length": "ideal block length",
  "data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
  "data_index_num": "Index {{index}}",
@@ -86,6 +87,14 @@
  "retain_collection": "Adjust Training Parameters",
  "retrain_task_submitted": "The retraining task has been submitted",
  "same_api_collection": "The same API set exists",
+  "split_sign_break": "1 newline character",
+  "split_sign_break2": "2 newline characters",
+  "split_sign_custom": "Customize",
+  "split_sign_exclamatiob": "exclamation mark",
+  "split_sign_null": "Not set",
+  "split_sign_period": "period",
+  "split_sign_question": "question mark",
+  "split_sign_semicolon": "semicolon",
  "start_sync_website_tip": "Confirm to start synchronizing data? \nThe old data will be deleted and retrieved again, please confirm!",
  "sync_collection_failed": "Synchronization collection error, please check whether the source file can be accessed normally",
  "sync_schedule": "Timing synchronization",