feat: chunk index independent config (#4271)

* sync collection * remove lock * feat: chunk index independent config * feat: add max chunksize to split chunk function * remove log * update doc * remove * remove log
2025-07-23 05:12:39 +00:00 · 2025-03-21 16:44:25 +08:00
parent 222ff0d49a
commit e812ad6e84
47 changed files with 784 additions and 443 deletions
--- a/packages/web/i18n/en/common.json
+++ b/packages/web/i18n/en/common.json
@@ -569,7 +569,6 @@
  "core.dataset.import.Custom process": "Custom Rules",
  "core.dataset.import.Custom process desc": "Customize segmentation and preprocessing rules",
  "core.dataset.import.Custom prompt": "Custom Prompt",
-  "core.dataset.import.Custom split char": "Custom Separator",
  "core.dataset.import.Custom text": "Custom Text",
  "core.dataset.import.Custom text desc": "Manually enter a piece of text as a dataset",
  "core.dataset.import.Data process params": "Data Processing Parameters",
--- a/packages/web/i18n/en/dataset.json
+++ b/packages/web/i18n/en/dataset.json
@@ -27,7 +27,6 @@
  "custom_data_process_params": "Custom",
  "custom_data_process_params_desc": "Customize data processing rules",
  "custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.",
-  "data.ideal_chunk_length": "ideal block length",
  "data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
  "data_index_num": "Index {{index}}",
  "data_process_params": "Params",
@@ -53,8 +52,6 @@
  "file_model_function_tip": "Enhances indexing and QA generation",
  "filename": "Filename",
  "folder_dataset": "Folder",
-  "ideal_chunk_length": "ideal block length",
-  "ideal_chunk_length_tips": "Segment according to the end symbol and combine multiple segments into one block. This value determines the estimated size of the block, if there is any fluctuation.",
  "image_auto_parse": "Automatic image indexing",
  "image_auto_parse_tips": "Call VLM to automatically label the pictures in the document and generate additional search indexes",
  "image_training_queue": "Queue of image processing",
@@ -68,6 +65,8 @@
  "import_param_setting": "Parameter settings",
  "import_select_file": "Select a file",
  "import_select_link": "Enter link",
+  "index_size": "Index size",
+  "index_size_tips": "When vectorized, the system will automatically further segment the blocks according to this size.",
  "is_open_schedule": "Enable scheduled synchronization",
  "keep_image": "Keep the picture",
  "move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.",
@@ -89,6 +88,8 @@
  "retain_collection": "Adjust Training Parameters",
  "retrain_task_submitted": "The retraining task has been submitted",
  "same_api_collection": "The same API set exists",
+  "split_chunk_char": "Block by specified splitter",
+  "split_chunk_size": "Block by length",
  "split_sign_break": "1 newline character",
  "split_sign_break2": "2 newline characters",
  "split_sign_custom": "Customize",