Add image index and pdf parse (#3956)

* feat: think tag parse * feat: parse think tag test * feat: pdf parse ux * feat: doc2x parse * perf: rewrite training mode setting * feat: image parse queue * perf: image index * feat: image parse process * feat: add init sh * fix: ts
2025-07-23 13:03:50 +00:00 · 2025-03-03 23:08:29 +08:00
parent 08b6f594df
commit adf5377ebe
106 changed files with 2337 additions and 1454 deletions
--- a/packages/web/i18n/en/dataset.json
+++ b/packages/web/i18n/en/dataset.json
@@ -3,11 +3,16 @@
  "add_file": "Import",
  "api_file": "API Dataset",
  "api_url": "API Url",
+  "auto_indexes": "Automatically generate supplementary indexes",
+  "auto_indexes_tips": "Additional index generation is performed through large models to improve semantic richness and improve retrieval accuracy.",
  "chunk_max_tokens": "max_tokens",
  "close_auto_sync": "Are you sure you want to turn off automatic sync?",
  "collection.Create update time": "Creation/Update Time",
  "collection.Training type": "Training",
+  "collection.training_type": "Chunk type",
  "collection_data_count": "Data amount",
+  "collection_metadata_custom_pdf_parse": "PDF enhancement analysis",
+  "collection_metadata_image_parse": "Image tagging",
  "collection_not_support_retraining": "This collection type does not support retuning parameters",
  "collection_not_support_sync": "This collection does not support synchronization",
  "collection_sync": "Sync data",
@@ -22,12 +27,21 @@
  "custom_data_process_params_desc": "Customize data processing rules",
  "data.ideal_chunk_length": "ideal block length",
  "data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
+  "data_index_custom": "Custom index",
+  "data_index_default": "Default index",
+  "data_index_image": "Image Index",
+  "data_index_num": "Index {{index}}",
+  "data_index_question": "Inferred question index",
+  "data_index_summary": "Summary index",
  "data_process_params": "Params",
  "data_process_setting": "Processing config",
  "dataset.Unsupported operation": "dataset.Unsupported operation",
  "dataset.no_collections": "No datasets available",
  "dataset.no_tags": "No tags available",
+  "default_params": "default",
+  "default_params_desc": "Use system default parameters and rules",
  "edit_dataset_config": "Edit knowledge base configuration",
+  "enhanced_indexes": "Index enhancement",
  "error.collectionNotFound": "Collection not found~",
  "external_file": "External File Library",
  "external_file_dataset_desc": "Import files from an external file library to build a Dataset. The files will not be stored again.",
@@ -38,19 +52,38 @@
  "feishu_dataset": "Feishu Dataset",
  "feishu_dataset_config": "Feishu Dataset Config",
  "feishu_dataset_desc": "Can build a dataset using Feishu documents by configuring permissions, without secondary storage",
+  "file_list": "File list",
  "file_model_function_tip": "Enhances indexing and QA generation",
  "filename": "Filename",
  "folder_dataset": "Folder",
  "ideal_chunk_length": "ideal block length",
  "ideal_chunk_length_tips": "Segment according to the end symbol and combine multiple segments into one block. This value determines the estimated size of the block, if there is any fluctuation.",
+  "image_auto_parse": "Automatic image indexing",
+  "image_auto_parse_tips": "Call VLM to automatically label the pictures in the document and generate additional search indexes",
  "import.Auto mode Estimated Price Tips": "The text understanding model needs to be called, which requires more points: {{price}} points/1K tokens",
  "import.Embedding Estimated Price Tips": "Only use the index model and consume a small amount of AI points: {{price}} points/1K tokens",
+  "import_confirm": "Confirm upload",
+  "import_data_preview": "Data preview",
+  "import_data_process_setting": "Data processing method settings",
+  "import_file_parse_setting": "File parsing settings",
+  "import_model_config": "Model selection",
+  "import_param_setting": "Parameter settings",
+  "import_select_file": "Select a file",
  "is_open_schedule": "Enable scheduled synchronization",
+  "keep_image": "Keep the picture",
  "move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.",
  "open_auto_sync": "After scheduled synchronization is turned on, the system will try to synchronize the collection from time to time every day. During the collection synchronization period, the collection data will not be searched.",
+  "params_setting": "Parameter settings",
+  "pdf_enhance_parse": "PDF enhancement analysis",
+  "pdf_enhance_parse_price": "{{price}} points/page",
+  "pdf_enhance_parse_tips": "When parsing a PDF file, the PDF recognition model is called for recognition, which can be converted into Markdown and retained the pictures in the document, and can also identify the scanned files.",
  "permission.des.manage": "Can manage the entire knowledge base data and information",
  "permission.des.read": "View knowledge base content",
  "permission.des.write": "Ability to add and change knowledge base content",
+  "preview_chunk": "Preview chunks",
+  "preview_chunk_empty": "Unable to read the contents of the file",
+  "preview_chunk_intro": "Display up to 10 pieces",
+  "preview_chunk_not_selected": "Click on the file on the left to preview",
  "rebuild_embedding_start_tip": "Index model switching task has started",
  "rebuilding_index_count": "Number of indexes being rebuilt: {{count}}",
  "request_headers": "Request headers, will automatically append 'Bearer '",
@@ -72,8 +105,10 @@
  "tag.tags": "Tags",
  "tag.total_tags": "Total {{total}} tags",
  "the_knowledge_base_has_indexes_that_are_being_trained_or_being_rebuilt": "The Dataset has indexes that are being trained or rebuilt",
+  "total_num_files": "Total {{total}} files",
  "training_mode": "Chunk mode",
  "vector_model_max_tokens_tip": "Each chunk of data has a maximum length of 3000 tokens",
+  "vllm_model": "Image understanding model",
  "website_dataset": "Website Sync",
  "website_dataset_desc": "Website sync allows you to build a Dataset directly using a web link.",
  "yuque_dataset": "Yuque Dataset",