Add image index and pdf parse (#3956)

* feat: think tag parse * feat: parse think tag test * feat: pdf parse ux * feat: doc2x parse * perf: rewrite training mode setting * feat: image parse queue * perf: image index * feat: image parse process * feat: add init sh * fix: ts
2025-07-23 13:03:50 +00:00 · 2025-03-03 23:08:29 +08:00
parent 08b6f594df
commit adf5377ebe
106 changed files with 2337 additions and 1454 deletions
--- a/packages/web/i18n/en/account_usage.json
+++ b/packages/web/i18n/en/account_usage.json
@@ -2,6 +2,7 @@
  "ai_model": "AI model",
  "all": "all",
  "app_name": "Application name",
+  "auto_index": "Auto index",
  "billing_module": "Deduction module",
  "confirm_export": "A total of {{total}} pieces of data were filtered out. Are you sure to export?",
  "current_filter_conditions": "Current filter conditions",
@@ -9,6 +10,7 @@
  "details": "Details",
  "dingtalk": "DingTalk",
  "duration_seconds": "Duration (seconds)",
+  "embedding_index": "Embedding",
  "every_day": "Day",
  "every_month": "Moon",
  "export_confirm": "Export confirmation",
@@ -16,6 +18,7 @@
  "export_title": "Time,Members,Type,Project name,AI points",
  "feishu": "Feishu",
  "generation_time": "Generation time",
+  "image_parse": "Image tagging",
  "input_token_length": "input tokens",
  "member": "member",
  "member_name": "Member name",
@@ -25,8 +28,12 @@
  "official_account": "Official Account",
  "order_number": "Order number",
  "output_token_length": "output tokens",
+  "pages": "Pages",
+  "pdf_enhanced_parse": "PDF Enhanced Analysis",
+  "pdf_parse": "PDF Analysis",
  "points": "Points",
  "project_name": "Project name",
+  "qa": "QA",
  "select_member_and_source_first": "Please select members and types first",
  "share": "Share Link",
  "source": "source",
--- a/packages/web/i18n/en/common.json
+++ b/packages/web/i18n/en/common.json
@@ -562,10 +562,7 @@
  "core.dataset.file": "File",
  "core.dataset.folder": "Directory",
  "core.dataset.import.Auto mode Estimated Price Tips": "Requires calling the file processing model, which consumes a lot of tokens: {{price}} points/1K tokens",
-  "core.dataset.import.Auto process": "Automatic",
-  "core.dataset.import.Auto process desc": "Automatically set segmentation and preprocessing rules",
  "core.dataset.import.Chunk Range": "Range: {{min}}~{{max}}",
-  "core.dataset.import.Chunk Split": "Chunks",
  "core.dataset.import.Chunk Split Tip": "Segment the text according to certain rules and convert it into a format that can be semantically searched. Suitable for most scenarios. No additional model processing is required, and the cost is low.",
  "core.dataset.import.Continue upload": "Continue upload",
  "core.dataset.import.Custom process": "Custom Rules",
@@ -575,7 +572,6 @@
  "core.dataset.import.Custom split char Tips": "Allows you to segment based on custom separators. Usually used for pre-processed data, using specific separators for precise segmentation.",
  "core.dataset.import.Custom text": "Custom Text",
  "core.dataset.import.Custom text desc": "Manually enter a piece of text as a dataset",
-  "core.dataset.import.Data Preprocessing": "Data Processing",
  "core.dataset.import.Data process params": "Data Processing Parameters",
  "core.dataset.import.Down load csv template": "Click to Download CSV Template",
  "core.dataset.import.Embedding Estimated Price Tips": "Only use the index model, consuming a small amount of AI points: {{price}} points/1K tokens",
@@ -597,7 +593,6 @@
  "core.dataset.import.Source name": "Source Name",
  "core.dataset.import.Sources list": "Sources",
  "core.dataset.import.Start upload": "Start Upload",
-  "core.dataset.import.Total files": "Total {{total}} Files",
  "core.dataset.import.Upload complete": "Upload complete",
  "core.dataset.import.Upload data": "Confirm Upload",
  "core.dataset.import.Upload file progress": "File Upload Progress",
@@ -649,10 +644,10 @@
  "core.dataset.training.Agent queue": "QA Training Queue",
  "core.dataset.training.Auto mode": "Auto index",
  "core.dataset.training.Auto mode Tip": "Increase the semantic richness of data blocks by generating related questions and summaries through sub-indexes and calling models, making it more conducive to retrieval. Requires more storage space and increases AI call times.",
-  "core.dataset.training.Chunk mode": "Default",
+  "core.dataset.training.Chunk mode": "Chunk",
  "core.dataset.training.Full": "Estimated Over 5 Minutes",
  "core.dataset.training.Leisure": "Idle",
-  "core.dataset.training.QA mode": "QA Chunks",
+  "core.dataset.training.QA mode": "QA",
  "core.dataset.training.Vector queue": "Index Queue",
  "core.dataset.training.Waiting": "Estimated 5 Minutes",
  "core.dataset.training.Website Sync": "Website Sync",
@@ -861,7 +856,6 @@
  "dataset.collections.Select Collection": "Select File",
  "dataset.collections.Select One Collection To Store": "Select a File to Store",
  "dataset.data.Can not edit": "No Edit Permission",
-  "dataset.data.Custom Index Number": "Custom Index {{number}}",
  "dataset.data.Default Index": "Default Index",
  "dataset.data.Delete Tip": "Confirm to Delete This Data?",
  "dataset.data.Index Placeholder": "Enter Index Text Content",
@@ -956,6 +950,7 @@
  "new_create": "Create New",
  "no": "No",
  "no_laf_env": "System Not Configured with Laf Environment",
+  "not_model_config": "No related model configured",
  "not_yet_introduced": "No Introduction Yet",
  "option": "Option",
  "pay.amount": "Amount",
@@ -1121,7 +1116,6 @@
  "support.wallet.invoice_detail": "Invoice Details",
  "support.wallet.invoice_info": "The invoice will be sent to the email within 3-7 working days, please wait patiently",
  "support.wallet.invoicing": "Invoicing",
-  "support.wallet.moduleName.index": "Index Generation",
  "support.wallet.moduleName.qa": "QA Split",
  "support.wallet.noBill": "No Bill Records",
  "support.wallet.no_invoice": "No Invoice Records",
--- a/packages/web/i18n/en/dataset.json
+++ b/packages/web/i18n/en/dataset.json
@@ -3,11 +3,16 @@
  "add_file": "Import",
  "api_file": "API Dataset",
  "api_url": "API Url",
+  "auto_indexes": "Automatically generate supplementary indexes",
+  "auto_indexes_tips": "Additional index generation is performed through large models to improve semantic richness and improve retrieval accuracy.",
  "chunk_max_tokens": "max_tokens",
  "close_auto_sync": "Are you sure you want to turn off automatic sync?",
  "collection.Create update time": "Creation/Update Time",
  "collection.Training type": "Training",
+  "collection.training_type": "Chunk type",
  "collection_data_count": "Data amount",
+  "collection_metadata_custom_pdf_parse": "PDF enhancement analysis",
+  "collection_metadata_image_parse": "Image tagging",
  "collection_not_support_retraining": "This collection type does not support retuning parameters",
  "collection_not_support_sync": "This collection does not support synchronization",
  "collection_sync": "Sync data",
@@ -22,12 +27,21 @@
  "custom_data_process_params_desc": "Customize data processing rules",
  "data.ideal_chunk_length": "ideal block length",
  "data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
+  "data_index_custom": "Custom index",
+  "data_index_default": "Default index",
+  "data_index_image": "Image Index",
+  "data_index_num": "Index {{index}}",
+  "data_index_question": "Inferred question index",
+  "data_index_summary": "Summary index",
  "data_process_params": "Params",
  "data_process_setting": "Processing config",
  "dataset.Unsupported operation": "dataset.Unsupported operation",
  "dataset.no_collections": "No datasets available",
  "dataset.no_tags": "No tags available",
+  "default_params": "default",
+  "default_params_desc": "Use system default parameters and rules",
  "edit_dataset_config": "Edit knowledge base configuration",
+  "enhanced_indexes": "Index enhancement",
  "error.collectionNotFound": "Collection not found~",
  "external_file": "External File Library",
  "external_file_dataset_desc": "Import files from an external file library to build a Dataset. The files will not be stored again.",
@@ -38,19 +52,38 @@
  "feishu_dataset": "Feishu Dataset",
  "feishu_dataset_config": "Feishu Dataset Config",
  "feishu_dataset_desc": "Can build a dataset using Feishu documents by configuring permissions, without secondary storage",
+  "file_list": "File list",
  "file_model_function_tip": "Enhances indexing and QA generation",
  "filename": "Filename",
  "folder_dataset": "Folder",
  "ideal_chunk_length": "ideal block length",
  "ideal_chunk_length_tips": "Segment according to the end symbol and combine multiple segments into one block. This value determines the estimated size of the block, if there is any fluctuation.",
+  "image_auto_parse": "Automatic image indexing",
+  "image_auto_parse_tips": "Call VLM to automatically label the pictures in the document and generate additional search indexes",
  "import.Auto mode Estimated Price Tips": "The text understanding model needs to be called, which requires more points: {{price}} points/1K tokens",
  "import.Embedding Estimated Price Tips": "Only use the index model and consume a small amount of AI points: {{price}} points/1K tokens",
+  "import_confirm": "Confirm upload",
+  "import_data_preview": "Data preview",
+  "import_data_process_setting": "Data processing method settings",
+  "import_file_parse_setting": "File parsing settings",
+  "import_model_config": "Model selection",
+  "import_param_setting": "Parameter settings",
+  "import_select_file": "Select a file",
  "is_open_schedule": "Enable scheduled synchronization",
+  "keep_image": "Keep the picture",
  "move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.",
  "open_auto_sync": "After scheduled synchronization is turned on, the system will try to synchronize the collection from time to time every day. During the collection synchronization period, the collection data will not be searched.",
+  "params_setting": "Parameter settings",
+  "pdf_enhance_parse": "PDF enhancement analysis",
+  "pdf_enhance_parse_price": "{{price}} points/page",
+  "pdf_enhance_parse_tips": "When parsing a PDF file, the PDF recognition model is called for recognition, which can be converted into Markdown and retained the pictures in the document, and can also identify the scanned files.",
  "permission.des.manage": "Can manage the entire knowledge base data and information",
  "permission.des.read": "View knowledge base content",
  "permission.des.write": "Ability to add and change knowledge base content",
+  "preview_chunk": "Preview chunks",
+  "preview_chunk_empty": "Unable to read the contents of the file",
+  "preview_chunk_intro": "Display up to 10 pieces",
+  "preview_chunk_not_selected": "Click on the file on the left to preview",
  "rebuild_embedding_start_tip": "Index model switching task has started",
  "rebuilding_index_count": "Number of indexes being rebuilt: {{count}}",
  "request_headers": "Request headers, will automatically append 'Bearer '",
@@ -72,8 +105,10 @@
  "tag.tags": "Tags",
  "tag.total_tags": "Total {{total}} tags",
  "the_knowledge_base_has_indexes_that_are_being_trained_or_being_rebuilt": "The Dataset has indexes that are being trained or rebuilt",
+  "total_num_files": "Total {{total}} files",
  "training_mode": "Chunk mode",
  "vector_model_max_tokens_tip": "Each chunk of data has a maximum length of 3000 tokens",
+  "vllm_model": "Image understanding model",
  "website_dataset": "Website Sync",
  "website_dataset_desc": "Website sync allows you to build a Dataset directly using a web link.",
  "yuque_dataset": "Yuque Dataset",