perf: chunk trigger and paragraph split (#4893)

* perf: chunk trigger and paragraph split

* update max size computed

* perf: i18n

* remove table
This commit is contained in:
Archer
2025-05-26 18:57:22 +08:00
committed by GitHub
parent 874300a56a
commit c25cd48e72
23 changed files with 859 additions and 164 deletions

View File

@@ -554,7 +554,7 @@
"core.dataset.training.Agent queue": "問答訓練排隊中",
"core.dataset.training.Auto mode": "補充索引",
"core.dataset.training.Auto mode Tip": "透過子索引以及呼叫模型產生相關問題與摘要,來增加資料區塊的語意豐富度,更有利於檢索。需要消耗更多的儲存空間並增加 AI 呼叫次數。",
"core.dataset.training.Chunk mode": "直接分塊",
"core.dataset.training.Chunk mode": "分塊存儲",
"core.dataset.training.Full": "預計 20 分鐘以上",
"core.dataset.training.Leisure": "閒置",
"core.dataset.training.QA mode": "問答對提取",

View File

@@ -14,7 +14,12 @@
"backup_dataset_tip": "可以將導出知識庫時,下載的 csv 文件重新導入。",
"backup_mode": "備份導入",
"chunk_max_tokens": "分塊上限",
"chunk_process_params": "分塊處理參數",
"chunk_size": "分塊大小",
"chunk_trigger": "分塊條件",
"chunk_trigger_force_chunk": "強制分塊",
"chunk_trigger_max_size": "原文長度小於文件處理模型最大上下文 70%",
"chunk_trigger_min_size": "原文長度大於",
"close_auto_sync": "確認關閉自動同步功能?",
"collection.Create update time": "建立/更新時間",
"collection.Training type": "分段模式",
@@ -28,6 +33,7 @@
"collection_tags": "集合標籤",
"common_dataset": "通用資料集",
"common_dataset_desc": "通過導入文件、網頁鏈接或手動錄入形式構建知識庫",
"condition": "條件",
"config_sync_schedule": "設定定時同步",
"confirm_to_rebuild_embedding_tip": "確定要為資料集切換索引嗎?\n切換索引是一個重要的操作需要對您資料集內所有資料重新建立索引可能需要較長時間請確保帳號內剩餘點數充足。\n\n此外您還需要注意修改使用此資料集的應用程式避免與其他索引模型資料集混用。",
"core.dataset.import.Adjust parameters": "調整參數",
@@ -99,6 +105,7 @@
"is_open_schedule": "啟用定時同步",
"keep_image": "保留圖片",
"loading": "加載中...",
"max_chunk_size": "最大分塊大小",
"move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。",
"noChildren": "無子目錄",
"noSelectedFolder": "沒有選擇文件夾",
@@ -106,8 +113,10 @@
"noValidId": "沒有有效的 ID",
"open_auto_sync": "開啟定時同步後,系統將每天不定時嘗試同步集合,集合同步期間,會出現無法搜尋到該集合資料現象。",
"other_dataset": "第三方知識庫",
"paragraph_max_deep": "最大段落深度",
"paragraph_split": "按段落分塊",
"paragraph_split_tip": "優先按 Makdown 標題段落進行分塊,如果分塊過長,再按長度進行二次分塊",
"params_config": "設定",
"params_setting": "參數設定",
"pdf_enhance_parse": "PDF 增強解析",
"pdf_enhance_parse_price": "{{price}}積分/頁",
"pdf_enhance_parse_tips": "呼叫 PDF 識別模型進行解析,可以將其轉換成 Markdown 並保留文件中的圖片,同時也可以對掃描件進行識別,識別時間較長。",