perf: chunk trigger and paragraph split (#4893)

* perf: chunk trigger and paragraph split

* update max size computed

* perf: i18n

* remove table
This commit is contained in:
Archer
2025-05-26 18:57:22 +08:00
committed by GitHub
parent 874300a56a
commit c25cd48e72
23 changed files with 859 additions and 164 deletions

View File

@@ -554,7 +554,7 @@
"core.dataset.training.Agent queue": "QA 训练排队",
"core.dataset.training.Auto mode": "补充索引",
"core.dataset.training.Auto mode Tip": "通过子索引以及调用模型生成相关问题与摘要,来增加数据块的语义丰富度,更利于检索。需要消耗更多的存储空间和增加 AI 调用次数。",
"core.dataset.training.Chunk mode": "直接分块",
"core.dataset.training.Chunk mode": "分块存储",
"core.dataset.training.Full": "预计 20 分钟以上",
"core.dataset.training.Leisure": "空闲",
"core.dataset.training.QA mode": "问答对提取",

View File

@@ -15,7 +15,13 @@
"backup_dataset_tip": "可以将导出知识库时,下载的 csv 文件重新导入。",
"backup_mode": "备份导入",
"chunk_max_tokens": "分块上限",
"chunk_process_params": "分块处理参数",
"chunk_size": "分块大小",
"chunk_trigger": "分块条件",
"chunk_trigger_force_chunk": "强制分块",
"chunk_trigger_max_size": "原文长度小于文件处理模型最大上下文70%",
"chunk_trigger_min_size": "原文长度大于",
"chunk_trigger_tips": "当满足一定条件时才触发分块存储,否则会直接完整存储原文",
"close_auto_sync": "确认关闭自动同步功能?",
"collection.Create update time": "创建/更新时间",
"collection.Training type": "训练模式",
@@ -29,6 +35,7 @@
"collection_tags": "集合标签",
"common_dataset": "通用知识库",
"common_dataset_desc": "通过导入文件、网页链接或手动录入形式构建知识库",
"condition": "条件",
"config_sync_schedule": "配置定时同步",
"confirm_to_rebuild_embedding_tip": "确认为知识库切换索引?\n切换索引是一个非常重量的操作需要对您知识库内所有数据进行重新索引时间可能较长请确保账号内剩余积分充足。\n\n此外你还需要注意修改选择该知识库的应用避免它们与其他索引模型知识库混用。",
"core.dataset.import.Adjust parameters": "调整参数",
@@ -100,6 +107,7 @@
"is_open_schedule": "启用定时同步",
"keep_image": "保留图片",
"loading": "加载中...",
"max_chunk_size": "最大分块大小",
"move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。",
"noChildren": "无子目录",
"noSelectedFolder": "没有选择文件夹",
@@ -107,8 +115,10 @@
"noValidId": "没有有效的 ID",
"open_auto_sync": "开启定时同步后,系统将会每天不定时尝试同步集合,集合同步期间,会出现无法搜索到该集合数据现象。",
"other_dataset": "第三方知识库",
"paragraph_max_deep": "最大段落深度",
"paragraph_split": "按段落分块",
"paragraph_split_tip": "优先按 Makdown 标题段落进行分块,如果分块过长,再按长度进行二次分块",
"params_config": "配置",
"params_setting": "参数设置",
"pdf_enhance_parse": "PDF增强解析",
"pdf_enhance_parse_price": "{{price}}积分/页",
"pdf_enhance_parse_tips": "调用 PDF 识别模型进行解析,可以将其转换成 Markdown 并保留文档中的图片,同时也可以对扫描件进行识别,识别时间较长。",