diff --git a/docSite/content/zh-cn/docs/development/upgrading/492.md b/docSite/content/zh-cn/docs/development/upgrading/492.md new file mode 100644 index 000000000..fb98bd53a --- /dev/null +++ b/docSite/content/zh-cn/docs/development/upgrading/492.md @@ -0,0 +1,20 @@ +--- +title: 'V4.9.2(进行中)' +description: 'FastGPT V4.9.2 更新说明' +icon: 'upgrade' +draft: false +toc: true +weight: 799 +--- + + +## 🚀 新增内容 + +1. 知识库分块增加自定义分隔符预设值,同时支持自定义换行符分割。 + +## ⚙️ 优化 + +1. 导出对话日志时,支持导出成员名。 + +## 🐛 修复 + diff --git a/packages/global/common/string/textSplitter.ts b/packages/global/common/string/textSplitter.ts index fe1212ba7..8c56029dd 100644 --- a/packages/global/common/string/textSplitter.ts +++ b/packages/global/common/string/textSplitter.ts @@ -1,5 +1,4 @@ import { getErrText } from '../error/utils'; -import { replaceRegChars } from './tools'; export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----'; @@ -115,9 +114,10 @@ const commonSplit = (props: SplitProps): SplitResponse => { // The larger maxLen is, the next sentence is less likely to trigger splitting const markdownIndex = 4; const forbidOverlapIndex = 8; - const stepReges: { reg: RegExp; maxLen: number }[] = [ + + const stepReges: { reg: RegExp | string; maxLen: number }[] = [ ...customReg.map((text) => ({ - reg: new RegExp(`(${replaceRegChars(text)})`, 'g'), + reg: text.replaceAll('\\n', '\n'), maxLen: chunkLen * 1.4 })), { reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkLen * 1.2 }, @@ -161,17 +161,32 @@ const commonSplit = (props: SplitProps): SplitResponse => { const { reg } = stepReges[step]; - const splitTexts = text - .replace( + const replaceText = (() => { + if (typeof reg === 'string') { + let tmpText = text; + reg.split('|').forEach((itemReg) => { + tmpText = tmpText.replaceAll( + itemReg, + (() => { + if (isCustomStep) return splitMarker; + if (isMarkdownSplit) return `${splitMarker}$1`; + return `$1${splitMarker}`; + })() + ); + }); + return tmpText; + } + + return text.replace( reg, (() => { if (isCustomStep) return splitMarker; if (isMarkdownSplit) return `${splitMarker}$1`; return `$1${splitMarker}`; })() - ) - .split(`${splitMarker}`) - .filter((part) => part.trim()); + ); + })(); + const splitTexts = replaceText.split(splitMarker).filter((part) => part.trim()); return splitTexts .map((text) => { diff --git a/packages/web/i18n/en/common.json b/packages/web/i18n/en/common.json index d62ea1bd9..cd36d6a6a 100644 --- a/packages/web/i18n/en/common.json +++ b/packages/web/i18n/en/common.json @@ -570,7 +570,6 @@ "core.dataset.import.Custom process desc": "Customize segmentation and preprocessing rules", "core.dataset.import.Custom prompt": "Custom Prompt", "core.dataset.import.Custom split char": "Custom Separator", - "core.dataset.import.Custom split char Tips": "Allows you to segment based on custom separators. Usually used for pre-processed data, using specific separators for precise segmentation.", "core.dataset.import.Custom text": "Custom Text", "core.dataset.import.Custom text desc": "Manually enter a piece of text as a dataset", "core.dataset.import.Data process params": "Data Processing Parameters", diff --git a/packages/web/i18n/en/dataset.json b/packages/web/i18n/en/dataset.json index 7a57c3e97..e4760e04d 100644 --- a/packages/web/i18n/en/dataset.json +++ b/packages/web/i18n/en/dataset.json @@ -25,6 +25,7 @@ "core.dataset.import.Adjust parameters": "Adjust parameters", "custom_data_process_params": "Custom", "custom_data_process_params_desc": "Customize data processing rules", + "custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.", "data.ideal_chunk_length": "ideal block length", "data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes", "data_index_num": "Index {{index}}", @@ -86,6 +87,14 @@ "retain_collection": "Adjust Training Parameters", "retrain_task_submitted": "The retraining task has been submitted", "same_api_collection": "The same API set exists", + "split_sign_break": "1 newline character", + "split_sign_break2": "2 newline characters", + "split_sign_custom": "Customize", + "split_sign_exclamatiob": "exclamation mark", + "split_sign_null": "Not set", + "split_sign_period": "period", + "split_sign_question": "question mark", + "split_sign_semicolon": "semicolon", "start_sync_website_tip": "Confirm to start synchronizing data? \nThe old data will be deleted and retrieved again, please confirm!", "sync_collection_failed": "Synchronization collection error, please check whether the source file can be accessed normally", "sync_schedule": "Timing synchronization", diff --git a/packages/web/i18n/zh-CN/common.json b/packages/web/i18n/zh-CN/common.json index 3c2d6ae85..52445499c 100644 --- a/packages/web/i18n/zh-CN/common.json +++ b/packages/web/i18n/zh-CN/common.json @@ -574,7 +574,6 @@ "core.dataset.import.Custom process desc": "自定义设置数据处理规则", "core.dataset.import.Custom prompt": "自定义提示词", "core.dataset.import.Custom split char": "自定义分隔符", - "core.dataset.import.Custom split char Tips": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。", "core.dataset.import.Custom text": "自定义文本", "core.dataset.import.Custom text desc": "手动输入一段文本作为数据集", "core.dataset.import.Data process params": "数据处理参数", diff --git a/packages/web/i18n/zh-CN/dataset.json b/packages/web/i18n/zh-CN/dataset.json index 7dd79ee32..11b48d81c 100644 --- a/packages/web/i18n/zh-CN/dataset.json +++ b/packages/web/i18n/zh-CN/dataset.json @@ -25,6 +25,7 @@ "core.dataset.import.Adjust parameters": "调整参数", "custom_data_process_params": "自定义", "custom_data_process_params_desc": "自定义设置数据处理规则", + "custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号,例如: * () [] {} 等。", "data.ideal_chunk_length": "理想分块长度", "data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引", "data_index_num": "索引 {{index}}", @@ -86,6 +87,14 @@ "retain_collection": "调整训练参数", "retrain_task_submitted": "重新训练任务已提交", "same_api_collection": "存在相同的 API 集合", + "split_sign_break": "1 个换行符", + "split_sign_break2": "2 个换行符", + "split_sign_custom": "自定义", + "split_sign_exclamatiob": "感叹号", + "split_sign_null": "不设置", + "split_sign_period": "句号", + "split_sign_question": "问号", + "split_sign_semicolon": "分号", "start_sync_website_tip": "确认开始同步数据?将会删除旧数据后重新获取,请确认!", "sync_collection_failed": "同步集合错误,请检查是否能正常访问源文件", "sync_schedule": "定时同步", diff --git a/packages/web/i18n/zh-Hant/common.json b/packages/web/i18n/zh-Hant/common.json index fe11b5474..fff2a0c78 100644 --- a/packages/web/i18n/zh-Hant/common.json +++ b/packages/web/i18n/zh-Hant/common.json @@ -569,7 +569,6 @@ "core.dataset.import.Custom process desc": "自訂設定資料處理規則", "core.dataset.import.Custom prompt": "自訂提示詞", "core.dataset.import.Custom split char": "自訂分隔符", - "core.dataset.import.Custom split char Tips": "允許您根據自訂的分隔符進行分割。通常用於已處理好的資料,使用特定的分隔符來精確分割。", "core.dataset.import.Custom text": "自訂文字", "core.dataset.import.Custom text desc": "手動輸入一段文字作為資料集", "core.dataset.import.Data process params": "資料處理參數", diff --git a/packages/web/i18n/zh-Hant/dataset.json b/packages/web/i18n/zh-Hant/dataset.json index 712956760..4a7a9fa97 100644 --- a/packages/web/i18n/zh-Hant/dataset.json +++ b/packages/web/i18n/zh-Hant/dataset.json @@ -25,6 +25,7 @@ "core.dataset.import.Adjust parameters": "調整參數", "custom_data_process_params": "自訂", "custom_data_process_params_desc": "自訂資料處理規則", + "custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的數據,使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.” 表示中英文句號。\n\n盡量避免使用正則相關特殊符號,例如: * () [] {} 等。", "data.ideal_chunk_length": "理想分塊長度", "data_amount": "{{dataAmount}} 組數據, {{indexAmount}} 組索引", "data_index_num": "索引 {{index}}", @@ -86,6 +87,14 @@ "retain_collection": "調整訓練參數", "retrain_task_submitted": "重新訓練任務已提交", "same_api_collection": "存在相同的 API 集合", + "split_sign_break": "1 個換行符", + "split_sign_break2": "2 個換行符", + "split_sign_custom": "自定義", + "split_sign_exclamatiob": "驚嘆號", + "split_sign_null": "不設置", + "split_sign_period": "句號", + "split_sign_question": "問號", + "split_sign_semicolon": "分號", "start_sync_website_tip": "確認開始同步資料?\n將會刪除舊資料後重新獲取,請確認!", "sync_collection_failed": "同步集合錯誤,請檢查是否能正常存取來源文件", "sync_schedule": "定時同步", diff --git a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/DataProcess.tsx b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/DataProcess.tsx index 898a2297b..5a1dd5065 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/DataProcess.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/DataProcess.tsx @@ -1,4 +1,4 @@ -import React, { useCallback, useEffect, useMemo, useRef } from 'react'; +import React, { useCallback, useEffect, useMemo, useRef, useState } from 'react'; import { Box, Flex, @@ -36,6 +36,7 @@ import MyNumberInput from '@fastgpt/web/components/common/Input/NumberInput'; import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip'; import { shadowLight } from '@fastgpt/web/styles/theme'; import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; +import MySelect from '@fastgpt/web/components/common/MySelect'; function DataProcess() { const { t } = useTranslation(); @@ -44,18 +45,39 @@ function DataProcess() { const { goToNext, processParamsForm, chunkSizeField, minChunkSize, maxChunkSize } = useContextSelector(DatasetImportContext, (v) => v); const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail); - const { setValue, register, watch } = processParamsForm; + const { setValue, register, watch, getValues } = processParamsForm; const trainingType = watch('trainingType'); const chunkSettingMode = watch('chunkSettingMode'); - const qaPrompt = watch('qaPrompt'); + const qaPrompt = watch('qaPrompt'); const { isOpen: isOpenCustomPrompt, onOpen: onOpenCustomPrompt, onClose: onCloseCustomPrompt } = useDisclosure(); + const customSplitList = [ + { label: t('dataset:split_sign_null'), value: '' }, + { label: t('dataset:split_sign_break'), value: '\\n' }, + { label: t('dataset:split_sign_break2'), value: '\\n\\n' }, + { label: t('dataset:split_sign_period'), value: '.|。' }, + { label: t('dataset:split_sign_exclamatiob'), value: '!|!' }, + { label: t('dataset:split_sign_question'), value: '?|?' }, + { label: t('dataset:split_sign_semicolon'), value: ';|;' }, + { label: '=====', value: '=====' }, + { label: t('dataset:split_sign_custom'), value: 'Other' } + ]; + + const [customListSelectValue, setCustomListSelectValue] = useState(getValues('customSplitChar')); + useEffect(() => { + if (customListSelectValue === 'Other') { + setValue('customSplitChar', ''); + } else { + setValue('customSplitChar', customListSelectValue); + } + }, [customListSelectValue, setValue]); + const trainingModeList = useMemo(() => { const list = Object.entries(DatasetCollectionDataProcessModeMap); return list @@ -248,19 +270,33 @@ function DataProcess() { {t('common:core.dataset.import.Custom split char')} - - - - + + + + + + list={customSplitList} + size={'sm'} + bg={'myGray.50'} + value={customListSelectValue} + h={'32px'} + onChange={(val) => { + setCustomListSelectValue(val); + }} + /> + + {customListSelectValue === 'Other' && ( + + )} + {showQAPromptInput && (