mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-21 11:43:56 +00:00
feat: custom dataset split sign (#4221)
* feat: custom dataset split sign * feat: custom dataset split sign
This commit is contained in:
20
docSite/content/zh-cn/docs/development/upgrading/492.md
Normal file
20
docSite/content/zh-cn/docs/development/upgrading/492.md
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
title: 'V4.9.2(进行中)'
|
||||||
|
description: 'FastGPT V4.9.2 更新说明'
|
||||||
|
icon: 'upgrade'
|
||||||
|
draft: false
|
||||||
|
toc: true
|
||||||
|
weight: 799
|
||||||
|
---
|
||||||
|
|
||||||
|
|
||||||
|
## 🚀 新增内容
|
||||||
|
|
||||||
|
1. 知识库分块增加自定义分隔符预设值,同时支持自定义换行符分割。
|
||||||
|
|
||||||
|
## ⚙️ 优化
|
||||||
|
|
||||||
|
1. 导出对话日志时,支持导出成员名。
|
||||||
|
|
||||||
|
## 🐛 修复
|
||||||
|
|
@@ -1,5 +1,4 @@
|
|||||||
import { getErrText } from '../error/utils';
|
import { getErrText } from '../error/utils';
|
||||||
import { replaceRegChars } from './tools';
|
|
||||||
|
|
||||||
export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
|
export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
|
||||||
|
|
||||||
@@ -115,9 +114,10 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
// The larger maxLen is, the next sentence is less likely to trigger splitting
|
// The larger maxLen is, the next sentence is less likely to trigger splitting
|
||||||
const markdownIndex = 4;
|
const markdownIndex = 4;
|
||||||
const forbidOverlapIndex = 8;
|
const forbidOverlapIndex = 8;
|
||||||
const stepReges: { reg: RegExp; maxLen: number }[] = [
|
|
||||||
|
const stepReges: { reg: RegExp | string; maxLen: number }[] = [
|
||||||
...customReg.map((text) => ({
|
...customReg.map((text) => ({
|
||||||
reg: new RegExp(`(${replaceRegChars(text)})`, 'g'),
|
reg: text.replaceAll('\\n', '\n'),
|
||||||
maxLen: chunkLen * 1.4
|
maxLen: chunkLen * 1.4
|
||||||
})),
|
})),
|
||||||
{ reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkLen * 1.2 },
|
{ reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkLen * 1.2 },
|
||||||
@@ -161,17 +161,32 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
|
|
||||||
const { reg } = stepReges[step];
|
const { reg } = stepReges[step];
|
||||||
|
|
||||||
const splitTexts = text
|
const replaceText = (() => {
|
||||||
.replace(
|
if (typeof reg === 'string') {
|
||||||
|
let tmpText = text;
|
||||||
|
reg.split('|').forEach((itemReg) => {
|
||||||
|
tmpText = tmpText.replaceAll(
|
||||||
|
itemReg,
|
||||||
|
(() => {
|
||||||
|
if (isCustomStep) return splitMarker;
|
||||||
|
if (isMarkdownSplit) return `${splitMarker}$1`;
|
||||||
|
return `$1${splitMarker}`;
|
||||||
|
})()
|
||||||
|
);
|
||||||
|
});
|
||||||
|
return tmpText;
|
||||||
|
}
|
||||||
|
|
||||||
|
return text.replace(
|
||||||
reg,
|
reg,
|
||||||
(() => {
|
(() => {
|
||||||
if (isCustomStep) return splitMarker;
|
if (isCustomStep) return splitMarker;
|
||||||
if (isMarkdownSplit) return `${splitMarker}$1`;
|
if (isMarkdownSplit) return `${splitMarker}$1`;
|
||||||
return `$1${splitMarker}`;
|
return `$1${splitMarker}`;
|
||||||
})()
|
})()
|
||||||
)
|
);
|
||||||
.split(`${splitMarker}`)
|
})();
|
||||||
.filter((part) => part.trim());
|
const splitTexts = replaceText.split(splitMarker).filter((part) => part.trim());
|
||||||
|
|
||||||
return splitTexts
|
return splitTexts
|
||||||
.map((text) => {
|
.map((text) => {
|
||||||
|
@@ -570,7 +570,6 @@
|
|||||||
"core.dataset.import.Custom process desc": "Customize segmentation and preprocessing rules",
|
"core.dataset.import.Custom process desc": "Customize segmentation and preprocessing rules",
|
||||||
"core.dataset.import.Custom prompt": "Custom Prompt",
|
"core.dataset.import.Custom prompt": "Custom Prompt",
|
||||||
"core.dataset.import.Custom split char": "Custom Separator",
|
"core.dataset.import.Custom split char": "Custom Separator",
|
||||||
"core.dataset.import.Custom split char Tips": "Allows you to segment based on custom separators. Usually used for pre-processed data, using specific separators for precise segmentation.",
|
|
||||||
"core.dataset.import.Custom text": "Custom Text",
|
"core.dataset.import.Custom text": "Custom Text",
|
||||||
"core.dataset.import.Custom text desc": "Manually enter a piece of text as a dataset",
|
"core.dataset.import.Custom text desc": "Manually enter a piece of text as a dataset",
|
||||||
"core.dataset.import.Data process params": "Data Processing Parameters",
|
"core.dataset.import.Data process params": "Data Processing Parameters",
|
||||||
|
@@ -25,6 +25,7 @@
|
|||||||
"core.dataset.import.Adjust parameters": "Adjust parameters",
|
"core.dataset.import.Adjust parameters": "Adjust parameters",
|
||||||
"custom_data_process_params": "Custom",
|
"custom_data_process_params": "Custom",
|
||||||
"custom_data_process_params_desc": "Customize data processing rules",
|
"custom_data_process_params_desc": "Customize data processing rules",
|
||||||
|
"custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.",
|
||||||
"data.ideal_chunk_length": "ideal block length",
|
"data.ideal_chunk_length": "ideal block length",
|
||||||
"data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
|
"data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
|
||||||
"data_index_num": "Index {{index}}",
|
"data_index_num": "Index {{index}}",
|
||||||
@@ -86,6 +87,14 @@
|
|||||||
"retain_collection": "Adjust Training Parameters",
|
"retain_collection": "Adjust Training Parameters",
|
||||||
"retrain_task_submitted": "The retraining task has been submitted",
|
"retrain_task_submitted": "The retraining task has been submitted",
|
||||||
"same_api_collection": "The same API set exists",
|
"same_api_collection": "The same API set exists",
|
||||||
|
"split_sign_break": "1 newline character",
|
||||||
|
"split_sign_break2": "2 newline characters",
|
||||||
|
"split_sign_custom": "Customize",
|
||||||
|
"split_sign_exclamatiob": "exclamation mark",
|
||||||
|
"split_sign_null": "Not set",
|
||||||
|
"split_sign_period": "period",
|
||||||
|
"split_sign_question": "question mark",
|
||||||
|
"split_sign_semicolon": "semicolon",
|
||||||
"start_sync_website_tip": "Confirm to start synchronizing data? \nThe old data will be deleted and retrieved again, please confirm!",
|
"start_sync_website_tip": "Confirm to start synchronizing data? \nThe old data will be deleted and retrieved again, please confirm!",
|
||||||
"sync_collection_failed": "Synchronization collection error, please check whether the source file can be accessed normally",
|
"sync_collection_failed": "Synchronization collection error, please check whether the source file can be accessed normally",
|
||||||
"sync_schedule": "Timing synchronization",
|
"sync_schedule": "Timing synchronization",
|
||||||
|
@@ -574,7 +574,6 @@
|
|||||||
"core.dataset.import.Custom process desc": "自定义设置数据处理规则",
|
"core.dataset.import.Custom process desc": "自定义设置数据处理规则",
|
||||||
"core.dataset.import.Custom prompt": "自定义提示词",
|
"core.dataset.import.Custom prompt": "自定义提示词",
|
||||||
"core.dataset.import.Custom split char": "自定义分隔符",
|
"core.dataset.import.Custom split char": "自定义分隔符",
|
||||||
"core.dataset.import.Custom split char Tips": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。",
|
|
||||||
"core.dataset.import.Custom text": "自定义文本",
|
"core.dataset.import.Custom text": "自定义文本",
|
||||||
"core.dataset.import.Custom text desc": "手动输入一段文本作为数据集",
|
"core.dataset.import.Custom text desc": "手动输入一段文本作为数据集",
|
||||||
"core.dataset.import.Data process params": "数据处理参数",
|
"core.dataset.import.Data process params": "数据处理参数",
|
||||||
|
@@ -25,6 +25,7 @@
|
|||||||
"core.dataset.import.Adjust parameters": "调整参数",
|
"core.dataset.import.Adjust parameters": "调整参数",
|
||||||
"custom_data_process_params": "自定义",
|
"custom_data_process_params": "自定义",
|
||||||
"custom_data_process_params_desc": "自定义设置数据处理规则",
|
"custom_data_process_params_desc": "自定义设置数据处理规则",
|
||||||
|
"custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号,例如: * () [] {} 等。",
|
||||||
"data.ideal_chunk_length": "理想分块长度",
|
"data.ideal_chunk_length": "理想分块长度",
|
||||||
"data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引",
|
"data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引",
|
||||||
"data_index_num": "索引 {{index}}",
|
"data_index_num": "索引 {{index}}",
|
||||||
@@ -86,6 +87,14 @@
|
|||||||
"retain_collection": "调整训练参数",
|
"retain_collection": "调整训练参数",
|
||||||
"retrain_task_submitted": "重新训练任务已提交",
|
"retrain_task_submitted": "重新训练任务已提交",
|
||||||
"same_api_collection": "存在相同的 API 集合",
|
"same_api_collection": "存在相同的 API 集合",
|
||||||
|
"split_sign_break": "1 个换行符",
|
||||||
|
"split_sign_break2": "2 个换行符",
|
||||||
|
"split_sign_custom": "自定义",
|
||||||
|
"split_sign_exclamatiob": "感叹号",
|
||||||
|
"split_sign_null": "不设置",
|
||||||
|
"split_sign_period": "句号",
|
||||||
|
"split_sign_question": "问号",
|
||||||
|
"split_sign_semicolon": "分号",
|
||||||
"start_sync_website_tip": "确认开始同步数据?将会删除旧数据后重新获取,请确认!",
|
"start_sync_website_tip": "确认开始同步数据?将会删除旧数据后重新获取,请确认!",
|
||||||
"sync_collection_failed": "同步集合错误,请检查是否能正常访问源文件",
|
"sync_collection_failed": "同步集合错误,请检查是否能正常访问源文件",
|
||||||
"sync_schedule": "定时同步",
|
"sync_schedule": "定时同步",
|
||||||
|
@@ -569,7 +569,6 @@
|
|||||||
"core.dataset.import.Custom process desc": "自訂設定資料處理規則",
|
"core.dataset.import.Custom process desc": "自訂設定資料處理規則",
|
||||||
"core.dataset.import.Custom prompt": "自訂提示詞",
|
"core.dataset.import.Custom prompt": "自訂提示詞",
|
||||||
"core.dataset.import.Custom split char": "自訂分隔符",
|
"core.dataset.import.Custom split char": "自訂分隔符",
|
||||||
"core.dataset.import.Custom split char Tips": "允許您根據自訂的分隔符進行分割。通常用於已處理好的資料,使用特定的分隔符來精確分割。",
|
|
||||||
"core.dataset.import.Custom text": "自訂文字",
|
"core.dataset.import.Custom text": "自訂文字",
|
||||||
"core.dataset.import.Custom text desc": "手動輸入一段文字作為資料集",
|
"core.dataset.import.Custom text desc": "手動輸入一段文字作為資料集",
|
||||||
"core.dataset.import.Data process params": "資料處理參數",
|
"core.dataset.import.Data process params": "資料處理參數",
|
||||||
|
@@ -25,6 +25,7 @@
|
|||||||
"core.dataset.import.Adjust parameters": "調整參數",
|
"core.dataset.import.Adjust parameters": "調整參數",
|
||||||
"custom_data_process_params": "自訂",
|
"custom_data_process_params": "自訂",
|
||||||
"custom_data_process_params_desc": "自訂資料處理規則",
|
"custom_data_process_params_desc": "自訂資料處理規則",
|
||||||
|
"custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的數據,使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.” 表示中英文句號。\n\n盡量避免使用正則相關特殊符號,例如: * () [] {} 等。",
|
||||||
"data.ideal_chunk_length": "理想分塊長度",
|
"data.ideal_chunk_length": "理想分塊長度",
|
||||||
"data_amount": "{{dataAmount}} 組數據, {{indexAmount}} 組索引",
|
"data_amount": "{{dataAmount}} 組數據, {{indexAmount}} 組索引",
|
||||||
"data_index_num": "索引 {{index}}",
|
"data_index_num": "索引 {{index}}",
|
||||||
@@ -86,6 +87,14 @@
|
|||||||
"retain_collection": "調整訓練參數",
|
"retain_collection": "調整訓練參數",
|
||||||
"retrain_task_submitted": "重新訓練任務已提交",
|
"retrain_task_submitted": "重新訓練任務已提交",
|
||||||
"same_api_collection": "存在相同的 API 集合",
|
"same_api_collection": "存在相同的 API 集合",
|
||||||
|
"split_sign_break": "1 個換行符",
|
||||||
|
"split_sign_break2": "2 個換行符",
|
||||||
|
"split_sign_custom": "自定義",
|
||||||
|
"split_sign_exclamatiob": "驚嘆號",
|
||||||
|
"split_sign_null": "不設置",
|
||||||
|
"split_sign_period": "句號",
|
||||||
|
"split_sign_question": "問號",
|
||||||
|
"split_sign_semicolon": "分號",
|
||||||
"start_sync_website_tip": "確認開始同步資料?\n將會刪除舊資料後重新獲取,請確認!",
|
"start_sync_website_tip": "確認開始同步資料?\n將會刪除舊資料後重新獲取,請確認!",
|
||||||
"sync_collection_failed": "同步集合錯誤,請檢查是否能正常存取來源文件",
|
"sync_collection_failed": "同步集合錯誤,請檢查是否能正常存取來源文件",
|
||||||
"sync_schedule": "定時同步",
|
"sync_schedule": "定時同步",
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
import React, { useCallback, useEffect, useMemo, useRef } from 'react';
|
import React, { useCallback, useEffect, useMemo, useRef, useState } from 'react';
|
||||||
import {
|
import {
|
||||||
Box,
|
Box,
|
||||||
Flex,
|
Flex,
|
||||||
@@ -36,6 +36,7 @@ import MyNumberInput from '@fastgpt/web/components/common/Input/NumberInput';
|
|||||||
import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip';
|
import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip';
|
||||||
import { shadowLight } from '@fastgpt/web/styles/theme';
|
import { shadowLight } from '@fastgpt/web/styles/theme';
|
||||||
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
||||||
|
import MySelect from '@fastgpt/web/components/common/MySelect';
|
||||||
|
|
||||||
function DataProcess() {
|
function DataProcess() {
|
||||||
const { t } = useTranslation();
|
const { t } = useTranslation();
|
||||||
@@ -44,18 +45,39 @@ function DataProcess() {
|
|||||||
const { goToNext, processParamsForm, chunkSizeField, minChunkSize, maxChunkSize } =
|
const { goToNext, processParamsForm, chunkSizeField, minChunkSize, maxChunkSize } =
|
||||||
useContextSelector(DatasetImportContext, (v) => v);
|
useContextSelector(DatasetImportContext, (v) => v);
|
||||||
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
|
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
|
||||||
const { setValue, register, watch } = processParamsForm;
|
const { setValue, register, watch, getValues } = processParamsForm;
|
||||||
|
|
||||||
const trainingType = watch('trainingType');
|
const trainingType = watch('trainingType');
|
||||||
const chunkSettingMode = watch('chunkSettingMode');
|
const chunkSettingMode = watch('chunkSettingMode');
|
||||||
const qaPrompt = watch('qaPrompt');
|
|
||||||
|
|
||||||
|
const qaPrompt = watch('qaPrompt');
|
||||||
const {
|
const {
|
||||||
isOpen: isOpenCustomPrompt,
|
isOpen: isOpenCustomPrompt,
|
||||||
onOpen: onOpenCustomPrompt,
|
onOpen: onOpenCustomPrompt,
|
||||||
onClose: onCloseCustomPrompt
|
onClose: onCloseCustomPrompt
|
||||||
} = useDisclosure();
|
} = useDisclosure();
|
||||||
|
|
||||||
|
const customSplitList = [
|
||||||
|
{ label: t('dataset:split_sign_null'), value: '' },
|
||||||
|
{ label: t('dataset:split_sign_break'), value: '\\n' },
|
||||||
|
{ label: t('dataset:split_sign_break2'), value: '\\n\\n' },
|
||||||
|
{ label: t('dataset:split_sign_period'), value: '.|。' },
|
||||||
|
{ label: t('dataset:split_sign_exclamatiob'), value: '!|!' },
|
||||||
|
{ label: t('dataset:split_sign_question'), value: '?|?' },
|
||||||
|
{ label: t('dataset:split_sign_semicolon'), value: ';|;' },
|
||||||
|
{ label: '=====', value: '=====' },
|
||||||
|
{ label: t('dataset:split_sign_custom'), value: 'Other' }
|
||||||
|
];
|
||||||
|
|
||||||
|
const [customListSelectValue, setCustomListSelectValue] = useState(getValues('customSplitChar'));
|
||||||
|
useEffect(() => {
|
||||||
|
if (customListSelectValue === 'Other') {
|
||||||
|
setValue('customSplitChar', '');
|
||||||
|
} else {
|
||||||
|
setValue('customSplitChar', customListSelectValue);
|
||||||
|
}
|
||||||
|
}, [customListSelectValue, setValue]);
|
||||||
|
|
||||||
const trainingModeList = useMemo(() => {
|
const trainingModeList = useMemo(() => {
|
||||||
const list = Object.entries(DatasetCollectionDataProcessModeMap);
|
const list = Object.entries(DatasetCollectionDataProcessModeMap);
|
||||||
return list
|
return list
|
||||||
@@ -248,19 +270,33 @@ function DataProcess() {
|
|||||||
<Box mt={3}>
|
<Box mt={3}>
|
||||||
<Box>
|
<Box>
|
||||||
{t('common:core.dataset.import.Custom split char')}
|
{t('common:core.dataset.import.Custom split char')}
|
||||||
<QuestionTip
|
<QuestionTip label={t('dataset:custom_split_sign_tip')} />
|
||||||
label={t('common:core.dataset.import.Custom split char Tips')}
|
|
||||||
/>
|
|
||||||
</Box>
|
|
||||||
<Box mt={1}>
|
|
||||||
<Input
|
|
||||||
size={'sm'}
|
|
||||||
bg={'myGray.50'}
|
|
||||||
defaultValue={''}
|
|
||||||
placeholder="\n;======;==SPLIT=="
|
|
||||||
{...register('customSplitChar')}
|
|
||||||
/>
|
|
||||||
</Box>
|
</Box>
|
||||||
|
|
||||||
|
<HStack mt={1}>
|
||||||
|
<Box flex={'1 0 0'}>
|
||||||
|
<MySelect<string>
|
||||||
|
list={customSplitList}
|
||||||
|
size={'sm'}
|
||||||
|
bg={'myGray.50'}
|
||||||
|
value={customListSelectValue}
|
||||||
|
h={'32px'}
|
||||||
|
onChange={(val) => {
|
||||||
|
setCustomListSelectValue(val);
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
</Box>
|
||||||
|
{customListSelectValue === 'Other' && (
|
||||||
|
<Input
|
||||||
|
flex={'1 0 0'}
|
||||||
|
h={'32px'}
|
||||||
|
size={'sm'}
|
||||||
|
bg={'myGray.50'}
|
||||||
|
placeholder="\n;======;==SPLIT=="
|
||||||
|
{...register('customSplitChar')}
|
||||||
|
/>
|
||||||
|
)}
|
||||||
|
</HStack>
|
||||||
</Box>
|
</Box>
|
||||||
|
|
||||||
{showQAPromptInput && (
|
{showQAPromptInput && (
|
||||||
|
Reference in New Issue
Block a user