From 4f550259065638db63052ad974b0826e554edf9f Mon Sep 17 00:00:00 2001 From: Archer <545436317@qq.com> Date: Thu, 21 Nov 2024 12:01:55 +0800 Subject: [PATCH] rFix textspliter (#3200) * fix: text splitter * perf: splitter --- .../zh-cn/docs/development/upgrading/4814.md | 4 ++ packages/global/common/string/textSplitter.ts | 57 ++++++++++++------- projects/app/.env.template | 2 +- 3 files changed, 42 insertions(+), 21 deletions(-) diff --git a/docSite/content/zh-cn/docs/development/upgrading/4814.md b/docSite/content/zh-cn/docs/development/upgrading/4814.md index 0671dd4f8..eff9168be 100644 --- a/docSite/content/zh-cn/docs/development/upgrading/4814.md +++ b/docSite/content/zh-cn/docs/development/upgrading/4814.md @@ -11,3 +11,7 @@ weight: 810 1. 2. 新增 - 工作流支持进入聊天框/点击开始对话后,自动触发一轮对话。 +3. 新增 - 重写 chatContext,对话测试也会有日志,并且刷新后不会丢失对话。 +4. 新增 - 分享链接支持配置是否允许查看原文。 +5. 优化 - 工作流 ui 细节。 +6. 修复 - 分块策略,四级标题会被丢失。 同时新增了五级标题的支持。 diff --git a/packages/global/common/string/textSplitter.ts b/packages/global/common/string/textSplitter.ts index c90f526b8..7694e604a 100644 --- a/packages/global/common/string/textSplitter.ts +++ b/packages/global/common/string/textSplitter.ts @@ -99,7 +99,7 @@ ${mdSplitString} 5. 标点分割:重叠 */ const commonSplit = (props: SplitProps): SplitResponse => { - let { text = '', chunkLen, overlapRatio = 0.2, customReg = [] } = props; + let { text = '', chunkLen, overlapRatio = 0.15, customReg = [] } = props; const splitMarker = 'SPLIT_HERE_SPLIT_HERE'; const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER'; @@ -113,6 +113,8 @@ const commonSplit = (props: SplitProps): SplitResponse => { text = text.replace(/(\r?\n|\r){3,}/g, '\n\n\n'); // The larger maxLen is, the next sentence is less likely to trigger splitting + const markdownIndex = 4; + const forbidOverlapIndex = 8; const stepReges: { reg: RegExp; maxLen: number }[] = [ ...customReg.map((text) => ({ reg: new RegExp(`(${replaceRegChars(text)})`, 'g'), @@ -122,9 +124,11 @@ const commonSplit = (props: SplitProps): SplitResponse => { { reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkLen * 1.4 }, { reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkLen * 1.6 }, { reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 }, + { reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 }, { reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block - { reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char + { reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char + { reg: /(\n{2,})/g, maxLen: chunkLen * 1.6 }, { reg: /([\n])/g, maxLen: chunkLen * 1.2 }, // ------ There's no overlap on the top { reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 }, @@ -136,8 +140,9 @@ const commonSplit = (props: SplitProps): SplitResponse => { const customRegLen = customReg.length; const checkIsCustomStep = (step: number) => step < customRegLen; - const checkIsMarkdownSplit = (step: number) => step >= customRegLen && step <= 3 + customRegLen; - const checkForbidOverlap = (step: number) => step <= 6 + customRegLen; + const checkIsMarkdownSplit = (step: number) => step >= customRegLen && step <= markdownIndex; + +customReg.length; + const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customReg.length; // if use markdown title split, Separate record title const getSplitTexts = ({ text, step }: { text: string; step: number }) => { @@ -231,7 +236,7 @@ const commonSplit = (props: SplitProps): SplitResponse => { // use slice-chunkLen to split text const chunks: string[] = []; for (let i = 0; i < text.length; i += chunkLen - overlapLen) { - chunks.push(`${parentTitle}${text.slice(i, i + chunkLen)}`); + chunks.push(text.slice(i, i + chunkLen)); } return chunks; } @@ -241,7 +246,6 @@ const commonSplit = (props: SplitProps): SplitResponse => { const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen; const minChunkLen = chunkLen * 0.7; - // console.log(splitTexts, stepReges[step].reg); const chunks: string[] = []; for (let i = 0; i < splitTexts.length; i++) { @@ -249,12 +253,34 @@ const commonSplit = (props: SplitProps): SplitResponse => { const lastTextLen = lastText.length; const currentText = item.text; - const currentTextLen = currentText.length; const newText = lastText + currentText; - const newTextLen = lastTextLen + currentTextLen; + const newTextLen = newText.length; + + // Markdown 模式下,会强制向下拆分最小块,并再最后一个标题时候,给小块都补充上所有标题(包含父级标题) + if (isMarkdownStep) { + // split new Text, split chunks must will greater 1 (small lastText) + const innerChunks = splitTextRecursively({ + text: newText, + step: step + 1, + lastText: '', + parentTitle: parentTitle + item.title + }); + + const lastChunk = innerChunks[innerChunks.length - 1]; + if (!lastChunk) continue; + + chunks.push( + ...innerChunks.map( + (chunk) => + step === markdownIndex + customRegLen ? `${parentTitle}${item.title}${chunk}` : chunk // 合并进 Markdown 分块时,需要补标题 + ) + ); + + continue; + } // newText is too large(now, The lastText must be smaller than chunkLen) - if (newTextLen > maxLen || isMarkdownStep) { + if (newTextLen > maxLen) { // lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText) if (lastTextLen > minChunkLen) { chunks.push(lastText); @@ -278,15 +304,6 @@ const commonSplit = (props: SplitProps): SplitResponse => { if (!lastChunk) continue; - if (forbidConcat) { - chunks.push( - ...innerChunks.map( - (chunk) => (step === 3 + customRegLen ? `${parentTitle}${chunk}` : chunk) // 合并进 Markdown 分块时,需要补标题 - ) - ); - continue; - } - // last chunk is too small, concat it to lastText(next chunk start) if (lastChunk.length < minChunkLen) { chunks.push(...innerChunks.slice(0, -1)); @@ -304,11 +321,11 @@ const commonSplit = (props: SplitProps): SplitResponse => { continue; } - // new text is small + // New text is small // Not overlap if (forbidConcat) { - chunks.push(`${parentTitle}${item.title}${item.text}`); + chunks.push(item.text); continue; } diff --git a/projects/app/.env.template b/projects/app/.env.template index 60d4a3752..5e043417d 100644 --- a/projects/app/.env.template +++ b/projects/app/.env.template @@ -33,7 +33,7 @@ MILVUS_TOKEN=133964348b00b4b4e4b51bef680a61350950385c8c64a3ec16b1ab92d3c67dcc4e0 SANDBOX_URL=http://localhost:3001 # 商业版地址 PRO_URL= -# 页面的地址,用于自动补全相对路径资源的 domain +# 页面的地址,用于自动补全相对路径资源的 domain,注意后面不要跟 / FE_DOMAIN=http://localhost:3000 # 二级路由,需要打包时候就确定 # NEXT_PUBLIC_BASE_URL=/fastai