rFix textspliter (#3200)

* fix: text splitter

* perf: splitter
This commit is contained in:
Archer
2024-11-21 12:01:55 +08:00
committed by GitHub
parent 489bb076a3
commit 4f55025906
3 changed files with 42 additions and 21 deletions

View File

@@ -11,3 +11,7 @@ weight: 810
1. 1.
2. 新增 - 工作流支持进入聊天框/点击开始对话后,自动触发一轮对话。 2. 新增 - 工作流支持进入聊天框/点击开始对话后,自动触发一轮对话。
3. 新增 - 重写 chatContext对话测试也会有日志并且刷新后不会丢失对话。
4. 新增 - 分享链接支持配置是否允许查看原文。
5. 优化 - 工作流 ui 细节。
6. 修复 - 分块策略,四级标题会被丢失。 同时新增了五级标题的支持。

View File

@@ -99,7 +99,7 @@ ${mdSplitString}
5. 标点分割:重叠 5. 标点分割:重叠
*/ */
const commonSplit = (props: SplitProps): SplitResponse => { const commonSplit = (props: SplitProps): SplitResponse => {
let { text = '', chunkLen, overlapRatio = 0.2, customReg = [] } = props; let { text = '', chunkLen, overlapRatio = 0.15, customReg = [] } = props;
const splitMarker = 'SPLIT_HERE_SPLIT_HERE'; const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER'; const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
@@ -113,6 +113,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
text = text.replace(/(\r?\n|\r){3,}/g, '\n\n\n'); text = text.replace(/(\r?\n|\r){3,}/g, '\n\n\n');
// The larger maxLen is, the next sentence is less likely to trigger splitting // The larger maxLen is, the next sentence is less likely to trigger splitting
const markdownIndex = 4;
const forbidOverlapIndex = 8;
const stepReges: { reg: RegExp; maxLen: number }[] = [ const stepReges: { reg: RegExp; maxLen: number }[] = [
...customReg.map((text) => ({ ...customReg.map((text) => ({
reg: new RegExp(`(${replaceRegChars(text)})`, 'g'), reg: new RegExp(`(${replaceRegChars(text)})`, 'g'),
@@ -122,9 +124,11 @@ const commonSplit = (props: SplitProps): SplitResponse => {
{ reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkLen * 1.4 }, { reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkLen * 1.4 },
{ reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkLen * 1.6 }, { reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkLen * 1.6 },
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 }, { reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 },
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 },
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block { reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char { reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
{ reg: /(\n{2,})/g, maxLen: chunkLen * 1.6 },
{ reg: /([\n])/g, maxLen: chunkLen * 1.2 }, { reg: /([\n])/g, maxLen: chunkLen * 1.2 },
// ------ There's no overlap on the top // ------ There's no overlap on the top
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 }, { reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
@@ -136,8 +140,9 @@ const commonSplit = (props: SplitProps): SplitResponse => {
const customRegLen = customReg.length; const customRegLen = customReg.length;
const checkIsCustomStep = (step: number) => step < customRegLen; const checkIsCustomStep = (step: number) => step < customRegLen;
const checkIsMarkdownSplit = (step: number) => step >= customRegLen && step <= 3 + customRegLen; const checkIsMarkdownSplit = (step: number) => step >= customRegLen && step <= markdownIndex;
const checkForbidOverlap = (step: number) => step <= 6 + customRegLen; +customReg.length;
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customReg.length;
// if use markdown title split, Separate record title // if use markdown title split, Separate record title
const getSplitTexts = ({ text, step }: { text: string; step: number }) => { const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
@@ -231,7 +236,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
// use slice-chunkLen to split text // use slice-chunkLen to split text
const chunks: string[] = []; const chunks: string[] = [];
for (let i = 0; i < text.length; i += chunkLen - overlapLen) { for (let i = 0; i < text.length; i += chunkLen - overlapLen) {
chunks.push(`${parentTitle}${text.slice(i, i + chunkLen)}`); chunks.push(text.slice(i, i + chunkLen));
} }
return chunks; return chunks;
} }
@@ -241,7 +246,6 @@ const commonSplit = (props: SplitProps): SplitResponse => {
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen; const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen;
const minChunkLen = chunkLen * 0.7; const minChunkLen = chunkLen * 0.7;
// console.log(splitTexts, stepReges[step].reg);
const chunks: string[] = []; const chunks: string[] = [];
for (let i = 0; i < splitTexts.length; i++) { for (let i = 0; i < splitTexts.length; i++) {
@@ -249,12 +253,34 @@ const commonSplit = (props: SplitProps): SplitResponse => {
const lastTextLen = lastText.length; const lastTextLen = lastText.length;
const currentText = item.text; const currentText = item.text;
const currentTextLen = currentText.length;
const newText = lastText + currentText; const newText = lastText + currentText;
const newTextLen = lastTextLen + currentTextLen; const newTextLen = newText.length;
// Markdown 模式下,会强制向下拆分最小块,并再最后一个标题时候,给小块都补充上所有标题(包含父级标题)
if (isMarkdownStep) {
// split new Text, split chunks must will greater 1 (small lastText)
const innerChunks = splitTextRecursively({
text: newText,
step: step + 1,
lastText: '',
parentTitle: parentTitle + item.title
});
const lastChunk = innerChunks[innerChunks.length - 1];
if (!lastChunk) continue;
chunks.push(
...innerChunks.map(
(chunk) =>
step === markdownIndex + customRegLen ? `${parentTitle}${item.title}${chunk}` : chunk // 合并进 Markdown 分块时,需要补标题
)
);
continue;
}
// newText is too large(now, The lastText must be smaller than chunkLen) // newText is too large(now, The lastText must be smaller than chunkLen)
if (newTextLen > maxLen || isMarkdownStep) { if (newTextLen > maxLen) {
// lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText) // lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
if (lastTextLen > minChunkLen) { if (lastTextLen > minChunkLen) {
chunks.push(lastText); chunks.push(lastText);
@@ -278,15 +304,6 @@ const commonSplit = (props: SplitProps): SplitResponse => {
if (!lastChunk) continue; if (!lastChunk) continue;
if (forbidConcat) {
chunks.push(
...innerChunks.map(
(chunk) => (step === 3 + customRegLen ? `${parentTitle}${chunk}` : chunk) // 合并进 Markdown 分块时,需要补标题
)
);
continue;
}
// last chunk is too small, concat it to lastText(next chunk start) // last chunk is too small, concat it to lastText(next chunk start)
if (lastChunk.length < minChunkLen) { if (lastChunk.length < minChunkLen) {
chunks.push(...innerChunks.slice(0, -1)); chunks.push(...innerChunks.slice(0, -1));
@@ -304,11 +321,11 @@ const commonSplit = (props: SplitProps): SplitResponse => {
continue; continue;
} }
// new text is small // New text is small
// Not overlap // Not overlap
if (forbidConcat) { if (forbidConcat) {
chunks.push(`${parentTitle}${item.title}${item.text}`); chunks.push(item.text);
continue; continue;
} }

View File

@@ -33,7 +33,7 @@ MILVUS_TOKEN=133964348b00b4b4e4b51bef680a61350950385c8c64a3ec16b1ab92d3c67dcc4e0
SANDBOX_URL=http://localhost:3001 SANDBOX_URL=http://localhost:3001
# 商业版地址 # 商业版地址
PRO_URL= PRO_URL=
# 页面的地址,用于自动补全相对路径资源的 domain # 页面的地址,用于自动补全相对路径资源的 domain,注意后面不要跟 /
FE_DOMAIN=http://localhost:3000 FE_DOMAIN=http://localhost:3000
# 二级路由,需要打包时候就确定 # 二级路由,需要打包时候就确定
# NEXT_PUBLIC_BASE_URL=/fastai # NEXT_PUBLIC_BASE_URL=/fastai