Files
FastGPT/packages/global/common/string/textSplitter.ts
Archer e25d7efb5b feature: V4.11.1 (#5350)
* perf: system toolset & mcp (#5200)

* feat: support system toolset

* fix: type

* fix: system tool config

* chore: mcptool config migrate

* refactor: mcp toolset

* fix: fe type error

* fix: type error

* fix: show version

* chore: support extract tool's secretInputConfig out of inputs

* chore: compatible with old version mcp

* chore: adjust

* deps: update dependency @fastgpt-skd/plugin

* fix: version

* fix: some bug (#5316)

* chore: compatible with old version mcp

* fix: version

* fix: compatible bug

* fix: mcp object params

* fix: type error

* chore: update test cases

* chore: remove log

* fix: toolset node name

* optimize app logs sort (#5310)

* log keys config modal

* multiple select

* api

* fontsize

* code

* chatid

* fix build

* fix

* fix component

* change name

* log keys config

* fix

* delete unused

* fix

* perf: log code

* perf: send auth code modal enter press

* fix log (#5328)

* perf: mcp toolset comment

* perf: log ui

* remove log (#5347)

* doc

* fix: action

* remove log

* fix: Table Optimization (#5319)

* feat: table test: 1

* feat: table test: 2

* feat: table test: 3

* feat: table test: 4

* feat: table test : 5 把maxSize改回chunkSize

* feat: table test : 6 都删了,只看maxSize

* feat: table test : 7 恢复初始,接下来删除标签功能

* feat: table test : 8 删除标签功能

* feat: table test : 9 删除标签功能成功

* feat: table test : 10 继续调试,修改trainingStates

* feat: table test : 11 修改第一步

* feat: table test : 12 修改第二步

* feat: table test : 13 修改了HtmlTable2Md

* feat: table test : 14 修改表头分块规则

* feat: table test : 15 前面表格分的太细了

* feat: table test : 16 改着改着表头又不加了

* feat: table test : 17 用CUSTOM_SPLIT_SIGN不行,重新改

* feat: table test : 18 表头仍然还会多加,但现在分块搞的合理了终于

* feat: table test : 19 还是需要搞好表头问题,先保存一下调试情况

* feat: table test : 20 调试结束,看一下replace有没有问题,没问题就pr

* feat: table test : 21 先把注释删了

* feat: table test : 21 注释replace都改了,下面切main分支看看情况

* feat: table test : 22 修改旧文件

* feat: table test : 23 修改测试文件

* feat: table test : 24 xlsx表格处理

* feat: table test : 25 刚才没保存先com了

* feat: table test : 26 fix

* feat: table test : 27 先com一版调试

* feat: table test : 28 试试放format2csv里

* feat: table test : 29 xlsx解决

* feat: table test : 30 tablesplit解决

* feat: table test : 31

* feat: table test : 32

* perf: table split

* perf: mcp old version compatibility (#5342)

* fix: system-tool secret inputs

* fix: rewrite runtime node i18n for system tool

* perf: mcp old version compatibility

* fix: splitPluginId

* fix: old mcp toolId

* fix: filter secret key

* feat: support system toolset activation

* chore: remove log

* perf: mcp update

* perf: rewrite toolset

* fix:delete variable id (#5335)

* perf: variable update

* fix: multiple select ui

* perf: model config move to plugin

* fix: var conflit

* perf: variable checker

* Avoid empty number

* update doc time

* fix: test

* fix: mcp object

* update count app

* update count app

---------

Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: heheer <zhiyu44@qq.com>
Co-authored-by: colnii <1286949794@qq.com>
Co-authored-by: dreamer6680 <1468683855@qq.com>
2025-08-01 16:08:20 +08:00

493 lines
14 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { defaultMaxChunkSize } from '../../core/dataset/training/utils';
import { getErrText } from '../error/utils';
import { simpleText } from './tools';
import { getTextValidLength } from './utils';
export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
export type SplitProps = {
text: string;
chunkSize: number;
paragraphChunkDeep?: number; // Paragraph deep
paragraphChunkMinSize?: number; // Paragraph min size, if too small, it will merge
maxSize?: number;
overlapRatio?: number;
customReg?: string[];
};
export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkSize'> & {
chunkSize?: number;
};
export type SplitResponse = {
chunks: string[];
chars: number;
};
// 判断字符串是否为markdown的表格形式
const strIsMdTable = (str: string) => {
// 检查是否包含表格分隔符 |
if (!str.includes('|')) {
return false;
}
const lines = str.split('\n');
// 检查表格是否至少有两行
if (lines.length < 2) {
return false;
}
// 检查表头行是否包含 |
const headerLine = lines[0].trim();
if (!headerLine.startsWith('|') || !headerLine.endsWith('|')) {
return false;
}
// 检查分隔行是否由 | 和 - 组成
const separatorLine = lines[1].trim();
const separatorRegex = /^(\|[\s:]*-+[\s:]*)+\|$/;
if (!separatorRegex.test(separatorLine)) {
return false;
}
// 检查数据行是否包含 |
for (let i = 2; i < lines.length; i++) {
const dataLine = lines[i].trim();
if (dataLine && (!dataLine.startsWith('|') || !dataLine.endsWith('|'))) {
return false;
}
}
return true;
};
const markdownTableSplit = (props: SplitProps): SplitResponse => {
let { text = '', chunkSize } = props;
// split by rows
const splitText2Lines = text.split('\n').filter((line) => line.trim());
// If there are not enough rows to form a table, return directly
if (splitText2Lines.length < 2) {
return { chunks: [text], chars: text.length };
}
const header = splitText2Lines[0];
const headerSize = header.split('|').length - 2;
const mdSplitString = `| ${new Array(headerSize > 0 ? headerSize : 1)
.fill(0)
.map(() => '---')
.join(' | ')} |`;
const chunks: string[] = [];
let chunk = `${header}
${mdSplitString}
`;
for (let i = 2; i < splitText2Lines.length; i++) {
const chunkLength = getTextValidLength(chunk);
const nextLineLength = getTextValidLength(splitText2Lines[i]);
// Over size
if (chunkLength + nextLineLength > chunkSize) {
chunks.push(chunk);
chunk = `${header}
${mdSplitString}
`;
}
chunk += `${splitText2Lines[i]}\n`;
}
if (chunk) {
chunks.push(chunk);
}
return {
chunks,
chars: chunks.reduce((sum, chunk) => sum + chunk.length, 0)
};
};
/*
1. 自定义分隔符:不需要重叠,不需要小块合并
2. Markdown 标题:不需要重叠;标题嵌套共享,需要小块合并
3. 特殊 markdown 语法:不需要重叠,需要小块合并
4. 段落:尽可能保证它是一个完整的段落。
5. 标点分割:重叠
*/
const commonSplit = (props: SplitProps): SplitResponse => {
let {
text = '',
chunkSize,
paragraphChunkDeep = 5,
paragraphChunkMinSize = 100,
maxSize = defaultMaxChunkSize,
overlapRatio = 0.15,
customReg = []
} = props;
const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
const overlapLen = Math.round(chunkSize * overlapRatio);
// 特殊模块处理
// 1. 代码块处理 - 去除空字符
// replace code block all \n to codeBlockMarker
text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) {
return match.replace(/\n/g, codeBlockMarker);
});
// replace invalid \n
text = text.replace(/(\r?\n|\r){3,}/g, '\n\n\n');
// The larger maxLen is, the next sentence is less likely to trigger splitting
const customRegLen = customReg.length;
const markdownIndex = paragraphChunkDeep - 1;
const forbidOverlapIndex = customRegLen + markdownIndex + 4;
const markdownHeaderRules = ((deep?: number): { reg: RegExp; maxLen: number }[] => {
if (!deep || deep === 0) return [];
const maxDeep = Math.min(deep, 8); // Maximum 8 levels
const rules: { reg: RegExp; maxLen: number }[] = [];
for (let i = 1; i <= maxDeep; i++) {
const hashSymbols = '#'.repeat(i);
rules.push({
reg: new RegExp(`^(${hashSymbols}\\s[^\\n]+\\n)`, 'gm'),
maxLen: chunkSize
});
}
return rules;
})(paragraphChunkDeep);
const stepReges: { reg: RegExp | string; maxLen: number }[] = [
...customReg.map((text) => ({
reg: text.replace(/\\n/g, '\n'),
maxLen: chunkSize
})),
...markdownHeaderRules,
{ reg: /([\n](```[\s\S]*?```|~~~[\s\S]*?~~~))/g, maxLen: maxSize }, // code block
// HTML Table tag 尽可能保障完整
{
reg: /(\n\|(?:[^\n|]*\|)+\n\|(?:[:\-\s]*\|)+\n(?:\|(?:[^\n|]*\|)*\n)*)/g,
maxLen: chunkSize
}, // Markdown Table 尽可能保证完整性
{ reg: /(\n{2,})/g, maxLen: chunkSize },
{ reg: /([\n])/g, maxLen: chunkSize },
// ------ There's no overlap on the top
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkSize },
{ reg: /([]|!\s)/g, maxLen: chunkSize },
{ reg: /([]|\?\s)/g, maxLen: chunkSize },
{ reg: /([]|;\s)/g, maxLen: chunkSize },
{ reg: /([]|,\s)/g, maxLen: chunkSize }
];
const checkIsCustomStep = (step: number) => step < customRegLen;
const checkIsMarkdownSplit = (step: number) =>
step >= customRegLen && step <= markdownIndex + customRegLen;
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex;
// if use markdown title split, Separate record title
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
if (step >= stepReges.length) {
return [
{
text,
title: '',
chunkMaxSize: chunkSize
}
];
}
const isCustomStep = checkIsCustomStep(step);
const isMarkdownSplit = checkIsMarkdownSplit(step);
const { reg, maxLen } = stepReges[step];
const replaceText = (() => {
if (typeof reg === 'string') {
let tmpText = text;
reg.split('|').forEach((itemReg) => {
tmpText = tmpText.replaceAll(
itemReg,
(() => {
if (isCustomStep) return splitMarker;
if (isMarkdownSplit) return `${splitMarker}$1`;
return `$1${splitMarker}`;
})()
);
});
return tmpText;
}
return text.replace(
reg,
(() => {
if (isCustomStep) return splitMarker;
if (isMarkdownSplit) return `${splitMarker}$1`;
return `$1${splitMarker}`;
})()
);
})();
const splitTexts = replaceText.split(splitMarker).filter((part) => part.trim());
return splitTexts
.map((text) => {
const matchTitle = isMarkdownSplit ? text.match(reg)?.[0] || '' : '';
// 如果一个分块没有匹配到,则使用默认块大小,否则使用最大块大小
const chunkMaxSize = text.match(reg) === null ? chunkSize : maxLen;
return {
text: isMarkdownSplit ? text.replace(matchTitle, '') : text,
title: matchTitle,
chunkMaxSize
};
})
.filter((item) => !!item.title || !!item.text?.trim());
};
/* Gets the overlap at the end of a text as the beginning of the next block */
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
const forbidOverlap = checkForbidOverlap(step);
const maxOverlapLen = chunkSize * 0.4;
// step >= stepReges.length: Do not overlap incomplete sentences
if (forbidOverlap || overlapLen === 0 || step >= stepReges.length) return '';
const splitTexts = getSplitTexts({ text, step });
let overlayText = '';
for (let i = splitTexts.length - 1; i >= 0; i--) {
const currentText = splitTexts[i].text;
const newText = currentText + overlayText;
const newTextLen = getTextValidLength(newText);
if (newTextLen > overlapLen) {
if (newTextLen > maxOverlapLen) {
const text = getOneTextOverlapText({ text: newText, step: step + 1 });
return text || overlayText;
}
return newText;
}
overlayText = newText;
}
return overlayText;
};
const splitTextRecursively = ({
text = '',
step,
lastText,
parentTitle = ''
}: {
text: string;
step: number;
lastText: string; // 上一个分块末尾数据会通过这个参数传入。
parentTitle: string;
}): string[] => {
const isMarkdownStep = checkIsMarkdownSplit(step);
const isCustomStep = checkIsCustomStep(step);
const forbidConcat = isCustomStep; // forbid=true时候lastText肯定为空
const textLength = getTextValidLength(text);
// Over step
if (step >= stepReges.length) {
if (textLength < maxSize) {
return [text];
}
// use slice-chunkSize to split text
const chunks: string[] = [];
for (let i = 0; i < textLength; i += chunkSize - overlapLen) {
chunks.push(text.slice(i, i + chunkSize));
}
return chunks;
}
// split text by special char
const splitTexts = getSplitTexts({ text, step });
const chunks: string[] = [];
for (let i = 0; i < splitTexts.length; i++) {
const item = splitTexts[i];
const maxLen = item.chunkMaxSize; // 当前块最大长度
const lastTextLen = getTextValidLength(lastText);
const currentText = item.text;
const newText = lastText + currentText;
const newTextLen = getTextValidLength(newText);
// split the current table if it will exceed after adding
if (strIsMdTable(currentText) && newTextLen > maxLen) {
if (lastTextLen > 0) {
chunks.push(lastText);
lastText = '';
}
const { chunks: tableChunks } = markdownTableSplit({
text: currentText,
chunkSize: chunkSize * 1.2
});
chunks.push(...tableChunks);
continue;
}
// Markdown 模式下,会强制向下拆分最小块,并再最后一个标题深度,给小块都补充上所有标题(包含父级标题)
if (isMarkdownStep) {
// split new Text, split chunks must will greater 1 (small lastText)
const innerChunks = splitTextRecursively({
text: newText,
step: step + 1,
lastText: '',
parentTitle: parentTitle + item.title
});
// 只有标题,没有内容。
if (innerChunks.length === 0) {
chunks.push(`${parentTitle}${item.title}`);
continue;
}
// 在合并最深级标题时,需要补充标题
chunks.push(
...innerChunks.map(
(chunk) =>
step === markdownIndex + customRegLen ? `${parentTitle}${item.title}${chunk}` : chunk // 合并进 Markdown 分块时,需要补标题
)
);
continue;
}
// newText is too large(now, The lastText must be smaller than chunkSize)
if (newTextLen > maxLen) {
const minChunkLen = maxLen * 0.8; // 当前块最小长度
const maxChunkLen = maxLen * 1.2; // 当前块最大长度
// 新文本没有非常大,直接认为它是一个新的块
if (newTextLen < maxChunkLen) {
chunks.push(newText);
lastText = getOneTextOverlapText({ text: newText, step }); // next chunk will start with overlayText
continue;
}
// 上一个文本块已经挺大的,单独做一个块
if (lastTextLen > minChunkLen) {
chunks.push(lastText);
lastText = getOneTextOverlapText({ text: lastText, step }); // next chunk will start with overlayText
i--;
continue;
}
// 说明是当前文本比较大,需要进一步拆分
// 把新的文本块进行一个拆分,并追加到 latestText 中
const innerChunks = splitTextRecursively({
text: currentText,
step: step + 1,
lastText,
parentTitle: parentTitle + item.title
});
const lastChunk = innerChunks[innerChunks.length - 1];
if (!lastChunk) continue;
// last chunk is too small, concat it to lastText(next chunk start)
if (getTextValidLength(lastChunk) < minChunkLen) {
chunks.push(...innerChunks.slice(0, -1));
lastText = lastChunk;
continue;
}
// Last chunk is large enough
chunks.push(...innerChunks);
// compute new overlapText
lastText = getOneTextOverlapText({
text: lastChunk,
step
});
continue;
}
// New text is small
// Not overlap
if (forbidConcat) {
chunks.push(currentText);
continue;
}
lastText = newText;
}
/* If the last chunk is independent, it needs to be push chunks. */
if (lastText && chunks[chunks.length - 1] && !chunks[chunks.length - 1].endsWith(lastText)) {
if (getTextValidLength(lastText) < chunkSize * 0.4) {
chunks[chunks.length - 1] = chunks[chunks.length - 1] + lastText;
} else {
chunks.push(lastText);
}
} else if (lastText && chunks.length === 0) {
// 只分出一个很小的块,则直接追加到末尾(如果大于 1 个块,说明这个小块内容已经被上一个块拿到了)
chunks.push(lastText);
}
return chunks;
};
try {
const chunks = splitTextRecursively({
text,
step: 0,
lastText: '',
parentTitle: ''
}).map((chunk) => chunk?.replaceAll(codeBlockMarker, '\n')?.trim() || ''); // restore code block
const chars = chunks.reduce((sum, chunk) => sum + chunk.length, 0);
return {
chunks,
chars
};
} catch (err) {
throw new Error(getErrText(err));
}
};
/**
* text split into chunks
* chunkSize - one chunk len. max: 3500
* overlapLen - The size of the before and after Text
* chunkSize > overlapLen
* markdown
*/
export const splitText2Chunks = (props: SplitProps): SplitResponse => {
let { text = '' } = props;
const splitWithCustomSign = text.split(CUSTOM_SPLIT_SIGN);
const splitResult = splitWithCustomSign.map((item) => {
if (strIsMdTable(item)) {
return markdownTableSplit({ ...props, text: item });
}
return commonSplit({ ...props, text: item });
});
return {
chunks: splitResult
.map((item) => item.chunks)
.flat()
.map((chunk) => simpleText(chunk)),
chars: splitResult.reduce((sum, item) => sum + item.chars, 0)
};
};