Files
FastGPT/packages/global/common/string/textSplitter.ts
Archer 1aebe5f185 V4.8.15 feature (#3331)
* feat: add customize toolkit (#3205)

* chaoyang

* fix-auth

* add toolkit

* add order

* plugin usage

* fix

* delete console:

* Fix: Fix fullscreen preview top positioning and improve Markdown rendering logic (#3247)

* 完成任务:修复全屏预览顶部固定问题,优化 Markdown 渲染逻辑

* 有问题修改

* 问题再修改

* 修正问题

* fix: plugin standalone display issue (#3254)

* 4.8.15 test (#3246)

* o1 config

* perf: system plugin code

* 调整系统插件代码。增加html 渲染安全配置。 (#3258)

* perf: base64 picker

* perf: list app or dataset

* perf: plugin config code

* 小窗适配等问题 (#3257)

* 小窗适配等问题

* git问题

* 小窗剩余问题

* feat: system plugin auth and lock version (#3265)

* feat: system plugin auth and lock version

* update comment

* 4.8.15 test (#3267)

* tmp log

* perf: login direct

* perf: iframe html code

* remove log

* fix: plugin standalone display (#3277)

* refactor: 页面拆分&i18n拆分 (#3281)

* refactor: account组件拆成独立页面

* script: 新增i18n json文件创建脚本

* refactor: 页面i18n拆分

* i18n: add en&hant

* 4.8.15 test (#3285)

* tmp log

* remove log

* fix: watch avatar refresh

* perf: i18n code

* fix(plugin): use intro instead of userguide (#3290)

* Universal SSO (#3292)

* tmp log

* remove log

* feat: common oauth

* readme

* perf: sso provider

* remove sso code

* perf: refresh plugins

* feat: add api dataset (#3272)

* add api-dataset

* fix api-dataset

* fix api dataset

* fix ts

* perf: create collection code (#3301)

* tmp log

* remove log

* perf: i18n change

* update version doc

* feat: question guide from chatId

* perf: create collection code

* fix: request api

* fix: request api

* fix: tts auth and response type (#3303)

* perf: md splitter

* fix: tts auth and response type

* fix: api file dataset (#3307)

* perf: api dataset init (#3310)

* perf: collection schema

* perf: api dataset init

* refactor: 团队管理独立页面 (#3302)

* ui: 团队管理独立页面

* 代码优化

* fix

* perf: sync collection and ui check (#3314)

* perf: sync collection

* remove script

* perf: update api server

* perf: api dataset parent

* perf: team ui

* perf: team 18n

* update team ui

* perf: ui check

* perf: i18n

* fix: debug variables & cronjob & system plugin callback load (#3315)

* fix: debug variables & cronjob & system plugin callback load

* fix type

* fix

* fix

* fix: plugin dataset quote;perf: system variables init (#3316)

* fix: plugin dataset quote

* perf: system variables init

* perf: node templates ui;fix: dataset import ui (#3318)

* fix: dataset import ui

* perf: node templates ui

* perf: ui refresh

* feat:套餐改名和套餐跳转配置 (#3309)

* fixing:except Sidebar

* 去除了多余的代码

* 修正了套餐说明的代码

* 修正了误删除的show_git代码

* 修正了名字部分等代码

* 修正了问题,遗留了其他和ui讨论不一致的部分

* 4.8.15 test (#3319)

* remove log

* pref: bill ui

* pref: bill ui

* perf: log

* html渲染文档 (#3270)

* html渲染文档

* 文档有点小问题

* feat: doc (#3322)

* 集合重训练 (#3282)

* rebaser

* 一点补充

* 小问题

* 其他问题修正,删除集合保留文件的参数还没找到...

* reTraining

* delete uesless

* 删除了一行错误代码

* 集合重训练部分

* fixing

* 删除console代码

* feat: navbar item config (#3326)

* perf: custom navbar code;perf: retraining code;feat: api dataset and dataset api doc (#3329)

* feat: api dataset and dataset api doc

* perf: retraining code

* perf: custom navbar code

* fix: ts (#3330)

* fix: ts

* fix: ts

* retraining ui

* perf: api collection filter

* perf: retrining button

---------

Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: Jiangween <145003935+Jiangween@users.noreply.github.com>
Co-authored-by: papapatrick <109422393+Patrickill@users.noreply.github.com>
2024-12-06 10:56:53 +08:00

397 lines
12 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { getErrText } from '../error/utils';
import { replaceRegChars } from './tools';
export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
type SplitProps = {
text: string;
chunkLen: number;
overlapRatio?: number;
customReg?: string[];
};
export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkLen'> & {
chunkLen?: number;
};
type SplitResponse = {
chunks: string[];
chars: number;
};
// 判断字符串是否为markdown的表格形式
const strIsMdTable = (str: string) => {
// 检查是否包含表格分隔符 |
if (!str.includes('|')) {
return false;
}
const lines = str.split('\n');
// 检查表格是否至少有两行
if (lines.length < 2) {
return false;
}
// 检查表头行是否包含 |
const headerLine = lines[0].trim();
if (!headerLine.startsWith('|') || !headerLine.endsWith('|')) {
return false;
}
// 检查分隔行是否由 | 和 - 组成
const separatorLine = lines[1].trim();
const separatorRegex = /^(\|[\s:]*-+[\s:]*)+\|$/;
if (!separatorRegex.test(separatorLine)) {
return false;
}
// 检查数据行是否包含 |
for (let i = 2; i < lines.length; i++) {
const dataLine = lines[i].trim();
if (dataLine && (!dataLine.startsWith('|') || !dataLine.endsWith('|'))) {
return false;
}
}
return true;
};
const markdownTableSplit = (props: SplitProps): SplitResponse => {
let { text = '', chunkLen } = props;
const splitText2Lines = text.split('\n');
const header = splitText2Lines[0];
const headerSize = header.split('|').length - 2;
const mdSplitString = `| ${new Array(headerSize > 0 ? headerSize : 1)
.fill(0)
.map(() => '---')
.join(' | ')} |`;
const chunks: string[] = [];
let chunk = `${header}
${mdSplitString}
`;
for (let i = 2; i < splitText2Lines.length; i++) {
if (chunk.length + splitText2Lines[i].length > chunkLen * 1.2) {
chunks.push(chunk);
chunk = `${header}
${mdSplitString}
`;
}
chunk += `${splitText2Lines[i]}\n`;
}
if (chunk) {
chunks.push(chunk);
}
return {
chunks,
chars: chunks.reduce((sum, chunk) => sum + chunk.length, 0)
};
};
/*
1. 自定义分隔符:不需要重叠,不需要小块合并
2. Markdown 标题:不需要重叠;标题嵌套共享,不需要小块合并
3. 特殊 markdown 语法:不需要重叠,需要小块合并
4. 段落:尽可能保证它是一个完整的段落。
5. 标点分割:重叠
*/
const commonSplit = (props: SplitProps): SplitResponse => {
let { text = '', chunkLen, overlapRatio = 0.15, customReg = [] } = props;
const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
const overlapLen = Math.round(chunkLen * overlapRatio);
// replace code block all \n to codeBlockMarker
text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) {
return match.replace(/\n/g, codeBlockMarker);
});
// replace invalid \n
text = text.replace(/(\r?\n|\r){3,}/g, '\n\n\n');
// The larger maxLen is, the next sentence is less likely to trigger splitting
const markdownIndex = 4;
const forbidOverlapIndex = 8;
const stepReges: { reg: RegExp; maxLen: number }[] = [
...customReg.map((text) => ({
reg: new RegExp(`(${replaceRegChars(text)})`, 'g'),
maxLen: chunkLen * 1.4
})),
{ reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkLen * 1.2 },
{ reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkLen * 1.4 },
{ reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkLen * 1.6 },
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 },
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 },
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
{ reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
{ reg: /(\n{2,})/g, maxLen: chunkLen * 1.6 },
{ reg: /([\n])/g, maxLen: chunkLen * 1.2 },
// ------ There's no overlap on the top
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
{ reg: /([]|!\s)/g, maxLen: chunkLen * 1.2 },
{ reg: /([]|\?\s)/g, maxLen: chunkLen * 1.4 },
{ reg: /([]|;\s)/g, maxLen: chunkLen * 1.6 },
{ reg: /([]|,\s)/g, maxLen: chunkLen * 2 }
];
const customRegLen = customReg.length;
const checkIsCustomStep = (step: number) => step < customRegLen;
const checkIsMarkdownSplit = (step: number) =>
step >= customRegLen && step <= markdownIndex + customRegLen;
+customReg.length;
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customRegLen;
// if use markdown title split, Separate record title
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
if (step >= stepReges.length) {
return [
{
text,
title: ''
}
];
}
const isCustomStep = checkIsCustomStep(step);
const isMarkdownSplit = checkIsMarkdownSplit(step);
const { reg } = stepReges[step];
const splitTexts = text
.replace(
reg,
(() => {
if (isCustomStep) return splitMarker;
if (isMarkdownSplit) return `${splitMarker}$1`;
return `$1${splitMarker}`;
})()
)
.split(`${splitMarker}`)
.filter((part) => part.trim());
return splitTexts
.map((text) => {
const matchTitle = isMarkdownSplit ? text.match(reg)?.[0] || '' : '';
return {
text: isMarkdownSplit ? text.replace(matchTitle, '') : text,
title: matchTitle
};
})
.filter((item) => !!item.title || !!item.text?.trim());
};
/* Gets the overlap at the end of a text as the beginning of the next block */
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
const forbidOverlap = checkForbidOverlap(step);
const maxOverlapLen = chunkLen * 0.4;
// step >= stepReges.length: Do not overlap incomplete sentences
if (forbidOverlap || overlapLen === 0 || step >= stepReges.length) return '';
const splitTexts = getSplitTexts({ text, step });
let overlayText = '';
for (let i = splitTexts.length - 1; i >= 0; i--) {
const currentText = splitTexts[i].text;
const newText = currentText + overlayText;
const newTextLen = newText.length;
if (newTextLen > overlapLen) {
if (newTextLen > maxOverlapLen) {
const text = getOneTextOverlapText({ text: newText, step: step + 1 });
return text || overlayText;
}
return newText;
}
overlayText = newText;
}
return overlayText;
};
const splitTextRecursively = ({
text = '',
step,
lastText,
parentTitle = ''
}: {
text: string;
step: number;
lastText: string; // 上一个分块末尾数据会通过这个参数传入。
parentTitle: string;
}): string[] => {
const isMarkdownStep = checkIsMarkdownSplit(step);
const isCustomStep = checkIsCustomStep(step);
const forbidConcat = isMarkdownStep || isCustomStep; // forbid=true时候lastText肯定为空
// oversize
if (step >= stepReges.length) {
if (text.length < chunkLen * 3) {
return [text];
}
// use slice-chunkLen to split text
const chunks: string[] = [];
for (let i = 0; i < text.length; i += chunkLen - overlapLen) {
chunks.push(text.slice(i, i + chunkLen));
}
return chunks;
}
// split text by special char
const splitTexts = getSplitTexts({ text, step });
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen;
const minChunkLen = chunkLen * 0.7;
const chunks: string[] = [];
for (let i = 0; i < splitTexts.length; i++) {
const item = splitTexts[i];
const lastTextLen = lastText.length;
const currentText = item.text;
const newText = lastText + currentText;
const newTextLen = newText.length;
// Markdown 模式下,会强制向下拆分最小块,并再最后一个标题时候,给小块都补充上所有标题(包含父级标题)
if (isMarkdownStep) {
// split new Text, split chunks must will greater 1 (small lastText)
const innerChunks = splitTextRecursively({
text: newText,
step: step + 1,
lastText: '',
parentTitle: parentTitle + item.title
});
if (innerChunks.length === 0) {
chunks.push(`${parentTitle}${item.title}`);
continue;
}
chunks.push(
...innerChunks.map(
(chunk) =>
step === markdownIndex + customRegLen ? `${parentTitle}${item.title}${chunk}` : chunk // 合并进 Markdown 分块时,需要补标题
)
);
continue;
}
// newText is too large(now, The lastText must be smaller than chunkLen)
if (newTextLen > maxLen) {
// lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
if (lastTextLen > minChunkLen) {
chunks.push(lastText);
lastText = getOneTextOverlapText({ text: lastText, step }); // next chunk will start with overlayText
i--;
continue;
}
// 说明是新的文本块比较大,需要进一步拆分
// split new Text, split chunks must will greater 1 (small lastText)
const innerChunks = splitTextRecursively({
text: newText,
step: step + 1,
lastText: '',
parentTitle: parentTitle + item.title
});
const lastChunk = innerChunks[innerChunks.length - 1];
if (!lastChunk) continue;
// last chunk is too small, concat it to lastText(next chunk start)
if (lastChunk.length < minChunkLen) {
chunks.push(...innerChunks.slice(0, -1));
lastText = lastChunk;
continue;
}
// Last chunk is large enough
chunks.push(...innerChunks);
// compute new overlapText
lastText = getOneTextOverlapText({
text: lastChunk,
step
});
continue;
}
// New text is small
// Not overlap
if (forbidConcat) {
chunks.push(item.text);
continue;
}
lastText += item.text;
}
/* If the last chunk is independent, it needs to be push chunks. */
if (lastText && chunks[chunks.length - 1] && !chunks[chunks.length - 1].endsWith(lastText)) {
if (lastText.length < chunkLen * 0.4) {
chunks[chunks.length - 1] = chunks[chunks.length - 1] + lastText;
} else {
chunks.push(lastText);
}
} else if (lastText && chunks.length === 0) {
// 只分出一个很小的块,则直接追加到末尾(如果大于 1 个块,说明这个小块内容已经被上一个块拿到了)
chunks.push(lastText);
}
return chunks;
};
try {
const chunks = splitTextRecursively({
text,
step: 0,
lastText: '',
parentTitle: ''
}).map((chunk) => chunk?.replaceAll(codeBlockMarker, '\n')?.trim() || ''); // restore code block
const chars = chunks.reduce((sum, chunk) => sum + chunk.length, 0);
return {
chunks,
chars
};
} catch (err) {
throw new Error(getErrText(err));
}
};
/**
* text split into chunks
* chunkLen - one chunk len. max: 3500
* overlapLen - The size of the before and after Text
* chunkLen > overlapLen
* markdown
*/
export const splitText2Chunks = (props: SplitProps): SplitResponse => {
let { text = '' } = props;
const start = Date.now();
const splitWithCustomSign = text.split(CUSTOM_SPLIT_SIGN);
const splitResult = splitWithCustomSign.map((item) => {
if (strIsMdTable(item)) {
return markdownTableSplit(props);
}
return commonSplit(props);
});
return {
chunks: splitResult.map((item) => item.chunks).flat(),
chars: splitResult.reduce((sum, item) => sum + item.chars, 0)
};
};