perf: chunk trigger and paragraph split (#4893)

* perf: chunk trigger and paragraph split

* update max size computed

* perf: i18n

* remove table
This commit is contained in:
Archer
2025-05-26 18:57:22 +08:00
committed by GitHub
parent 874300a56a
commit c25cd48e72
23 changed files with 859 additions and 164 deletions

View File

@@ -7,6 +7,10 @@ export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
type SplitProps = {
text: string;
chunkSize: number;
paragraphChunkDeep?: number; // Paragraph deep
paragraphChunkMinSize?: number; // Paragraph min size, if too small, it will merge
maxSize?: number;
overlapRatio?: number;
customReg?: string[];
@@ -108,6 +112,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
let {
text = '',
chunkSize,
paragraphChunkDeep = 5,
paragraphChunkMinSize = 100,
maxSize = defaultMaxChunkSize,
overlapRatio = 0.15,
customReg = []
@@ -123,7 +129,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) {
return match.replace(/\n/g, codeBlockMarker);
});
// 2. 表格处理 - 单独提取表格出来,进行表头合并
// 2. Markdown 表格处理 - 单独提取表格出来,进行表头合并
const tableReg =
/(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n?)*)(?:\n|$)/g;
const tableDataList = text.match(tableReg);
@@ -143,25 +149,40 @@ const commonSplit = (props: SplitProps): SplitResponse => {
text = text.replace(/(\r?\n|\r){3,}/g, '\n\n\n');
// The larger maxLen is, the next sentence is less likely to trigger splitting
const markdownIndex = 4;
const forbidOverlapIndex = 8;
const customRegLen = customReg.length;
const markdownIndex = paragraphChunkDeep - 1;
const forbidOverlapIndex = customRegLen + markdownIndex + 4;
const markdownHeaderRules = ((deep?: number): { reg: RegExp; maxLen: number }[] => {
if (!deep || deep === 0) return [];
const maxDeep = Math.min(deep, 8); // Maximum 8 levels
const rules: { reg: RegExp; maxLen: number }[] = [];
for (let i = 1; i <= maxDeep; i++) {
const hashSymbols = '#'.repeat(i);
rules.push({
reg: new RegExp(`^(${hashSymbols}\\s[^\\n]+\\n)`, 'gm'),
maxLen: chunkSize
});
}
return rules;
})(paragraphChunkDeep);
const stepReges: { reg: RegExp | string; maxLen: number }[] = [
...customReg.map((text) => ({
reg: text.replaceAll('\\n', '\n'),
maxLen: chunkSize
})),
{ reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize },
...markdownHeaderRules,
{ reg: /([\n](```[\s\S]*?```|~~~[\s\S]*?~~~))/g, maxLen: maxSize }, // code block
// HTML Table tag 尽可能保障完整
{
reg: /(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n)*)/g,
maxLen: Math.min(chunkSize * 1.5, maxSize)
}, // Table 尽可能保证完整性
maxLen: chunkSize
}, // Markdown Table 尽可能保证完整性
{ reg: /(\n{2,})/g, maxLen: chunkSize },
{ reg: /([\n])/g, maxLen: chunkSize },
// ------ There's no overlap on the top
@@ -172,12 +193,10 @@ const commonSplit = (props: SplitProps): SplitResponse => {
{ reg: /([]|,\s)/g, maxLen: chunkSize }
];
const customRegLen = customReg.length;
const checkIsCustomStep = (step: number) => step < customRegLen;
const checkIsMarkdownSplit = (step: number) =>
step >= customRegLen && step <= markdownIndex + customRegLen;
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customRegLen;
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex;
// if use markdown title split, Separate record title
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
@@ -301,6 +320,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
const splitTexts = getSplitTexts({ text, step });
const chunks: string[] = [];
for (let i = 0; i < splitTexts.length; i++) {
const item = splitTexts[i];
@@ -443,7 +463,6 @@ const commonSplit = (props: SplitProps): SplitResponse => {
*/
export const splitText2Chunks = (props: SplitProps): SplitResponse => {
let { text = '' } = props;
const start = Date.now();
const splitWithCustomSign = text.split(CUSTOM_SPLIT_SIGN);
const splitResult = splitWithCustomSign.map((item) => {