Files
FastGPT/packages/global/common/string/textSplitter.ts
Archer e75d81d05a V4.9.1 feature (#4206)
* fix: remove DefaultTeam (#4037)

* fix :Get application bound knowledge base information logical rewrite (#4057)

* fix :Get application bound knowledge base information logical rewrite

* fix :Get application bound knowledge base information logical rewrite

* fix :Get application bound knowledge base information logical rewrite

* fix :Get application bound knowledge base information logical rewrite

* update package

* fix: import dataset step error;perf: ai proxy avatar (#4074)

* perf: pg config params

* perf: ai proxy avatar

* fix: import dataset step error

* feat: data input ux

* perf: app dataset rewite

* fix: 文本提取不支持arrayString,arrayNumber等jsonSchema (#4079)

* update doc ;perf: model test (#4098)

* perf: extract array

* update doc

* perf: model test

* perf: model test

* perf: think tag parse (#4102)

* chat quote reader (#3912)

* init chat quote full text reader

* linked structure

* dataset data linked

* optimize code

* fix ts build

* test finish

* delete log

* fix

* fix ts

* fix ts

* remove nextId

* initial scroll

* fix

* fix

* perf: chunk read   (#4109)

* package

* perf: chunk read

* feat: api dataset support pdf parse;fix: chunk reader auth (#4117)

* feat: api dataset support pdf parse

* fix: chunk reader auth

* feat: invitation link (#3979)

* feat: invitation link schema and apis

* feat: add invitation link

* feat: member status: active, leave, forbidden

* fix: expires show hours and minutes

* feat: invalid invitation link hint

* fix: typo

* chore: fix typo & i18n

* fix

* pref: fe

* feat: add ttl index for 30-day-clean-up

* perf: invite member code (#4118)

* perf: invite member code

* fix: ts

* fix: model test channel id;fix: quote reader (#4123)

* fix: model test channel id

* fix: quote reader

* fix chat quote reader (#4125)

* perf: model test;perf: sidebar trigger (#4127)

* fix: import dataset step error;perf: ai proxy avatar (#4074)

* perf: pg config params

* perf: ai proxy avatar

* fix: import dataset step error

* feat: data input ux

* perf: app dataset rewite

* perf: model test

* perf: sidebar trigger

* lock

* update nanoid version

* fix: select component ux

* fix: ts

* fix: vitest

* remove test

* fix: prompt toolcall ui (#4139)

* load log error adapt

* fix: prompt toolcall ui

* perf: commercial function tip

* update package

* pref: copy link (#4147)

* fix(i18n): namespace (#4143)

* hiden dataset source (#4152)

* hiden dataset source

* perf: reader

* chore: move all tests into a single folder (#4160)

* fix modal close scroll (#4162)

* fix modal close scroll

* update refresh

* feat: rerank modal select and weight (#4164)

* fix loadInitData refresh (#4169)

* fix

* fix

* form input number default & api dataset max token

* feat: mix search weight (#4170)

* feat: mix search weight

* feat: svg render

* fix: avatar error remove (#4173)

* fix: avatar error remove

* fix: index

* fix: guide

* fix: auth

* update package;fix: input data model ui (#4181)

* update package

* fix: ts

* update config

* update jieba package

* add type sign

* fix: input data ui

* fix: page title refresh (#4186)

* fix: ts

* update jieba package

* fix: page title refresh

* fix: remove member length check when opening invite create modal (#4193)

* add env to check internal ip (#4187)

* fix: ts

* update jieba package

* add env to check internal ip

* package

* fix: jieba

* reset package

* update config

* fix: jieba package

* init shell

* init version

* change team reload

* update jieba package (#4200)

* update jieba package

* package

* update package

* remove invalid code

* action

* package (#4201)

* package

* update package

* remove invalid code

* package

* remove i18n tip (#4202)

* doc (#4205)

* fix: i18n (#4208)

* fix: next config (#4207)

* reset package

* i18n

* update config

* i18n

* remove log

---------

Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
Co-authored-by: gggaaallleee <91131304+gggaaallleee@users.noreply.github.com>
Co-authored-by: shilin <39396378+shilin66@users.noreply.github.com>
Co-authored-by: heheer <heheer@sealos.io>
2025-03-18 14:40:41 +08:00

397 lines
12 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { getErrText } from '../error/utils';
import { replaceRegChars } from './tools';
export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
type SplitProps = {
text: string;
chunkLen: number;
overlapRatio?: number;
customReg?: string[];
};
export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkLen'> & {
chunkLen?: number;
};
type SplitResponse = {
chunks: string[];
chars: number;
};
// 判断字符串是否为markdown的表格形式
const strIsMdTable = (str: string) => {
// 检查是否包含表格分隔符 |
if (!str.includes('|')) {
return false;
}
const lines = str.split('\n');
// 检查表格是否至少有两行
if (lines.length < 2) {
return false;
}
// 检查表头行是否包含 |
const headerLine = lines[0].trim();
if (!headerLine.startsWith('|') || !headerLine.endsWith('|')) {
return false;
}
// 检查分隔行是否由 | 和 - 组成
const separatorLine = lines[1].trim();
const separatorRegex = /^(\|[\s:]*-+[\s:]*)+\|$/;
if (!separatorRegex.test(separatorLine)) {
return false;
}
// 检查数据行是否包含 |
for (let i = 2; i < lines.length; i++) {
const dataLine = lines[i].trim();
if (dataLine && (!dataLine.startsWith('|') || !dataLine.endsWith('|'))) {
return false;
}
}
return true;
};
const markdownTableSplit = (props: SplitProps): SplitResponse => {
let { text = '', chunkLen } = props;
const splitText2Lines = text.split('\n');
const header = splitText2Lines[0];
const headerSize = header.split('|').length - 2;
const mdSplitString = `| ${new Array(headerSize > 0 ? headerSize : 1)
.fill(0)
.map(() => '---')
.join(' | ')} |`;
const chunks: string[] = [];
let chunk = `${header}
${mdSplitString}
`;
for (let i = 2; i < splitText2Lines.length; i++) {
if (chunk.length + splitText2Lines[i].length > chunkLen * 1.2) {
chunks.push(chunk);
chunk = `${header}
${mdSplitString}
`;
}
chunk += `${splitText2Lines[i]}\n`;
}
if (chunk) {
chunks.push(chunk);
}
return {
chunks,
chars: chunks.reduce((sum, chunk) => sum + chunk.length, 0)
};
};
/*
1. 自定义分隔符:不需要重叠,不需要小块合并
2. Markdown 标题:不需要重叠;标题嵌套共享,需要小块合并
3. 特殊 markdown 语法:不需要重叠,需要小块合并
4. 段落:尽可能保证它是一个完整的段落。
5. 标点分割:重叠
*/
const commonSplit = (props: SplitProps): SplitResponse => {
let { text = '', chunkLen, overlapRatio = 0.15, customReg = [] } = props;
const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
const overlapLen = Math.round(chunkLen * overlapRatio);
// replace code block all \n to codeBlockMarker
text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) {
return match.replace(/\n/g, codeBlockMarker);
});
// replace invalid \n
text = text.replace(/(\r?\n|\r){3,}/g, '\n\n\n');
// The larger maxLen is, the next sentence is less likely to trigger splitting
const markdownIndex = 4;
const forbidOverlapIndex = 8;
const stepReges: { reg: RegExp; maxLen: number }[] = [
...customReg.map((text) => ({
reg: new RegExp(`(${replaceRegChars(text)})`, 'g'),
maxLen: chunkLen * 1.4
})),
{ reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkLen * 1.2 },
{ reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkLen * 1.4 },
{ reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkLen * 1.6 },
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 },
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 },
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
{ reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
{ reg: /(\n{2,})/g, maxLen: chunkLen * 1.6 },
{ reg: /([\n])/g, maxLen: chunkLen * 1.2 },
// ------ There's no overlap on the top
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
{ reg: /([]|!\s)/g, maxLen: chunkLen * 1.2 },
{ reg: /([]|\?\s)/g, maxLen: chunkLen * 1.4 },
{ reg: /([]|;\s)/g, maxLen: chunkLen * 1.6 },
{ reg: /([]|,\s)/g, maxLen: chunkLen * 2 }
];
const customRegLen = customReg.length;
const checkIsCustomStep = (step: number) => step < customRegLen;
const checkIsMarkdownSplit = (step: number) =>
step >= customRegLen && step <= markdownIndex + customRegLen;
+customReg.length;
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customRegLen;
// if use markdown title split, Separate record title
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
if (step >= stepReges.length) {
return [
{
text,
title: ''
}
];
}
const isCustomStep = checkIsCustomStep(step);
const isMarkdownSplit = checkIsMarkdownSplit(step);
const { reg } = stepReges[step];
const splitTexts = text
.replace(
reg,
(() => {
if (isCustomStep) return splitMarker;
if (isMarkdownSplit) return `${splitMarker}$1`;
return `$1${splitMarker}`;
})()
)
.split(`${splitMarker}`)
.filter((part) => part.trim());
return splitTexts
.map((text) => {
const matchTitle = isMarkdownSplit ? text.match(reg)?.[0] || '' : '';
return {
text: isMarkdownSplit ? text.replace(matchTitle, '') : text,
title: matchTitle
};
})
.filter((item) => !!item.title || !!item.text?.trim());
};
/* Gets the overlap at the end of a text as the beginning of the next block */
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
const forbidOverlap = checkForbidOverlap(step);
const maxOverlapLen = chunkLen * 0.4;
// step >= stepReges.length: Do not overlap incomplete sentences
if (forbidOverlap || overlapLen === 0 || step >= stepReges.length) return '';
const splitTexts = getSplitTexts({ text, step });
let overlayText = '';
for (let i = splitTexts.length - 1; i >= 0; i--) {
const currentText = splitTexts[i].text;
const newText = currentText + overlayText;
const newTextLen = newText.length;
if (newTextLen > overlapLen) {
if (newTextLen > maxOverlapLen) {
const text = getOneTextOverlapText({ text: newText, step: step + 1 });
return text || overlayText;
}
return newText;
}
overlayText = newText;
}
return overlayText;
};
const splitTextRecursively = ({
text = '',
step,
lastText,
parentTitle = ''
}: {
text: string;
step: number;
lastText: string; // 上一个分块末尾数据会通过这个参数传入。
parentTitle: string;
}): string[] => {
const isMarkdownStep = checkIsMarkdownSplit(step);
const isCustomStep = checkIsCustomStep(step);
const forbidConcat = isCustomStep; // forbid=true时候lastText肯定为空
// oversize
if (step >= stepReges.length) {
if (text.length < chunkLen * 3) {
return [text];
}
// use slice-chunkLen to split text
const chunks: string[] = [];
for (let i = 0; i < text.length; i += chunkLen - overlapLen) {
chunks.push(text.slice(i, i + chunkLen));
}
return chunks;
}
// split text by special char
const splitTexts = getSplitTexts({ text, step });
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen;
const minChunkLen = chunkLen * 0.7;
const chunks: string[] = [];
for (let i = 0; i < splitTexts.length; i++) {
const item = splitTexts[i];
const lastTextLen = lastText.length;
const currentText = item.text;
const newText = lastText + currentText;
const newTextLen = newText.length;
// Markdown 模式下,会强制向下拆分最小块,并再最后一个标题时候,给小块都补充上所有标题(包含父级标题)
if (isMarkdownStep) {
// split new Text, split chunks must will greater 1 (small lastText)
const innerChunks = splitTextRecursively({
text: newText,
step: step + 1,
lastText: '',
parentTitle: parentTitle + item.title
});
if (innerChunks.length === 0) {
chunks.push(`${parentTitle}${item.title}`);
continue;
}
chunks.push(
...innerChunks.map(
(chunk) =>
step === markdownIndex + customRegLen ? `${parentTitle}${item.title}${chunk}` : chunk // 合并进 Markdown 分块时,需要补标题
)
);
continue;
}
// newText is too large(now, The lastText must be smaller than chunkLen)
if (newTextLen > maxLen) {
// lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
if (lastTextLen > minChunkLen) {
chunks.push(lastText);
lastText = getOneTextOverlapText({ text: lastText, step }); // next chunk will start with overlayText
i--;
continue;
}
// 说明是新的文本块比较大,需要进一步拆分
// split new Text, split chunks must will greater 1 (small lastText)
const innerChunks = splitTextRecursively({
text: newText,
step: step + 1,
lastText: '',
parentTitle: parentTitle + item.title
});
const lastChunk = innerChunks[innerChunks.length - 1];
if (!lastChunk) continue;
// last chunk is too small, concat it to lastText(next chunk start)
if (lastChunk.length < minChunkLen) {
chunks.push(...innerChunks.slice(0, -1));
lastText = lastChunk;
continue;
}
// Last chunk is large enough
chunks.push(...innerChunks);
// compute new overlapText
lastText = getOneTextOverlapText({
text: lastChunk,
step
});
continue;
}
// New text is small
// Not overlap
if (forbidConcat) {
chunks.push(item.text);
continue;
}
lastText += item.text;
}
/* If the last chunk is independent, it needs to be push chunks. */
if (lastText && chunks[chunks.length - 1] && !chunks[chunks.length - 1].endsWith(lastText)) {
if (lastText.length < chunkLen * 0.4) {
chunks[chunks.length - 1] = chunks[chunks.length - 1] + lastText;
} else {
chunks.push(lastText);
}
} else if (lastText && chunks.length === 0) {
// 只分出一个很小的块,则直接追加到末尾(如果大于 1 个块,说明这个小块内容已经被上一个块拿到了)
chunks.push(lastText);
}
return chunks;
};
try {
const chunks = splitTextRecursively({
text,
step: 0,
lastText: '',
parentTitle: ''
}).map((chunk) => chunk?.replaceAll(codeBlockMarker, '\n')?.trim() || ''); // restore code block
const chars = chunks.reduce((sum, chunk) => sum + chunk.length, 0);
return {
chunks,
chars
};
} catch (err) {
throw new Error(getErrText(err));
}
};
/**
* text split into chunks
* chunkLen - one chunk len. max: 3500
* overlapLen - The size of the before and after Text
* chunkLen > overlapLen
* markdown
*/
export const splitText2Chunks = (props: SplitProps): SplitResponse => {
let { text = '' } = props;
const start = Date.now();
const splitWithCustomSign = text.split(CUSTOM_SPLIT_SIGN);
const splitResult = splitWithCustomSign.map((item) => {
if (strIsMdTable(item)) {
return markdownTableSplit(props);
}
return commonSplit(props);
});
return {
chunks: splitResult.map((item) => item.chunks).flat(),
chars: splitResult.reduce((sum, item) => sum + item.chars, 0)
};
};