perf: text splitter (#4313)

* sync collection

* remove lock

* perf: text splitter

* update comment
This commit is contained in:
Archer
2025-03-25 17:44:38 +08:00
committed by archer
parent 826a53dcb6
commit 37b4a1919b
8 changed files with 716 additions and 70 deletions

View File

@@ -30,6 +30,10 @@ weight: 799
6. 工作流节点数组字符串类型,自动适配 string 输入。 6. 工作流节点数组字符串类型,自动适配 string 输入。
7. 工作流节点数组类型,自动进行 JSON parse 解析 string 输入。 7. 工作流节点数组类型,自动进行 JSON parse 解析 string 输入。
8. AI proxy 日志优化,去除重试失败的日志,仅保留最后一份错误日志。 8. AI proxy 日志优化,去除重试失败的日志,仅保留最后一份错误日志。
9. 分块算法小调整:
* 跨处理符号之间连续性更强。
* 代码块分割时,用 LLM 模型上下文作为分块大小,尽可能保证代码块完整性。
* 表格分割时,用 LLM 模型上下文作为分块大小,尽可能保证表格完整性。
## 🐛 修复 ## 🐛 修复
@@ -37,3 +41,4 @@ weight: 799
2. 渠道测试时,如果配置了模型自定义请求地址,会走自定义请求地址,而不是渠道请求地址。 2. 渠道测试时,如果配置了模型自定义请求地址,会走自定义请求地址,而不是渠道请求地址。
3. 语音识别模型测试未启用的模型时,无法正常测试。 3. 语音识别模型测试未启用的模型时,无法正常测试。
4. 管理员配置系统插件时,如果插件包含其他系统应用,无法正常鉴权。 4. 管理员配置系统插件时,如果插件包含其他系统应用,无法正常鉴权。
5. 移除 TTS 自定义请求地址时,必须需要填 requestAuth 字段。

View File

@@ -134,8 +134,11 @@ const commonSplit = (props: SplitProps): SplitResponse => {
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize }, { reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize }, { reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /([\n]([`~]))/g, maxLen: chunkSize }, // code block { reg: /([\n](```[\s\S]*?```|~~~[\s\S]*?~~~))/g, maxLen: maxSize }, // code block
{ reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkSize }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char {
reg: /(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n)*)/g,
maxLen: maxSize
}, // Table 尽可能保证完整性
{ reg: /(\n{2,})/g, maxLen: chunkSize }, { reg: /(\n{2,})/g, maxLen: chunkSize },
{ reg: /([\n])/g, maxLen: chunkSize }, { reg: /([\n])/g, maxLen: chunkSize },
// ------ There's no overlap on the top // ------ There's no overlap on the top
@@ -150,7 +153,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
const checkIsCustomStep = (step: number) => step < customRegLen; const checkIsCustomStep = (step: number) => step < customRegLen;
const checkIsMarkdownSplit = (step: number) => const checkIsMarkdownSplit = (step: number) =>
step >= customRegLen && step <= markdownIndex + customRegLen; step >= customRegLen && step <= markdownIndex + customRegLen;
+customReg.length;
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customRegLen; const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customRegLen;
// if use markdown title split, Separate record title // if use markdown title split, Separate record title
@@ -159,7 +162,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
return [ return [
{ {
text, text,
title: '' title: '',
chunkMaxSize: chunkSize
} }
]; ];
} }
@@ -167,7 +171,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
const isCustomStep = checkIsCustomStep(step); const isCustomStep = checkIsCustomStep(step);
const isMarkdownSplit = checkIsMarkdownSplit(step); const isMarkdownSplit = checkIsMarkdownSplit(step);
const { reg } = stepReges[step]; const { reg, maxLen } = stepReges[step];
const replaceText = (() => { const replaceText = (() => {
if (typeof reg === 'string') { if (typeof reg === 'string') {
@@ -194,15 +198,19 @@ const commonSplit = (props: SplitProps): SplitResponse => {
})() })()
); );
})(); })();
const splitTexts = replaceText.split(splitMarker).filter((part) => part.trim()); const splitTexts = replaceText.split(splitMarker).filter((part) => part.trim());
return splitTexts return splitTexts
.map((text) => { .map((text) => {
const matchTitle = isMarkdownSplit ? text.match(reg)?.[0] || '' : ''; const matchTitle = isMarkdownSplit ? text.match(reg)?.[0] || '' : '';
// 如果一个分块没有匹配到,则使用默认块大小,否则使用最大块大小
const chunkMaxSize = text.match(reg) === null ? chunkSize : maxLen;
return { return {
text: isMarkdownSplit ? text.replace(matchTitle, '') : text, text: isMarkdownSplit ? text.replace(matchTitle, '') : text,
title: matchTitle title: matchTitle,
chunkMaxSize
}; };
}) })
.filter((item) => !!item.title || !!item.text?.trim()); .filter((item) => !!item.title || !!item.text?.trim());
@@ -252,9 +260,9 @@ const commonSplit = (props: SplitProps): SplitResponse => {
const isCustomStep = checkIsCustomStep(step); const isCustomStep = checkIsCustomStep(step);
const forbidConcat = isCustomStep; // forbid=true时候lastText肯定为空 const forbidConcat = isCustomStep; // forbid=true时候lastText肯定为空
// oversize // Over step
if (step >= stepReges.length) { if (step >= stepReges.length) {
if (text.length < chunkSize * 3) { if (text.length < maxSize) {
return [text]; return [text];
} }
// use slice-chunkSize to split text // use slice-chunkSize to split text
@@ -268,19 +276,18 @@ const commonSplit = (props: SplitProps): SplitResponse => {
// split text by special char // split text by special char
const splitTexts = getSplitTexts({ text, step }); const splitTexts = getSplitTexts({ text, step });
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkSize;
const minChunkLen = chunkSize * 0.7;
const chunks: string[] = []; const chunks: string[] = [];
for (let i = 0; i < splitTexts.length; i++) { for (let i = 0; i < splitTexts.length; i++) {
const item = splitTexts[i]; const item = splitTexts[i];
const maxLen = item.chunkMaxSize; // 当前块最大长度
const lastTextLen = lastText.length; const lastTextLen = lastText.length;
const currentText = item.text; const currentText = item.text;
const newText = lastText + currentText; const newText = lastText + currentText;
const newTextLen = newText.length; const newTextLen = newText.length;
// Markdown 模式下,会强制向下拆分最小块,并再最后一个标题时候,给小块都补充上所有标题(包含父级标题) // Markdown 模式下,会强制向下拆分最小块,并再最后一个标题深度,给小块都补充上所有标题(包含父级标题)
if (isMarkdownStep) { if (isMarkdownStep) {
// split new Text, split chunks must will greater 1 (small lastText) // split new Text, split chunks must will greater 1 (small lastText)
const innerChunks = splitTextRecursively({ const innerChunks = splitTextRecursively({
@@ -290,11 +297,13 @@ const commonSplit = (props: SplitProps): SplitResponse => {
parentTitle: parentTitle + item.title parentTitle: parentTitle + item.title
}); });
// 只有标题,没有内容。
if (innerChunks.length === 0) { if (innerChunks.length === 0) {
chunks.push(`${parentTitle}${item.title}`); chunks.push(`${parentTitle}${item.title}`);
continue; continue;
} }
// 在合并最深级标题时,需要补充标题
chunks.push( chunks.push(
...innerChunks.map( ...innerChunks.map(
(chunk) => (chunk) =>
@@ -307,7 +316,16 @@ const commonSplit = (props: SplitProps): SplitResponse => {
// newText is too large(now, The lastText must be smaller than chunkSize) // newText is too large(now, The lastText must be smaller than chunkSize)
if (newTextLen > maxLen) { if (newTextLen > maxLen) {
// lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText) const minChunkLen = maxLen * 0.8; // 当前块最小长度
const maxChunkLen = maxLen * 1.2; // 当前块最大长度
// 新文本没有非常大,直接认为它是一个新的块
if (newTextLen < maxChunkLen) {
chunks.push(newText);
lastText = getOneTextOverlapText({ text: newText, step }); // next chunk will start with overlayText
continue;
}
// 上一个文本块已经挺大的,单独做一个块
if (lastTextLen > minChunkLen) { if (lastTextLen > minChunkLen) {
chunks.push(lastText); chunks.push(lastText);
@@ -317,13 +335,13 @@ const commonSplit = (props: SplitProps): SplitResponse => {
continue; continue;
} }
// 说明是新的文本比较大,需要进一步拆分 // 说明是当前文本比较大,需要进一步拆分
// split new Text, split chunks must will greater 1 (small lastText) // 把新的文本块进行一个拆分,并追加到 latestText
const innerChunks = splitTextRecursively({ const innerChunks = splitTextRecursively({
text: newText, text: currentText,
step: step + 1, step: step + 1,
lastText: '', lastText,
parentTitle: parentTitle + item.title parentTitle: parentTitle + item.title
}); });
const lastChunk = innerChunks[innerChunks.length - 1]; const lastChunk = innerChunks[innerChunks.length - 1];
@@ -351,11 +369,11 @@ const commonSplit = (props: SplitProps): SplitResponse => {
// Not overlap // Not overlap
if (forbidConcat) { if (forbidConcat) {
chunks.push(item.text); chunks.push(currentText);
continue; continue;
} }
lastText += item.text; lastText = newText;
} }
/* If the last chunk is independent, it needs to be push chunks. */ /* If the last chunk is independent, it needs to be push chunks. */

View File

@@ -30,11 +30,11 @@ export async function text2Speech({
response_format: 'mp3', response_format: 'mp3',
speed speed
}, },
modelData.requestUrl && modelData.requestAuth modelData.requestUrl
? { ? {
path: modelData.requestUrl, path: modelData.requestUrl,
headers: { headers: {
Authorization: `Bearer ${modelData.requestAuth}` ...(modelData.requestAuth ? { Authorization: `Bearer ${modelData.requestAuth}` } : {})
} }
} }
: {} : {}

View File

@@ -65,6 +65,7 @@ export const llmCompletionsBodyFormat = <T extends CompletionsBodyType>(
const requestBody: T = { const requestBody: T = {
...body, ...body,
model: modelData.model,
temperature: temperature:
typeof body.temperature === 'number' typeof body.temperature === 'number'
? computedTemperature({ ? computedTemperature({

View File

@@ -196,7 +196,7 @@ function OrgTable({ Tabs }: { Tabs: React.ReactNode }) {
isLoading={isLoadingOrgs} isLoading={isLoadingOrgs}
> >
<Box mb={3}> <Box mb={3}>
<Path paths={paths} rootName={userInfo?.team?.teamName} onClick={setPath} /> <Path paths={paths} rootName={userInfo?.team?.teamName} />
</Box> </Box>
<Flex flex={'1 0 0'} h={0} w={'100%'} gap={'4'}> <Flex flex={'1 0 0'} h={0} w={'100%'} gap={'4'}>
<MemberScrollData flex="1"> <MemberScrollData flex="1">
@@ -420,7 +420,7 @@ function OrgTable({ Tabs }: { Tabs: React.ReactNode }) {
<ActionButton <ActionButton
icon="common/administrator" icon="common/administrator"
text={t('account_team:manage_member')} text={t('account_team:manage_member')}
onClick={() => setManageMemberOrg(currentOrg ?? rootOrg)} onClick={() => setManageMemberOrg(currentOrg)}
/> />
{currentOrg && currentOrg?.path !== '' && ( {currentOrg && currentOrg?.path !== '' && (
<> <>

View File

@@ -94,7 +94,7 @@ async function handler(
per: WritePermissionVal per: WritePermissionVal
}); });
if (fileAuthRes && (String(fileAuthRes.tmbId) !== String(tmbId) || !fileAuthRes.isRoot)) { if (fileAuthRes && String(fileAuthRes.tmbId) !== String(tmbId) && !fileAuthRes.isRoot) {
return Promise.reject(CommonErrEnum.unAuthFile); return Promise.reject(CommonErrEnum.unAuthFile);
} }

View File

@@ -15,18 +15,19 @@ import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTex
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants'; import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter'; import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken'; import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
const formatIndexes = async ({ const formatIndexes = async ({
indexes, indexes,
q, q,
a = '', a = '',
indexSize indexSize,
maxIndexSize
}: { }: {
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[]; indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
q: string; q: string;
a?: string; a?: string;
indexSize: number; indexSize: number;
maxIndexSize: number;
}): Promise< }): Promise<
{ {
type: `${DatasetDataIndexTypeEnum}`; type: `${DatasetDataIndexTypeEnum}`;
@@ -46,9 +47,12 @@ const formatIndexes = async ({
}) => { }) => {
const qChunks = splitText2Chunks({ const qChunks = splitText2Chunks({
text: q, text: q,
chunkSize: indexSize chunkSize: indexSize,
maxSize: maxIndexSize
}).chunks; }).chunks;
const aChunks = a ? splitText2Chunks({ text: a, chunkSize: indexSize }).chunks : []; const aChunks = a
? splitText2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize }).chunks
: [];
return [ return [
...qChunks.map((text) => ({ ...qChunks.map((text) => ({
@@ -100,7 +104,11 @@ const formatIndexes = async ({
// If oversize tokens, split it // If oversize tokens, split it
const tokens = await countPromptTokens(item.text); const tokens = await countPromptTokens(item.text);
if (tokens > indexSize) { if (tokens > indexSize) {
const splitText = splitText2Chunks({ text: item.text, chunkSize: 512 }).chunks; const splitText = splitText2Chunks({
text: item.text,
chunkSize: 512,
maxSize: maxIndexSize
}).chunks;
return splitText.map((text) => ({ return splitText.map((text) => ({
text, text,
type: item.type type: item.type
@@ -151,7 +159,8 @@ export async function insertData2Dataset({
indexes, indexes,
q, q,
a, a,
indexSize indexSize,
maxIndexSize: embModel.maxToken
}); });
// insert to vector store // insert to vector store
@@ -236,7 +245,13 @@ export async function updateData2Dataset({
if (!mongoData) return Promise.reject('core.dataset.error.Data not found'); if (!mongoData) return Promise.reject('core.dataset.error.Data not found');
// 2. Compute indexes // 2. Compute indexes
const formatIndexesResult = await formatIndexes({ indexes, q, a, indexSize }); const formatIndexesResult = await formatIndexes({
indexes,
q,
a,
indexSize,
maxIndexSize: getEmbeddingModel(model).maxToken
});
// 3. Patch indexes, create, update, delete // 3. Patch indexes, create, update, delete
const patchResult: PatchIndexesProps[] = []; const patchResult: PatchIndexesProps[] = [];

File diff suppressed because one or more lines are too long