perf: text splitter (#4313)

* sync collection

* remove lock

* perf: text splitter

* update comment
This commit is contained in:
Archer
2025-03-25 17:44:38 +08:00
committed by archer
parent 826a53dcb6
commit 37b4a1919b
8 changed files with 716 additions and 70 deletions

View File

@@ -30,10 +30,15 @@ weight: 799
6. 工作流节点数组字符串类型,自动适配 string 输入。
7. 工作流节点数组类型,自动进行 JSON parse 解析 string 输入。
8. AI proxy 日志优化,去除重试失败的日志,仅保留最后一份错误日志。
9. 分块算法小调整:
* 跨处理符号之间连续性更强。
* 代码块分割时,用 LLM 模型上下文作为分块大小,尽可能保证代码块完整性。
* 表格分割时,用 LLM 模型上下文作为分块大小,尽可能保证表格完整性。
## 🐛 修复
1. 飞书和语雀知识库无法同步。
2. 渠道测试时,如果配置了模型自定义请求地址,会走自定义请求地址,而不是渠道请求地址。
3. 语音识别模型测试未启用的模型时,无法正常测试。
4. 管理员配置系统插件时,如果插件包含其他系统应用,无法正常鉴权。
4. 管理员配置系统插件时,如果插件包含其他系统应用,无法正常鉴权。
5. 移除 TTS 自定义请求地址时,必须需要填 requestAuth 字段。

View File

@@ -134,8 +134,11 @@ const commonSplit = (props: SplitProps): SplitResponse => {
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /([\n]([`~]))/g, maxLen: chunkSize }, // code block
{ reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkSize }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
{ reg: /([\n](```[\s\S]*?```|~~~[\s\S]*?~~~))/g, maxLen: maxSize }, // code block
{
reg: /(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n)*)/g,
maxLen: maxSize
}, // Table 尽可能保证完整性
{ reg: /(\n{2,})/g, maxLen: chunkSize },
{ reg: /([\n])/g, maxLen: chunkSize },
// ------ There's no overlap on the top
@@ -150,7 +153,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
const checkIsCustomStep = (step: number) => step < customRegLen;
const checkIsMarkdownSplit = (step: number) =>
step >= customRegLen && step <= markdownIndex + customRegLen;
+customReg.length;
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customRegLen;
// if use markdown title split, Separate record title
@@ -159,7 +162,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
return [
{
text,
title: ''
title: '',
chunkMaxSize: chunkSize
}
];
}
@@ -167,7 +171,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
const isCustomStep = checkIsCustomStep(step);
const isMarkdownSplit = checkIsMarkdownSplit(step);
const { reg } = stepReges[step];
const { reg, maxLen } = stepReges[step];
const replaceText = (() => {
if (typeof reg === 'string') {
@@ -194,15 +198,19 @@ const commonSplit = (props: SplitProps): SplitResponse => {
})()
);
})();
const splitTexts = replaceText.split(splitMarker).filter((part) => part.trim());
return splitTexts
.map((text) => {
const matchTitle = isMarkdownSplit ? text.match(reg)?.[0] || '' : '';
// 如果一个分块没有匹配到,则使用默认块大小,否则使用最大块大小
const chunkMaxSize = text.match(reg) === null ? chunkSize : maxLen;
return {
text: isMarkdownSplit ? text.replace(matchTitle, '') : text,
title: matchTitle
title: matchTitle,
chunkMaxSize
};
})
.filter((item) => !!item.title || !!item.text?.trim());
@@ -252,9 +260,9 @@ const commonSplit = (props: SplitProps): SplitResponse => {
const isCustomStep = checkIsCustomStep(step);
const forbidConcat = isCustomStep; // forbid=true时候lastText肯定为空
// oversize
// Over step
if (step >= stepReges.length) {
if (text.length < chunkSize * 3) {
if (text.length < maxSize) {
return [text];
}
// use slice-chunkSize to split text
@@ -268,19 +276,18 @@ const commonSplit = (props: SplitProps): SplitResponse => {
// split text by special char
const splitTexts = getSplitTexts({ text, step });
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkSize;
const minChunkLen = chunkSize * 0.7;
const chunks: string[] = [];
for (let i = 0; i < splitTexts.length; i++) {
const item = splitTexts[i];
const maxLen = item.chunkMaxSize; // 当前块最大长度
const lastTextLen = lastText.length;
const currentText = item.text;
const newText = lastText + currentText;
const newTextLen = newText.length;
// Markdown 模式下,会强制向下拆分最小块,并再最后一个标题时候,给小块都补充上所有标题(包含父级标题)
// Markdown 模式下,会强制向下拆分最小块,并再最后一个标题深度,给小块都补充上所有标题(包含父级标题)
if (isMarkdownStep) {
// split new Text, split chunks must will greater 1 (small lastText)
const innerChunks = splitTextRecursively({
@@ -290,11 +297,13 @@ const commonSplit = (props: SplitProps): SplitResponse => {
parentTitle: parentTitle + item.title
});
// 只有标题,没有内容。
if (innerChunks.length === 0) {
chunks.push(`${parentTitle}${item.title}`);
continue;
}
// 在合并最深级标题时,需要补充标题
chunks.push(
...innerChunks.map(
(chunk) =>
@@ -307,7 +316,16 @@ const commonSplit = (props: SplitProps): SplitResponse => {
// newText is too large(now, The lastText must be smaller than chunkSize)
if (newTextLen > maxLen) {
// lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
const minChunkLen = maxLen * 0.8; // 当前块最小长度
const maxChunkLen = maxLen * 1.2; // 当前块最大长度
// 新文本没有非常大,直接认为它是一个新的块
if (newTextLen < maxChunkLen) {
chunks.push(newText);
lastText = getOneTextOverlapText({ text: newText, step }); // next chunk will start with overlayText
continue;
}
// 上一个文本块已经挺大的,单独做一个块
if (lastTextLen > minChunkLen) {
chunks.push(lastText);
@@ -317,13 +335,13 @@ const commonSplit = (props: SplitProps): SplitResponse => {
continue;
}
// 说明是新的文本比较大,需要进一步拆分
// 说明是当前文本比较大,需要进一步拆分
// split new Text, split chunks must will greater 1 (small lastText)
// 把新的文本块进行一个拆分,并追加到 latestText
const innerChunks = splitTextRecursively({
text: newText,
text: currentText,
step: step + 1,
lastText: '',
lastText,
parentTitle: parentTitle + item.title
});
const lastChunk = innerChunks[innerChunks.length - 1];
@@ -351,11 +369,11 @@ const commonSplit = (props: SplitProps): SplitResponse => {
// Not overlap
if (forbidConcat) {
chunks.push(item.text);
chunks.push(currentText);
continue;
}
lastText += item.text;
lastText = newText;
}
/* If the last chunk is independent, it needs to be push chunks. */

View File

@@ -30,11 +30,11 @@ export async function text2Speech({
response_format: 'mp3',
speed
},
modelData.requestUrl && modelData.requestAuth
modelData.requestUrl
? {
path: modelData.requestUrl,
headers: {
Authorization: `Bearer ${modelData.requestAuth}`
...(modelData.requestAuth ? { Authorization: `Bearer ${modelData.requestAuth}` } : {})
}
}
: {}

View File

@@ -65,6 +65,7 @@ export const llmCompletionsBodyFormat = <T extends CompletionsBodyType>(
const requestBody: T = {
...body,
model: modelData.model,
temperature:
typeof body.temperature === 'number'
? computedTemperature({

View File

@@ -196,7 +196,7 @@ function OrgTable({ Tabs }: { Tabs: React.ReactNode }) {
isLoading={isLoadingOrgs}
>
<Box mb={3}>
<Path paths={paths} rootName={userInfo?.team?.teamName} onClick={setPath} />
<Path paths={paths} rootName={userInfo?.team?.teamName} />
</Box>
<Flex flex={'1 0 0'} h={0} w={'100%'} gap={'4'}>
<MemberScrollData flex="1">
@@ -420,7 +420,7 @@ function OrgTable({ Tabs }: { Tabs: React.ReactNode }) {
<ActionButton
icon="common/administrator"
text={t('account_team:manage_member')}
onClick={() => setManageMemberOrg(currentOrg ?? rootOrg)}
onClick={() => setManageMemberOrg(currentOrg)}
/>
{currentOrg && currentOrg?.path !== '' && (
<>

View File

@@ -94,7 +94,7 @@ async function handler(
per: WritePermissionVal
});
if (fileAuthRes && (String(fileAuthRes.tmbId) !== String(tmbId) || !fileAuthRes.isRoot)) {
if (fileAuthRes && String(fileAuthRes.tmbId) !== String(tmbId) && !fileAuthRes.isRoot) {
return Promise.reject(CommonErrEnum.unAuthFile);
}

View File

@@ -15,18 +15,19 @@ import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTex
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
const formatIndexes = async ({
indexes,
q,
a = '',
indexSize
indexSize,
maxIndexSize
}: {
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
q: string;
a?: string;
indexSize: number;
maxIndexSize: number;
}): Promise<
{
type: `${DatasetDataIndexTypeEnum}`;
@@ -46,9 +47,12 @@ const formatIndexes = async ({
}) => {
const qChunks = splitText2Chunks({
text: q,
chunkSize: indexSize
chunkSize: indexSize,
maxSize: maxIndexSize
}).chunks;
const aChunks = a ? splitText2Chunks({ text: a, chunkSize: indexSize }).chunks : [];
const aChunks = a
? splitText2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize }).chunks
: [];
return [
...qChunks.map((text) => ({
@@ -100,7 +104,11 @@ const formatIndexes = async ({
// If oversize tokens, split it
const tokens = await countPromptTokens(item.text);
if (tokens > indexSize) {
const splitText = splitText2Chunks({ text: item.text, chunkSize: 512 }).chunks;
const splitText = splitText2Chunks({
text: item.text,
chunkSize: 512,
maxSize: maxIndexSize
}).chunks;
return splitText.map((text) => ({
text,
type: item.type
@@ -151,7 +159,8 @@ export async function insertData2Dataset({
indexes,
q,
a,
indexSize
indexSize,
maxIndexSize: embModel.maxToken
});
// insert to vector store
@@ -236,7 +245,13 @@ export async function updateData2Dataset({
if (!mongoData) return Promise.reject('core.dataset.error.Data not found');
// 2. Compute indexes
const formatIndexesResult = await formatIndexes({ indexes, q, a, indexSize });
const formatIndexesResult = await formatIndexes({
indexes,
q,
a,
indexSize,
maxIndexSize: getEmbeddingModel(model).maxToken
});
// 3. Patch indexes, create, update, delete
const patchResult: PatchIndexesProps[] = [];

File diff suppressed because one or more lines are too long