mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-21 11:43:56 +00:00
perf: text splitter (#4313)
* sync collection * remove lock * perf: text splitter * update comment
This commit is contained in:
@@ -30,10 +30,15 @@ weight: 799
|
||||
6. 工作流节点数组字符串类型,自动适配 string 输入。
|
||||
7. 工作流节点数组类型,自动进行 JSON parse 解析 string 输入。
|
||||
8. AI proxy 日志优化,去除重试失败的日志,仅保留最后一份错误日志。
|
||||
9. 分块算法小调整:
|
||||
* 跨处理符号之间连续性更强。
|
||||
* 代码块分割时,用 LLM 模型上下文作为分块大小,尽可能保证代码块完整性。
|
||||
* 表格分割时,用 LLM 模型上下文作为分块大小,尽可能保证表格完整性。
|
||||
|
||||
## 🐛 修复
|
||||
|
||||
1. 飞书和语雀知识库无法同步。
|
||||
2. 渠道测试时,如果配置了模型自定义请求地址,会走自定义请求地址,而不是渠道请求地址。
|
||||
3. 语音识别模型测试未启用的模型时,无法正常测试。
|
||||
4. 管理员配置系统插件时,如果插件包含其他系统应用,无法正常鉴权。
|
||||
4. 管理员配置系统插件时,如果插件包含其他系统应用,无法正常鉴权。
|
||||
5. 移除 TTS 自定义请求地址时,必须需要填 requestAuth 字段。
|
@@ -134,8 +134,11 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||
|
||||
{ reg: /([\n]([`~]))/g, maxLen: chunkSize }, // code block
|
||||
{ reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkSize }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
|
||||
{ reg: /([\n](```[\s\S]*?```|~~~[\s\S]*?~~~))/g, maxLen: maxSize }, // code block
|
||||
{
|
||||
reg: /(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n)*)/g,
|
||||
maxLen: maxSize
|
||||
}, // Table 尽可能保证完整性
|
||||
{ reg: /(\n{2,})/g, maxLen: chunkSize },
|
||||
{ reg: /([\n])/g, maxLen: chunkSize },
|
||||
// ------ There's no overlap on the top
|
||||
@@ -150,7 +153,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
const checkIsCustomStep = (step: number) => step < customRegLen;
|
||||
const checkIsMarkdownSplit = (step: number) =>
|
||||
step >= customRegLen && step <= markdownIndex + customRegLen;
|
||||
+customReg.length;
|
||||
|
||||
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customRegLen;
|
||||
|
||||
// if use markdown title split, Separate record title
|
||||
@@ -159,7 +162,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
return [
|
||||
{
|
||||
text,
|
||||
title: ''
|
||||
title: '',
|
||||
chunkMaxSize: chunkSize
|
||||
}
|
||||
];
|
||||
}
|
||||
@@ -167,7 +171,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
const isCustomStep = checkIsCustomStep(step);
|
||||
const isMarkdownSplit = checkIsMarkdownSplit(step);
|
||||
|
||||
const { reg } = stepReges[step];
|
||||
const { reg, maxLen } = stepReges[step];
|
||||
|
||||
const replaceText = (() => {
|
||||
if (typeof reg === 'string') {
|
||||
@@ -194,15 +198,19 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
})()
|
||||
);
|
||||
})();
|
||||
|
||||
const splitTexts = replaceText.split(splitMarker).filter((part) => part.trim());
|
||||
|
||||
return splitTexts
|
||||
.map((text) => {
|
||||
const matchTitle = isMarkdownSplit ? text.match(reg)?.[0] || '' : '';
|
||||
// 如果一个分块没有匹配到,则使用默认块大小,否则使用最大块大小
|
||||
const chunkMaxSize = text.match(reg) === null ? chunkSize : maxLen;
|
||||
|
||||
return {
|
||||
text: isMarkdownSplit ? text.replace(matchTitle, '') : text,
|
||||
title: matchTitle
|
||||
title: matchTitle,
|
||||
chunkMaxSize
|
||||
};
|
||||
})
|
||||
.filter((item) => !!item.title || !!item.text?.trim());
|
||||
@@ -252,9 +260,9 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
const isCustomStep = checkIsCustomStep(step);
|
||||
const forbidConcat = isCustomStep; // forbid=true时候,lastText肯定为空
|
||||
|
||||
// oversize
|
||||
// Over step
|
||||
if (step >= stepReges.length) {
|
||||
if (text.length < chunkSize * 3) {
|
||||
if (text.length < maxSize) {
|
||||
return [text];
|
||||
}
|
||||
// use slice-chunkSize to split text
|
||||
@@ -268,19 +276,18 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
// split text by special char
|
||||
const splitTexts = getSplitTexts({ text, step });
|
||||
|
||||
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkSize;
|
||||
const minChunkLen = chunkSize * 0.7;
|
||||
|
||||
const chunks: string[] = [];
|
||||
for (let i = 0; i < splitTexts.length; i++) {
|
||||
const item = splitTexts[i];
|
||||
|
||||
const maxLen = item.chunkMaxSize; // 当前块最大长度
|
||||
|
||||
const lastTextLen = lastText.length;
|
||||
const currentText = item.text;
|
||||
const newText = lastText + currentText;
|
||||
const newTextLen = newText.length;
|
||||
|
||||
// Markdown 模式下,会强制向下拆分最小块,并再最后一个标题时候,给小块都补充上所有标题(包含父级标题)
|
||||
// Markdown 模式下,会强制向下拆分最小块,并再最后一个标题深度,给小块都补充上所有标题(包含父级标题)
|
||||
if (isMarkdownStep) {
|
||||
// split new Text, split chunks must will greater 1 (small lastText)
|
||||
const innerChunks = splitTextRecursively({
|
||||
@@ -290,11 +297,13 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
parentTitle: parentTitle + item.title
|
||||
});
|
||||
|
||||
// 只有标题,没有内容。
|
||||
if (innerChunks.length === 0) {
|
||||
chunks.push(`${parentTitle}${item.title}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
// 在合并最深级标题时,需要补充标题
|
||||
chunks.push(
|
||||
...innerChunks.map(
|
||||
(chunk) =>
|
||||
@@ -307,7 +316,16 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
|
||||
// newText is too large(now, The lastText must be smaller than chunkSize)
|
||||
if (newTextLen > maxLen) {
|
||||
// lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
|
||||
const minChunkLen = maxLen * 0.8; // 当前块最小长度
|
||||
const maxChunkLen = maxLen * 1.2; // 当前块最大长度
|
||||
|
||||
// 新文本没有非常大,直接认为它是一个新的块
|
||||
if (newTextLen < maxChunkLen) {
|
||||
chunks.push(newText);
|
||||
lastText = getOneTextOverlapText({ text: newText, step }); // next chunk will start with overlayText
|
||||
continue;
|
||||
}
|
||||
// 上一个文本块已经挺大的,单独做一个块
|
||||
if (lastTextLen > minChunkLen) {
|
||||
chunks.push(lastText);
|
||||
|
||||
@@ -317,13 +335,13 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 说明是新的文本块比较大,需要进一步拆分
|
||||
// 说明是当前文本比较大,需要进一步拆分
|
||||
|
||||
// split new Text, split chunks must will greater 1 (small lastText)
|
||||
// 把新的文本块进行一个拆分,并追加到 latestText 中
|
||||
const innerChunks = splitTextRecursively({
|
||||
text: newText,
|
||||
text: currentText,
|
||||
step: step + 1,
|
||||
lastText: '',
|
||||
lastText,
|
||||
parentTitle: parentTitle + item.title
|
||||
});
|
||||
const lastChunk = innerChunks[innerChunks.length - 1];
|
||||
@@ -351,11 +369,11 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
|
||||
// Not overlap
|
||||
if (forbidConcat) {
|
||||
chunks.push(item.text);
|
||||
chunks.push(currentText);
|
||||
continue;
|
||||
}
|
||||
|
||||
lastText += item.text;
|
||||
lastText = newText;
|
||||
}
|
||||
|
||||
/* If the last chunk is independent, it needs to be push chunks. */
|
||||
|
@@ -30,11 +30,11 @@ export async function text2Speech({
|
||||
response_format: 'mp3',
|
||||
speed
|
||||
},
|
||||
modelData.requestUrl && modelData.requestAuth
|
||||
modelData.requestUrl
|
||||
? {
|
||||
path: modelData.requestUrl,
|
||||
headers: {
|
||||
Authorization: `Bearer ${modelData.requestAuth}`
|
||||
...(modelData.requestAuth ? { Authorization: `Bearer ${modelData.requestAuth}` } : {})
|
||||
}
|
||||
}
|
||||
: {}
|
||||
|
@@ -65,6 +65,7 @@ export const llmCompletionsBodyFormat = <T extends CompletionsBodyType>(
|
||||
|
||||
const requestBody: T = {
|
||||
...body,
|
||||
model: modelData.model,
|
||||
temperature:
|
||||
typeof body.temperature === 'number'
|
||||
? computedTemperature({
|
||||
|
@@ -196,7 +196,7 @@ function OrgTable({ Tabs }: { Tabs: React.ReactNode }) {
|
||||
isLoading={isLoadingOrgs}
|
||||
>
|
||||
<Box mb={3}>
|
||||
<Path paths={paths} rootName={userInfo?.team?.teamName} onClick={setPath} />
|
||||
<Path paths={paths} rootName={userInfo?.team?.teamName} />
|
||||
</Box>
|
||||
<Flex flex={'1 0 0'} h={0} w={'100%'} gap={'4'}>
|
||||
<MemberScrollData flex="1">
|
||||
@@ -420,7 +420,7 @@ function OrgTable({ Tabs }: { Tabs: React.ReactNode }) {
|
||||
<ActionButton
|
||||
icon="common/administrator"
|
||||
text={t('account_team:manage_member')}
|
||||
onClick={() => setManageMemberOrg(currentOrg ?? rootOrg)}
|
||||
onClick={() => setManageMemberOrg(currentOrg)}
|
||||
/>
|
||||
{currentOrg && currentOrg?.path !== '' && (
|
||||
<>
|
||||
|
@@ -94,7 +94,7 @@ async function handler(
|
||||
per: WritePermissionVal
|
||||
});
|
||||
|
||||
if (fileAuthRes && (String(fileAuthRes.tmbId) !== String(tmbId) || !fileAuthRes.isRoot)) {
|
||||
if (fileAuthRes && String(fileAuthRes.tmbId) !== String(tmbId) && !fileAuthRes.isRoot) {
|
||||
return Promise.reject(CommonErrEnum.unAuthFile);
|
||||
}
|
||||
|
||||
|
@@ -15,18 +15,19 @@ import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTex
|
||||
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
|
||||
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
|
||||
|
||||
const formatIndexes = async ({
|
||||
indexes,
|
||||
q,
|
||||
a = '',
|
||||
indexSize
|
||||
indexSize,
|
||||
maxIndexSize
|
||||
}: {
|
||||
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
|
||||
q: string;
|
||||
a?: string;
|
||||
indexSize: number;
|
||||
maxIndexSize: number;
|
||||
}): Promise<
|
||||
{
|
||||
type: `${DatasetDataIndexTypeEnum}`;
|
||||
@@ -46,9 +47,12 @@ const formatIndexes = async ({
|
||||
}) => {
|
||||
const qChunks = splitText2Chunks({
|
||||
text: q,
|
||||
chunkSize: indexSize
|
||||
chunkSize: indexSize,
|
||||
maxSize: maxIndexSize
|
||||
}).chunks;
|
||||
const aChunks = a ? splitText2Chunks({ text: a, chunkSize: indexSize }).chunks : [];
|
||||
const aChunks = a
|
||||
? splitText2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize }).chunks
|
||||
: [];
|
||||
|
||||
return [
|
||||
...qChunks.map((text) => ({
|
||||
@@ -100,7 +104,11 @@ const formatIndexes = async ({
|
||||
// If oversize tokens, split it
|
||||
const tokens = await countPromptTokens(item.text);
|
||||
if (tokens > indexSize) {
|
||||
const splitText = splitText2Chunks({ text: item.text, chunkSize: 512 }).chunks;
|
||||
const splitText = splitText2Chunks({
|
||||
text: item.text,
|
||||
chunkSize: 512,
|
||||
maxSize: maxIndexSize
|
||||
}).chunks;
|
||||
return splitText.map((text) => ({
|
||||
text,
|
||||
type: item.type
|
||||
@@ -151,7 +159,8 @@ export async function insertData2Dataset({
|
||||
indexes,
|
||||
q,
|
||||
a,
|
||||
indexSize
|
||||
indexSize,
|
||||
maxIndexSize: embModel.maxToken
|
||||
});
|
||||
|
||||
// insert to vector store
|
||||
@@ -236,7 +245,13 @@ export async function updateData2Dataset({
|
||||
if (!mongoData) return Promise.reject('core.dataset.error.Data not found');
|
||||
|
||||
// 2. Compute indexes
|
||||
const formatIndexesResult = await formatIndexes({ indexes, q, a, indexSize });
|
||||
const formatIndexesResult = await formatIndexes({
|
||||
indexes,
|
||||
q,
|
||||
a,
|
||||
indexSize,
|
||||
maxIndexSize: getEmbeddingModel(model).maxToken
|
||||
});
|
||||
|
||||
// 3. Patch indexes, create, update, delete
|
||||
const patchResult: PatchIndexesProps[] = [];
|
||||
|
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user