mirror of
https://github.com/labring/FastGPT.git
synced 2025-10-17 16:45:02 +00:00
perf: text splitter (#4313)
* sync collection * remove lock * perf: text splitter * update comment
This commit is contained in:
@@ -30,6 +30,10 @@ weight: 799
|
|||||||
6. 工作流节点数组字符串类型,自动适配 string 输入。
|
6. 工作流节点数组字符串类型,自动适配 string 输入。
|
||||||
7. 工作流节点数组类型,自动进行 JSON parse 解析 string 输入。
|
7. 工作流节点数组类型,自动进行 JSON parse 解析 string 输入。
|
||||||
8. AI proxy 日志优化,去除重试失败的日志,仅保留最后一份错误日志。
|
8. AI proxy 日志优化,去除重试失败的日志,仅保留最后一份错误日志。
|
||||||
|
9. 分块算法小调整:
|
||||||
|
* 跨处理符号之间连续性更强。
|
||||||
|
* 代码块分割时,用 LLM 模型上下文作为分块大小,尽可能保证代码块完整性。
|
||||||
|
* 表格分割时,用 LLM 模型上下文作为分块大小,尽可能保证表格完整性。
|
||||||
|
|
||||||
## 🐛 修复
|
## 🐛 修复
|
||||||
|
|
||||||
@@ -37,3 +41,4 @@ weight: 799
|
|||||||
2. 渠道测试时,如果配置了模型自定义请求地址,会走自定义请求地址,而不是渠道请求地址。
|
2. 渠道测试时,如果配置了模型自定义请求地址,会走自定义请求地址,而不是渠道请求地址。
|
||||||
3. 语音识别模型测试未启用的模型时,无法正常测试。
|
3. 语音识别模型测试未启用的模型时,无法正常测试。
|
||||||
4. 管理员配置系统插件时,如果插件包含其他系统应用,无法正常鉴权。
|
4. 管理员配置系统插件时,如果插件包含其他系统应用,无法正常鉴权。
|
||||||
|
5. 移除 TTS 自定义请求地址时,必须需要填 requestAuth 字段。
|
@@ -134,8 +134,11 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize },
|
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||||
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize },
|
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||||
|
|
||||||
{ reg: /([\n]([`~]))/g, maxLen: chunkSize }, // code block
|
{ reg: /([\n](```[\s\S]*?```|~~~[\s\S]*?~~~))/g, maxLen: maxSize }, // code block
|
||||||
{ reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkSize }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
|
{
|
||||||
|
reg: /(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n)*)/g,
|
||||||
|
maxLen: maxSize
|
||||||
|
}, // Table 尽可能保证完整性
|
||||||
{ reg: /(\n{2,})/g, maxLen: chunkSize },
|
{ reg: /(\n{2,})/g, maxLen: chunkSize },
|
||||||
{ reg: /([\n])/g, maxLen: chunkSize },
|
{ reg: /([\n])/g, maxLen: chunkSize },
|
||||||
// ------ There's no overlap on the top
|
// ------ There's no overlap on the top
|
||||||
@@ -150,7 +153,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
const checkIsCustomStep = (step: number) => step < customRegLen;
|
const checkIsCustomStep = (step: number) => step < customRegLen;
|
||||||
const checkIsMarkdownSplit = (step: number) =>
|
const checkIsMarkdownSplit = (step: number) =>
|
||||||
step >= customRegLen && step <= markdownIndex + customRegLen;
|
step >= customRegLen && step <= markdownIndex + customRegLen;
|
||||||
+customReg.length;
|
|
||||||
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customRegLen;
|
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customRegLen;
|
||||||
|
|
||||||
// if use markdown title split, Separate record title
|
// if use markdown title split, Separate record title
|
||||||
@@ -159,7 +162,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
return [
|
return [
|
||||||
{
|
{
|
||||||
text,
|
text,
|
||||||
title: ''
|
title: '',
|
||||||
|
chunkMaxSize: chunkSize
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
@@ -167,7 +171,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
const isCustomStep = checkIsCustomStep(step);
|
const isCustomStep = checkIsCustomStep(step);
|
||||||
const isMarkdownSplit = checkIsMarkdownSplit(step);
|
const isMarkdownSplit = checkIsMarkdownSplit(step);
|
||||||
|
|
||||||
const { reg } = stepReges[step];
|
const { reg, maxLen } = stepReges[step];
|
||||||
|
|
||||||
const replaceText = (() => {
|
const replaceText = (() => {
|
||||||
if (typeof reg === 'string') {
|
if (typeof reg === 'string') {
|
||||||
@@ -194,15 +198,19 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
})()
|
})()
|
||||||
);
|
);
|
||||||
})();
|
})();
|
||||||
|
|
||||||
const splitTexts = replaceText.split(splitMarker).filter((part) => part.trim());
|
const splitTexts = replaceText.split(splitMarker).filter((part) => part.trim());
|
||||||
|
|
||||||
return splitTexts
|
return splitTexts
|
||||||
.map((text) => {
|
.map((text) => {
|
||||||
const matchTitle = isMarkdownSplit ? text.match(reg)?.[0] || '' : '';
|
const matchTitle = isMarkdownSplit ? text.match(reg)?.[0] || '' : '';
|
||||||
|
// 如果一个分块没有匹配到,则使用默认块大小,否则使用最大块大小
|
||||||
|
const chunkMaxSize = text.match(reg) === null ? chunkSize : maxLen;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
text: isMarkdownSplit ? text.replace(matchTitle, '') : text,
|
text: isMarkdownSplit ? text.replace(matchTitle, '') : text,
|
||||||
title: matchTitle
|
title: matchTitle,
|
||||||
|
chunkMaxSize
|
||||||
};
|
};
|
||||||
})
|
})
|
||||||
.filter((item) => !!item.title || !!item.text?.trim());
|
.filter((item) => !!item.title || !!item.text?.trim());
|
||||||
@@ -252,9 +260,9 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
const isCustomStep = checkIsCustomStep(step);
|
const isCustomStep = checkIsCustomStep(step);
|
||||||
const forbidConcat = isCustomStep; // forbid=true时候,lastText肯定为空
|
const forbidConcat = isCustomStep; // forbid=true时候,lastText肯定为空
|
||||||
|
|
||||||
// oversize
|
// Over step
|
||||||
if (step >= stepReges.length) {
|
if (step >= stepReges.length) {
|
||||||
if (text.length < chunkSize * 3) {
|
if (text.length < maxSize) {
|
||||||
return [text];
|
return [text];
|
||||||
}
|
}
|
||||||
// use slice-chunkSize to split text
|
// use slice-chunkSize to split text
|
||||||
@@ -268,19 +276,18 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
// split text by special char
|
// split text by special char
|
||||||
const splitTexts = getSplitTexts({ text, step });
|
const splitTexts = getSplitTexts({ text, step });
|
||||||
|
|
||||||
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkSize;
|
|
||||||
const minChunkLen = chunkSize * 0.7;
|
|
||||||
|
|
||||||
const chunks: string[] = [];
|
const chunks: string[] = [];
|
||||||
for (let i = 0; i < splitTexts.length; i++) {
|
for (let i = 0; i < splitTexts.length; i++) {
|
||||||
const item = splitTexts[i];
|
const item = splitTexts[i];
|
||||||
|
|
||||||
|
const maxLen = item.chunkMaxSize; // 当前块最大长度
|
||||||
|
|
||||||
const lastTextLen = lastText.length;
|
const lastTextLen = lastText.length;
|
||||||
const currentText = item.text;
|
const currentText = item.text;
|
||||||
const newText = lastText + currentText;
|
const newText = lastText + currentText;
|
||||||
const newTextLen = newText.length;
|
const newTextLen = newText.length;
|
||||||
|
|
||||||
// Markdown 模式下,会强制向下拆分最小块,并再最后一个标题时候,给小块都补充上所有标题(包含父级标题)
|
// Markdown 模式下,会强制向下拆分最小块,并再最后一个标题深度,给小块都补充上所有标题(包含父级标题)
|
||||||
if (isMarkdownStep) {
|
if (isMarkdownStep) {
|
||||||
// split new Text, split chunks must will greater 1 (small lastText)
|
// split new Text, split chunks must will greater 1 (small lastText)
|
||||||
const innerChunks = splitTextRecursively({
|
const innerChunks = splitTextRecursively({
|
||||||
@@ -290,11 +297,13 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
parentTitle: parentTitle + item.title
|
parentTitle: parentTitle + item.title
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// 只有标题,没有内容。
|
||||||
if (innerChunks.length === 0) {
|
if (innerChunks.length === 0) {
|
||||||
chunks.push(`${parentTitle}${item.title}`);
|
chunks.push(`${parentTitle}${item.title}`);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 在合并最深级标题时,需要补充标题
|
||||||
chunks.push(
|
chunks.push(
|
||||||
...innerChunks.map(
|
...innerChunks.map(
|
||||||
(chunk) =>
|
(chunk) =>
|
||||||
@@ -307,7 +316,16 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
|
|
||||||
// newText is too large(now, The lastText must be smaller than chunkSize)
|
// newText is too large(now, The lastText must be smaller than chunkSize)
|
||||||
if (newTextLen > maxLen) {
|
if (newTextLen > maxLen) {
|
||||||
// lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
|
const minChunkLen = maxLen * 0.8; // 当前块最小长度
|
||||||
|
const maxChunkLen = maxLen * 1.2; // 当前块最大长度
|
||||||
|
|
||||||
|
// 新文本没有非常大,直接认为它是一个新的块
|
||||||
|
if (newTextLen < maxChunkLen) {
|
||||||
|
chunks.push(newText);
|
||||||
|
lastText = getOneTextOverlapText({ text: newText, step }); // next chunk will start with overlayText
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// 上一个文本块已经挺大的,单独做一个块
|
||||||
if (lastTextLen > minChunkLen) {
|
if (lastTextLen > minChunkLen) {
|
||||||
chunks.push(lastText);
|
chunks.push(lastText);
|
||||||
|
|
||||||
@@ -317,13 +335,13 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 说明是新的文本块比较大,需要进一步拆分
|
// 说明是当前文本比较大,需要进一步拆分
|
||||||
|
|
||||||
// split new Text, split chunks must will greater 1 (small lastText)
|
// 把新的文本块进行一个拆分,并追加到 latestText 中
|
||||||
const innerChunks = splitTextRecursively({
|
const innerChunks = splitTextRecursively({
|
||||||
text: newText,
|
text: currentText,
|
||||||
step: step + 1,
|
step: step + 1,
|
||||||
lastText: '',
|
lastText,
|
||||||
parentTitle: parentTitle + item.title
|
parentTitle: parentTitle + item.title
|
||||||
});
|
});
|
||||||
const lastChunk = innerChunks[innerChunks.length - 1];
|
const lastChunk = innerChunks[innerChunks.length - 1];
|
||||||
@@ -351,11 +369,11 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
|
|
||||||
// Not overlap
|
// Not overlap
|
||||||
if (forbidConcat) {
|
if (forbidConcat) {
|
||||||
chunks.push(item.text);
|
chunks.push(currentText);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
lastText += item.text;
|
lastText = newText;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If the last chunk is independent, it needs to be push chunks. */
|
/* If the last chunk is independent, it needs to be push chunks. */
|
||||||
|
@@ -30,11 +30,11 @@ export async function text2Speech({
|
|||||||
response_format: 'mp3',
|
response_format: 'mp3',
|
||||||
speed
|
speed
|
||||||
},
|
},
|
||||||
modelData.requestUrl && modelData.requestAuth
|
modelData.requestUrl
|
||||||
? {
|
? {
|
||||||
path: modelData.requestUrl,
|
path: modelData.requestUrl,
|
||||||
headers: {
|
headers: {
|
||||||
Authorization: `Bearer ${modelData.requestAuth}`
|
...(modelData.requestAuth ? { Authorization: `Bearer ${modelData.requestAuth}` } : {})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
: {}
|
: {}
|
||||||
|
@@ -65,6 +65,7 @@ export const llmCompletionsBodyFormat = <T extends CompletionsBodyType>(
|
|||||||
|
|
||||||
const requestBody: T = {
|
const requestBody: T = {
|
||||||
...body,
|
...body,
|
||||||
|
model: modelData.model,
|
||||||
temperature:
|
temperature:
|
||||||
typeof body.temperature === 'number'
|
typeof body.temperature === 'number'
|
||||||
? computedTemperature({
|
? computedTemperature({
|
||||||
|
@@ -196,7 +196,7 @@ function OrgTable({ Tabs }: { Tabs: React.ReactNode }) {
|
|||||||
isLoading={isLoadingOrgs}
|
isLoading={isLoadingOrgs}
|
||||||
>
|
>
|
||||||
<Box mb={3}>
|
<Box mb={3}>
|
||||||
<Path paths={paths} rootName={userInfo?.team?.teamName} onClick={setPath} />
|
<Path paths={paths} rootName={userInfo?.team?.teamName} />
|
||||||
</Box>
|
</Box>
|
||||||
<Flex flex={'1 0 0'} h={0} w={'100%'} gap={'4'}>
|
<Flex flex={'1 0 0'} h={0} w={'100%'} gap={'4'}>
|
||||||
<MemberScrollData flex="1">
|
<MemberScrollData flex="1">
|
||||||
@@ -420,7 +420,7 @@ function OrgTable({ Tabs }: { Tabs: React.ReactNode }) {
|
|||||||
<ActionButton
|
<ActionButton
|
||||||
icon="common/administrator"
|
icon="common/administrator"
|
||||||
text={t('account_team:manage_member')}
|
text={t('account_team:manage_member')}
|
||||||
onClick={() => setManageMemberOrg(currentOrg ?? rootOrg)}
|
onClick={() => setManageMemberOrg(currentOrg)}
|
||||||
/>
|
/>
|
||||||
{currentOrg && currentOrg?.path !== '' && (
|
{currentOrg && currentOrg?.path !== '' && (
|
||||||
<>
|
<>
|
||||||
|
@@ -94,7 +94,7 @@ async function handler(
|
|||||||
per: WritePermissionVal
|
per: WritePermissionVal
|
||||||
});
|
});
|
||||||
|
|
||||||
if (fileAuthRes && (String(fileAuthRes.tmbId) !== String(tmbId) || !fileAuthRes.isRoot)) {
|
if (fileAuthRes && String(fileAuthRes.tmbId) !== String(tmbId) && !fileAuthRes.isRoot) {
|
||||||
return Promise.reject(CommonErrEnum.unAuthFile);
|
return Promise.reject(CommonErrEnum.unAuthFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -15,18 +15,19 @@ import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTex
|
|||||||
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
|
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
|
||||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||||
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
|
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
|
||||||
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
|
|
||||||
|
|
||||||
const formatIndexes = async ({
|
const formatIndexes = async ({
|
||||||
indexes,
|
indexes,
|
||||||
q,
|
q,
|
||||||
a = '',
|
a = '',
|
||||||
indexSize
|
indexSize,
|
||||||
|
maxIndexSize
|
||||||
}: {
|
}: {
|
||||||
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
|
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
|
||||||
q: string;
|
q: string;
|
||||||
a?: string;
|
a?: string;
|
||||||
indexSize: number;
|
indexSize: number;
|
||||||
|
maxIndexSize: number;
|
||||||
}): Promise<
|
}): Promise<
|
||||||
{
|
{
|
||||||
type: `${DatasetDataIndexTypeEnum}`;
|
type: `${DatasetDataIndexTypeEnum}`;
|
||||||
@@ -46,9 +47,12 @@ const formatIndexes = async ({
|
|||||||
}) => {
|
}) => {
|
||||||
const qChunks = splitText2Chunks({
|
const qChunks = splitText2Chunks({
|
||||||
text: q,
|
text: q,
|
||||||
chunkSize: indexSize
|
chunkSize: indexSize,
|
||||||
|
maxSize: maxIndexSize
|
||||||
}).chunks;
|
}).chunks;
|
||||||
const aChunks = a ? splitText2Chunks({ text: a, chunkSize: indexSize }).chunks : [];
|
const aChunks = a
|
||||||
|
? splitText2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize }).chunks
|
||||||
|
: [];
|
||||||
|
|
||||||
return [
|
return [
|
||||||
...qChunks.map((text) => ({
|
...qChunks.map((text) => ({
|
||||||
@@ -100,7 +104,11 @@ const formatIndexes = async ({
|
|||||||
// If oversize tokens, split it
|
// If oversize tokens, split it
|
||||||
const tokens = await countPromptTokens(item.text);
|
const tokens = await countPromptTokens(item.text);
|
||||||
if (tokens > indexSize) {
|
if (tokens > indexSize) {
|
||||||
const splitText = splitText2Chunks({ text: item.text, chunkSize: 512 }).chunks;
|
const splitText = splitText2Chunks({
|
||||||
|
text: item.text,
|
||||||
|
chunkSize: 512,
|
||||||
|
maxSize: maxIndexSize
|
||||||
|
}).chunks;
|
||||||
return splitText.map((text) => ({
|
return splitText.map((text) => ({
|
||||||
text,
|
text,
|
||||||
type: item.type
|
type: item.type
|
||||||
@@ -151,7 +159,8 @@ export async function insertData2Dataset({
|
|||||||
indexes,
|
indexes,
|
||||||
q,
|
q,
|
||||||
a,
|
a,
|
||||||
indexSize
|
indexSize,
|
||||||
|
maxIndexSize: embModel.maxToken
|
||||||
});
|
});
|
||||||
|
|
||||||
// insert to vector store
|
// insert to vector store
|
||||||
@@ -236,7 +245,13 @@ export async function updateData2Dataset({
|
|||||||
if (!mongoData) return Promise.reject('core.dataset.error.Data not found');
|
if (!mongoData) return Promise.reject('core.dataset.error.Data not found');
|
||||||
|
|
||||||
// 2. Compute indexes
|
// 2. Compute indexes
|
||||||
const formatIndexesResult = await formatIndexes({ indexes, q, a, indexSize });
|
const formatIndexesResult = await formatIndexes({
|
||||||
|
indexes,
|
||||||
|
q,
|
||||||
|
a,
|
||||||
|
indexSize,
|
||||||
|
maxIndexSize: getEmbeddingModel(model).maxToken
|
||||||
|
});
|
||||||
|
|
||||||
// 3. Patch indexes, create, update, delete
|
// 3. Patch indexes, create, update, delete
|
||||||
const patchResult: PatchIndexesProps[] = [];
|
const patchResult: PatchIndexesProps[] = [];
|
||||||
|
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user