mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 13:03:50 +00:00
perf: chunk trigger and paragraph split (#4893)
* perf: chunk trigger and paragraph split * update max size computed * perf: i18n * remove table
This commit is contained in:
@@ -7,6 +7,10 @@ export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
|
||||
type SplitProps = {
|
||||
text: string;
|
||||
chunkSize: number;
|
||||
|
||||
paragraphChunkDeep?: number; // Paragraph deep
|
||||
paragraphChunkMinSize?: number; // Paragraph min size, if too small, it will merge
|
||||
|
||||
maxSize?: number;
|
||||
overlapRatio?: number;
|
||||
customReg?: string[];
|
||||
@@ -108,6 +112,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
let {
|
||||
text = '',
|
||||
chunkSize,
|
||||
paragraphChunkDeep = 5,
|
||||
paragraphChunkMinSize = 100,
|
||||
maxSize = defaultMaxChunkSize,
|
||||
overlapRatio = 0.15,
|
||||
customReg = []
|
||||
@@ -123,7 +129,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) {
|
||||
return match.replace(/\n/g, codeBlockMarker);
|
||||
});
|
||||
// 2. 表格处理 - 单独提取表格出来,进行表头合并
|
||||
// 2. Markdown 表格处理 - 单独提取表格出来,进行表头合并
|
||||
const tableReg =
|
||||
/(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n?)*)(?:\n|$)/g;
|
||||
const tableDataList = text.match(tableReg);
|
||||
@@ -143,25 +149,40 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
text = text.replace(/(\r?\n|\r){3,}/g, '\n\n\n');
|
||||
|
||||
// The larger maxLen is, the next sentence is less likely to trigger splitting
|
||||
const markdownIndex = 4;
|
||||
const forbidOverlapIndex = 8;
|
||||
const customRegLen = customReg.length;
|
||||
const markdownIndex = paragraphChunkDeep - 1;
|
||||
const forbidOverlapIndex = customRegLen + markdownIndex + 4;
|
||||
|
||||
const markdownHeaderRules = ((deep?: number): { reg: RegExp; maxLen: number }[] => {
|
||||
if (!deep || deep === 0) return [];
|
||||
|
||||
const maxDeep = Math.min(deep, 8); // Maximum 8 levels
|
||||
const rules: { reg: RegExp; maxLen: number }[] = [];
|
||||
|
||||
for (let i = 1; i <= maxDeep; i++) {
|
||||
const hashSymbols = '#'.repeat(i);
|
||||
rules.push({
|
||||
reg: new RegExp(`^(${hashSymbols}\\s[^\\n]+\\n)`, 'gm'),
|
||||
maxLen: chunkSize
|
||||
});
|
||||
}
|
||||
|
||||
return rules;
|
||||
})(paragraphChunkDeep);
|
||||
|
||||
const stepReges: { reg: RegExp | string; maxLen: number }[] = [
|
||||
...customReg.map((text) => ({
|
||||
reg: text.replaceAll('\\n', '\n'),
|
||||
maxLen: chunkSize
|
||||
})),
|
||||
{ reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||
{ reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||
{ reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||
...markdownHeaderRules,
|
||||
|
||||
{ reg: /([\n](```[\s\S]*?```|~~~[\s\S]*?~~~))/g, maxLen: maxSize }, // code block
|
||||
// HTML Table tag 尽可能保障完整
|
||||
{
|
||||
reg: /(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n)*)/g,
|
||||
maxLen: Math.min(chunkSize * 1.5, maxSize)
|
||||
}, // Table 尽可能保证完整性
|
||||
maxLen: chunkSize
|
||||
}, // Markdown Table 尽可能保证完整性
|
||||
{ reg: /(\n{2,})/g, maxLen: chunkSize },
|
||||
{ reg: /([\n])/g, maxLen: chunkSize },
|
||||
// ------ There's no overlap on the top
|
||||
@@ -172,12 +193,10 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
{ reg: /([,]|,\s)/g, maxLen: chunkSize }
|
||||
];
|
||||
|
||||
const customRegLen = customReg.length;
|
||||
const checkIsCustomStep = (step: number) => step < customRegLen;
|
||||
const checkIsMarkdownSplit = (step: number) =>
|
||||
step >= customRegLen && step <= markdownIndex + customRegLen;
|
||||
|
||||
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customRegLen;
|
||||
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex;
|
||||
|
||||
// if use markdown title split, Separate record title
|
||||
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
|
||||
@@ -301,6 +320,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
const splitTexts = getSplitTexts({ text, step });
|
||||
|
||||
const chunks: string[] = [];
|
||||
|
||||
for (let i = 0; i < splitTexts.length; i++) {
|
||||
const item = splitTexts[i];
|
||||
|
||||
@@ -443,7 +463,6 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
*/
|
||||
export const splitText2Chunks = (props: SplitProps): SplitResponse => {
|
||||
let { text = '' } = props;
|
||||
const start = Date.now();
|
||||
const splitWithCustomSign = text.split(CUSTOM_SPLIT_SIGN);
|
||||
|
||||
const splitResult = splitWithCustomSign.map((item) => {
|
||||
|
@@ -120,7 +120,6 @@ export const computeChunkSize = (params: {
|
||||
|
||||
return Math.min(params.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel));
|
||||
};
|
||||
|
||||
export const computeChunkSplitter = (params: {
|
||||
chunkSettingMode?: ChunkSettingModeEnum;
|
||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||
@@ -129,8 +128,21 @@ export const computeChunkSplitter = (params: {
|
||||
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return undefined;
|
||||
}
|
||||
if (params.chunkSplitMode === DataChunkSplitModeEnum.size) {
|
||||
if (params.chunkSplitMode !== DataChunkSplitModeEnum.char) {
|
||||
return undefined;
|
||||
}
|
||||
return params.chunkSplitter;
|
||||
};
|
||||
export const computeParagraphChunkDeep = (params: {
|
||||
chunkSettingMode?: ChunkSettingModeEnum;
|
||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||
paragraphChunkDeep?: number;
|
||||
}) => {
|
||||
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return 5;
|
||||
}
|
||||
if (params.chunkSplitMode === DataChunkSplitModeEnum.paragraph) {
|
||||
return params.paragraphChunkDeep;
|
||||
}
|
||||
return 0;
|
||||
};
|
||||
|
8
packages/global/core/dataset/type.d.ts
vendored
8
packages/global/core/dataset/type.d.ts
vendored
@@ -9,7 +9,8 @@ import type {
|
||||
DatasetTypeEnum,
|
||||
SearchScoreTypeEnum,
|
||||
TrainingModeEnum,
|
||||
ChunkSettingModeEnum
|
||||
ChunkSettingModeEnum,
|
||||
ChunkTriggerConfigTypeEnum
|
||||
} from './constants';
|
||||
import type { DatasetPermission } from '../../support/permission/dataset/controller';
|
||||
import type { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
|
||||
@@ -37,11 +38,10 @@ export type ChunkSettingsType = {
|
||||
paragraphChunkAIMode?: ParagraphChunkAIModeEnum;
|
||||
paragraphChunkDeep?: number; // Paragraph deep
|
||||
paragraphChunkMinSize?: number; // Paragraph min size, if too small, it will merge
|
||||
paragraphChunkMaxSize?: number; // Paragraph max size, if too large, it will split
|
||||
// Size split
|
||||
chunkSize?: number;
|
||||
chunkSize?: number; // chunk/qa chunk size, Paragraph max chunk size.
|
||||
// Char split
|
||||
chunkSplitter?: string;
|
||||
chunkSplitter?: string; // chunk/qa chunk splitter
|
||||
indexSize?: number;
|
||||
|
||||
qaPrompt?: string;
|
||||
|
Reference in New Issue
Block a user