mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-21 11:43:56 +00:00
@@ -11,3 +11,7 @@ weight: 810
|
|||||||
|
|
||||||
1.
|
1.
|
||||||
2. 新增 - 工作流支持进入聊天框/点击开始对话后,自动触发一轮对话。
|
2. 新增 - 工作流支持进入聊天框/点击开始对话后,自动触发一轮对话。
|
||||||
|
3. 新增 - 重写 chatContext,对话测试也会有日志,并且刷新后不会丢失对话。
|
||||||
|
4. 新增 - 分享链接支持配置是否允许查看原文。
|
||||||
|
5. 优化 - 工作流 ui 细节。
|
||||||
|
6. 修复 - 分块策略,四级标题会被丢失。 同时新增了五级标题的支持。
|
||||||
|
@@ -99,7 +99,7 @@ ${mdSplitString}
|
|||||||
5. 标点分割:重叠
|
5. 标点分割:重叠
|
||||||
*/
|
*/
|
||||||
const commonSplit = (props: SplitProps): SplitResponse => {
|
const commonSplit = (props: SplitProps): SplitResponse => {
|
||||||
let { text = '', chunkLen, overlapRatio = 0.2, customReg = [] } = props;
|
let { text = '', chunkLen, overlapRatio = 0.15, customReg = [] } = props;
|
||||||
|
|
||||||
const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
|
const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
|
||||||
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
|
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
|
||||||
@@ -113,6 +113,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
text = text.replace(/(\r?\n|\r){3,}/g, '\n\n\n');
|
text = text.replace(/(\r?\n|\r){3,}/g, '\n\n\n');
|
||||||
|
|
||||||
// The larger maxLen is, the next sentence is less likely to trigger splitting
|
// The larger maxLen is, the next sentence is less likely to trigger splitting
|
||||||
|
const markdownIndex = 4;
|
||||||
|
const forbidOverlapIndex = 8;
|
||||||
const stepReges: { reg: RegExp; maxLen: number }[] = [
|
const stepReges: { reg: RegExp; maxLen: number }[] = [
|
||||||
...customReg.map((text) => ({
|
...customReg.map((text) => ({
|
||||||
reg: new RegExp(`(${replaceRegChars(text)})`, 'g'),
|
reg: new RegExp(`(${replaceRegChars(text)})`, 'g'),
|
||||||
@@ -122,9 +124,11 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
{ reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkLen * 1.4 },
|
{ reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkLen * 1.4 },
|
||||||
{ reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkLen * 1.6 },
|
{ reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkLen * 1.6 },
|
||||||
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 },
|
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 },
|
||||||
|
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 },
|
||||||
|
|
||||||
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
|
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
|
||||||
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
|
{ reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
|
||||||
|
{ reg: /(\n{2,})/g, maxLen: chunkLen * 1.6 },
|
||||||
{ reg: /([\n])/g, maxLen: chunkLen * 1.2 },
|
{ reg: /([\n])/g, maxLen: chunkLen * 1.2 },
|
||||||
// ------ There's no overlap on the top
|
// ------ There's no overlap on the top
|
||||||
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
|
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
|
||||||
@@ -136,8 +140,9 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
|
|
||||||
const customRegLen = customReg.length;
|
const customRegLen = customReg.length;
|
||||||
const checkIsCustomStep = (step: number) => step < customRegLen;
|
const checkIsCustomStep = (step: number) => step < customRegLen;
|
||||||
const checkIsMarkdownSplit = (step: number) => step >= customRegLen && step <= 3 + customRegLen;
|
const checkIsMarkdownSplit = (step: number) => step >= customRegLen && step <= markdownIndex;
|
||||||
const checkForbidOverlap = (step: number) => step <= 6 + customRegLen;
|
+customReg.length;
|
||||||
|
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customReg.length;
|
||||||
|
|
||||||
// if use markdown title split, Separate record title
|
// if use markdown title split, Separate record title
|
||||||
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
|
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
|
||||||
@@ -231,7 +236,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
// use slice-chunkLen to split text
|
// use slice-chunkLen to split text
|
||||||
const chunks: string[] = [];
|
const chunks: string[] = [];
|
||||||
for (let i = 0; i < text.length; i += chunkLen - overlapLen) {
|
for (let i = 0; i < text.length; i += chunkLen - overlapLen) {
|
||||||
chunks.push(`${parentTitle}${text.slice(i, i + chunkLen)}`);
|
chunks.push(text.slice(i, i + chunkLen));
|
||||||
}
|
}
|
||||||
return chunks;
|
return chunks;
|
||||||
}
|
}
|
||||||
@@ -241,7 +246,6 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
|
|
||||||
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen;
|
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen;
|
||||||
const minChunkLen = chunkLen * 0.7;
|
const minChunkLen = chunkLen * 0.7;
|
||||||
// console.log(splitTexts, stepReges[step].reg);
|
|
||||||
|
|
||||||
const chunks: string[] = [];
|
const chunks: string[] = [];
|
||||||
for (let i = 0; i < splitTexts.length; i++) {
|
for (let i = 0; i < splitTexts.length; i++) {
|
||||||
@@ -249,12 +253,34 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
|
|
||||||
const lastTextLen = lastText.length;
|
const lastTextLen = lastText.length;
|
||||||
const currentText = item.text;
|
const currentText = item.text;
|
||||||
const currentTextLen = currentText.length;
|
|
||||||
const newText = lastText + currentText;
|
const newText = lastText + currentText;
|
||||||
const newTextLen = lastTextLen + currentTextLen;
|
const newTextLen = newText.length;
|
||||||
|
|
||||||
|
// Markdown 模式下,会强制向下拆分最小块,并再最后一个标题时候,给小块都补充上所有标题(包含父级标题)
|
||||||
|
if (isMarkdownStep) {
|
||||||
|
// split new Text, split chunks must will greater 1 (small lastText)
|
||||||
|
const innerChunks = splitTextRecursively({
|
||||||
|
text: newText,
|
||||||
|
step: step + 1,
|
||||||
|
lastText: '',
|
||||||
|
parentTitle: parentTitle + item.title
|
||||||
|
});
|
||||||
|
|
||||||
|
const lastChunk = innerChunks[innerChunks.length - 1];
|
||||||
|
if (!lastChunk) continue;
|
||||||
|
|
||||||
|
chunks.push(
|
||||||
|
...innerChunks.map(
|
||||||
|
(chunk) =>
|
||||||
|
step === markdownIndex + customRegLen ? `${parentTitle}${item.title}${chunk}` : chunk // 合并进 Markdown 分块时,需要补标题
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// newText is too large(now, The lastText must be smaller than chunkLen)
|
// newText is too large(now, The lastText must be smaller than chunkLen)
|
||||||
if (newTextLen > maxLen || isMarkdownStep) {
|
if (newTextLen > maxLen) {
|
||||||
// lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
|
// lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
|
||||||
if (lastTextLen > minChunkLen) {
|
if (lastTextLen > minChunkLen) {
|
||||||
chunks.push(lastText);
|
chunks.push(lastText);
|
||||||
@@ -278,15 +304,6 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
|
|
||||||
if (!lastChunk) continue;
|
if (!lastChunk) continue;
|
||||||
|
|
||||||
if (forbidConcat) {
|
|
||||||
chunks.push(
|
|
||||||
...innerChunks.map(
|
|
||||||
(chunk) => (step === 3 + customRegLen ? `${parentTitle}${chunk}` : chunk) // 合并进 Markdown 分块时,需要补标题
|
|
||||||
)
|
|
||||||
);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// last chunk is too small, concat it to lastText(next chunk start)
|
// last chunk is too small, concat it to lastText(next chunk start)
|
||||||
if (lastChunk.length < minChunkLen) {
|
if (lastChunk.length < minChunkLen) {
|
||||||
chunks.push(...innerChunks.slice(0, -1));
|
chunks.push(...innerChunks.slice(0, -1));
|
||||||
@@ -304,11 +321,11 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// new text is small
|
// New text is small
|
||||||
|
|
||||||
// Not overlap
|
// Not overlap
|
||||||
if (forbidConcat) {
|
if (forbidConcat) {
|
||||||
chunks.push(`${parentTitle}${item.title}${item.text}`);
|
chunks.push(item.text);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -33,7 +33,7 @@ MILVUS_TOKEN=133964348b00b4b4e4b51bef680a61350950385c8c64a3ec16b1ab92d3c67dcc4e0
|
|||||||
SANDBOX_URL=http://localhost:3001
|
SANDBOX_URL=http://localhost:3001
|
||||||
# 商业版地址
|
# 商业版地址
|
||||||
PRO_URL=
|
PRO_URL=
|
||||||
# 页面的地址,用于自动补全相对路径资源的 domain
|
# 页面的地址,用于自动补全相对路径资源的 domain,注意后面不要跟 /
|
||||||
FE_DOMAIN=http://localhost:3000
|
FE_DOMAIN=http://localhost:3000
|
||||||
# 二级路由,需要打包时候就确定
|
# 二级路由,需要打包时候就确定
|
||||||
# NEXT_PUBLIC_BASE_URL=/fastai
|
# NEXT_PUBLIC_BASE_URL=/fastai
|
||||||
|
Reference in New Issue
Block a user