mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 13:03:50 +00:00
update text splitter (#3020)
This commit is contained in:
@@ -65,3 +65,4 @@ curl --location --request POST 'https://{{host}}/api/admin/resetMilvus' \
|
|||||||
19. 修复 - 拥有多个循环节点时,错误运行。
|
19. 修复 - 拥有多个循环节点时,错误运行。
|
||||||
20. 修复 - 循环节点中修改变量,无法传递。
|
20. 修复 - 循环节点中修改变量,无法传递。
|
||||||
21. 修复 - 非 stream 模式,嵌套子应用/插件执行时无法获取子应用响应。
|
21. 修复 - 非 stream 模式,嵌套子应用/插件执行时无法获取子应用响应。
|
||||||
|
22. 修复 - 数据分块策略,同时将每个 Markdown 独立分块。
|
||||||
|
@@ -92,9 +92,9 @@ ${mdSplitString}
|
|||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
1. 自定义分隔符:不需要重叠
|
1. 自定义分隔符:不需要重叠,不需要小块合并
|
||||||
2. Markdown 标题:不需要重叠;标题嵌套共享。
|
2. Markdown 标题:不需要重叠;标题嵌套共享,不需要小块合并
|
||||||
3. 特殊 markdown 语法:不需要重叠
|
3. 特殊 markdown 语法:不需要重叠,需要小块合并
|
||||||
4. 段落:尽可能保证它是一个完整的段落。
|
4. 段落:尽可能保证它是一个完整的段落。
|
||||||
5. 标点分割:重叠
|
5. 标点分割:重叠
|
||||||
*/
|
*/
|
||||||
@@ -118,10 +118,10 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
reg: new RegExp(`(${replaceRegChars(text)})`, 'g'),
|
reg: new RegExp(`(${replaceRegChars(text)})`, 'g'),
|
||||||
maxLen: chunkLen * 1.4
|
maxLen: chunkLen * 1.4
|
||||||
})),
|
})),
|
||||||
{ reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
|
{ reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkLen * 1.2 },
|
||||||
{ reg: /^(##\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
|
{ reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkLen * 1.4 },
|
||||||
{ reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
|
{ reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkLen * 1.6 },
|
||||||
{ reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
|
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 },
|
||||||
|
|
||||||
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
|
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
|
||||||
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
|
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
|
||||||
@@ -137,7 +137,6 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
const customRegLen = customReg.length;
|
const customRegLen = customReg.length;
|
||||||
const checkIsCustomStep = (step: number) => step < customRegLen;
|
const checkIsCustomStep = (step: number) => step < customRegLen;
|
||||||
const checkIsMarkdownSplit = (step: number) => step >= customRegLen && step <= 3 + customRegLen;
|
const checkIsMarkdownSplit = (step: number) => step >= customRegLen && step <= 3 + customRegLen;
|
||||||
const checkIndependentChunk = (step: number) => step >= customRegLen && step <= 4 + customRegLen;
|
|
||||||
const checkForbidOverlap = (step: number) => step <= 6 + customRegLen;
|
const checkForbidOverlap = (step: number) => step <= 6 + customRegLen;
|
||||||
|
|
||||||
// if use markdown title split, Separate record title
|
// if use markdown title split, Separate record title
|
||||||
@@ -153,7 +152,6 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
|
|
||||||
const isCustomStep = checkIsCustomStep(step);
|
const isCustomStep = checkIsCustomStep(step);
|
||||||
const isMarkdownSplit = checkIsMarkdownSplit(step);
|
const isMarkdownSplit = checkIsMarkdownSplit(step);
|
||||||
const independentChunk = checkIndependentChunk(step);
|
|
||||||
|
|
||||||
const { reg } = stepReges[step];
|
const { reg } = stepReges[step];
|
||||||
|
|
||||||
@@ -162,7 +160,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
reg,
|
reg,
|
||||||
(() => {
|
(() => {
|
||||||
if (isCustomStep) return splitMarker;
|
if (isCustomStep) return splitMarker;
|
||||||
if (independentChunk) return `${splitMarker}$1`;
|
if (isMarkdownSplit) return `${splitMarker}$1`;
|
||||||
return `$1${splitMarker}`;
|
return `$1${splitMarker}`;
|
||||||
})()
|
})()
|
||||||
)
|
)
|
||||||
@@ -178,7 +176,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
title: matchTitle
|
title: matchTitle
|
||||||
};
|
};
|
||||||
})
|
})
|
||||||
.filter((item) => item.text.trim());
|
.filter((item) => item.text?.trim());
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Gets the overlap at the end of a text as the beginning of the next block */
|
/* Gets the overlap at the end of a text as the beginning of the next block */
|
||||||
@@ -214,15 +212,16 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
text = '',
|
text = '',
|
||||||
step,
|
step,
|
||||||
lastText,
|
lastText,
|
||||||
mdTitle = ''
|
parentTitle = ''
|
||||||
}: {
|
}: {
|
||||||
text: string;
|
text: string;
|
||||||
step: number;
|
step: number;
|
||||||
lastText: string;
|
lastText: string; // 上一个分块末尾数据会通过这个参数传入。
|
||||||
mdTitle: string;
|
parentTitle: string;
|
||||||
}): string[] => {
|
}): string[] => {
|
||||||
const independentChunk = checkIndependentChunk(step);
|
const isMarkdownStep = checkIsMarkdownSplit(step);
|
||||||
const isCustomStep = checkIsCustomStep(step);
|
const isCustomStep = checkIsCustomStep(step);
|
||||||
|
const forbidConcat = isMarkdownStep || isCustomStep; // forbid=true时候,lastText肯定为空
|
||||||
|
|
||||||
// oversize
|
// oversize
|
||||||
if (step >= stepReges.length) {
|
if (step >= stepReges.length) {
|
||||||
@@ -232,7 +231,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
// use slice-chunkLen to split text
|
// use slice-chunkLen to split text
|
||||||
const chunks: string[] = [];
|
const chunks: string[] = [];
|
||||||
for (let i = 0; i < text.length; i += chunkLen - overlapLen) {
|
for (let i = 0; i < text.length; i += chunkLen - overlapLen) {
|
||||||
chunks.push(`${mdTitle}${text.slice(i, i + chunkLen)}`);
|
chunks.push(`${parentTitle}${text.slice(i, i + chunkLen)}`);
|
||||||
}
|
}
|
||||||
return chunks;
|
return chunks;
|
||||||
}
|
}
|
||||||
@@ -242,67 +241,78 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
|
|
||||||
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen;
|
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen;
|
||||||
const minChunkLen = chunkLen * 0.7;
|
const minChunkLen = chunkLen * 0.7;
|
||||||
const miniChunkLen = 30;
|
|
||||||
// console.log(splitTexts, stepReges[step].reg);
|
// console.log(splitTexts, stepReges[step].reg);
|
||||||
|
|
||||||
const chunks: string[] = [];
|
const chunks: string[] = [];
|
||||||
for (let i = 0; i < splitTexts.length; i++) {
|
for (let i = 0; i < splitTexts.length; i++) {
|
||||||
const item = splitTexts[i];
|
const item = splitTexts[i];
|
||||||
const currentTitle = `${mdTitle}${item.title}`;
|
|
||||||
|
|
||||||
|
const lastTextLen = lastText.length;
|
||||||
const currentText = item.text;
|
const currentText = item.text;
|
||||||
const currentTextLen = currentText.length;
|
const currentTextLen = currentText.length;
|
||||||
const lastTextLen = lastText.length;
|
|
||||||
const newText = lastText + currentText;
|
const newText = lastText + currentText;
|
||||||
const newTextLen = lastTextLen + currentTextLen;
|
const newTextLen = lastTextLen + currentTextLen;
|
||||||
|
|
||||||
// newText is too large(now, The lastText must be smaller than chunkLen)
|
// newText is too large(now, The lastText must be smaller than chunkLen)
|
||||||
if (newTextLen > maxLen) {
|
if (newTextLen > maxLen || isMarkdownStep) {
|
||||||
// lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
|
// lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
|
||||||
if (lastTextLen > minChunkLen) {
|
if (lastTextLen > minChunkLen) {
|
||||||
chunks.push(`${currentTitle}${lastText}`);
|
chunks.push(lastText);
|
||||||
lastText = getOneTextOverlapText({ text: lastText, step }); // next chunk will start with overlayText
|
|
||||||
i--;
|
|
||||||
|
|
||||||
|
lastText = getOneTextOverlapText({ text: lastText, step }); // next chunk will start with overlayText
|
||||||
|
|
||||||
|
i--;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 说明是新的文本块比较大,需要进一步拆分
|
||||||
|
|
||||||
// split new Text, split chunks must will greater 1 (small lastText)
|
// split new Text, split chunks must will greater 1 (small lastText)
|
||||||
const innerChunks = splitTextRecursively({
|
const innerChunks = splitTextRecursively({
|
||||||
text: newText,
|
text: newText,
|
||||||
step: step + 1,
|
step: step + 1,
|
||||||
lastText: '',
|
lastText: '',
|
||||||
mdTitle: currentTitle
|
parentTitle: parentTitle + item.title
|
||||||
});
|
});
|
||||||
const lastChunk = innerChunks[innerChunks.length - 1];
|
const lastChunk = innerChunks[innerChunks.length - 1];
|
||||||
|
|
||||||
|
if (!lastChunk) continue;
|
||||||
|
|
||||||
|
if (forbidConcat) {
|
||||||
|
chunks.push(
|
||||||
|
...innerChunks.map(
|
||||||
|
(chunk) => (step === 3 + customRegLen ? `${parentTitle}${chunk}` : chunk) // 合并进 Markdown 分块时,需要补标题
|
||||||
|
)
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// last chunk is too small, concat it to lastText(next chunk start)
|
// last chunk is too small, concat it to lastText(next chunk start)
|
||||||
if (!independentChunk && lastChunk.length < minChunkLen) {
|
if (lastChunk.length < minChunkLen) {
|
||||||
chunks.push(...innerChunks.slice(0, -1));
|
chunks.push(...innerChunks.slice(0, -1));
|
||||||
lastText = lastChunk;
|
lastText = lastChunk;
|
||||||
} else {
|
continue;
|
||||||
chunks.push(...innerChunks);
|
|
||||||
// compute new overlapText
|
|
||||||
lastText = getOneTextOverlapText({
|
|
||||||
text: lastChunk,
|
|
||||||
step
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Last chunk is large enough
|
||||||
|
chunks.push(...innerChunks);
|
||||||
|
// compute new overlapText
|
||||||
|
lastText = getOneTextOverlapText({
|
||||||
|
text: lastChunk,
|
||||||
|
step
|
||||||
|
});
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// size less than chunkLen, push text to last chunk. now, text definitely less than maxLen
|
// new text is small
|
||||||
lastText = newText;
|
|
||||||
|
|
||||||
// markdown paragraph block: Direct addition; If the chunk size reaches, add a chunk
|
// Not overlap
|
||||||
if (
|
if (forbidConcat) {
|
||||||
isCustomStep ||
|
chunks.push(`${parentTitle}${item.title}${item.text}`);
|
||||||
(independentChunk && newTextLen > miniChunkLen) ||
|
continue;
|
||||||
newTextLen >= chunkLen
|
|
||||||
) {
|
|
||||||
chunks.push(`${currentTitle}${lastText}`);
|
|
||||||
|
|
||||||
lastText = getOneTextOverlapText({ text: lastText, step });
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
lastText += item.text;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If the last chunk is independent, it needs to be push chunks. */
|
/* If the last chunk is independent, it needs to be push chunks. */
|
||||||
@@ -310,9 +320,10 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
if (lastText.length < chunkLen * 0.4) {
|
if (lastText.length < chunkLen * 0.4) {
|
||||||
chunks[chunks.length - 1] = chunks[chunks.length - 1] + lastText;
|
chunks[chunks.length - 1] = chunks[chunks.length - 1] + lastText;
|
||||||
} else {
|
} else {
|
||||||
chunks.push(`${mdTitle}${lastText}`);
|
chunks.push(lastText);
|
||||||
}
|
}
|
||||||
} else if (lastText && chunks.length === 0) {
|
} else if (lastText && chunks.length === 0) {
|
||||||
|
// 只分出一个很小的块,则直接追加到末尾(如果大于 1 个块,说明这个小块内容已经被上一个块拿到了)
|
||||||
chunks.push(lastText);
|
chunks.push(lastText);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -324,8 +335,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
text,
|
text,
|
||||||
step: 0,
|
step: 0,
|
||||||
lastText: '',
|
lastText: '',
|
||||||
mdTitle: ''
|
parentTitle: ''
|
||||||
}).map((chunk) => chunk?.replaceAll(codeBlockMarker, '\n') || ''); // restore code block
|
}).map((chunk) => chunk?.replaceAll(codeBlockMarker, '\n')?.trim() || ''); // restore code block
|
||||||
|
|
||||||
const chars = chunks.reduce((sum, chunk) => sum + chunk.length, 0);
|
const chars = chunks.reduce((sum, chunk) => sum + chunk.length, 0);
|
||||||
|
|
||||||
|
@@ -1,8 +1,12 @@
|
|||||||
import { simpleMarkdownText } from '@fastgpt/global/common/string/markdown';
|
import { simpleMarkdownText } from '@fastgpt/global/common/string/markdown';
|
||||||
import { WorkerNameEnum, runWorker } from '../../worker/utils';
|
import { WorkerNameEnum, runWorker } from '../../worker/utils';
|
||||||
|
import { ImageType } from '../../worker/readFile/type';
|
||||||
|
|
||||||
export const htmlToMarkdown = async (html?: string | null) => {
|
export const htmlToMarkdown = async (html?: string | null) => {
|
||||||
const md = await runWorker<string>(WorkerNameEnum.htmlStr2Md, { html: html || '' });
|
const md = await runWorker<{
|
||||||
|
rawText: string;
|
||||||
|
imageList: ImageType[];
|
||||||
|
}>(WorkerNameEnum.htmlStr2Md, { html: html || '' });
|
||||||
|
|
||||||
return simpleMarkdownText(md);
|
return simpleMarkdownText(md.rawText);
|
||||||
};
|
};
|
||||||
|
@@ -34,7 +34,7 @@
|
|||||||
"pdfjs-dist": "4.4.168",
|
"pdfjs-dist": "4.4.168",
|
||||||
"pg": "^8.10.0",
|
"pg": "^8.10.0",
|
||||||
"request-ip": "^3.3.0",
|
"request-ip": "^3.3.0",
|
||||||
"tiktoken": "^1.0.15",
|
"tiktoken": "1.0.17",
|
||||||
"tunnel": "^0.0.6",
|
"tunnel": "^0.0.6",
|
||||||
"turndown": "^7.1.2"
|
"turndown": "^7.1.2"
|
||||||
},
|
},
|
||||||
|
@@ -178,11 +178,13 @@ export class WorkerPool<Props = Record<string, any>, Response = any> {
|
|||||||
|
|
||||||
// Worker error, terminate and delete it.(Un catch error)
|
// Worker error, terminate and delete it.(Un catch error)
|
||||||
worker.on('error', (err) => {
|
worker.on('error', (err) => {
|
||||||
addLog.warn('Worker error', { err });
|
console.log(err);
|
||||||
|
addLog.error('Worker error', err);
|
||||||
this.deleteWorker(workerId);
|
this.deleteWorker(workerId);
|
||||||
});
|
});
|
||||||
worker.on('messageerror', (err) => {
|
worker.on('messageerror', (err) => {
|
||||||
addLog.warn('Worker error', { err });
|
console.log(err);
|
||||||
|
addLog.error('Worker messageerror', err);
|
||||||
this.deleteWorker(workerId);
|
this.deleteWorker(workerId);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@@ -559,7 +559,7 @@
|
|||||||
"core.dataset.import.Link name placeholder": "Only supports static links. If the data is empty after uploading, the link may not be readable\nEach line one, up to 10 links at a time",
|
"core.dataset.import.Link name placeholder": "Only supports static links. If the data is empty after uploading, the link may not be readable\nEach line one, up to 10 links at a time",
|
||||||
"core.dataset.import.Local file": "Local File",
|
"core.dataset.import.Local file": "Local File",
|
||||||
"core.dataset.import.Local file desc": "Upload files in PDF, TXT, DOCX, etc. formats",
|
"core.dataset.import.Local file desc": "Upload files in PDF, TXT, DOCX, etc. formats",
|
||||||
"core.dataset.import.Preview chunks": "Preview Segments (up to 5 segments)",
|
"core.dataset.import.Preview chunks": "Preview Chunks (limit 15)",
|
||||||
"core.dataset.import.Preview raw text": "Preview Raw Text (up to 3000 characters)",
|
"core.dataset.import.Preview raw text": "Preview Raw Text (up to 3000 characters)",
|
||||||
"core.dataset.import.Process way": "Processing Method",
|
"core.dataset.import.Process way": "Processing Method",
|
||||||
"core.dataset.import.QA Estimated Price Tips": "Requires calling the file processing model, which consumes a lot of AI points: {{price}} points/1K tokens",
|
"core.dataset.import.QA Estimated Price Tips": "Requires calling the file processing model, which consumes a lot of AI points: {{price}} points/1K tokens",
|
||||||
@@ -1198,4 +1198,4 @@
|
|||||||
"verification": "Verification",
|
"verification": "Verification",
|
||||||
"xx_search_result": "{{key}} Search Results",
|
"xx_search_result": "{{key}} Search Results",
|
||||||
"yes": "Yes"
|
"yes": "Yes"
|
||||||
}
|
}
|
||||||
|
@@ -459,7 +459,7 @@
|
|||||||
"core.chat.response.module similarity": "相似度",
|
"core.chat.response.module similarity": "相似度",
|
||||||
"core.chat.response.module temperature": "温度",
|
"core.chat.response.module temperature": "温度",
|
||||||
"core.chat.response.module time": "运行时长",
|
"core.chat.response.module time": "运行时长",
|
||||||
"core.chat.response.module tokens": "AI Tokens 消耗",
|
"core.chat.response.module tokens": "AI Tokens总量",
|
||||||
"core.chat.response.plugin output": "插件输出值",
|
"core.chat.response.plugin output": "插件输出值",
|
||||||
"core.chat.response.search using reRank": "结果重排",
|
"core.chat.response.search using reRank": "结果重排",
|
||||||
"core.chat.response.text output": "文本输出",
|
"core.chat.response.text output": "文本输出",
|
||||||
@@ -565,7 +565,7 @@
|
|||||||
"core.dataset.import.Link name placeholder": "仅支持静态链接,如果上传后数据为空,可能该链接无法被读取\n每行一个,每次最多 10 个链接",
|
"core.dataset.import.Link name placeholder": "仅支持静态链接,如果上传后数据为空,可能该链接无法被读取\n每行一个,每次最多 10 个链接",
|
||||||
"core.dataset.import.Local file": "本地文件",
|
"core.dataset.import.Local file": "本地文件",
|
||||||
"core.dataset.import.Local file desc": "上传 PDF、TXT、DOCX 等格式的文件",
|
"core.dataset.import.Local file desc": "上传 PDF、TXT、DOCX 等格式的文件",
|
||||||
"core.dataset.import.Preview chunks": "预览分段(最多 5 段)",
|
"core.dataset.import.Preview chunks": "预览分段(最多 15 段)",
|
||||||
"core.dataset.import.Preview raw text": "预览源文本(最多 3000 字)",
|
"core.dataset.import.Preview raw text": "预览源文本(最多 3000 字)",
|
||||||
"core.dataset.import.Process way": "处理方式",
|
"core.dataset.import.Process way": "处理方式",
|
||||||
"core.dataset.import.QA Estimated Price Tips": "需调用文本理解模型,需要消耗较多 AI 积分:{{price}} 积分/1K tokens",
|
"core.dataset.import.QA Estimated Price Tips": "需调用文本理解模型,需要消耗较多 AI 积分:{{price}} 积分/1K tokens",
|
||||||
@@ -1207,4 +1207,4 @@
|
|||||||
"verification": "验证",
|
"verification": "验证",
|
||||||
"xx_search_result": "{{key}} 的搜索结果",
|
"xx_search_result": "{{key}} 的搜索结果",
|
||||||
"yes": "是"
|
"yes": "是"
|
||||||
}
|
}
|
||||||
|
17
pnpm-lock.yaml
generated
17
pnpm-lock.yaml
generated
@@ -224,7 +224,7 @@ importers:
|
|||||||
specifier: ^3.3.0
|
specifier: ^3.3.0
|
||||||
version: 3.3.0
|
version: 3.3.0
|
||||||
tiktoken:
|
tiktoken:
|
||||||
specifier: ^1.0.15
|
specifier: 1.0.17
|
||||||
version: 1.0.17
|
version: 1.0.17
|
||||||
tunnel:
|
tunnel:
|
||||||
specifier: ^0.0.6
|
specifier: ^0.0.6
|
||||||
@@ -560,7 +560,7 @@ importers:
|
|||||||
version: 1.77.8
|
version: 1.77.8
|
||||||
ts-jest:
|
ts-jest:
|
||||||
specifier: ^29.1.0
|
specifier: ^29.1.0
|
||||||
version: 29.2.2(@babel/core@7.24.9)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.9))(jest@29.7.0(@types/node@20.14.11)(babel-plugin-macros@3.1.0)(ts-node@10.9.2(@types/node@20.14.11)(typescript@5.5.3)))(typescript@5.5.3)
|
version: 29.2.2(@babel/core@7.24.9)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.9))(jest@29.7.0(@types/node@20.14.11)(babel-plugin-macros@3.1.0))(typescript@5.5.3)
|
||||||
use-context-selector:
|
use-context-selector:
|
||||||
specifier: ^1.4.4
|
specifier: ^1.4.4
|
||||||
version: 1.4.4(react-dom@18.3.1(react@18.3.1))(react@18.3.1)(scheduler@0.23.2)
|
version: 1.4.4(react-dom@18.3.1(react@18.3.1))(react@18.3.1)(scheduler@0.23.2)
|
||||||
@@ -659,8 +659,8 @@ importers:
|
|||||||
specifier: ^7.8.1
|
specifier: ^7.8.1
|
||||||
version: 7.8.1
|
version: 7.8.1
|
||||||
tiktoken:
|
tiktoken:
|
||||||
specifier: ^1.0.15
|
specifier: 1.0.17
|
||||||
version: 1.0.15
|
version: 1.0.17
|
||||||
devDependencies:
|
devDependencies:
|
||||||
'@nestjs/cli':
|
'@nestjs/cli':
|
||||||
specifier: ^10.0.0
|
specifier: ^10.0.0
|
||||||
@@ -700,7 +700,7 @@ importers:
|
|||||||
version: 6.3.4
|
version: 6.3.4
|
||||||
ts-jest:
|
ts-jest:
|
||||||
specifier: ^29.1.0
|
specifier: ^29.1.0
|
||||||
version: 29.2.2(@babel/core@7.24.9)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.9))(jest@29.7.0(@types/node@20.14.11)(babel-plugin-macros@3.1.0)(ts-node@10.9.2(@types/node@20.14.11)(typescript@5.5.3)))(typescript@5.5.3)
|
version: 29.2.2(@babel/core@7.24.9)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.9))(jest@29.7.0(@types/node@20.14.11)(babel-plugin-macros@3.1.0))(typescript@5.5.3)
|
||||||
ts-loader:
|
ts-loader:
|
||||||
specifier: ^9.4.3
|
specifier: ^9.4.3
|
||||||
version: 9.5.1(typescript@5.5.3)(webpack@5.92.1)
|
version: 9.5.1(typescript@5.5.3)(webpack@5.92.1)
|
||||||
@@ -8481,9 +8481,6 @@ packages:
|
|||||||
through@2.3.8:
|
through@2.3.8:
|
||||||
resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==}
|
resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==}
|
||||||
|
|
||||||
tiktoken@1.0.15:
|
|
||||||
resolution: {integrity: sha512-sCsrq/vMWUSEW29CJLNmPvWxlVp7yh2tlkAjpJltIKqp5CKf98ZNpdeHRmAlPVFlGEbswDc6SmI8vz64W/qErw==}
|
|
||||||
|
|
||||||
tiktoken@1.0.17:
|
tiktoken@1.0.17:
|
||||||
resolution: {integrity: sha512-UuFHqpy/DxOfNiC3otsqbx3oS6jr5uKdQhB/CvDEroZQbVHt+qAK+4JbIooabUWKU9g6PpsFylNu9Wcg4MxSGA==}
|
resolution: {integrity: sha512-UuFHqpy/DxOfNiC3otsqbx3oS6jr5uKdQhB/CvDEroZQbVHt+qAK+4JbIooabUWKU9g6PpsFylNu9Wcg4MxSGA==}
|
||||||
|
|
||||||
@@ -18892,8 +18889,6 @@ snapshots:
|
|||||||
|
|
||||||
through@2.3.8: {}
|
through@2.3.8: {}
|
||||||
|
|
||||||
tiktoken@1.0.15: {}
|
|
||||||
|
|
||||||
tiktoken@1.0.17: {}
|
tiktoken@1.0.17: {}
|
||||||
|
|
||||||
timezones-list@3.0.3: {}
|
timezones-list@3.0.3: {}
|
||||||
@@ -18961,7 +18956,7 @@ snapshots:
|
|||||||
|
|
||||||
ts-dedent@2.2.0: {}
|
ts-dedent@2.2.0: {}
|
||||||
|
|
||||||
ts-jest@29.2.2(@babel/core@7.24.9)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.9))(jest@29.7.0(@types/node@20.14.11)(babel-plugin-macros@3.1.0)(ts-node@10.9.2(@types/node@20.14.11)(typescript@5.5.3)))(typescript@5.5.3):
|
ts-jest@29.2.2(@babel/core@7.24.9)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.9))(jest@29.7.0(@types/node@20.14.11)(babel-plugin-macros@3.1.0))(typescript@5.5.3):
|
||||||
dependencies:
|
dependencies:
|
||||||
bs-logger: 0.2.6
|
bs-logger: 0.2.6
|
||||||
ejs: 3.1.10
|
ejs: 3.1.10
|
||||||
|
@@ -54,7 +54,6 @@ async function handler(
|
|||||||
}
|
}
|
||||||
);
|
);
|
||||||
dataLength += data.matchedCount;
|
dataLength += data.matchedCount;
|
||||||
console.log(data.matchedCount, '=-=-');
|
|
||||||
|
|
||||||
// 插入数据进入训练库
|
// 插入数据进入训练库
|
||||||
const max = global.systemEnv?.vectorMaxProcess || 10;
|
const max = global.systemEnv?.vectorMaxProcess || 10;
|
||||||
|
@@ -3,7 +3,7 @@ import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/
|
|||||||
import { authCert } from '@fastgpt/service/support/permission/auth/common';
|
import { authCert } from '@fastgpt/service/support/permission/auth/common';
|
||||||
import { NextAPI } from '@/service/middleware/entry';
|
import { NextAPI } from '@/service/middleware/entry';
|
||||||
import { ApiRequestProps } from '@fastgpt/service/type/next';
|
import { ApiRequestProps } from '@fastgpt/service/type/next';
|
||||||
import { OwnerPermissionVal, ReadPermissionVal } from '@fastgpt/global/support/permission/constant';
|
import { OwnerPermissionVal } from '@fastgpt/global/support/permission/constant';
|
||||||
import { authFile } from '@fastgpt/service/support/permission/auth/file';
|
import { authFile } from '@fastgpt/service/support/permission/auth/file';
|
||||||
|
|
||||||
export type PostPreviewFilesChunksProps = {
|
export type PostPreviewFilesChunksProps = {
|
||||||
@@ -60,6 +60,6 @@ async function handler(
|
|||||||
overlapRatio,
|
overlapRatio,
|
||||||
customReg: customSplitChar ? [customSplitChar] : [],
|
customReg: customSplitChar ? [customSplitChar] : [],
|
||||||
isQAImport: isQAImport
|
isQAImport: isQAImport
|
||||||
}).slice(0, 5);
|
}).slice(0, 15);
|
||||||
}
|
}
|
||||||
export default NextAPI(handler);
|
export default NextAPI(handler);
|
||||||
|
@@ -111,7 +111,6 @@ export const useSpeech = (props?: OutLinkChatAuthProps & { appId?: string }) =>
|
|||||||
|
|
||||||
const blob = new Blob(chunks, options);
|
const blob = new Blob(chunks, options);
|
||||||
const duration = Math.round((Date.now() - startTimestamp.current) / 1000);
|
const duration = Math.round((Date.now() - startTimestamp.current) / 1000);
|
||||||
console.log(options, filename, '=-=-');
|
|
||||||
formData.append('file', blob, filename);
|
formData.append('file', blob, filename);
|
||||||
formData.append(
|
formData.append(
|
||||||
'data',
|
'data',
|
||||||
|
@@ -27,7 +27,7 @@
|
|||||||
"fastify": "^4.27.0",
|
"fastify": "^4.27.0",
|
||||||
"dayjs": "^1.11.7",
|
"dayjs": "^1.11.7",
|
||||||
"isolated-vm": "^4.7.2",
|
"isolated-vm": "^4.7.2",
|
||||||
"tiktoken": "^1.0.15",
|
"tiktoken": "1.0.17",
|
||||||
"node-gyp": "^10.1.0",
|
"node-gyp": "^10.1.0",
|
||||||
"reflect-metadata": "^0.2.0",
|
"reflect-metadata": "^0.2.0",
|
||||||
"rxjs": "^7.8.1"
|
"rxjs": "^7.8.1"
|
||||||
|
Reference in New Issue
Block a user