diff --git a/docSite/content/zh-cn/docs/development/upgrading/4812.md b/docSite/content/zh-cn/docs/development/upgrading/4812.md index 7ef68f6ef..6429ce8a6 100644 --- a/docSite/content/zh-cn/docs/development/upgrading/4812.md +++ b/docSite/content/zh-cn/docs/development/upgrading/4812.md @@ -65,3 +65,4 @@ curl --location --request POST 'https://{{host}}/api/admin/resetMilvus' \ 19. 修复 - 拥有多个循环节点时,错误运行。 20. 修复 - 循环节点中修改变量,无法传递。 21. 修复 - 非 stream 模式,嵌套子应用/插件执行时无法获取子应用响应。 +22. 修复 - 数据分块策略,同时将每个 Markdown 独立分块。 diff --git a/packages/global/common/string/textSplitter.ts b/packages/global/common/string/textSplitter.ts index f76bd2f49..c90f526b8 100644 --- a/packages/global/common/string/textSplitter.ts +++ b/packages/global/common/string/textSplitter.ts @@ -92,9 +92,9 @@ ${mdSplitString} }; /* - 1. 自定义分隔符:不需要重叠 - 2. Markdown 标题:不需要重叠;标题嵌套共享。 - 3. 特殊 markdown 语法:不需要重叠 + 1. 自定义分隔符:不需要重叠,不需要小块合并 + 2. Markdown 标题:不需要重叠;标题嵌套共享,不需要小块合并 + 3. 特殊 markdown 语法:不需要重叠,需要小块合并 4. 段落:尽可能保证它是一个完整的段落。 5. 标点分割:重叠 */ @@ -118,10 +118,10 @@ const commonSplit = (props: SplitProps): SplitResponse => { reg: new RegExp(`(${replaceRegChars(text)})`, 'g'), maxLen: chunkLen * 1.4 })), - { reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 }, - { reg: /^(##\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 }, - { reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 }, - { reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 }, + { reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkLen * 1.2 }, + { reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkLen * 1.4 }, + { reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkLen * 1.6 }, + { reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 }, { reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block { reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char @@ -137,7 +137,6 @@ const commonSplit = (props: SplitProps): SplitResponse => { const customRegLen = customReg.length; const checkIsCustomStep = (step: number) => step < customRegLen; const checkIsMarkdownSplit = (step: number) => step >= customRegLen && step <= 3 + customRegLen; - const checkIndependentChunk = (step: number) => step >= customRegLen && step <= 4 + customRegLen; const checkForbidOverlap = (step: number) => step <= 6 + customRegLen; // if use markdown title split, Separate record title @@ -153,7 +152,6 @@ const commonSplit = (props: SplitProps): SplitResponse => { const isCustomStep = checkIsCustomStep(step); const isMarkdownSplit = checkIsMarkdownSplit(step); - const independentChunk = checkIndependentChunk(step); const { reg } = stepReges[step]; @@ -162,7 +160,7 @@ const commonSplit = (props: SplitProps): SplitResponse => { reg, (() => { if (isCustomStep) return splitMarker; - if (independentChunk) return `${splitMarker}$1`; + if (isMarkdownSplit) return `${splitMarker}$1`; return `$1${splitMarker}`; })() ) @@ -178,7 +176,7 @@ const commonSplit = (props: SplitProps): SplitResponse => { title: matchTitle }; }) - .filter((item) => item.text.trim()); + .filter((item) => item.text?.trim()); }; /* Gets the overlap at the end of a text as the beginning of the next block */ @@ -214,15 +212,16 @@ const commonSplit = (props: SplitProps): SplitResponse => { text = '', step, lastText, - mdTitle = '' + parentTitle = '' }: { text: string; step: number; - lastText: string; - mdTitle: string; + lastText: string; // 上一个分块末尾数据会通过这个参数传入。 + parentTitle: string; }): string[] => { - const independentChunk = checkIndependentChunk(step); + const isMarkdownStep = checkIsMarkdownSplit(step); const isCustomStep = checkIsCustomStep(step); + const forbidConcat = isMarkdownStep || isCustomStep; // forbid=true时候,lastText肯定为空 // oversize if (step >= stepReges.length) { @@ -232,7 +231,7 @@ const commonSplit = (props: SplitProps): SplitResponse => { // use slice-chunkLen to split text const chunks: string[] = []; for (let i = 0; i < text.length; i += chunkLen - overlapLen) { - chunks.push(`${mdTitle}${text.slice(i, i + chunkLen)}`); + chunks.push(`${parentTitle}${text.slice(i, i + chunkLen)}`); } return chunks; } @@ -242,67 +241,78 @@ const commonSplit = (props: SplitProps): SplitResponse => { const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen; const minChunkLen = chunkLen * 0.7; - const miniChunkLen = 30; // console.log(splitTexts, stepReges[step].reg); const chunks: string[] = []; for (let i = 0; i < splitTexts.length; i++) { const item = splitTexts[i]; - const currentTitle = `${mdTitle}${item.title}`; + const lastTextLen = lastText.length; const currentText = item.text; const currentTextLen = currentText.length; - const lastTextLen = lastText.length; const newText = lastText + currentText; const newTextLen = lastTextLen + currentTextLen; // newText is too large(now, The lastText must be smaller than chunkLen) - if (newTextLen > maxLen) { + if (newTextLen > maxLen || isMarkdownStep) { // lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText) if (lastTextLen > minChunkLen) { - chunks.push(`${currentTitle}${lastText}`); - lastText = getOneTextOverlapText({ text: lastText, step }); // next chunk will start with overlayText - i--; + chunks.push(lastText); + lastText = getOneTextOverlapText({ text: lastText, step }); // next chunk will start with overlayText + + i--; continue; } + // 说明是新的文本块比较大,需要进一步拆分 + // split new Text, split chunks must will greater 1 (small lastText) const innerChunks = splitTextRecursively({ text: newText, step: step + 1, lastText: '', - mdTitle: currentTitle + parentTitle: parentTitle + item.title }); const lastChunk = innerChunks[innerChunks.length - 1]; + + if (!lastChunk) continue; + + if (forbidConcat) { + chunks.push( + ...innerChunks.map( + (chunk) => (step === 3 + customRegLen ? `${parentTitle}${chunk}` : chunk) // 合并进 Markdown 分块时,需要补标题 + ) + ); + continue; + } + // last chunk is too small, concat it to lastText(next chunk start) - if (!independentChunk && lastChunk.length < minChunkLen) { + if (lastChunk.length < minChunkLen) { chunks.push(...innerChunks.slice(0, -1)); lastText = lastChunk; - } else { - chunks.push(...innerChunks); - // compute new overlapText - lastText = getOneTextOverlapText({ - text: lastChunk, - step - }); + continue; } + + // Last chunk is large enough + chunks.push(...innerChunks); + // compute new overlapText + lastText = getOneTextOverlapText({ + text: lastChunk, + step + }); continue; } - // size less than chunkLen, push text to last chunk. now, text definitely less than maxLen - lastText = newText; + // new text is small - // markdown paragraph block: Direct addition; If the chunk size reaches, add a chunk - if ( - isCustomStep || - (independentChunk && newTextLen > miniChunkLen) || - newTextLen >= chunkLen - ) { - chunks.push(`${currentTitle}${lastText}`); - - lastText = getOneTextOverlapText({ text: lastText, step }); + // Not overlap + if (forbidConcat) { + chunks.push(`${parentTitle}${item.title}${item.text}`); + continue; } + + lastText += item.text; } /* If the last chunk is independent, it needs to be push chunks. */ @@ -310,9 +320,10 @@ const commonSplit = (props: SplitProps): SplitResponse => { if (lastText.length < chunkLen * 0.4) { chunks[chunks.length - 1] = chunks[chunks.length - 1] + lastText; } else { - chunks.push(`${mdTitle}${lastText}`); + chunks.push(lastText); } } else if (lastText && chunks.length === 0) { + // 只分出一个很小的块,则直接追加到末尾(如果大于 1 个块,说明这个小块内容已经被上一个块拿到了) chunks.push(lastText); } @@ -324,8 +335,8 @@ const commonSplit = (props: SplitProps): SplitResponse => { text, step: 0, lastText: '', - mdTitle: '' - }).map((chunk) => chunk?.replaceAll(codeBlockMarker, '\n') || ''); // restore code block + parentTitle: '' + }).map((chunk) => chunk?.replaceAll(codeBlockMarker, '\n')?.trim() || ''); // restore code block const chars = chunks.reduce((sum, chunk) => sum + chunk.length, 0); diff --git a/packages/service/common/string/utils.ts b/packages/service/common/string/utils.ts index cd83e2e56..1d9a906bd 100644 --- a/packages/service/common/string/utils.ts +++ b/packages/service/common/string/utils.ts @@ -1,8 +1,12 @@ import { simpleMarkdownText } from '@fastgpt/global/common/string/markdown'; import { WorkerNameEnum, runWorker } from '../../worker/utils'; +import { ImageType } from '../../worker/readFile/type'; export const htmlToMarkdown = async (html?: string | null) => { - const md = await runWorker(WorkerNameEnum.htmlStr2Md, { html: html || '' }); + const md = await runWorker<{ + rawText: string; + imageList: ImageType[]; + }>(WorkerNameEnum.htmlStr2Md, { html: html || '' }); - return simpleMarkdownText(md); + return simpleMarkdownText(md.rawText); }; diff --git a/packages/service/package.json b/packages/service/package.json index 291670a02..3555248b3 100644 --- a/packages/service/package.json +++ b/packages/service/package.json @@ -34,7 +34,7 @@ "pdfjs-dist": "4.4.168", "pg": "^8.10.0", "request-ip": "^3.3.0", - "tiktoken": "^1.0.15", + "tiktoken": "1.0.17", "tunnel": "^0.0.6", "turndown": "^7.1.2" }, diff --git a/packages/service/worker/utils.ts b/packages/service/worker/utils.ts index a5fd3b4ae..e87dc19fc 100644 --- a/packages/service/worker/utils.ts +++ b/packages/service/worker/utils.ts @@ -178,11 +178,13 @@ export class WorkerPool, Response = any> { // Worker error, terminate and delete it.(Un catch error) worker.on('error', (err) => { - addLog.warn('Worker error', { err }); + console.log(err); + addLog.error('Worker error', err); this.deleteWorker(workerId); }); worker.on('messageerror', (err) => { - addLog.warn('Worker error', { err }); + console.log(err); + addLog.error('Worker messageerror', err); this.deleteWorker(workerId); }); diff --git a/packages/web/i18n/en/common.json b/packages/web/i18n/en/common.json index 2cd36dcfe..9adb2e1e6 100644 --- a/packages/web/i18n/en/common.json +++ b/packages/web/i18n/en/common.json @@ -559,7 +559,7 @@ "core.dataset.import.Link name placeholder": "Only supports static links. If the data is empty after uploading, the link may not be readable\nEach line one, up to 10 links at a time", "core.dataset.import.Local file": "Local File", "core.dataset.import.Local file desc": "Upload files in PDF, TXT, DOCX, etc. formats", - "core.dataset.import.Preview chunks": "Preview Segments (up to 5 segments)", + "core.dataset.import.Preview chunks": "Preview Chunks (limit 15)", "core.dataset.import.Preview raw text": "Preview Raw Text (up to 3000 characters)", "core.dataset.import.Process way": "Processing Method", "core.dataset.import.QA Estimated Price Tips": "Requires calling the file processing model, which consumes a lot of AI points: {{price}} points/1K tokens", @@ -1198,4 +1198,4 @@ "verification": "Verification", "xx_search_result": "{{key}} Search Results", "yes": "Yes" -} \ No newline at end of file +} diff --git a/packages/web/i18n/zh/common.json b/packages/web/i18n/zh/common.json index 7a546068c..539a45429 100644 --- a/packages/web/i18n/zh/common.json +++ b/packages/web/i18n/zh/common.json @@ -459,7 +459,7 @@ "core.chat.response.module similarity": "相似度", "core.chat.response.module temperature": "温度", "core.chat.response.module time": "运行时长", - "core.chat.response.module tokens": "AI Tokens 消耗", + "core.chat.response.module tokens": "AI Tokens总量", "core.chat.response.plugin output": "插件输出值", "core.chat.response.search using reRank": "结果重排", "core.chat.response.text output": "文本输出", @@ -565,7 +565,7 @@ "core.dataset.import.Link name placeholder": "仅支持静态链接,如果上传后数据为空,可能该链接无法被读取\n每行一个,每次最多 10 个链接", "core.dataset.import.Local file": "本地文件", "core.dataset.import.Local file desc": "上传 PDF、TXT、DOCX 等格式的文件", - "core.dataset.import.Preview chunks": "预览分段(最多 5 段)", + "core.dataset.import.Preview chunks": "预览分段(最多 15 段)", "core.dataset.import.Preview raw text": "预览源文本(最多 3000 字)", "core.dataset.import.Process way": "处理方式", "core.dataset.import.QA Estimated Price Tips": "需调用文本理解模型,需要消耗较多 AI 积分:{{price}} 积分/1K tokens", @@ -1207,4 +1207,4 @@ "verification": "验证", "xx_search_result": "{{key}} 的搜索结果", "yes": "是" -} \ No newline at end of file +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index fb027ba41..b95fc1dc3 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -224,7 +224,7 @@ importers: specifier: ^3.3.0 version: 3.3.0 tiktoken: - specifier: ^1.0.15 + specifier: 1.0.17 version: 1.0.17 tunnel: specifier: ^0.0.6 @@ -560,7 +560,7 @@ importers: version: 1.77.8 ts-jest: specifier: ^29.1.0 - version: 29.2.2(@babel/core@7.24.9)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.9))(jest@29.7.0(@types/node@20.14.11)(babel-plugin-macros@3.1.0)(ts-node@10.9.2(@types/node@20.14.11)(typescript@5.5.3)))(typescript@5.5.3) + version: 29.2.2(@babel/core@7.24.9)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.9))(jest@29.7.0(@types/node@20.14.11)(babel-plugin-macros@3.1.0))(typescript@5.5.3) use-context-selector: specifier: ^1.4.4 version: 1.4.4(react-dom@18.3.1(react@18.3.1))(react@18.3.1)(scheduler@0.23.2) @@ -659,8 +659,8 @@ importers: specifier: ^7.8.1 version: 7.8.1 tiktoken: - specifier: ^1.0.15 - version: 1.0.15 + specifier: 1.0.17 + version: 1.0.17 devDependencies: '@nestjs/cli': specifier: ^10.0.0 @@ -700,7 +700,7 @@ importers: version: 6.3.4 ts-jest: specifier: ^29.1.0 - version: 29.2.2(@babel/core@7.24.9)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.9))(jest@29.7.0(@types/node@20.14.11)(babel-plugin-macros@3.1.0)(ts-node@10.9.2(@types/node@20.14.11)(typescript@5.5.3)))(typescript@5.5.3) + version: 29.2.2(@babel/core@7.24.9)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.9))(jest@29.7.0(@types/node@20.14.11)(babel-plugin-macros@3.1.0))(typescript@5.5.3) ts-loader: specifier: ^9.4.3 version: 9.5.1(typescript@5.5.3)(webpack@5.92.1) @@ -8481,9 +8481,6 @@ packages: through@2.3.8: resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==} - tiktoken@1.0.15: - resolution: {integrity: sha512-sCsrq/vMWUSEW29CJLNmPvWxlVp7yh2tlkAjpJltIKqp5CKf98ZNpdeHRmAlPVFlGEbswDc6SmI8vz64W/qErw==} - tiktoken@1.0.17: resolution: {integrity: sha512-UuFHqpy/DxOfNiC3otsqbx3oS6jr5uKdQhB/CvDEroZQbVHt+qAK+4JbIooabUWKU9g6PpsFylNu9Wcg4MxSGA==} @@ -18892,8 +18889,6 @@ snapshots: through@2.3.8: {} - tiktoken@1.0.15: {} - tiktoken@1.0.17: {} timezones-list@3.0.3: {} @@ -18961,7 +18956,7 @@ snapshots: ts-dedent@2.2.0: {} - ts-jest@29.2.2(@babel/core@7.24.9)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.9))(jest@29.7.0(@types/node@20.14.11)(babel-plugin-macros@3.1.0)(ts-node@10.9.2(@types/node@20.14.11)(typescript@5.5.3)))(typescript@5.5.3): + ts-jest@29.2.2(@babel/core@7.24.9)(@jest/transform@29.7.0)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.24.9))(jest@29.7.0(@types/node@20.14.11)(babel-plugin-macros@3.1.0))(typescript@5.5.3): dependencies: bs-logger: 0.2.6 ejs: 3.1.10 diff --git a/projects/app/src/pages/api/admin/resetMilvus.ts b/projects/app/src/pages/api/admin/resetMilvus.ts index 2e51ee97b..2d5e47b01 100644 --- a/projects/app/src/pages/api/admin/resetMilvus.ts +++ b/projects/app/src/pages/api/admin/resetMilvus.ts @@ -54,7 +54,6 @@ async function handler( } ); dataLength += data.matchedCount; - console.log(data.matchedCount, '=-=-'); // 插入数据进入训练库 const max = global.systemEnv?.vectorMaxProcess || 10; diff --git a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts index 4d9df172c..ef8ba01eb 100644 --- a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts +++ b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts @@ -3,7 +3,7 @@ import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/ import { authCert } from '@fastgpt/service/support/permission/auth/common'; import { NextAPI } from '@/service/middleware/entry'; import { ApiRequestProps } from '@fastgpt/service/type/next'; -import { OwnerPermissionVal, ReadPermissionVal } from '@fastgpt/global/support/permission/constant'; +import { OwnerPermissionVal } from '@fastgpt/global/support/permission/constant'; import { authFile } from '@fastgpt/service/support/permission/auth/file'; export type PostPreviewFilesChunksProps = { @@ -60,6 +60,6 @@ async function handler( overlapRatio, customReg: customSplitChar ? [customSplitChar] : [], isQAImport: isQAImport - }).slice(0, 5); + }).slice(0, 15); } export default NextAPI(handler); diff --git a/projects/app/src/web/common/hooks/useSpeech.ts b/projects/app/src/web/common/hooks/useSpeech.ts index f7cece150..04204cd53 100644 --- a/projects/app/src/web/common/hooks/useSpeech.ts +++ b/projects/app/src/web/common/hooks/useSpeech.ts @@ -111,7 +111,6 @@ export const useSpeech = (props?: OutLinkChatAuthProps & { appId?: string }) => const blob = new Blob(chunks, options); const duration = Math.round((Date.now() - startTimestamp.current) / 1000); - console.log(options, filename, '=-=-'); formData.append('file', blob, filename); formData.append( 'data', diff --git a/projects/sandbox/package.json b/projects/sandbox/package.json index c7181fc86..70d5de089 100644 --- a/projects/sandbox/package.json +++ b/projects/sandbox/package.json @@ -27,7 +27,7 @@ "fastify": "^4.27.0", "dayjs": "^1.11.7", "isolated-vm": "^4.7.2", - "tiktoken": "^1.0.15", + "tiktoken": "1.0.17", "node-gyp": "^10.1.0", "reflect-metadata": "^0.2.0", "rxjs": "^7.8.1"