From adbaa8b37bd167a0ec860a41cfc388459f45e148 Mon Sep 17 00:00:00 2001 From: archer <545436317@qq.com> Date: Mon, 24 Apr 2023 15:18:05 +0800 Subject: [PATCH] feat: use Tiktokenizer to count tokens --- .env.template | 1 + README.md | 3 +- next.config.js | 5 ++ package.json | 2 + pnpm-lock.yaml | 16 ++++ .../detail/components/SelectFileModal.tsx | 68 +++++++++------- src/utils/file.ts | 7 +- src/utils/tools.ts | 79 +++++++++++++++++++ 8 files changed, 149 insertions(+), 32 deletions(-) diff --git a/.env.template b/.env.template index b186b07e3..73369b178 100644 --- a/.env.template +++ b/.env.template @@ -3,6 +3,7 @@ AXIOS_PROXY_HOST=127.0.0.1 AXIOS_PROXY_PORT_FAST=7890 AXIOS_PROXY_PORT_NORMAL=7890 queueTask=1 +parentUrl=https://hostname/api/openapi/startEvents # email MY_MAIL=xxx@qq.com MAILE_CODE=xxx diff --git a/README.md b/README.md index 98821e2e6..66fb37849 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ AXIOS_PROXY_HOST=127.0.0.1 AXIOS_PROXY_PORT_FAST=7890 AXIOS_PROXY_PORT_NORMAL=7890 queueTask=1 +parentUrl=https://hostname/api/openapi/startEvents # email,参考 nodeMail 获取参数 MY_MAIL=xxx@qq.com MAILE_CODE=xxx @@ -196,7 +197,7 @@ services: - TOKEN_KEY=xxxx # 是否开启队列任务。 1-开启,0-关闭(请求parentUrl去执行任务,单机时直接填1) - queueTask=1 - - parentUrl=https://fastgpt.run/api/openapi/startEvents + - parentUrl=https://hostname/api/openapi/startEvents # db - MONGODB_URI=mongodb://username:passsword@0.0.0.0:27017/?authSource=admin - MONGODB_NAME=xxx diff --git a/next.config.js b/next.config.js index 8722ceb28..a7963c704 100644 --- a/next.config.js +++ b/next.config.js @@ -4,7 +4,12 @@ const nextConfig = { output: 'standalone', reactStrictMode: true, compress: true, + webpack(config) { + config.experiments = { + asyncWebAssembly: true, + layers: true + }; config.module.rules = config.module.rules.concat([ { test: /\.svg$/i, diff --git a/package.json b/package.json index 569e870de..b7641a674 100644 --- a/package.json +++ b/package.json @@ -17,6 +17,7 @@ "@chakra-ui/icons": "^2.0.17", "@chakra-ui/react": "^2.5.1", "@chakra-ui/system": "^2.5.5", + "@dqbd/tiktoken": "^1.0.6", "@emotion/react": "^11.10.6", "@emotion/styled": "^11.10.6", "@next/font": "13.1.6", @@ -29,6 +30,7 @@ "formidable": "^2.1.1", "framer-motion": "^9.0.6", "gpt-token-utils": "^1.2.0", + "graphemer": "^1.4.0", "hyperdown": "^2.4.29", "immer": "^9.0.19", "jsonwebtoken": "^9.0.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 219348620..7a97f5c6f 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -7,6 +7,7 @@ specifiers: '@chakra-ui/icons': ^2.0.17 '@chakra-ui/react': ^2.5.1 '@chakra-ui/system': ^2.5.5 + '@dqbd/tiktoken': ^1.0.6 '@emotion/react': ^11.10.6 '@emotion/styled': ^11.10.6 '@next/font': 13.1.6 @@ -33,6 +34,7 @@ specifiers: formidable: ^2.1.1 framer-motion: ^9.0.6 gpt-token-utils: ^1.2.0 + graphemer: ^1.4.0 husky: ^8.0.3 hyperdown: ^2.4.29 immer: ^9.0.19 @@ -72,6 +74,7 @@ dependencies: '@chakra-ui/icons': registry.npmmirror.com/@chakra-ui/icons/2.0.17_lze4h7kxffpjhokvtqbtrlfkmq '@chakra-ui/react': registry.npmmirror.com/@chakra-ui/react/2.5.1_e6pzu3hsaqmql4fl7jx73ckiym '@chakra-ui/system': registry.npmmirror.com/@chakra-ui/system/2.5.5_xqp3pgpqjlfxxa3zxu4zoc4fba + '@dqbd/tiktoken': registry.npmmirror.com/@dqbd/tiktoken/1.0.6 '@emotion/react': registry.npmmirror.com/@emotion/react/11.10.6_pmekkgnqduwlme35zpnqhenc34 '@emotion/styled': registry.npmmirror.com/@emotion/styled/11.10.6_oouaibmszuch5k64ms7uxp2aia '@next/font': registry.npmmirror.com/@next/font/13.1.6 @@ -84,6 +87,7 @@ dependencies: formidable: registry.npmmirror.com/formidable/2.1.1 framer-motion: registry.npmmirror.com/framer-motion/9.0.6_biqbaboplfbrettd7655fr4n2y gpt-token-utils: registry.npmmirror.com/gpt-token-utils/1.2.0 + graphemer: registry.npmmirror.com/graphemer/1.4.0 hyperdown: registry.npmmirror.com/hyperdown/2.4.29 immer: registry.npmmirror.com/immer/9.0.19 jsonwebtoken: registry.npmmirror.com/jsonwebtoken/9.0.0 @@ -4176,6 +4180,12 @@ packages: react: registry.npmmirror.com/react/18.2.0 dev: false + registry.npmmirror.com/@dqbd/tiktoken/1.0.6: + resolution: {integrity: sha512-umSdeZTy/SbPPKVuZKV/XKyFPmXSN145CcM3iHjBbmhlohBJg7vaDp4cPCW+xNlWL6L2U1sp7T2BD+di2sUKdA==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@dqbd/tiktoken/-/tiktoken-1.0.6.tgz} + name: '@dqbd/tiktoken' + version: 1.0.6 + dev: false + registry.npmmirror.com/@emotion/babel-plugin/11.10.6: resolution: {integrity: sha512-p2dAqtVrkhSa7xz1u/m9eHYdLi+en8NowrmXeF/dKtJpU8lCWli8RUAati7NcSl0afsBott48pdnANuD0wh9QQ==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@emotion/babel-plugin/-/babel-plugin-11.10.6.tgz} name: '@emotion/babel-plugin' @@ -7675,6 +7685,12 @@ packages: version: 1.0.4 dev: true + registry.npmmirror.com/graphemer/1.4.0: + resolution: {integrity: sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/graphemer/-/graphemer-1.4.0.tgz} + name: graphemer + version: 1.4.0 + dev: false + registry.npmmirror.com/has-bigints/1.0.2: resolution: {integrity: sha512-tSvCKtBr9lkF0Ex0aQiP9N+OpV4zi2r/Nee5VkRDbaqv35RLYMzbwQfFSZZH0kR+Rd6302UJZ2p/bJCEoR3VoQ==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/has-bigints/-/has-bigints-1.0.2.tgz} name: has-bigints diff --git a/src/pages/model/detail/components/SelectFileModal.tsx b/src/pages/model/detail/components/SelectFileModal.tsx index a2a120a97..3e62a29c5 100644 --- a/src/pages/model/detail/components/SelectFileModal.tsx +++ b/src/pages/model/detail/components/SelectFileModal.tsx @@ -14,7 +14,6 @@ import { } from '@chakra-ui/react'; import { useToast } from '@/hooks/useToast'; import { useSelectFile } from '@/hooks/useSelectFile'; -import { encode } from 'gpt-token-utils'; import { useConfirm } from '@/hooks/useConfirm'; import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file'; import { useMutation } from '@tanstack/react-query'; @@ -22,6 +21,7 @@ import { postModelDataSplitData } from '@/api/model'; import { formatPrice } from '@/utils/user'; import Radio from '@/components/Radio'; import { splitText } from '@/utils/file'; +import { countChatTokens } from '@/utils/tools'; const fileExtension = '.txt,.doc,.docx,.pdf,.md'; @@ -29,11 +29,11 @@ const modeMap = { qa: { maxLen: 2800, slideLen: 800, - price: 3, + price: 4, isPrompt: true }, subsection: { - maxLen: 1000, + maxLen: 800, slideLen: 300, price: 0.4, isPrompt: false @@ -55,19 +55,19 @@ const SelectFileModal = ({ const { File, onOpen } = useSelectFile({ fileType: fileExtension, multiple: true }); const [mode, setMode] = useState<'qa' | 'subsection'>('qa'); const [fileTextArr, setFileTextArr] = useState(['']); + const [splitRes, setSplitRes] = useState<{ tokens: number; chunks: string[] }>({ + tokens: 0, + chunks: [] + }); const { openConfirm, ConfirmChild } = useConfirm({ - content: '确认导入该文件,需要一定时间进行拆解,该任务无法终止!如果余额不足,任务讲被终止。' + content: `确认导入该文件,需要一定时间进行拆解,该任务无法终止!如果余额不足,未完成的任务会被直接清除。一共 ${ + splitRes.chunks.length + } 组,大约 ${splitRes.tokens} 个tokens, 约 ${formatPrice( + splitRes.tokens * modeMap[mode].price + )} 元` }); - const fileText = useMemo(() => { - const chunks = fileTextArr.map((item) => - splitText({ - text: item, - ...modeMap[mode] - }) - ); - return chunks.join(''); - }, [fileTextArr, mode]); + const fileText = useMemo(() => fileTextArr.join(''), [fileTextArr]); const onSelectFile = useCallback( async (e: File[]) => { @@ -106,18 +106,11 @@ const SelectFileModal = ({ const { mutate, isLoading } = useMutation({ mutationFn: async () => { - if (!fileText) return; - const chunks = fileTextArr - .map((item) => - splitText({ - text: item, - ...modeMap[mode] - }) - ) - .flat(); + if (splitRes.chunks.length === 0) return; + await postModelDataSplitData({ modelId, - chunks, + chunks: splitRes.chunks, prompt: `下面是"${prompt || '一段长文本'}"`, mode }); @@ -136,6 +129,28 @@ const SelectFileModal = ({ } }); + const onclickImport = useCallback(() => { + const chunks = fileTextArr + .map((item) => + splitText({ + text: item, + ...modeMap[mode] + }) + ) + .flat(); + // count tokens + const tokens = chunks.map((item) => + countChatTokens({ messages: [{ role: 'system', content: item }] }) + ); + + setSplitRes({ + tokens: tokens.reduce((sum, item) => sum + item, 0), + chunks + }); + + openConfirm(mutate)(); + }, [fileTextArr, mode, mutate, openConfirm]); + return ( @@ -152,10 +167,9 @@ const SelectFileModal = ({ justifyContent={'center'} fontSize={'sm'} > - + 支持 {fileExtension} 文件。模型会自动对文本进行 QA 拆分,需要较长训练时间,拆分需要消耗 - tokens,账号余额不足时,未拆分的数据会被删除。当前一共 {encode(fileText).length}{' '} - 个tokens,大约 {formatPrice(encode(fileText).length * modeMap[mode].price)}元 + tokens,账号余额不足时,未拆分的数据会被删除。 {/* 拆分模式 */} @@ -217,7 +231,7 @@ const SelectFileModal = ({ - diff --git a/src/utils/file.ts b/src/utils/file.ts index a6f8cb85b..ff1987818 100644 --- a/src/utils/file.ts +++ b/src/utils/file.ts @@ -1,6 +1,6 @@ import mammoth from 'mammoth'; import Papa from 'papaparse'; -import { encode } from 'gpt-token-utils'; +import { countChatTokens } from './tools'; /** * 读取 txt 文件内容 @@ -164,7 +164,7 @@ export const splitText = ({ const chunks: { sum: number; arr: string[] }[] = [{ sum: 0, arr: [] }]; for (let i = 0; i < textArr.length; i++) { - const tokenLen = encode(textArr[i]).length; + const tokenLen = countChatTokens({ messages: [{ role: 'system', content: textArr[i] }] }); chunks[chunks.length - 1].sum += tokenLen; chunks[chunks.length - 1].arr.push(textArr[i]); @@ -174,7 +174,7 @@ export const splitText = ({ const chunk: { sum: number; arr: string[] } = { sum: 0, arr: [] }; for (let j = chunks[chunks.length - 1].arr.length - 1; j >= 0; j--) { const chunkText = chunks[chunks.length - 1].arr[j]; - const tokenLen = encode(chunkText).length; + const tokenLen = countChatTokens({ messages: [{ role: 'system', content: chunkText }] }); chunk.sum += tokenLen; chunk.arr.unshift(chunkText); @@ -185,7 +185,6 @@ export const splitText = ({ chunks.push(chunk); } } - const result = chunks.map((item) => item.arr.join('')); return result; }; diff --git a/src/utils/tools.ts b/src/utils/tools.ts index 64d935402..2ba124d4c 100644 --- a/src/utils/tools.ts +++ b/src/utils/tools.ts @@ -1,5 +1,27 @@ import crypto from 'crypto'; import { useToast } from '@/hooks/useToast'; +import { encoding_for_model, type Tiktoken } from '@dqbd/tiktoken'; +import Graphemer from 'graphemer'; + +const textDecoder = new TextDecoder(); +const graphemer = new Graphemer(); +const encMap = { + 'gpt-3.5-turbo': encoding_for_model('gpt-3.5-turbo', { + '<|im_start|>': 100264, + '<|im_end|>': 100265, + '<|im_sep|>': 100266 + }), + 'gpt-4': encoding_for_model('gpt-4', { + '<|im_start|>': 100264, + '<|im_end|>': 100265, + '<|im_sep|>': 100266 + }), + 'gpt-4-32k': encoding_for_model('gpt-4-32k', { + '<|im_start|>': 100264, + '<|im_end|>': 100265, + '<|im_sep|>': 100266 + }) +}; /** * copy text data @@ -51,3 +73,60 @@ export const Obj2Query = (obj: Record) => { } return queryParams.toString(); }; + +/* 格式化 chat 聊天内容 */ +function getChatGPTEncodingText( + messages: { role: 'system' | 'user' | 'assistant'; content: string; name?: string }[], + model: 'gpt-3.5-turbo' | 'gpt-4' | 'gpt-4-32k' +) { + const isGpt3 = model === 'gpt-3.5-turbo'; + + const msgSep = isGpt3 ? '\n' : ''; + const roleSep = isGpt3 ? '\n' : '<|im_sep|>'; + + return [ + messages + .map(({ name = '', role, content }) => { + return `<|im_start|>${name || role}${roleSep}${content}<|im_end|>`; + }) + .join(msgSep), + `<|im_start|>assistant${roleSep}` + ].join(msgSep); +} +function text2TokensLen(encoder: Tiktoken, inputText: string) { + const encoding = encoder.encode(inputText, 'all'); + const segments: { text: string; tokens: { id: number; idx: number }[] }[] = []; + + let byteAcc: number[] = []; + let tokenAcc: { id: number; idx: number }[] = []; + let inputGraphemes = graphemer.splitGraphemes(inputText); + + for (let idx = 0; idx < encoding.length; idx++) { + const token = encoding[idx]!; + byteAcc.push(...encoder.decode_single_token_bytes(token)); + tokenAcc.push({ id: token, idx }); + + const segmentText = textDecoder.decode(new Uint8Array(byteAcc)); + const graphemes = graphemer.splitGraphemes(segmentText); + + if (graphemes.every((item, idx) => inputGraphemes[idx] === item)) { + segments.push({ text: segmentText, tokens: tokenAcc }); + + byteAcc = []; + tokenAcc = []; + inputGraphemes = inputGraphemes.slice(graphemes.length); + } + } + + return segments.reduce((memo, i) => memo + i.tokens.length, 0) ?? 0; +} +export const countChatTokens = ({ + model = 'gpt-3.5-turbo', + messages +}: { + model?: 'gpt-4' | 'gpt-4-32k' | 'gpt-3.5-turbo'; + messages: { role: 'system' | 'user' | 'assistant'; content: string }[]; +}) => { + const text = getChatGPTEncodingText(messages, model); + return text2TokensLen(encMap[model], text); +};