feat: use Tiktokenizer to count tokens

This commit is contained in:
archer
2023-04-24 15:18:05 +08:00
parent 29c95d24ae
commit adbaa8b37b
8 changed files with 149 additions and 32 deletions

View File

@@ -3,6 +3,7 @@ AXIOS_PROXY_HOST=127.0.0.1
AXIOS_PROXY_PORT_FAST=7890 AXIOS_PROXY_PORT_FAST=7890
AXIOS_PROXY_PORT_NORMAL=7890 AXIOS_PROXY_PORT_NORMAL=7890
queueTask=1 queueTask=1
parentUrl=https://hostname/api/openapi/startEvents
# email # email
MY_MAIL=xxx@qq.com MY_MAIL=xxx@qq.com
MAILE_CODE=xxx MAILE_CODE=xxx

View File

@@ -14,6 +14,7 @@ AXIOS_PROXY_HOST=127.0.0.1
AXIOS_PROXY_PORT_FAST=7890 AXIOS_PROXY_PORT_FAST=7890
AXIOS_PROXY_PORT_NORMAL=7890 AXIOS_PROXY_PORT_NORMAL=7890
queueTask=1 queueTask=1
parentUrl=https://hostname/api/openapi/startEvents
# email参考 nodeMail 获取参数 # email参考 nodeMail 获取参数
MY_MAIL=xxx@qq.com MY_MAIL=xxx@qq.com
MAILE_CODE=xxx MAILE_CODE=xxx
@@ -196,7 +197,7 @@ services:
- TOKEN_KEY=xxxx - TOKEN_KEY=xxxx
# 是否开启队列任务。 1-开启0-关闭请求parentUrl去执行任务,单机时直接填1 # 是否开启队列任务。 1-开启0-关闭请求parentUrl去执行任务,单机时直接填1
- queueTask=1 - queueTask=1
- parentUrl=https://fastgpt.run/api/openapi/startEvents - parentUrl=https://hostname/api/openapi/startEvents
# db # db
- MONGODB_URI=mongodb://username:passsword@0.0.0.0:27017/?authSource=admin - MONGODB_URI=mongodb://username:passsword@0.0.0.0:27017/?authSource=admin
- MONGODB_NAME=xxx - MONGODB_NAME=xxx

View File

@@ -4,7 +4,12 @@ const nextConfig = {
output: 'standalone', output: 'standalone',
reactStrictMode: true, reactStrictMode: true,
compress: true, compress: true,
webpack(config) { webpack(config) {
config.experiments = {
asyncWebAssembly: true,
layers: true
};
config.module.rules = config.module.rules.concat([ config.module.rules = config.module.rules.concat([
{ {
test: /\.svg$/i, test: /\.svg$/i,

View File

@@ -17,6 +17,7 @@
"@chakra-ui/icons": "^2.0.17", "@chakra-ui/icons": "^2.0.17",
"@chakra-ui/react": "^2.5.1", "@chakra-ui/react": "^2.5.1",
"@chakra-ui/system": "^2.5.5", "@chakra-ui/system": "^2.5.5",
"@dqbd/tiktoken": "^1.0.6",
"@emotion/react": "^11.10.6", "@emotion/react": "^11.10.6",
"@emotion/styled": "^11.10.6", "@emotion/styled": "^11.10.6",
"@next/font": "13.1.6", "@next/font": "13.1.6",
@@ -29,6 +30,7 @@
"formidable": "^2.1.1", "formidable": "^2.1.1",
"framer-motion": "^9.0.6", "framer-motion": "^9.0.6",
"gpt-token-utils": "^1.2.0", "gpt-token-utils": "^1.2.0",
"graphemer": "^1.4.0",
"hyperdown": "^2.4.29", "hyperdown": "^2.4.29",
"immer": "^9.0.19", "immer": "^9.0.19",
"jsonwebtoken": "^9.0.0", "jsonwebtoken": "^9.0.0",

16
pnpm-lock.yaml generated
View File

@@ -7,6 +7,7 @@ specifiers:
'@chakra-ui/icons': ^2.0.17 '@chakra-ui/icons': ^2.0.17
'@chakra-ui/react': ^2.5.1 '@chakra-ui/react': ^2.5.1
'@chakra-ui/system': ^2.5.5 '@chakra-ui/system': ^2.5.5
'@dqbd/tiktoken': ^1.0.6
'@emotion/react': ^11.10.6 '@emotion/react': ^11.10.6
'@emotion/styled': ^11.10.6 '@emotion/styled': ^11.10.6
'@next/font': 13.1.6 '@next/font': 13.1.6
@@ -33,6 +34,7 @@ specifiers:
formidable: ^2.1.1 formidable: ^2.1.1
framer-motion: ^9.0.6 framer-motion: ^9.0.6
gpt-token-utils: ^1.2.0 gpt-token-utils: ^1.2.0
graphemer: ^1.4.0
husky: ^8.0.3 husky: ^8.0.3
hyperdown: ^2.4.29 hyperdown: ^2.4.29
immer: ^9.0.19 immer: ^9.0.19
@@ -72,6 +74,7 @@ dependencies:
'@chakra-ui/icons': registry.npmmirror.com/@chakra-ui/icons/2.0.17_lze4h7kxffpjhokvtqbtrlfkmq '@chakra-ui/icons': registry.npmmirror.com/@chakra-ui/icons/2.0.17_lze4h7kxffpjhokvtqbtrlfkmq
'@chakra-ui/react': registry.npmmirror.com/@chakra-ui/react/2.5.1_e6pzu3hsaqmql4fl7jx73ckiym '@chakra-ui/react': registry.npmmirror.com/@chakra-ui/react/2.5.1_e6pzu3hsaqmql4fl7jx73ckiym
'@chakra-ui/system': registry.npmmirror.com/@chakra-ui/system/2.5.5_xqp3pgpqjlfxxa3zxu4zoc4fba '@chakra-ui/system': registry.npmmirror.com/@chakra-ui/system/2.5.5_xqp3pgpqjlfxxa3zxu4zoc4fba
'@dqbd/tiktoken': registry.npmmirror.com/@dqbd/tiktoken/1.0.6
'@emotion/react': registry.npmmirror.com/@emotion/react/11.10.6_pmekkgnqduwlme35zpnqhenc34 '@emotion/react': registry.npmmirror.com/@emotion/react/11.10.6_pmekkgnqduwlme35zpnqhenc34
'@emotion/styled': registry.npmmirror.com/@emotion/styled/11.10.6_oouaibmszuch5k64ms7uxp2aia '@emotion/styled': registry.npmmirror.com/@emotion/styled/11.10.6_oouaibmszuch5k64ms7uxp2aia
'@next/font': registry.npmmirror.com/@next/font/13.1.6 '@next/font': registry.npmmirror.com/@next/font/13.1.6
@@ -84,6 +87,7 @@ dependencies:
formidable: registry.npmmirror.com/formidable/2.1.1 formidable: registry.npmmirror.com/formidable/2.1.1
framer-motion: registry.npmmirror.com/framer-motion/9.0.6_biqbaboplfbrettd7655fr4n2y framer-motion: registry.npmmirror.com/framer-motion/9.0.6_biqbaboplfbrettd7655fr4n2y
gpt-token-utils: registry.npmmirror.com/gpt-token-utils/1.2.0 gpt-token-utils: registry.npmmirror.com/gpt-token-utils/1.2.0
graphemer: registry.npmmirror.com/graphemer/1.4.0
hyperdown: registry.npmmirror.com/hyperdown/2.4.29 hyperdown: registry.npmmirror.com/hyperdown/2.4.29
immer: registry.npmmirror.com/immer/9.0.19 immer: registry.npmmirror.com/immer/9.0.19
jsonwebtoken: registry.npmmirror.com/jsonwebtoken/9.0.0 jsonwebtoken: registry.npmmirror.com/jsonwebtoken/9.0.0
@@ -4176,6 +4180,12 @@ packages:
react: registry.npmmirror.com/react/18.2.0 react: registry.npmmirror.com/react/18.2.0
dev: false dev: false
registry.npmmirror.com/@dqbd/tiktoken/1.0.6:
resolution: {integrity: sha512-umSdeZTy/SbPPKVuZKV/XKyFPmXSN145CcM3iHjBbmhlohBJg7vaDp4cPCW+xNlWL6L2U1sp7T2BD+di2sUKdA==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@dqbd/tiktoken/-/tiktoken-1.0.6.tgz}
name: '@dqbd/tiktoken'
version: 1.0.6
dev: false
registry.npmmirror.com/@emotion/babel-plugin/11.10.6: registry.npmmirror.com/@emotion/babel-plugin/11.10.6:
resolution: {integrity: sha512-p2dAqtVrkhSa7xz1u/m9eHYdLi+en8NowrmXeF/dKtJpU8lCWli8RUAati7NcSl0afsBott48pdnANuD0wh9QQ==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@emotion/babel-plugin/-/babel-plugin-11.10.6.tgz} resolution: {integrity: sha512-p2dAqtVrkhSa7xz1u/m9eHYdLi+en8NowrmXeF/dKtJpU8lCWli8RUAati7NcSl0afsBott48pdnANuD0wh9QQ==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@emotion/babel-plugin/-/babel-plugin-11.10.6.tgz}
name: '@emotion/babel-plugin' name: '@emotion/babel-plugin'
@@ -7675,6 +7685,12 @@ packages:
version: 1.0.4 version: 1.0.4
dev: true dev: true
registry.npmmirror.com/graphemer/1.4.0:
resolution: {integrity: sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/graphemer/-/graphemer-1.4.0.tgz}
name: graphemer
version: 1.4.0
dev: false
registry.npmmirror.com/has-bigints/1.0.2: registry.npmmirror.com/has-bigints/1.0.2:
resolution: {integrity: sha512-tSvCKtBr9lkF0Ex0aQiP9N+OpV4zi2r/Nee5VkRDbaqv35RLYMzbwQfFSZZH0kR+Rd6302UJZ2p/bJCEoR3VoQ==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/has-bigints/-/has-bigints-1.0.2.tgz} resolution: {integrity: sha512-tSvCKtBr9lkF0Ex0aQiP9N+OpV4zi2r/Nee5VkRDbaqv35RLYMzbwQfFSZZH0kR+Rd6302UJZ2p/bJCEoR3VoQ==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/has-bigints/-/has-bigints-1.0.2.tgz}
name: has-bigints name: has-bigints

View File

@@ -14,7 +14,6 @@ import {
} from '@chakra-ui/react'; } from '@chakra-ui/react';
import { useToast } from '@/hooks/useToast'; import { useToast } from '@/hooks/useToast';
import { useSelectFile } from '@/hooks/useSelectFile'; import { useSelectFile } from '@/hooks/useSelectFile';
import { encode } from 'gpt-token-utils';
import { useConfirm } from '@/hooks/useConfirm'; import { useConfirm } from '@/hooks/useConfirm';
import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file'; import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file';
import { useMutation } from '@tanstack/react-query'; import { useMutation } from '@tanstack/react-query';
@@ -22,6 +21,7 @@ import { postModelDataSplitData } from '@/api/model';
import { formatPrice } from '@/utils/user'; import { formatPrice } from '@/utils/user';
import Radio from '@/components/Radio'; import Radio from '@/components/Radio';
import { splitText } from '@/utils/file'; import { splitText } from '@/utils/file';
import { countChatTokens } from '@/utils/tools';
const fileExtension = '.txt,.doc,.docx,.pdf,.md'; const fileExtension = '.txt,.doc,.docx,.pdf,.md';
@@ -29,11 +29,11 @@ const modeMap = {
qa: { qa: {
maxLen: 2800, maxLen: 2800,
slideLen: 800, slideLen: 800,
price: 3, price: 4,
isPrompt: true isPrompt: true
}, },
subsection: { subsection: {
maxLen: 1000, maxLen: 800,
slideLen: 300, slideLen: 300,
price: 0.4, price: 0.4,
isPrompt: false isPrompt: false
@@ -55,19 +55,19 @@ const SelectFileModal = ({
const { File, onOpen } = useSelectFile({ fileType: fileExtension, multiple: true }); const { File, onOpen } = useSelectFile({ fileType: fileExtension, multiple: true });
const [mode, setMode] = useState<'qa' | 'subsection'>('qa'); const [mode, setMode] = useState<'qa' | 'subsection'>('qa');
const [fileTextArr, setFileTextArr] = useState<string[]>(['']); const [fileTextArr, setFileTextArr] = useState<string[]>(['']);
const [splitRes, setSplitRes] = useState<{ tokens: number; chunks: string[] }>({
tokens: 0,
chunks: []
});
const { openConfirm, ConfirmChild } = useConfirm({ const { openConfirm, ConfirmChild } = useConfirm({
content: '确认导入该文件,需要一定时间进行拆解,该任务无法终止!如果余额不足,任务讲被终止。' content: `确认导入该文件,需要一定时间进行拆解,该任务无法终止!如果余额不足,未完成的任务会被直接清除。一共 ${
splitRes.chunks.length
} 组,大约 ${splitRes.tokens} 个tokens, 约 ${formatPrice(
splitRes.tokens * modeMap[mode].price
)}`
}); });
const fileText = useMemo(() => { const fileText = useMemo(() => fileTextArr.join(''), [fileTextArr]);
const chunks = fileTextArr.map((item) =>
splitText({
text: item,
...modeMap[mode]
})
);
return chunks.join('');
}, [fileTextArr, mode]);
const onSelectFile = useCallback( const onSelectFile = useCallback(
async (e: File[]) => { async (e: File[]) => {
@@ -106,18 +106,11 @@ const SelectFileModal = ({
const { mutate, isLoading } = useMutation({ const { mutate, isLoading } = useMutation({
mutationFn: async () => { mutationFn: async () => {
if (!fileText) return; if (splitRes.chunks.length === 0) return;
const chunks = fileTextArr
.map((item) =>
splitText({
text: item,
...modeMap[mode]
})
)
.flat();
await postModelDataSplitData({ await postModelDataSplitData({
modelId, modelId,
chunks, chunks: splitRes.chunks,
prompt: `下面是"${prompt || '一段长文本'}"`, prompt: `下面是"${prompt || '一段长文本'}"`,
mode mode
}); });
@@ -136,6 +129,28 @@ const SelectFileModal = ({
} }
}); });
const onclickImport = useCallback(() => {
const chunks = fileTextArr
.map((item) =>
splitText({
text: item,
...modeMap[mode]
})
)
.flat();
// count tokens
const tokens = chunks.map((item) =>
countChatTokens({ messages: [{ role: 'system', content: item }] })
);
setSplitRes({
tokens: tokens.reduce((sum, item) => sum + item, 0),
chunks
});
openConfirm(mutate)();
}, [fileTextArr, mode, mutate, openConfirm]);
return ( return (
<Modal isOpen={true} onClose={onClose} isCentered> <Modal isOpen={true} onClose={onClose} isCentered>
<ModalOverlay /> <ModalOverlay />
@@ -152,10 +167,9 @@ const SelectFileModal = ({
justifyContent={'center'} justifyContent={'center'}
fontSize={'sm'} fontSize={'sm'}
> >
<Box mt={2} px={4} maxW={['100%']} textAlign={'justify'} color={'blackAlpha.600'}> <Box mt={2} px={5} maxW={['100%', '70%']} textAlign={'justify'} color={'blackAlpha.600'}>
{fileExtension} QA {fileExtension} QA
tokens {encode(fileText).length}{' '} tokens
tokens {formatPrice(encode(fileText).length * modeMap[mode].price)}
</Box> </Box>
{/* 拆分模式 */} {/* 拆分模式 */}
<Flex w={'100%'} px={5} alignItems={'center'} mt={4}> <Flex w={'100%'} px={5} alignItems={'center'} mt={4}>
@@ -217,7 +231,7 @@ const SelectFileModal = ({
<Button variant={'outline'} colorScheme={'gray'} mr={3} onClick={onClose}> <Button variant={'outline'} colorScheme={'gray'} mr={3} onClick={onClose}>
</Button> </Button>
<Button isLoading={isLoading} isDisabled={fileText === ''} onClick={openConfirm(mutate)}> <Button isLoading={isLoading} isDisabled={fileText === ''} onClick={onclickImport}>
</Button> </Button>
</Flex> </Flex>

View File

@@ -1,6 +1,6 @@
import mammoth from 'mammoth'; import mammoth from 'mammoth';
import Papa from 'papaparse'; import Papa from 'papaparse';
import { encode } from 'gpt-token-utils'; import { countChatTokens } from './tools';
/** /**
* 读取 txt 文件内容 * 读取 txt 文件内容
@@ -164,7 +164,7 @@ export const splitText = ({
const chunks: { sum: number; arr: string[] }[] = [{ sum: 0, arr: [] }]; const chunks: { sum: number; arr: string[] }[] = [{ sum: 0, arr: [] }];
for (let i = 0; i < textArr.length; i++) { for (let i = 0; i < textArr.length; i++) {
const tokenLen = encode(textArr[i]).length; const tokenLen = countChatTokens({ messages: [{ role: 'system', content: textArr[i] }] });
chunks[chunks.length - 1].sum += tokenLen; chunks[chunks.length - 1].sum += tokenLen;
chunks[chunks.length - 1].arr.push(textArr[i]); chunks[chunks.length - 1].arr.push(textArr[i]);
@@ -174,7 +174,7 @@ export const splitText = ({
const chunk: { sum: number; arr: string[] } = { sum: 0, arr: [] }; const chunk: { sum: number; arr: string[] } = { sum: 0, arr: [] };
for (let j = chunks[chunks.length - 1].arr.length - 1; j >= 0; j--) { for (let j = chunks[chunks.length - 1].arr.length - 1; j >= 0; j--) {
const chunkText = chunks[chunks.length - 1].arr[j]; const chunkText = chunks[chunks.length - 1].arr[j];
const tokenLen = encode(chunkText).length; const tokenLen = countChatTokens({ messages: [{ role: 'system', content: chunkText }] });
chunk.sum += tokenLen; chunk.sum += tokenLen;
chunk.arr.unshift(chunkText); chunk.arr.unshift(chunkText);
@@ -185,7 +185,6 @@ export const splitText = ({
chunks.push(chunk); chunks.push(chunk);
} }
} }
const result = chunks.map((item) => item.arr.join('')); const result = chunks.map((item) => item.arr.join(''));
return result; return result;
}; };

View File

@@ -1,5 +1,27 @@
import crypto from 'crypto'; import crypto from 'crypto';
import { useToast } from '@/hooks/useToast'; import { useToast } from '@/hooks/useToast';
import { encoding_for_model, type Tiktoken } from '@dqbd/tiktoken';
import Graphemer from 'graphemer';
const textDecoder = new TextDecoder();
const graphemer = new Graphemer();
const encMap = {
'gpt-3.5-turbo': encoding_for_model('gpt-3.5-turbo', {
'<|im_start|>': 100264,
'<|im_end|>': 100265,
'<|im_sep|>': 100266
}),
'gpt-4': encoding_for_model('gpt-4', {
'<|im_start|>': 100264,
'<|im_end|>': 100265,
'<|im_sep|>': 100266
}),
'gpt-4-32k': encoding_for_model('gpt-4-32k', {
'<|im_start|>': 100264,
'<|im_end|>': 100265,
'<|im_sep|>': 100266
})
};
/** /**
* copy text data * copy text data
@@ -51,3 +73,60 @@ export const Obj2Query = (obj: Record<string, string | number>) => {
} }
return queryParams.toString(); return queryParams.toString();
}; };
/* 格式化 chat 聊天内容 */
function getChatGPTEncodingText(
messages: { role: 'system' | 'user' | 'assistant'; content: string; name?: string }[],
model: 'gpt-3.5-turbo' | 'gpt-4' | 'gpt-4-32k'
) {
const isGpt3 = model === 'gpt-3.5-turbo';
const msgSep = isGpt3 ? '\n' : '';
const roleSep = isGpt3 ? '\n' : '<|im_sep|>';
return [
messages
.map(({ name = '', role, content }) => {
return `<|im_start|>${name || role}${roleSep}${content}<|im_end|>`;
})
.join(msgSep),
`<|im_start|>assistant${roleSep}`
].join(msgSep);
}
function text2TokensLen(encoder: Tiktoken, inputText: string) {
const encoding = encoder.encode(inputText, 'all');
const segments: { text: string; tokens: { id: number; idx: number }[] }[] = [];
let byteAcc: number[] = [];
let tokenAcc: { id: number; idx: number }[] = [];
let inputGraphemes = graphemer.splitGraphemes(inputText);
for (let idx = 0; idx < encoding.length; idx++) {
const token = encoding[idx]!;
byteAcc.push(...encoder.decode_single_token_bytes(token));
tokenAcc.push({ id: token, idx });
const segmentText = textDecoder.decode(new Uint8Array(byteAcc));
const graphemes = graphemer.splitGraphemes(segmentText);
if (graphemes.every((item, idx) => inputGraphemes[idx] === item)) {
segments.push({ text: segmentText, tokens: tokenAcc });
byteAcc = [];
tokenAcc = [];
inputGraphemes = inputGraphemes.slice(graphemes.length);
}
}
return segments.reduce((memo, i) => memo + i.tokens.length, 0) ?? 0;
}
export const countChatTokens = ({
model = 'gpt-3.5-turbo',
messages
}: {
model?: 'gpt-4' | 'gpt-4-32k' | 'gpt-3.5-turbo';
messages: { role: 'system' | 'user' | 'assistant'; content: string }[];
}) => {
const text = getChatGPTEncodingText(messages, model);
return text2TokensLen(encMap[model], text);
};