mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 21:13:50 +00:00
feat: use Tiktokenizer to count tokens
This commit is contained in:
@@ -3,6 +3,7 @@ AXIOS_PROXY_HOST=127.0.0.1
|
|||||||
AXIOS_PROXY_PORT_FAST=7890
|
AXIOS_PROXY_PORT_FAST=7890
|
||||||
AXIOS_PROXY_PORT_NORMAL=7890
|
AXIOS_PROXY_PORT_NORMAL=7890
|
||||||
queueTask=1
|
queueTask=1
|
||||||
|
parentUrl=https://hostname/api/openapi/startEvents
|
||||||
# email
|
# email
|
||||||
MY_MAIL=xxx@qq.com
|
MY_MAIL=xxx@qq.com
|
||||||
MAILE_CODE=xxx
|
MAILE_CODE=xxx
|
||||||
|
@@ -14,6 +14,7 @@ AXIOS_PROXY_HOST=127.0.0.1
|
|||||||
AXIOS_PROXY_PORT_FAST=7890
|
AXIOS_PROXY_PORT_FAST=7890
|
||||||
AXIOS_PROXY_PORT_NORMAL=7890
|
AXIOS_PROXY_PORT_NORMAL=7890
|
||||||
queueTask=1
|
queueTask=1
|
||||||
|
parentUrl=https://hostname/api/openapi/startEvents
|
||||||
# email,参考 nodeMail 获取参数
|
# email,参考 nodeMail 获取参数
|
||||||
MY_MAIL=xxx@qq.com
|
MY_MAIL=xxx@qq.com
|
||||||
MAILE_CODE=xxx
|
MAILE_CODE=xxx
|
||||||
@@ -196,7 +197,7 @@ services:
|
|||||||
- TOKEN_KEY=xxxx
|
- TOKEN_KEY=xxxx
|
||||||
# 是否开启队列任务。 1-开启,0-关闭(请求parentUrl去执行任务,单机时直接填1)
|
# 是否开启队列任务。 1-开启,0-关闭(请求parentUrl去执行任务,单机时直接填1)
|
||||||
- queueTask=1
|
- queueTask=1
|
||||||
- parentUrl=https://fastgpt.run/api/openapi/startEvents
|
- parentUrl=https://hostname/api/openapi/startEvents
|
||||||
# db
|
# db
|
||||||
- MONGODB_URI=mongodb://username:passsword@0.0.0.0:27017/?authSource=admin
|
- MONGODB_URI=mongodb://username:passsword@0.0.0.0:27017/?authSource=admin
|
||||||
- MONGODB_NAME=xxx
|
- MONGODB_NAME=xxx
|
||||||
|
@@ -4,7 +4,12 @@ const nextConfig = {
|
|||||||
output: 'standalone',
|
output: 'standalone',
|
||||||
reactStrictMode: true,
|
reactStrictMode: true,
|
||||||
compress: true,
|
compress: true,
|
||||||
|
|
||||||
webpack(config) {
|
webpack(config) {
|
||||||
|
config.experiments = {
|
||||||
|
asyncWebAssembly: true,
|
||||||
|
layers: true
|
||||||
|
};
|
||||||
config.module.rules = config.module.rules.concat([
|
config.module.rules = config.module.rules.concat([
|
||||||
{
|
{
|
||||||
test: /\.svg$/i,
|
test: /\.svg$/i,
|
||||||
|
@@ -17,6 +17,7 @@
|
|||||||
"@chakra-ui/icons": "^2.0.17",
|
"@chakra-ui/icons": "^2.0.17",
|
||||||
"@chakra-ui/react": "^2.5.1",
|
"@chakra-ui/react": "^2.5.1",
|
||||||
"@chakra-ui/system": "^2.5.5",
|
"@chakra-ui/system": "^2.5.5",
|
||||||
|
"@dqbd/tiktoken": "^1.0.6",
|
||||||
"@emotion/react": "^11.10.6",
|
"@emotion/react": "^11.10.6",
|
||||||
"@emotion/styled": "^11.10.6",
|
"@emotion/styled": "^11.10.6",
|
||||||
"@next/font": "13.1.6",
|
"@next/font": "13.1.6",
|
||||||
@@ -29,6 +30,7 @@
|
|||||||
"formidable": "^2.1.1",
|
"formidable": "^2.1.1",
|
||||||
"framer-motion": "^9.0.6",
|
"framer-motion": "^9.0.6",
|
||||||
"gpt-token-utils": "^1.2.0",
|
"gpt-token-utils": "^1.2.0",
|
||||||
|
"graphemer": "^1.4.0",
|
||||||
"hyperdown": "^2.4.29",
|
"hyperdown": "^2.4.29",
|
||||||
"immer": "^9.0.19",
|
"immer": "^9.0.19",
|
||||||
"jsonwebtoken": "^9.0.0",
|
"jsonwebtoken": "^9.0.0",
|
||||||
|
16
pnpm-lock.yaml
generated
16
pnpm-lock.yaml
generated
@@ -7,6 +7,7 @@ specifiers:
|
|||||||
'@chakra-ui/icons': ^2.0.17
|
'@chakra-ui/icons': ^2.0.17
|
||||||
'@chakra-ui/react': ^2.5.1
|
'@chakra-ui/react': ^2.5.1
|
||||||
'@chakra-ui/system': ^2.5.5
|
'@chakra-ui/system': ^2.5.5
|
||||||
|
'@dqbd/tiktoken': ^1.0.6
|
||||||
'@emotion/react': ^11.10.6
|
'@emotion/react': ^11.10.6
|
||||||
'@emotion/styled': ^11.10.6
|
'@emotion/styled': ^11.10.6
|
||||||
'@next/font': 13.1.6
|
'@next/font': 13.1.6
|
||||||
@@ -33,6 +34,7 @@ specifiers:
|
|||||||
formidable: ^2.1.1
|
formidable: ^2.1.1
|
||||||
framer-motion: ^9.0.6
|
framer-motion: ^9.0.6
|
||||||
gpt-token-utils: ^1.2.0
|
gpt-token-utils: ^1.2.0
|
||||||
|
graphemer: ^1.4.0
|
||||||
husky: ^8.0.3
|
husky: ^8.0.3
|
||||||
hyperdown: ^2.4.29
|
hyperdown: ^2.4.29
|
||||||
immer: ^9.0.19
|
immer: ^9.0.19
|
||||||
@@ -72,6 +74,7 @@ dependencies:
|
|||||||
'@chakra-ui/icons': registry.npmmirror.com/@chakra-ui/icons/2.0.17_lze4h7kxffpjhokvtqbtrlfkmq
|
'@chakra-ui/icons': registry.npmmirror.com/@chakra-ui/icons/2.0.17_lze4h7kxffpjhokvtqbtrlfkmq
|
||||||
'@chakra-ui/react': registry.npmmirror.com/@chakra-ui/react/2.5.1_e6pzu3hsaqmql4fl7jx73ckiym
|
'@chakra-ui/react': registry.npmmirror.com/@chakra-ui/react/2.5.1_e6pzu3hsaqmql4fl7jx73ckiym
|
||||||
'@chakra-ui/system': registry.npmmirror.com/@chakra-ui/system/2.5.5_xqp3pgpqjlfxxa3zxu4zoc4fba
|
'@chakra-ui/system': registry.npmmirror.com/@chakra-ui/system/2.5.5_xqp3pgpqjlfxxa3zxu4zoc4fba
|
||||||
|
'@dqbd/tiktoken': registry.npmmirror.com/@dqbd/tiktoken/1.0.6
|
||||||
'@emotion/react': registry.npmmirror.com/@emotion/react/11.10.6_pmekkgnqduwlme35zpnqhenc34
|
'@emotion/react': registry.npmmirror.com/@emotion/react/11.10.6_pmekkgnqduwlme35zpnqhenc34
|
||||||
'@emotion/styled': registry.npmmirror.com/@emotion/styled/11.10.6_oouaibmszuch5k64ms7uxp2aia
|
'@emotion/styled': registry.npmmirror.com/@emotion/styled/11.10.6_oouaibmszuch5k64ms7uxp2aia
|
||||||
'@next/font': registry.npmmirror.com/@next/font/13.1.6
|
'@next/font': registry.npmmirror.com/@next/font/13.1.6
|
||||||
@@ -84,6 +87,7 @@ dependencies:
|
|||||||
formidable: registry.npmmirror.com/formidable/2.1.1
|
formidable: registry.npmmirror.com/formidable/2.1.1
|
||||||
framer-motion: registry.npmmirror.com/framer-motion/9.0.6_biqbaboplfbrettd7655fr4n2y
|
framer-motion: registry.npmmirror.com/framer-motion/9.0.6_biqbaboplfbrettd7655fr4n2y
|
||||||
gpt-token-utils: registry.npmmirror.com/gpt-token-utils/1.2.0
|
gpt-token-utils: registry.npmmirror.com/gpt-token-utils/1.2.0
|
||||||
|
graphemer: registry.npmmirror.com/graphemer/1.4.0
|
||||||
hyperdown: registry.npmmirror.com/hyperdown/2.4.29
|
hyperdown: registry.npmmirror.com/hyperdown/2.4.29
|
||||||
immer: registry.npmmirror.com/immer/9.0.19
|
immer: registry.npmmirror.com/immer/9.0.19
|
||||||
jsonwebtoken: registry.npmmirror.com/jsonwebtoken/9.0.0
|
jsonwebtoken: registry.npmmirror.com/jsonwebtoken/9.0.0
|
||||||
@@ -4176,6 +4180,12 @@ packages:
|
|||||||
react: registry.npmmirror.com/react/18.2.0
|
react: registry.npmmirror.com/react/18.2.0
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
|
registry.npmmirror.com/@dqbd/tiktoken/1.0.6:
|
||||||
|
resolution: {integrity: sha512-umSdeZTy/SbPPKVuZKV/XKyFPmXSN145CcM3iHjBbmhlohBJg7vaDp4cPCW+xNlWL6L2U1sp7T2BD+di2sUKdA==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@dqbd/tiktoken/-/tiktoken-1.0.6.tgz}
|
||||||
|
name: '@dqbd/tiktoken'
|
||||||
|
version: 1.0.6
|
||||||
|
dev: false
|
||||||
|
|
||||||
registry.npmmirror.com/@emotion/babel-plugin/11.10.6:
|
registry.npmmirror.com/@emotion/babel-plugin/11.10.6:
|
||||||
resolution: {integrity: sha512-p2dAqtVrkhSa7xz1u/m9eHYdLi+en8NowrmXeF/dKtJpU8lCWli8RUAati7NcSl0afsBott48pdnANuD0wh9QQ==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@emotion/babel-plugin/-/babel-plugin-11.10.6.tgz}
|
resolution: {integrity: sha512-p2dAqtVrkhSa7xz1u/m9eHYdLi+en8NowrmXeF/dKtJpU8lCWli8RUAati7NcSl0afsBott48pdnANuD0wh9QQ==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@emotion/babel-plugin/-/babel-plugin-11.10.6.tgz}
|
||||||
name: '@emotion/babel-plugin'
|
name: '@emotion/babel-plugin'
|
||||||
@@ -7675,6 +7685,12 @@ packages:
|
|||||||
version: 1.0.4
|
version: 1.0.4
|
||||||
dev: true
|
dev: true
|
||||||
|
|
||||||
|
registry.npmmirror.com/graphemer/1.4.0:
|
||||||
|
resolution: {integrity: sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/graphemer/-/graphemer-1.4.0.tgz}
|
||||||
|
name: graphemer
|
||||||
|
version: 1.4.0
|
||||||
|
dev: false
|
||||||
|
|
||||||
registry.npmmirror.com/has-bigints/1.0.2:
|
registry.npmmirror.com/has-bigints/1.0.2:
|
||||||
resolution: {integrity: sha512-tSvCKtBr9lkF0Ex0aQiP9N+OpV4zi2r/Nee5VkRDbaqv35RLYMzbwQfFSZZH0kR+Rd6302UJZ2p/bJCEoR3VoQ==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/has-bigints/-/has-bigints-1.0.2.tgz}
|
resolution: {integrity: sha512-tSvCKtBr9lkF0Ex0aQiP9N+OpV4zi2r/Nee5VkRDbaqv35RLYMzbwQfFSZZH0kR+Rd6302UJZ2p/bJCEoR3VoQ==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/has-bigints/-/has-bigints-1.0.2.tgz}
|
||||||
name: has-bigints
|
name: has-bigints
|
||||||
|
@@ -14,7 +14,6 @@ import {
|
|||||||
} from '@chakra-ui/react';
|
} from '@chakra-ui/react';
|
||||||
import { useToast } from '@/hooks/useToast';
|
import { useToast } from '@/hooks/useToast';
|
||||||
import { useSelectFile } from '@/hooks/useSelectFile';
|
import { useSelectFile } from '@/hooks/useSelectFile';
|
||||||
import { encode } from 'gpt-token-utils';
|
|
||||||
import { useConfirm } from '@/hooks/useConfirm';
|
import { useConfirm } from '@/hooks/useConfirm';
|
||||||
import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file';
|
import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file';
|
||||||
import { useMutation } from '@tanstack/react-query';
|
import { useMutation } from '@tanstack/react-query';
|
||||||
@@ -22,6 +21,7 @@ import { postModelDataSplitData } from '@/api/model';
|
|||||||
import { formatPrice } from '@/utils/user';
|
import { formatPrice } from '@/utils/user';
|
||||||
import Radio from '@/components/Radio';
|
import Radio from '@/components/Radio';
|
||||||
import { splitText } from '@/utils/file';
|
import { splitText } from '@/utils/file';
|
||||||
|
import { countChatTokens } from '@/utils/tools';
|
||||||
|
|
||||||
const fileExtension = '.txt,.doc,.docx,.pdf,.md';
|
const fileExtension = '.txt,.doc,.docx,.pdf,.md';
|
||||||
|
|
||||||
@@ -29,11 +29,11 @@ const modeMap = {
|
|||||||
qa: {
|
qa: {
|
||||||
maxLen: 2800,
|
maxLen: 2800,
|
||||||
slideLen: 800,
|
slideLen: 800,
|
||||||
price: 3,
|
price: 4,
|
||||||
isPrompt: true
|
isPrompt: true
|
||||||
},
|
},
|
||||||
subsection: {
|
subsection: {
|
||||||
maxLen: 1000,
|
maxLen: 800,
|
||||||
slideLen: 300,
|
slideLen: 300,
|
||||||
price: 0.4,
|
price: 0.4,
|
||||||
isPrompt: false
|
isPrompt: false
|
||||||
@@ -55,19 +55,19 @@ const SelectFileModal = ({
|
|||||||
const { File, onOpen } = useSelectFile({ fileType: fileExtension, multiple: true });
|
const { File, onOpen } = useSelectFile({ fileType: fileExtension, multiple: true });
|
||||||
const [mode, setMode] = useState<'qa' | 'subsection'>('qa');
|
const [mode, setMode] = useState<'qa' | 'subsection'>('qa');
|
||||||
const [fileTextArr, setFileTextArr] = useState<string[]>(['']);
|
const [fileTextArr, setFileTextArr] = useState<string[]>(['']);
|
||||||
|
const [splitRes, setSplitRes] = useState<{ tokens: number; chunks: string[] }>({
|
||||||
|
tokens: 0,
|
||||||
|
chunks: []
|
||||||
|
});
|
||||||
const { openConfirm, ConfirmChild } = useConfirm({
|
const { openConfirm, ConfirmChild } = useConfirm({
|
||||||
content: '确认导入该文件,需要一定时间进行拆解,该任务无法终止!如果余额不足,任务讲被终止。'
|
content: `确认导入该文件,需要一定时间进行拆解,该任务无法终止!如果余额不足,未完成的任务会被直接清除。一共 ${
|
||||||
|
splitRes.chunks.length
|
||||||
|
} 组,大约 ${splitRes.tokens} 个tokens, 约 ${formatPrice(
|
||||||
|
splitRes.tokens * modeMap[mode].price
|
||||||
|
)} 元`
|
||||||
});
|
});
|
||||||
|
|
||||||
const fileText = useMemo(() => {
|
const fileText = useMemo(() => fileTextArr.join(''), [fileTextArr]);
|
||||||
const chunks = fileTextArr.map((item) =>
|
|
||||||
splitText({
|
|
||||||
text: item,
|
|
||||||
...modeMap[mode]
|
|
||||||
})
|
|
||||||
);
|
|
||||||
return chunks.join('');
|
|
||||||
}, [fileTextArr, mode]);
|
|
||||||
|
|
||||||
const onSelectFile = useCallback(
|
const onSelectFile = useCallback(
|
||||||
async (e: File[]) => {
|
async (e: File[]) => {
|
||||||
@@ -106,18 +106,11 @@ const SelectFileModal = ({
|
|||||||
|
|
||||||
const { mutate, isLoading } = useMutation({
|
const { mutate, isLoading } = useMutation({
|
||||||
mutationFn: async () => {
|
mutationFn: async () => {
|
||||||
if (!fileText) return;
|
if (splitRes.chunks.length === 0) return;
|
||||||
const chunks = fileTextArr
|
|
||||||
.map((item) =>
|
|
||||||
splitText({
|
|
||||||
text: item,
|
|
||||||
...modeMap[mode]
|
|
||||||
})
|
|
||||||
)
|
|
||||||
.flat();
|
|
||||||
await postModelDataSplitData({
|
await postModelDataSplitData({
|
||||||
modelId,
|
modelId,
|
||||||
chunks,
|
chunks: splitRes.chunks,
|
||||||
prompt: `下面是"${prompt || '一段长文本'}"`,
|
prompt: `下面是"${prompt || '一段长文本'}"`,
|
||||||
mode
|
mode
|
||||||
});
|
});
|
||||||
@@ -136,6 +129,28 @@ const SelectFileModal = ({
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const onclickImport = useCallback(() => {
|
||||||
|
const chunks = fileTextArr
|
||||||
|
.map((item) =>
|
||||||
|
splitText({
|
||||||
|
text: item,
|
||||||
|
...modeMap[mode]
|
||||||
|
})
|
||||||
|
)
|
||||||
|
.flat();
|
||||||
|
// count tokens
|
||||||
|
const tokens = chunks.map((item) =>
|
||||||
|
countChatTokens({ messages: [{ role: 'system', content: item }] })
|
||||||
|
);
|
||||||
|
|
||||||
|
setSplitRes({
|
||||||
|
tokens: tokens.reduce((sum, item) => sum + item, 0),
|
||||||
|
chunks
|
||||||
|
});
|
||||||
|
|
||||||
|
openConfirm(mutate)();
|
||||||
|
}, [fileTextArr, mode, mutate, openConfirm]);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<Modal isOpen={true} onClose={onClose} isCentered>
|
<Modal isOpen={true} onClose={onClose} isCentered>
|
||||||
<ModalOverlay />
|
<ModalOverlay />
|
||||||
@@ -152,10 +167,9 @@ const SelectFileModal = ({
|
|||||||
justifyContent={'center'}
|
justifyContent={'center'}
|
||||||
fontSize={'sm'}
|
fontSize={'sm'}
|
||||||
>
|
>
|
||||||
<Box mt={2} px={4} maxW={['100%']} textAlign={'justify'} color={'blackAlpha.600'}>
|
<Box mt={2} px={5} maxW={['100%', '70%']} textAlign={'justify'} color={'blackAlpha.600'}>
|
||||||
支持 {fileExtension} 文件。模型会自动对文本进行 QA 拆分,需要较长训练时间,拆分需要消耗
|
支持 {fileExtension} 文件。模型会自动对文本进行 QA 拆分,需要较长训练时间,拆分需要消耗
|
||||||
tokens,账号余额不足时,未拆分的数据会被删除。当前一共 {encode(fileText).length}{' '}
|
tokens,账号余额不足时,未拆分的数据会被删除。
|
||||||
个tokens,大约 {formatPrice(encode(fileText).length * modeMap[mode].price)}元
|
|
||||||
</Box>
|
</Box>
|
||||||
{/* 拆分模式 */}
|
{/* 拆分模式 */}
|
||||||
<Flex w={'100%'} px={5} alignItems={'center'} mt={4}>
|
<Flex w={'100%'} px={5} alignItems={'center'} mt={4}>
|
||||||
@@ -217,7 +231,7 @@ const SelectFileModal = ({
|
|||||||
<Button variant={'outline'} colorScheme={'gray'} mr={3} onClick={onClose}>
|
<Button variant={'outline'} colorScheme={'gray'} mr={3} onClick={onClose}>
|
||||||
取消
|
取消
|
||||||
</Button>
|
</Button>
|
||||||
<Button isLoading={isLoading} isDisabled={fileText === ''} onClick={openConfirm(mutate)}>
|
<Button isLoading={isLoading} isDisabled={fileText === ''} onClick={onclickImport}>
|
||||||
确认导入
|
确认导入
|
||||||
</Button>
|
</Button>
|
||||||
</Flex>
|
</Flex>
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
import mammoth from 'mammoth';
|
import mammoth from 'mammoth';
|
||||||
import Papa from 'papaparse';
|
import Papa from 'papaparse';
|
||||||
import { encode } from 'gpt-token-utils';
|
import { countChatTokens } from './tools';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 读取 txt 文件内容
|
* 读取 txt 文件内容
|
||||||
@@ -164,7 +164,7 @@ export const splitText = ({
|
|||||||
const chunks: { sum: number; arr: string[] }[] = [{ sum: 0, arr: [] }];
|
const chunks: { sum: number; arr: string[] }[] = [{ sum: 0, arr: [] }];
|
||||||
|
|
||||||
for (let i = 0; i < textArr.length; i++) {
|
for (let i = 0; i < textArr.length; i++) {
|
||||||
const tokenLen = encode(textArr[i]).length;
|
const tokenLen = countChatTokens({ messages: [{ role: 'system', content: textArr[i] }] });
|
||||||
chunks[chunks.length - 1].sum += tokenLen;
|
chunks[chunks.length - 1].sum += tokenLen;
|
||||||
chunks[chunks.length - 1].arr.push(textArr[i]);
|
chunks[chunks.length - 1].arr.push(textArr[i]);
|
||||||
|
|
||||||
@@ -174,7 +174,7 @@ export const splitText = ({
|
|||||||
const chunk: { sum: number; arr: string[] } = { sum: 0, arr: [] };
|
const chunk: { sum: number; arr: string[] } = { sum: 0, arr: [] };
|
||||||
for (let j = chunks[chunks.length - 1].arr.length - 1; j >= 0; j--) {
|
for (let j = chunks[chunks.length - 1].arr.length - 1; j >= 0; j--) {
|
||||||
const chunkText = chunks[chunks.length - 1].arr[j];
|
const chunkText = chunks[chunks.length - 1].arr[j];
|
||||||
const tokenLen = encode(chunkText).length;
|
const tokenLen = countChatTokens({ messages: [{ role: 'system', content: chunkText }] });
|
||||||
chunk.sum += tokenLen;
|
chunk.sum += tokenLen;
|
||||||
chunk.arr.unshift(chunkText);
|
chunk.arr.unshift(chunkText);
|
||||||
|
|
||||||
@@ -185,7 +185,6 @@ export const splitText = ({
|
|||||||
chunks.push(chunk);
|
chunks.push(chunk);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const result = chunks.map((item) => item.arr.join(''));
|
const result = chunks.map((item) => item.arr.join(''));
|
||||||
return result;
|
return result;
|
||||||
};
|
};
|
||||||
|
@@ -1,5 +1,27 @@
|
|||||||
import crypto from 'crypto';
|
import crypto from 'crypto';
|
||||||
import { useToast } from '@/hooks/useToast';
|
import { useToast } from '@/hooks/useToast';
|
||||||
|
import { encoding_for_model, type Tiktoken } from '@dqbd/tiktoken';
|
||||||
|
import Graphemer from 'graphemer';
|
||||||
|
|
||||||
|
const textDecoder = new TextDecoder();
|
||||||
|
const graphemer = new Graphemer();
|
||||||
|
const encMap = {
|
||||||
|
'gpt-3.5-turbo': encoding_for_model('gpt-3.5-turbo', {
|
||||||
|
'<|im_start|>': 100264,
|
||||||
|
'<|im_end|>': 100265,
|
||||||
|
'<|im_sep|>': 100266
|
||||||
|
}),
|
||||||
|
'gpt-4': encoding_for_model('gpt-4', {
|
||||||
|
'<|im_start|>': 100264,
|
||||||
|
'<|im_end|>': 100265,
|
||||||
|
'<|im_sep|>': 100266
|
||||||
|
}),
|
||||||
|
'gpt-4-32k': encoding_for_model('gpt-4-32k', {
|
||||||
|
'<|im_start|>': 100264,
|
||||||
|
'<|im_end|>': 100265,
|
||||||
|
'<|im_sep|>': 100266
|
||||||
|
})
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* copy text data
|
* copy text data
|
||||||
@@ -51,3 +73,60 @@ export const Obj2Query = (obj: Record<string, string | number>) => {
|
|||||||
}
|
}
|
||||||
return queryParams.toString();
|
return queryParams.toString();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* 格式化 chat 聊天内容 */
|
||||||
|
function getChatGPTEncodingText(
|
||||||
|
messages: { role: 'system' | 'user' | 'assistant'; content: string; name?: string }[],
|
||||||
|
model: 'gpt-3.5-turbo' | 'gpt-4' | 'gpt-4-32k'
|
||||||
|
) {
|
||||||
|
const isGpt3 = model === 'gpt-3.5-turbo';
|
||||||
|
|
||||||
|
const msgSep = isGpt3 ? '\n' : '';
|
||||||
|
const roleSep = isGpt3 ? '\n' : '<|im_sep|>';
|
||||||
|
|
||||||
|
return [
|
||||||
|
messages
|
||||||
|
.map(({ name = '', role, content }) => {
|
||||||
|
return `<|im_start|>${name || role}${roleSep}${content}<|im_end|>`;
|
||||||
|
})
|
||||||
|
.join(msgSep),
|
||||||
|
`<|im_start|>assistant${roleSep}`
|
||||||
|
].join(msgSep);
|
||||||
|
}
|
||||||
|
function text2TokensLen(encoder: Tiktoken, inputText: string) {
|
||||||
|
const encoding = encoder.encode(inputText, 'all');
|
||||||
|
const segments: { text: string; tokens: { id: number; idx: number }[] }[] = [];
|
||||||
|
|
||||||
|
let byteAcc: number[] = [];
|
||||||
|
let tokenAcc: { id: number; idx: number }[] = [];
|
||||||
|
let inputGraphemes = graphemer.splitGraphemes(inputText);
|
||||||
|
|
||||||
|
for (let idx = 0; idx < encoding.length; idx++) {
|
||||||
|
const token = encoding[idx]!;
|
||||||
|
byteAcc.push(...encoder.decode_single_token_bytes(token));
|
||||||
|
tokenAcc.push({ id: token, idx });
|
||||||
|
|
||||||
|
const segmentText = textDecoder.decode(new Uint8Array(byteAcc));
|
||||||
|
const graphemes = graphemer.splitGraphemes(segmentText);
|
||||||
|
|
||||||
|
if (graphemes.every((item, idx) => inputGraphemes[idx] === item)) {
|
||||||
|
segments.push({ text: segmentText, tokens: tokenAcc });
|
||||||
|
|
||||||
|
byteAcc = [];
|
||||||
|
tokenAcc = [];
|
||||||
|
inputGraphemes = inputGraphemes.slice(graphemes.length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return segments.reduce((memo, i) => memo + i.tokens.length, 0) ?? 0;
|
||||||
|
}
|
||||||
|
export const countChatTokens = ({
|
||||||
|
model = 'gpt-3.5-turbo',
|
||||||
|
messages
|
||||||
|
}: {
|
||||||
|
model?: 'gpt-4' | 'gpt-4-32k' | 'gpt-3.5-turbo';
|
||||||
|
messages: { role: 'system' | 'user' | 'assistant'; content: string }[];
|
||||||
|
}) => {
|
||||||
|
const text = getChatGPTEncodingText(messages, model);
|
||||||
|
return text2TokensLen(encMap[model], text);
|
||||||
|
};
|
||||||
|
Reference in New Issue
Block a user