diff --git a/client/package.json b/client/package.json index 4515caca6..1042ac9c1 100644 --- a/client/package.json +++ b/client/package.json @@ -29,7 +29,6 @@ "eventsource-parser": "^0.1.0", "formidable": "^2.1.1", "framer-motion": "^9.0.6", - "graphemer": "^1.4.0", "hyperdown": "^2.4.29", "immer": "^9.0.19", "jsonwebtoken": "^9.0.0", diff --git a/client/pnpm-lock.yaml b/client/pnpm-lock.yaml index 6ece7382d..2f6636958 100644 --- a/client/pnpm-lock.yaml +++ b/client/pnpm-lock.yaml @@ -65,9 +65,6 @@ dependencies: framer-motion: specifier: ^9.0.6 version: registry.npmmirror.com/framer-motion@9.0.6(react-dom@18.2.0)(react@18.2.0) - graphemer: - specifier: ^1.4.0 - version: registry.npmmirror.com/graphemer@1.4.0 hyperdown: specifier: ^2.4.29 version: registry.npmmirror.com/hyperdown@2.4.29 @@ -8013,12 +8010,6 @@ packages: version: 1.0.4 dev: true - registry.npmmirror.com/graphemer@1.4.0: - resolution: {integrity: sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/graphemer/-/graphemer-1.4.0.tgz} - name: graphemer - version: 1.4.0 - dev: false - registry.npmmirror.com/has-bigints@1.0.2: resolution: {integrity: sha512-tSvCKtBr9lkF0Ex0aQiP9N+OpV4zi2r/Nee5VkRDbaqv35RLYMzbwQfFSZZH0kR+Rd6302UJZ2p/bJCEoR3VoQ==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/has-bigints/-/has-bigints-1.0.2.tgz} name: has-bigints diff --git a/client/src/pages/kb/components/SelectFileModal.tsx b/client/src/pages/kb/components/SelectFileModal.tsx index f1b7d636e..06c6c95b1 100644 --- a/client/src/pages/kb/components/SelectFileModal.tsx +++ b/client/src/pages/kb/components/SelectFileModal.tsx @@ -1,4 +1,4 @@ -import React, { useState, useCallback } from 'react'; +import React, { useState, useCallback, useRef } from 'react'; import { Box, Flex, @@ -24,24 +24,10 @@ import { TrainingModeEnum } from '@/constants/plugin'; import { getErrText } from '@/utils/tools'; import { ChatModelMap, OpenAiChatEnum, embeddingPrice } from '@/constants/model'; import { formatPrice } from '@/utils/user'; +import MySlider from '@/components/Slider'; const fileExtension = '.txt,.doc,.docx,.pdf,.md'; -const modeMap = { - [TrainingModeEnum.qa]: { - maxLen: 8000, - slideLen: 3000, - price: ChatModelMap[OpenAiChatEnum.GPT3516k].price, - isPrompt: true - }, - [TrainingModeEnum.index]: { - maxLen: 1000, - slideLen: 500, - price: embeddingPrice, - isPrompt: false - } -}; - const SelectFileModal = ({ onClose, onSuccess, @@ -51,6 +37,16 @@ const SelectFileModal = ({ onSuccess: () => void; kbId: string; }) => { + const [modeMap, setModeMap] = useState({ + [TrainingModeEnum.qa]: { + maxLen: 8000, + price: ChatModelMap[OpenAiChatEnum.GPT3516k].price + }, + [TrainingModeEnum.index]: { + maxLen: 600, + price: embeddingPrice + } + }); const [btnLoading, setBtnLoading] = useState(false); const { toast } = useToast(); const [prompt, setPrompt] = useState(''); @@ -200,7 +196,7 @@ const SelectFileModal = ({ }); } setBtnLoading(false); - }, [files, mode, mutate, openConfirm, toast]); + }, [files, mode, modeMap, mutate, openConfirm, toast]); return ( @@ -244,19 +240,52 @@ const SelectFileModal = ({ /> {/* 内容介绍 */} - {modeMap[mode].isPrompt && ( - - - 下面是 - - setPrompt(e.target.value)} - size={'sm'} - /> - - )} + + {mode === TrainingModeEnum.qa && ( + <> + + 下面是 + + setPrompt(e.target.value)} + size={'sm'} + /> + + )} + {/* chunk size */} + {mode === TrainingModeEnum.index && ( + + + 段落长度 + + + { + setModeMap((state) => ({ + ...state, + [TrainingModeEnum.index]: { + maxLen: val, + price: embeddingPrice + } + })); + }} + /> + + + )} + + {/* 文本内容 */} {files.slice(0, 100).map((item, i) => ( diff --git a/client/src/utils/file.ts b/client/src/utils/file.ts index a20245053..c2504c082 100644 --- a/client/src/utils/file.ts +++ b/client/src/utils/file.ts @@ -148,15 +148,9 @@ export const fileDownload = ({ * slideLen - The size of the before and after Text * maxLen > slideLen */ -export const splitText_token = ({ - text, - maxLen, - slideLen -}: { - text: string; - maxLen: number; - slideLen: number; -}) => { +export const splitText_token = ({ text, maxLen }: { text: string; maxLen: number }) => { + const slideLen = Math.floor(maxLen * 0.3); + try { const enc = getOpenAiEncMap()[OpenAiChatEnum.GPT35]; // filter empty text. encode sentence diff --git a/client/src/utils/plugin/openai.ts b/client/src/utils/plugin/openai.ts index 16125d6db..b39d9a3e5 100644 --- a/client/src/utils/plugin/openai.ts +++ b/client/src/utils/plugin/openai.ts @@ -1,68 +1,20 @@ -import { encoding_for_model, type Tiktoken } from '@dqbd/tiktoken'; +import { encoding_for_model } from '@dqbd/tiktoken'; import type { ChatItemType } from '@/types/chat'; import { ChatRoleEnum } from '@/constants/chat'; -import { type ChatCompletionRequestMessage, ChatCompletionRequestMessageRoleEnum } from 'openai'; +import { ChatCompletionRequestMessageRoleEnum } from 'openai'; import { OpenAiChatEnum } from '@/constants/model'; -import Graphemer from 'graphemer'; import axios from 'axios'; import dayjs from 'dayjs'; import type { MessageItemType } from '@/pages/api/openapi/v1/chat/completions'; -const textDecoder = new TextDecoder(); -const graphemer = new Graphemer(); - export const getOpenAiEncMap = () => { - if (typeof window !== 'undefined') { - window.OpenAiEncMap = window.OpenAiEncMap || { - [OpenAiChatEnum.GPT35]: encoding_for_model('gpt-3.5-turbo', { - '<|im_start|>': 100264, - '<|im_end|>': 100265, - '<|im_sep|>': 100266 - }), - [OpenAiChatEnum.GPT3516k]: encoding_for_model('gpt-3.5-turbo', { - '<|im_start|>': 100264, - '<|im_end|>': 100265, - '<|im_sep|>': 100266 - }), - [OpenAiChatEnum.GPT4]: encoding_for_model('gpt-4', { - '<|im_start|>': 100264, - '<|im_end|>': 100265, - '<|im_sep|>': 100266 - }), - [OpenAiChatEnum.GPT432k]: encoding_for_model('gpt-4-32k', { - '<|im_start|>': 100264, - '<|im_end|>': 100265, - '<|im_sep|>': 100266 - }) - }; + if (typeof window !== 'undefined' && window.OpenAiEncMap) { return window.OpenAiEncMap; } - if (typeof global !== 'undefined') { - global.OpenAiEncMap = global.OpenAiEncMap || { - [OpenAiChatEnum.GPT35]: encoding_for_model('gpt-3.5-turbo', { - '<|im_start|>': 100264, - '<|im_end|>': 100265, - '<|im_sep|>': 100266 - }), - [OpenAiChatEnum.GPT3516k]: encoding_for_model('gpt-3.5-turbo', { - '<|im_start|>': 100264, - '<|im_end|>': 100265, - '<|im_sep|>': 100266 - }), - [OpenAiChatEnum.GPT4]: encoding_for_model('gpt-4', { - '<|im_start|>': 100264, - '<|im_end|>': 100265, - '<|im_sep|>': 100266 - }), - [OpenAiChatEnum.GPT432k]: encoding_for_model('gpt-4-32k', { - '<|im_start|>': 100264, - '<|im_end|>': 100265, - '<|im_sep|>': 100266 - }) - }; + if (typeof global !== 'undefined' && global.OpenAiEncMap) { return global.OpenAiEncMap; } - return { + const enc = { [OpenAiChatEnum.GPT35]: encoding_for_model('gpt-3.5-turbo', { '<|im_start|>': 100264, '<|im_end|>': 100265, @@ -84,6 +36,15 @@ export const getOpenAiEncMap = () => { '<|im_sep|>': 100266 }) }; + + if (typeof window !== 'undefined') { + window.OpenAiEncMap = enc; + } + if (typeof global !== 'undefined') { + global.OpenAiEncMap = enc; + } + + return enc; }; export const adaptChatItem_openAI = ({ @@ -112,55 +73,18 @@ export function countOpenAIToken({ messages: ChatItemType[]; model: `${OpenAiChatEnum}`; }) { - function getChatGPTEncodingText( - messages: ChatCompletionRequestMessage[], - model: `${OpenAiChatEnum}` - ) { - const isGpt3 = model.startsWith('gpt-3.5-turbo'); - - const msgSep = isGpt3 ? '\n' : ''; - const roleSep = isGpt3 ? '\n' : '<|im_sep|>'; - - return [ - messages - .map(({ name = '', role, content }) => { - return `<|im_start|>${name || role}${roleSep}${content}<|im_end|>`; - }) - .join(msgSep), - `<|im_start|>assistant${roleSep}` - ].join(msgSep); - } - function text2TokensLen(encoder: Tiktoken, inputText: string) { - const encoding = encoder.encode(inputText, 'all'); - const segments: { text: string; tokens: { id: number; idx: number }[] }[] = []; - - let byteAcc: number[] = []; - let tokenAcc: { id: number; idx: number }[] = []; - let inputGraphemes = graphemer.splitGraphemes(inputText); - - for (let idx = 0; idx < encoding.length; idx++) { - const token = encoding[idx]!; - byteAcc.push(...encoder.decode_single_token_bytes(token)); - tokenAcc.push({ id: token, idx }); - - const segmentText = textDecoder.decode(new Uint8Array(byteAcc)); - const graphemes = graphemer.splitGraphemes(segmentText); - - if (graphemes.every((item, idx) => inputGraphemes[idx] === item)) { - segments.push({ text: segmentText, tokens: tokenAcc }); - - byteAcc = []; - tokenAcc = []; - inputGraphemes = inputGraphemes.slice(graphemes.length); - } - } - - return segments.reduce((memo, i) => memo + i.tokens.length, 0) ?? 0; - } + const diffVal = model.startsWith('gpt-3.5-turbo') ? 3 : 2; const adaptMessages = adaptChatItem_openAI({ messages, reserveId: true }); + const token = adaptMessages.reduce((sum, item) => { + const text = `${item.role}\n${item.content}`; + const enc = getOpenAiEncMap()[model]; + const encodeText = enc.encode(text); + const tokens = encodeText.length + diffVal; + return sum + tokens; + }, 0); - return text2TokensLen(getOpenAiEncMap()[model], getChatGPTEncodingText(adaptMessages, model)); + return token; } export const openAiSliceTextByToken = ({