From 7fe20ef041f03434ec97499e3ddbbf0467065c22 Mon Sep 17 00:00:00 2001 From: archer <545436317@qq.com> Date: Mon, 7 Aug 2023 10:59:31 +0800 Subject: [PATCH] perf: chunk filter --- client/src/components/Layout/navbar.tsx | 4 +- .../kb/detail/components/Import/Chunk.tsx | 11 ++-- .../pages/kb/detail/components/Import/QA.tsx | 17 +------ client/src/service/events/pushBill.ts | 16 ++++-- client/src/utils/file.ts | 50 +++++++++---------- 5 files changed, 47 insertions(+), 51 deletions(-) diff --git a/client/src/components/Layout/navbar.tsx b/client/src/components/Layout/navbar.tsx index 085986fa3..c52365fd4 100644 --- a/client/src/components/Layout/navbar.tsx +++ b/client/src/components/Layout/navbar.tsx @@ -118,9 +118,9 @@ const Navbar = ({ unread }: { unread: number }) => { } : { color: 'myGray.500', - backgroundColor: 'transparent' + backgroundColor: 'transparent', + onClick: () => router.push(item.link) })} - onClick={() => router.push(item.link)} > { 段落长度 @@ -269,7 +271,7 @@ const ChunkImport = ({ kbId }: { kbId: string }) => { flex={1} defaultValue={chunkLen} min={300} - max={1000} + max={2000} step={10} onChange={(e) => { setChunkLen(+e); @@ -294,10 +296,7 @@ const ChunkImport = ({ kbId }: { kbId: string }) => { - - {} - {price}元 - + {price}元 {showRePreview && ( diff --git a/client/src/pages/kb/detail/components/Import/QA.tsx b/client/src/pages/kb/detail/components/Import/QA.tsx index 59b6991ab..294313b7a 100644 --- a/client/src/pages/kb/detail/components/Import/QA.tsx +++ b/client/src/pages/kb/detail/components/Import/QA.tsx @@ -1,18 +1,5 @@ import React, { useState, useCallback, useMemo } from 'react'; -import { - Box, - Flex, - Button, - useTheme, - NumberInput, - NumberInputField, - NumberInputStepper, - NumberIncrementStepper, - NumberDecrementStepper, - Image, - Textarea, - Input -} from '@chakra-ui/react'; +import { Box, Flex, Button, useTheme, Image, Input } from '@chakra-ui/react'; import { useToast } from '@/hooks/useToast'; import { useConfirm } from '@/hooks/useConfirm'; import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file'; @@ -48,7 +35,7 @@ type FileItemType = { const QAImport = ({ kbId }: { kbId: string }) => { const model = qaModelList[0]?.model; const unitPrice = qaModelList[0]?.price || 3; - const chunkLen = qaModelList[0].maxToken / 2; + const chunkLen = qaModelList[0].maxToken * 0.45; const theme = useTheme(); const router = useRouter(); const { toast } = useToast(); diff --git a/client/src/service/events/pushBill.ts b/client/src/service/events/pushBill.ts index 709298ccb..e8680acf9 100644 --- a/client/src/service/events/pushBill.ts +++ b/client/src/service/events/pushBill.ts @@ -129,16 +129,26 @@ export const pushGenerateVectorBill = async ({ try { // 计算价格. 至少为1 - const unitPrice = global.vectorModels.find((item) => item.model === model)?.price || 0.2; + const vectorModel = + global.vectorModels.find((item) => item.model === model) || global.vectorModels[0]; + const unitPrice = vectorModel.price || 0.2; let total = unitPrice * tokenLen; total = total > 1 ? total : 1; // 插入 Bill 记录 const res = await Bill.create({ userId, - model, + model: vectorModel.model, appName: '索引生成', - total + total, + list: [ + { + moduleName: '索引生成', + amount: total, + model: vectorModel.model, + tokenLen + } + ] }); billId = res._id; diff --git a/client/src/utils/file.ts b/client/src/utils/file.ts index fec9e39eb..085621a7f 100644 --- a/client/src/utils/file.ts +++ b/client/src/utils/file.ts @@ -2,7 +2,6 @@ import mammoth from 'mammoth'; import Papa from 'papaparse'; import { getOpenAiEncMap } from './plugin/openai'; import { getErrText } from './tools'; -import { OpenAiChatEnum } from '@/constants/model'; import { uploadImg } from '@/api/system'; /** @@ -145,38 +144,39 @@ export const fileDownload = ({ /** * text split into chunks * maxLen - one chunk len. max: 3500 - * slideLen - The size of the before and after Text - * maxLen > slideLen + * overlapLen - The size of the before and after Text + * maxLen > overlapLen */ export const splitText_token = ({ text, maxLen }: { text: string; maxLen: number }) => { - const slideLen = Math.floor(maxLen * 0.3); + const overlapLen = Math.floor(maxLen * 0.3); // Overlap length try { - const enc = getOpenAiEncMap(); - // filter empty text. encode sentence - const encodeText = enc.encode(text); - + const splitTexts = text.split(/(?<=[。!?.!?])/g); const chunks: string[] = []; - let tokens = 0; - let startIndex = 0; - let endIndex = Math.min(startIndex + maxLen, encodeText.length); - let chunkEncodeArr = encodeText.slice(startIndex, endIndex); - - const decoder = new TextDecoder(); - - while (startIndex < encodeText.length) { - tokens += chunkEncodeArr.length; - chunks.push(decoder.decode(enc.decode(chunkEncodeArr))); - - startIndex += maxLen - slideLen; - endIndex = Math.min(startIndex + maxLen, encodeText.length); - chunkEncodeArr = encodeText.slice( - Math.min(encodeText.length - slideLen, startIndex), - endIndex - ); + let preChunk = ''; + let chunk = ''; + for (let i = 0; i < splitTexts.length; i++) { + const text = splitTexts[i]; + chunk += text; + if (chunk.length > maxLen - overlapLen) { + preChunk += text; + } + if (chunk.length >= maxLen) { + chunks.push(chunk); + chunk = preChunk; + preChunk = ''; + } } + if (chunk) { + chunks.push(chunk); + } + + const enc = getOpenAiEncMap(); + const encodeText = enc.encode(chunks.join('')); + const tokens = encodeText.length; + return { chunks, tokens