perf: chunk filter

2025-07-25 06:14:06 +00:00 · 2023-08-07 10:59:31 +08:00
parent 1964640d5c
commit 7fe20ef041
5 changed files with 47 additions and 51 deletions
--- a/client/src/components/Layout/navbar.tsx
+++ b/client/src/components/Layout/navbar.tsx
@@ -118,9 +118,9 @@ const Navbar = ({ unread }: { unread: number }) => {
                }
              : {
                  color: 'myGray.500',
-                  backgroundColor: 'transparent'
+                  backgroundColor: 'transparent',
+                  onClick: () => router.push(item.link)
                })}
-            onClick={() => router.push(item.link)}
          >
            <MyIcon
              name={
--- a/client/src/pages/kb/detail/components/Import/Chunk.tsx
+++ b/client/src/pages/kb/detail/components/Import/Chunk.tsx
@@ -258,7 +258,9 @@ const ChunkImport = ({ kbId }: { kbId: string }) => {
              <Box>
                段落长度
                <MyTooltip
-                  label={'基于 Gpt3.5 的 Token 计算方法进行分段。前后段落会有 30% 的内容重叠。'}
+                  label={
+                    '按结束标点符号进行分段。前后段落会有 30% 的内容重叠。\n中文文档建议不要超过800，英文不要超过1500'
+                  }
                  forceShow
                >
                  <QuestionOutlineIcon ml={1} />
@@ -269,7 +271,7 @@ const ChunkImport = ({ kbId }: { kbId: string }) => {
                flex={1}
                defaultValue={chunkLen}
                min={300}
-                max={1000}
+                max={2000}
                step={10}
                onChange={(e) => {
                  setChunkLen(+e);
@@ -294,10 +296,7 @@ const ChunkImport = ({ kbId }: { kbId: string }) => {
                  <QuestionOutlineIcon ml={1} />
                </MyTooltip>
              </Box>
-              <Box ml={4}>
-                {}
-                {price}元
-              </Box>
+              <Box ml={4}>{price}元</Box>
            </Flex>
            <Flex mt={3}>
              {showRePreview && (
--- a/client/src/pages/kb/detail/components/Import/QA.tsx
+++ b/client/src/pages/kb/detail/components/Import/QA.tsx
@@ -1,18 +1,5 @@
 import React, { useState, useCallback, useMemo } from 'react';
-import {
-  Box,
-  Flex,
-  Button,
-  useTheme,
-  NumberInput,
-  NumberInputField,
-  NumberInputStepper,
-  NumberIncrementStepper,
-  NumberDecrementStepper,
-  Image,
-  Textarea,
-  Input
-} from '@chakra-ui/react';
+import { Box, Flex, Button, useTheme, Image, Input } from '@chakra-ui/react';
 import { useToast } from '@/hooks/useToast';
 import { useConfirm } from '@/hooks/useConfirm';
 import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file';
@@ -48,7 +35,7 @@ type FileItemType = {
 const QAImport = ({ kbId }: { kbId: string }) => {
  const model = qaModelList[0]?.model;
  const unitPrice = qaModelList[0]?.price || 3;
-  const chunkLen = qaModelList[0].maxToken / 2;
+  const chunkLen = qaModelList[0].maxToken * 0.45;
  const theme = useTheme();
  const router = useRouter();
  const { toast } = useToast();
--- a/client/src/service/events/pushBill.ts
+++ b/client/src/service/events/pushBill.ts
@@ -129,16 +129,26 @@ export const pushGenerateVectorBill = async ({

    try {
      // 计算价格. 至少为1
-      const unitPrice = global.vectorModels.find((item) => item.model === model)?.price || 0.2;
+      const vectorModel =
+        global.vectorModels.find((item) => item.model === model) || global.vectorModels[0];
+      const unitPrice = vectorModel.price || 0.2;
      let total = unitPrice * tokenLen;
      total = total > 1 ? total : 1;

      // 插入 Bill 记录
      const res = await Bill.create({
        userId,
-        model,
+        model: vectorModel.model,
        appName: '索引生成',
-        total
+        total,
+        list: [
+          {
+            moduleName: '索引生成',
+            amount: total,
+            model: vectorModel.model,
+            tokenLen
+          }
+        ]
      });
      billId = res._id;

--- a/client/src/utils/file.ts
+++ b/client/src/utils/file.ts
@@ -2,7 +2,6 @@ import mammoth from 'mammoth';
 import Papa from 'papaparse';
 import { getOpenAiEncMap } from './plugin/openai';
 import { getErrText } from './tools';
-import { OpenAiChatEnum } from '@/constants/model';
 import { uploadImg } from '@/api/system';

 /**
@@ -145,38 +144,39 @@ export const fileDownload = ({
 /**
 * text split into chunks
 * maxLen - one chunk len. max: 3500
- * slideLen - The size of the before and after Text
- * maxLen > slideLen
+ * overlapLen - The size of the before and after Text
+ * maxLen > overlapLen
 */
 export const splitText_token = ({ text, maxLen }: { text: string; maxLen: number }) => {
-  const slideLen = Math.floor(maxLen * 0.3);
+  const overlapLen = Math.floor(maxLen * 0.3); // Overlap length

  try {
-    const enc = getOpenAiEncMap();
-    // filter empty text. encode sentence
-    const encodeText = enc.encode(text);
-
+    const splitTexts = text.split(/(?<=[。！？.!?])/g);
    const chunks: string[] = [];
-    let tokens = 0;

-    let startIndex = 0;
-    let endIndex = Math.min(startIndex + maxLen, encodeText.length);
-    let chunkEncodeArr = encodeText.slice(startIndex, endIndex);
-
-    const decoder = new TextDecoder();
-
-    while (startIndex < encodeText.length) {
-      tokens += chunkEncodeArr.length;
-      chunks.push(decoder.decode(enc.decode(chunkEncodeArr)));
-
-      startIndex += maxLen - slideLen;
-      endIndex = Math.min(startIndex + maxLen, encodeText.length);
-      chunkEncodeArr = encodeText.slice(
-        Math.min(encodeText.length - slideLen, startIndex),
-        endIndex
-      );
+    let preChunk = '';
+    let chunk = '';
+    for (let i = 0; i < splitTexts.length; i++) {
+      const text = splitTexts[i];
+      chunk += text;
+      if (chunk.length > maxLen - overlapLen) {
+        preChunk += text;
+      }
+      if (chunk.length >= maxLen) {
+        chunks.push(chunk);
+        chunk = preChunk;
+        preChunk = '';
+      }
    }

+    if (chunk) {
+      chunks.push(chunk);
+    }
+
+    const enc = getOpenAiEncMap();
+    const encodeText = enc.encode(chunks.join(''));
+    const tokens = encodeText.length;
+
    return {
      chunks,
      tokens