perf: token split text

2025-07-27 08:25:07 +00:00 · 2023-04-30 22:35:47 +08:00
parent 39869bc4ea
commit 89a67ca9c0
8 changed files with 96 additions and 85 deletions
--- a/src/utils/file.ts
+++ b/src/utils/file.ts
@@ -1,6 +1,6 @@
 import mammoth from 'mammoth';
 import Papa from 'papaparse';
-import { countChatTokens } from './tools';
+import { getEncMap } from './tools';

 /**
 * 读取 txt 文件内容
@@ -145,7 +145,7 @@ export const fileDownload = ({
 * slideLen - The size of the before and after Text
 * maxLen > slideLen
 */
-export const splitText = ({
+export const splitText_token = ({
  text,
  maxLen,
  slideLen
@@ -154,39 +154,32 @@ export const splitText = ({
  maxLen: number;
  slideLen: number;
 }) => {
-  const textArr =
-    text.split(/(?<=[。！？\.!\?\n])/g)?.filter((item) => {
-      const text = item.replace(/(\\n)/g, '\n').trim();
-      if (text && text !== '\n') return true;
-      return false;
-    }) || [];
+  const enc = getEncMap()['gpt-3.5-turbo'];
+  // filter empty text. encode sentence
+  const encodeText = enc.encode(text);

-  const chunks: { sum: number; arr: string[] }[] = [{ sum: 0, arr: [] }];
+  const chunks: string[] = [];
+  let tokens = 0;

-  for (let i = 0; i < textArr.length; i++) {
-    const tokenLen = countChatTokens({ messages: [{ role: 'system', content: textArr[i] }] });
-    chunks[chunks.length - 1].sum += tokenLen;
-    chunks[chunks.length - 1].arr.push(textArr[i]);
+  let startIndex = 0;
+  let endIndex = Math.min(startIndex + maxLen, encodeText.length);
+  let chunkEncodeArr = encodeText.slice(startIndex, endIndex);

-    //  current length is over maxLen. create new chunk
-    if (chunks[chunks.length - 1].sum + tokenLen >= maxLen) {
-      // get slide len text as the initial value
-      const chunk: { sum: number; arr: string[] } = { sum: 0, arr: [] };
-      for (let j = chunks[chunks.length - 1].arr.length - 1; j >= 0; j--) {
-        const chunkText = chunks[chunks.length - 1].arr[j];
-        const tokenLen = countChatTokens({ messages: [{ role: 'system', content: chunkText }] });
-        chunk.sum += tokenLen;
-        chunk.arr.unshift(chunkText);
+  const decoder = new TextDecoder();

-        if (chunk.sum >= slideLen) {
-          break;
-        }
-      }
-      chunks.push(chunk);
-    }
+  while (startIndex < encodeText.length) {
+    tokens += chunkEncodeArr.length;
+    chunks.push(decoder.decode(enc.decode(chunkEncodeArr)));
+
+    startIndex += maxLen - slideLen;
+    endIndex = Math.min(startIndex + maxLen, encodeText.length);
+    chunkEncodeArr = encodeText.slice(Math.min(encodeText.length - slideLen, startIndex), endIndex);
  }
-  const result = chunks.map((item) => item.arr.join(''));
-  return result;
+
+  return {
+    chunks,
+    tokens
+  };
 };

 export const fileToBase64 = (file: File) => {
--- a/src/utils/tools.ts
+++ b/src/utils/tools.ts
@@ -7,7 +7,7 @@ import { ChatModelEnum } from '@/constants/model';
 const textDecoder = new TextDecoder();
 const graphemer = new Graphemer();
 let encMap: Record<string, Tiktoken>;
-const getEncMap = () => {
+export const getEncMap = () => {
  if (encMap) return encMap;
  encMap = {
    'gpt-3.5-turbo': encoding_for_model('gpt-3.5-turbo', {
@@ -136,3 +136,18 @@ export const countChatTokens = ({
  const text = getChatGPTEncodingText(messages, model);
  return text2TokensLen(getEncMap()[model], text);
 };
+
+export const sliceTextByToken = ({
+  model = 'gpt-3.5-turbo',
+  text,
+  length
+}: {
+  model?: `${ChatModelEnum}`;
+  text: string;
+  length: number;
+}) => {
+  const enc = getEncMap()[model];
+  const encodeText = enc.encode(text);
+  const decoder = new TextDecoder();
+  return decoder.decode(enc.decode(encodeText.slice(0, length)));
+};