v4.6-3 (#471)

2025-07-23 05:12:39 +00:00 · 2023-11-15 11:36:25 +08:00
parent 592e1a93a2
commit bfd8be5df0
181 changed files with 2499 additions and 1552 deletions
--- a/packages/global/common/string/textSplitter.ts
+++ b/packages/global/common/string/textSplitter.ts
@@ -0,0 +1,131 @@
+import { getErrText } from '../error/utils';
+import { countPromptTokens } from './tiktoken';
+
+/**
+ * text split into chunks
+ * maxLen - one chunk len. max: 3500
+ * overlapLen - The size of the before and after Text
+ * maxLen > overlapLen
+ * markdown
+ */
+export const splitText2Chunks = (props: { text: string; maxLen: number; overlapLen?: number }) => {
+  const { text = '', maxLen, overlapLen = Math.floor(maxLen * 0.2) } = props;
+  const tempMarker = 'SPLIT_HERE_SPLIT_HERE';
+
+  const stepReg: Record<number, RegExp> = {
+    0: /^(#\s[^\n]+)\n/gm,
+    1: /^(##\s[^\n]+)\n/gm,
+    2: /^(###\s[^\n]+)\n/gm,
+    3: /^(####\s[^\n]+)\n/gm,
+
+    4: /(\n\n)/g,
+    5: /([\n])/g,
+    6: /[。]|(?!<[^a-zA-Z])\.\s/g,
+    7: /([！？]|!\s|\?\s)/g,
+    8: /([；]|;\s)/g,
+    9: /([，]|,\s)/g
+  };
+
+  const splitTextRecursively = ({
+    text = '',
+    step,
+    lastChunk,
+    overlayChunk
+  }: {
+    text: string;
+    step: number;
+    lastChunk: string;
+    overlayChunk: string;
+  }) => {
+    if (text.length <= maxLen) {
+      return [text];
+    }
+    const reg = stepReg[step];
+    const isMarkdownSplit = step < 4;
+
+    if (!reg) {
+      // use slice-maxLen to split text
+      const chunks: string[] = [];
+      let chunk = '';
+      for (let i = 0; i < text.length; i += maxLen - overlapLen) {
+        chunk = text.slice(i, i + maxLen);
+        chunks.push(chunk);
+      }
+      return chunks;
+    }
+
+    // split text by special char
+    const splitTexts = text
+      .replace(reg, isMarkdownSplit ? `${tempMarker}$1` : `$1${tempMarker}`)
+      .split(`${tempMarker}`)
+      .filter((part) => part);
+
+    let chunks: string[] = [];
+    for (let i = 0; i < splitTexts.length; i++) {
+      let text = splitTexts[i];
+      let chunkToken = countPromptTokens(lastChunk, '');
+      const textToken = countPromptTokens(text, '');
+
+      // next chunk is too large / new chunk is too large(The current chunk must be smaller than maxLen)
+      if (textToken >= maxLen || chunkToken + textToken > maxLen * 1.4) {
+        // last chunk is too large, push it to chunks, not add to next chunk
+        if (chunkToken > maxLen * 0.7) {
+          chunks.push(lastChunk);
+          lastChunk = '';
+          overlayChunk = '';
+        }
+        // chunk is small, insert to next chunks
+        const innerChunks = splitTextRecursively({
+          text,
+          step: step + 1,
+          lastChunk,
+          overlayChunk
+        });
+        if (innerChunks.length === 0) continue;
+        chunks = chunks.concat(innerChunks);
+        lastChunk = '';
+        overlayChunk = '';
+        continue;
+      }
+
+      // size less than maxLen, push text to last chunk
+      lastChunk += text;
+      chunkToken += textToken; // Definitely less than 1.4 * maxLen
+
+      // size over lapLen, push it to next chunk
+      if (
+        overlapLen !== 0 &&
+        !isMarkdownSplit &&
+        chunkToken >= maxLen - overlapLen &&
+        textToken < overlapLen
+      ) {
+        overlayChunk += text;
+      }
+      if (chunkToken >= maxLen) {
+        chunks.push(lastChunk);
+        lastChunk = overlayChunk;
+        overlayChunk = '';
+      }
+    }
+
+    /* If the last chunk is independent, it needs to be push chunks. */
+    if (lastChunk && chunks[chunks.length - 1] && !chunks[chunks.length - 1].endsWith(lastChunk)) {
+      chunks.push(lastChunk);
+    }
+
+    return chunks;
+  };
+
+  try {
+    const chunks = splitTextRecursively({ text, step: 0, lastChunk: '', overlayChunk: '' });
+
+    const tokens = chunks.reduce((sum, chunk) => sum + countPromptTokens(chunk, 'system'), 0);
+
+    return {
+      chunks,
+      tokens
+    };
+  } catch (err) {
+    throw new Error(getErrText(err));
+  }
+};
--- a/packages/global/common/string/tiktoken/cl100k_base.json
+++ b/packages/global/common/string/tiktoken/cl100k_base.json
--- a/packages/global/common/string/tiktoken/index.ts
+++ b/packages/global/common/string/tiktoken/index.ts
@@ -0,0 +1,84 @@
+/* Only the token of gpt-3.5-turbo is used */
+import type { ChatItemType } from '../../../core/chat/type';
+import { Tiktoken } from 'js-tiktoken/lite';
+import { adaptChat2GptMessages } from '../../../core/chat/adapt';
+import { ChatCompletionRequestMessageRoleEnum } from '../../../core/ai/constant';
+import encodingJson from './cl100k_base.json';
+
+/* init tikToken obj */
+export function getTikTokenEnc() {
+  if (typeof window !== 'undefined' && window.TikToken) {
+    return window.TikToken;
+  }
+  if (typeof global !== 'undefined' && global.TikToken) {
+    return global.TikToken;
+  }
+
+  const enc = new Tiktoken(encodingJson);
+
+  if (typeof window !== 'undefined') {
+    window.TikToken = enc;
+  }
+  if (typeof global !== 'undefined') {
+    global.TikToken = enc;
+  }
+
+  return enc;
+}
+
+/* count one prompt tokens */
+export function countPromptTokens(
+  prompt = '',
+  role: '' | `${ChatCompletionRequestMessageRoleEnum}` = ''
+) {
+  const enc = getTikTokenEnc();
+  const text = `${role}\n${prompt}`;
+  try {
+    const encodeText = enc.encode(text);
+    return encodeText.length + 3; // 补充 role 估算值
+  } catch (error) {
+    return text.length;
+  }
+}
+
+/* count messages tokens */
+export function countMessagesTokens({ messages }: { messages: ChatItemType[] }) {
+  const adaptMessages = adaptChat2GptMessages({ messages, reserveId: true });
+
+  let totalTokens = 0;
+  for (let i = 0; i < adaptMessages.length; i++) {
+    const item = adaptMessages[i];
+    const tokens = countPromptTokens(item.content, item.role);
+    totalTokens += tokens;
+  }
+
+  return totalTokens;
+}
+
+/* slice messages from top to bottom by maxTokens */
+export function sliceMessagesTB({
+  messages,
+  maxTokens
+}: {
+  messages: ChatItemType[];
+  maxTokens: number;
+}) {
+  const adaptMessages = adaptChat2GptMessages({ messages, reserveId: true });
+  let reduceTokens = maxTokens;
+  let result: ChatItemType[] = [];
+
+  for (let i = 0; i < adaptMessages.length; i++) {
+    const item = adaptMessages[i];
+
+    const tokens = countPromptTokens(item.content, item.role);
+    reduceTokens -= tokens;
+
+    if (reduceTokens > 0) {
+      result.push(messages[i]);
+    } else {
+      break;
+    }
+  }
+
+  return result.length === 0 && messages[0] ? [messages[0]] : result;
+}
--- a/packages/global/common/string/tiktoken/type.d.ts
+++ b/packages/global/common/string/tiktoken/type.d.ts
@@ -0,0 +1,5 @@
+import type { Tiktoken } from 'js-tiktoken';
+
+declare global {
+  var TikToken: Tiktoken;
+}
--- a/packages/global/common/string/tools.ts
+++ b/packages/global/common/string/tools.ts
@@ -1,13 +1,15 @@
 import crypto from 'crypto';

+/* check string is a web link */
 export function strIsLink(str?: string) {
  if (!str) return false;
  if (/^((http|https)?:\/\/|www\.|\/)[^\s/$.?#].[^\s]*$/i.test(str)) return true;
  return false;
 }

-export const hashStr = (psw: string) => {
-  return crypto.createHash('sha256').update(psw).digest('hex');
+/* hash string */
+export const hashStr = (str: string) => {
+  return crypto.createHash('sha256').update(str).digest('hex');
 };

 /* simple text, remove chinese space and extra \n */
@@ -20,3 +22,16 @@ export const simpleText = (text: string) => {

  return text;
 };
+
+/* 
+    replace {{variable}} to value
+*/
+export function replaceVariable(text: string, obj: Record<string, string | number>) {
+  for (const key in obj) {
+    const val = obj[key];
+    if (!['string', 'number'].includes(typeof val)) continue;
+
+    text = text.replace(new RegExp(`{{(${key})}}`, 'g'), String(val));
+  }
+  return text || '';
+}