mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 05:12:39 +00:00
v4.6-3 (#471)
This commit is contained in:
131
packages/global/common/string/textSplitter.ts
Normal file
131
packages/global/common/string/textSplitter.ts
Normal file
@@ -0,0 +1,131 @@
|
||||
import { getErrText } from '../error/utils';
|
||||
import { countPromptTokens } from './tiktoken';
|
||||
|
||||
/**
|
||||
* text split into chunks
|
||||
* maxLen - one chunk len. max: 3500
|
||||
* overlapLen - The size of the before and after Text
|
||||
* maxLen > overlapLen
|
||||
* markdown
|
||||
*/
|
||||
export const splitText2Chunks = (props: { text: string; maxLen: number; overlapLen?: number }) => {
|
||||
const { text = '', maxLen, overlapLen = Math.floor(maxLen * 0.2) } = props;
|
||||
const tempMarker = 'SPLIT_HERE_SPLIT_HERE';
|
||||
|
||||
const stepReg: Record<number, RegExp> = {
|
||||
0: /^(#\s[^\n]+)\n/gm,
|
||||
1: /^(##\s[^\n]+)\n/gm,
|
||||
2: /^(###\s[^\n]+)\n/gm,
|
||||
3: /^(####\s[^\n]+)\n/gm,
|
||||
|
||||
4: /(\n\n)/g,
|
||||
5: /([\n])/g,
|
||||
6: /[。]|(?!<[^a-zA-Z])\.\s/g,
|
||||
7: /([!?]|!\s|\?\s)/g,
|
||||
8: /([;]|;\s)/g,
|
||||
9: /([,]|,\s)/g
|
||||
};
|
||||
|
||||
const splitTextRecursively = ({
|
||||
text = '',
|
||||
step,
|
||||
lastChunk,
|
||||
overlayChunk
|
||||
}: {
|
||||
text: string;
|
||||
step: number;
|
||||
lastChunk: string;
|
||||
overlayChunk: string;
|
||||
}) => {
|
||||
if (text.length <= maxLen) {
|
||||
return [text];
|
||||
}
|
||||
const reg = stepReg[step];
|
||||
const isMarkdownSplit = step < 4;
|
||||
|
||||
if (!reg) {
|
||||
// use slice-maxLen to split text
|
||||
const chunks: string[] = [];
|
||||
let chunk = '';
|
||||
for (let i = 0; i < text.length; i += maxLen - overlapLen) {
|
||||
chunk = text.slice(i, i + maxLen);
|
||||
chunks.push(chunk);
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
|
||||
// split text by special char
|
||||
const splitTexts = text
|
||||
.replace(reg, isMarkdownSplit ? `${tempMarker}$1` : `$1${tempMarker}`)
|
||||
.split(`${tempMarker}`)
|
||||
.filter((part) => part);
|
||||
|
||||
let chunks: string[] = [];
|
||||
for (let i = 0; i < splitTexts.length; i++) {
|
||||
let text = splitTexts[i];
|
||||
let chunkToken = countPromptTokens(lastChunk, '');
|
||||
const textToken = countPromptTokens(text, '');
|
||||
|
||||
// next chunk is too large / new chunk is too large(The current chunk must be smaller than maxLen)
|
||||
if (textToken >= maxLen || chunkToken + textToken > maxLen * 1.4) {
|
||||
// last chunk is too large, push it to chunks, not add to next chunk
|
||||
if (chunkToken > maxLen * 0.7) {
|
||||
chunks.push(lastChunk);
|
||||
lastChunk = '';
|
||||
overlayChunk = '';
|
||||
}
|
||||
// chunk is small, insert to next chunks
|
||||
const innerChunks = splitTextRecursively({
|
||||
text,
|
||||
step: step + 1,
|
||||
lastChunk,
|
||||
overlayChunk
|
||||
});
|
||||
if (innerChunks.length === 0) continue;
|
||||
chunks = chunks.concat(innerChunks);
|
||||
lastChunk = '';
|
||||
overlayChunk = '';
|
||||
continue;
|
||||
}
|
||||
|
||||
// size less than maxLen, push text to last chunk
|
||||
lastChunk += text;
|
||||
chunkToken += textToken; // Definitely less than 1.4 * maxLen
|
||||
|
||||
// size over lapLen, push it to next chunk
|
||||
if (
|
||||
overlapLen !== 0 &&
|
||||
!isMarkdownSplit &&
|
||||
chunkToken >= maxLen - overlapLen &&
|
||||
textToken < overlapLen
|
||||
) {
|
||||
overlayChunk += text;
|
||||
}
|
||||
if (chunkToken >= maxLen) {
|
||||
chunks.push(lastChunk);
|
||||
lastChunk = overlayChunk;
|
||||
overlayChunk = '';
|
||||
}
|
||||
}
|
||||
|
||||
/* If the last chunk is independent, it needs to be push chunks. */
|
||||
if (lastChunk && chunks[chunks.length - 1] && !chunks[chunks.length - 1].endsWith(lastChunk)) {
|
||||
chunks.push(lastChunk);
|
||||
}
|
||||
|
||||
return chunks;
|
||||
};
|
||||
|
||||
try {
|
||||
const chunks = splitTextRecursively({ text, step: 0, lastChunk: '', overlayChunk: '' });
|
||||
|
||||
const tokens = chunks.reduce((sum, chunk) => sum + countPromptTokens(chunk, 'system'), 0);
|
||||
|
||||
return {
|
||||
chunks,
|
||||
tokens
|
||||
};
|
||||
} catch (err) {
|
||||
throw new Error(getErrText(err));
|
||||
}
|
||||
};
|
11
packages/global/common/string/tiktoken/cl100k_base.json
Normal file
11
packages/global/common/string/tiktoken/cl100k_base.json
Normal file
File diff suppressed because one or more lines are too long
84
packages/global/common/string/tiktoken/index.ts
Normal file
84
packages/global/common/string/tiktoken/index.ts
Normal file
@@ -0,0 +1,84 @@
|
||||
/* Only the token of gpt-3.5-turbo is used */
|
||||
import type { ChatItemType } from '../../../core/chat/type';
|
||||
import { Tiktoken } from 'js-tiktoken/lite';
|
||||
import { adaptChat2GptMessages } from '../../../core/chat/adapt';
|
||||
import { ChatCompletionRequestMessageRoleEnum } from '../../../core/ai/constant';
|
||||
import encodingJson from './cl100k_base.json';
|
||||
|
||||
/* init tikToken obj */
|
||||
export function getTikTokenEnc() {
|
||||
if (typeof window !== 'undefined' && window.TikToken) {
|
||||
return window.TikToken;
|
||||
}
|
||||
if (typeof global !== 'undefined' && global.TikToken) {
|
||||
return global.TikToken;
|
||||
}
|
||||
|
||||
const enc = new Tiktoken(encodingJson);
|
||||
|
||||
if (typeof window !== 'undefined') {
|
||||
window.TikToken = enc;
|
||||
}
|
||||
if (typeof global !== 'undefined') {
|
||||
global.TikToken = enc;
|
||||
}
|
||||
|
||||
return enc;
|
||||
}
|
||||
|
||||
/* count one prompt tokens */
|
||||
export function countPromptTokens(
|
||||
prompt = '',
|
||||
role: '' | `${ChatCompletionRequestMessageRoleEnum}` = ''
|
||||
) {
|
||||
const enc = getTikTokenEnc();
|
||||
const text = `${role}\n${prompt}`;
|
||||
try {
|
||||
const encodeText = enc.encode(text);
|
||||
return encodeText.length + 3; // 补充 role 估算值
|
||||
} catch (error) {
|
||||
return text.length;
|
||||
}
|
||||
}
|
||||
|
||||
/* count messages tokens */
|
||||
export function countMessagesTokens({ messages }: { messages: ChatItemType[] }) {
|
||||
const adaptMessages = adaptChat2GptMessages({ messages, reserveId: true });
|
||||
|
||||
let totalTokens = 0;
|
||||
for (let i = 0; i < adaptMessages.length; i++) {
|
||||
const item = adaptMessages[i];
|
||||
const tokens = countPromptTokens(item.content, item.role);
|
||||
totalTokens += tokens;
|
||||
}
|
||||
|
||||
return totalTokens;
|
||||
}
|
||||
|
||||
/* slice messages from top to bottom by maxTokens */
|
||||
export function sliceMessagesTB({
|
||||
messages,
|
||||
maxTokens
|
||||
}: {
|
||||
messages: ChatItemType[];
|
||||
maxTokens: number;
|
||||
}) {
|
||||
const adaptMessages = adaptChat2GptMessages({ messages, reserveId: true });
|
||||
let reduceTokens = maxTokens;
|
||||
let result: ChatItemType[] = [];
|
||||
|
||||
for (let i = 0; i < adaptMessages.length; i++) {
|
||||
const item = adaptMessages[i];
|
||||
|
||||
const tokens = countPromptTokens(item.content, item.role);
|
||||
reduceTokens -= tokens;
|
||||
|
||||
if (reduceTokens > 0) {
|
||||
result.push(messages[i]);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return result.length === 0 && messages[0] ? [messages[0]] : result;
|
||||
}
|
5
packages/global/common/string/tiktoken/type.d.ts
vendored
Normal file
5
packages/global/common/string/tiktoken/type.d.ts
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
import type { Tiktoken } from 'js-tiktoken';
|
||||
|
||||
declare global {
|
||||
var TikToken: Tiktoken;
|
||||
}
|
@@ -1,13 +1,15 @@
|
||||
import crypto from 'crypto';
|
||||
|
||||
/* check string is a web link */
|
||||
export function strIsLink(str?: string) {
|
||||
if (!str) return false;
|
||||
if (/^((http|https)?:\/\/|www\.|\/)[^\s/$.?#].[^\s]*$/i.test(str)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
export const hashStr = (psw: string) => {
|
||||
return crypto.createHash('sha256').update(psw).digest('hex');
|
||||
/* hash string */
|
||||
export const hashStr = (str: string) => {
|
||||
return crypto.createHash('sha256').update(str).digest('hex');
|
||||
};
|
||||
|
||||
/* simple text, remove chinese space and extra \n */
|
||||
@@ -20,3 +22,16 @@ export const simpleText = (text: string) => {
|
||||
|
||||
return text;
|
||||
};
|
||||
|
||||
/*
|
||||
replace {{variable}} to value
|
||||
*/
|
||||
export function replaceVariable(text: string, obj: Record<string, string | number>) {
|
||||
for (const key in obj) {
|
||||
const val = obj[key];
|
||||
if (!['string', 'number'].includes(typeof val)) continue;
|
||||
|
||||
text = text.replace(new RegExp(`{{(${key})}}`, 'g'), String(val));
|
||||
}
|
||||
return text || '';
|
||||
}
|
||||
|
Reference in New Issue
Block a user