mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-27 08:25:07 +00:00
perf: token split text
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
import mammoth from 'mammoth';
|
||||
import Papa from 'papaparse';
|
||||
import { countChatTokens } from './tools';
|
||||
import { getEncMap } from './tools';
|
||||
|
||||
/**
|
||||
* 读取 txt 文件内容
|
||||
@@ -145,7 +145,7 @@ export const fileDownload = ({
|
||||
* slideLen - The size of the before and after Text
|
||||
* maxLen > slideLen
|
||||
*/
|
||||
export const splitText = ({
|
||||
export const splitText_token = ({
|
||||
text,
|
||||
maxLen,
|
||||
slideLen
|
||||
@@ -154,39 +154,32 @@ export const splitText = ({
|
||||
maxLen: number;
|
||||
slideLen: number;
|
||||
}) => {
|
||||
const textArr =
|
||||
text.split(/(?<=[。!?\.!\?\n])/g)?.filter((item) => {
|
||||
const text = item.replace(/(\\n)/g, '\n').trim();
|
||||
if (text && text !== '\n') return true;
|
||||
return false;
|
||||
}) || [];
|
||||
const enc = getEncMap()['gpt-3.5-turbo'];
|
||||
// filter empty text. encode sentence
|
||||
const encodeText = enc.encode(text);
|
||||
|
||||
const chunks: { sum: number; arr: string[] }[] = [{ sum: 0, arr: [] }];
|
||||
const chunks: string[] = [];
|
||||
let tokens = 0;
|
||||
|
||||
for (let i = 0; i < textArr.length; i++) {
|
||||
const tokenLen = countChatTokens({ messages: [{ role: 'system', content: textArr[i] }] });
|
||||
chunks[chunks.length - 1].sum += tokenLen;
|
||||
chunks[chunks.length - 1].arr.push(textArr[i]);
|
||||
let startIndex = 0;
|
||||
let endIndex = Math.min(startIndex + maxLen, encodeText.length);
|
||||
let chunkEncodeArr = encodeText.slice(startIndex, endIndex);
|
||||
|
||||
// current length is over maxLen. create new chunk
|
||||
if (chunks[chunks.length - 1].sum + tokenLen >= maxLen) {
|
||||
// get slide len text as the initial value
|
||||
const chunk: { sum: number; arr: string[] } = { sum: 0, arr: [] };
|
||||
for (let j = chunks[chunks.length - 1].arr.length - 1; j >= 0; j--) {
|
||||
const chunkText = chunks[chunks.length - 1].arr[j];
|
||||
const tokenLen = countChatTokens({ messages: [{ role: 'system', content: chunkText }] });
|
||||
chunk.sum += tokenLen;
|
||||
chunk.arr.unshift(chunkText);
|
||||
const decoder = new TextDecoder();
|
||||
|
||||
if (chunk.sum >= slideLen) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
chunks.push(chunk);
|
||||
}
|
||||
while (startIndex < encodeText.length) {
|
||||
tokens += chunkEncodeArr.length;
|
||||
chunks.push(decoder.decode(enc.decode(chunkEncodeArr)));
|
||||
|
||||
startIndex += maxLen - slideLen;
|
||||
endIndex = Math.min(startIndex + maxLen, encodeText.length);
|
||||
chunkEncodeArr = encodeText.slice(Math.min(encodeText.length - slideLen, startIndex), endIndex);
|
||||
}
|
||||
const result = chunks.map((item) => item.arr.join(''));
|
||||
return result;
|
||||
|
||||
return {
|
||||
chunks,
|
||||
tokens
|
||||
};
|
||||
};
|
||||
|
||||
export const fileToBase64 = (file: File) => {
|
||||
|
@@ -7,7 +7,7 @@ import { ChatModelEnum } from '@/constants/model';
|
||||
const textDecoder = new TextDecoder();
|
||||
const graphemer = new Graphemer();
|
||||
let encMap: Record<string, Tiktoken>;
|
||||
const getEncMap = () => {
|
||||
export const getEncMap = () => {
|
||||
if (encMap) return encMap;
|
||||
encMap = {
|
||||
'gpt-3.5-turbo': encoding_for_model('gpt-3.5-turbo', {
|
||||
@@ -136,3 +136,18 @@ export const countChatTokens = ({
|
||||
const text = getChatGPTEncodingText(messages, model);
|
||||
return text2TokensLen(getEncMap()[model], text);
|
||||
};
|
||||
|
||||
export const sliceTextByToken = ({
|
||||
model = 'gpt-3.5-turbo',
|
||||
text,
|
||||
length
|
||||
}: {
|
||||
model?: `${ChatModelEnum}`;
|
||||
text: string;
|
||||
length: number;
|
||||
}) => {
|
||||
const enc = getEncMap()[model];
|
||||
const encodeText = enc.encode(text);
|
||||
const decoder = new TextDecoder();
|
||||
return decoder.decode(enc.decode(encodeText.slice(0, length)));
|
||||
};
|
||||
|
Reference in New Issue
Block a user