Files
FastGPT/projects/app/src/global/common/string/tools.ts
2023-10-30 13:26:42 +08:00

109 lines
2.9 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { getErrText } from '@fastgpt/global/common/error/utils';
import { countPromptTokens } from '@/global/common/tiktoken';
/*
replace {{variable}} to value
*/
export function replaceVariable(text: string, obj: Record<string, string | number>) {
for (const key in obj) {
const val = obj[key];
if (!['string', 'number'].includes(typeof val)) continue;
text = text.replace(new RegExp(`{{(${key})}}`, 'g'), String(val));
}
return text || '';
}
/**
* text split into chunks
* maxLen - one chunk len. max: 3500
* overlapLen - The size of the before and after Text
* maxLen > overlapLen
*/
export const splitText2Chunks = ({ text = '', maxLen }: { text: string; maxLen: number }) => {
const overlapLen = Math.floor(maxLen * 0.15); // Overlap length
const tempMarker = 'SPLIT_HERE_SPLIT_HERE';
const stepReg: Record<number, RegExp> = {
0: /(\n\n)/g,
1: /([\n])/g,
2: /[。]|(?!<[^a-zA-Z])\.\s/g,
3: /([]|!\s|\?\s)/g,
4: /([]|;\s)/g,
5: /([]|,\s)/g
};
const splitTextRecursively = ({ text = '', step }: { text: string; step: number }) => {
if (text.length <= maxLen) {
return [text];
}
const reg = stepReg[step];
if (!reg) {
// use slice-maxLen to split text
const chunks: string[] = [];
let chunk = '';
for (let i = 0; i < text.length; i += maxLen - overlapLen) {
chunk = text.slice(i, i + maxLen);
chunks.push(chunk);
}
return chunks;
}
// split text by delimiters
const splitTexts = text
.replace(reg, `$1${tempMarker}`)
.split(`${tempMarker}`)
.filter((part) => part);
let chunks: string[] = [];
let preChunk = '';
let chunk = '';
for (let i = 0; i < splitTexts.length; i++) {
let text = splitTexts[i];
// chunk over size
if (text.length > maxLen) {
const innerChunks = splitTextRecursively({ text, step: step + 1 });
if (innerChunks.length === 0) continue;
// If the last chunk is too small, it is merged into the next chunk
if (innerChunks[innerChunks.length - 1].length <= maxLen * 0.5) {
text = innerChunks.pop() || '';
chunks = chunks.concat(innerChunks);
} else {
chunks = chunks.concat(innerChunks);
continue;
}
}
chunk += text;
// size over lapLen, push it to next chunk
if (chunk.length > maxLen - overlapLen) {
preChunk += text;
}
if (chunk.length >= maxLen) {
chunks.push(chunk);
chunk = preChunk;
preChunk = '';
}
}
if (chunk && !chunks[chunks.length - 1].endsWith(chunk)) {
chunks.push(chunk);
}
return chunks;
};
try {
const chunks = splitTextRecursively({ text, step: 0 });
const tokens = chunks.reduce((sum, chunk) => sum + countPromptTokens(chunk, 'system'), 0);
return {
chunks,
tokens
};
} catch (err) {
throw new Error(getErrText(err));
}
};