This commit is contained in:
Archer
2023-11-15 11:36:25 +08:00
committed by GitHub
parent 592e1a93a2
commit bfd8be5df0
181 changed files with 2499 additions and 1552 deletions

View File

@@ -1,108 +0,0 @@
import { getErrText } from '@fastgpt/global/common/error/utils';
import { countPromptTokens } from '@/global/common/tiktoken';
/*
replace {{variable}} to value
*/
export function replaceVariable(text: string, obj: Record<string, string | number>) {
for (const key in obj) {
const val = obj[key];
if (!['string', 'number'].includes(typeof val)) continue;
text = text.replace(new RegExp(`{{(${key})}}`, 'g'), String(val));
}
return text || '';
}
/**
* text split into chunks
* maxLen - one chunk len. max: 3500
* overlapLen - The size of the before and after Text
* maxLen > overlapLen
*/
export const splitText2Chunks = ({ text = '', maxLen }: { text: string; maxLen: number }) => {
const overlapLen = Math.floor(maxLen * 0.15); // Overlap length
const tempMarker = 'SPLIT_HERE_SPLIT_HERE';
const stepReg: Record<number, RegExp> = {
0: /(\n\n)/g,
1: /([\n])/g,
2: /[。]|(?!<[^a-zA-Z])\.\s/g,
3: /([]|!\s|\?\s)/g,
4: /([]|;\s)/g,
5: /([]|,\s)/g
};
const splitTextRecursively = ({ text = '', step }: { text: string; step: number }) => {
if (text.length <= maxLen) {
return [text];
}
const reg = stepReg[step];
if (!reg) {
// use slice-maxLen to split text
const chunks: string[] = [];
let chunk = '';
for (let i = 0; i < text.length; i += maxLen - overlapLen) {
chunk = text.slice(i, i + maxLen);
chunks.push(chunk);
}
return chunks;
}
// split text by delimiters
const splitTexts = text
.replace(reg, `$1${tempMarker}`)
.split(`${tempMarker}`)
.filter((part) => part);
let chunks: string[] = [];
let preChunk = '';
let chunk = '';
for (let i = 0; i < splitTexts.length; i++) {
let text = splitTexts[i];
// chunk over size
if (text.length > maxLen) {
const innerChunks = splitTextRecursively({ text, step: step + 1 });
if (innerChunks.length === 0) continue;
// If the last chunk is too small, it is merged into the next chunk
if (innerChunks[innerChunks.length - 1].length <= maxLen * 0.5) {
text = innerChunks.pop() || '';
chunks = chunks.concat(innerChunks);
} else {
chunks = chunks.concat(innerChunks);
continue;
}
}
chunk += text;
// size over lapLen, push it to next chunk
if (chunk.length > maxLen - overlapLen) {
preChunk += text;
}
if (chunk.length >= maxLen) {
chunks.push(chunk);
chunk = preChunk;
preChunk = '';
}
}
if (chunk && !chunks[chunks.length - 1].endsWith(chunk)) {
chunks.push(chunk);
}
return chunks;
};
try {
const chunks = splitTextRecursively({ text, step: 0 });
const tokens = chunks.reduce((sum, chunk) => sum + countPromptTokens(chunk, 'system'), 0);
return {
chunks,
tokens
};
} catch (err) {
throw new Error(getErrText(err));
}
};

File diff suppressed because one or more lines are too long

View File

@@ -1,95 +0,0 @@
/* Only the token of gpt-3.5-turbo is used */
import type { ChatItemType } from '@fastgpt/global/core/chat/type';
import { Tiktoken } from 'js-tiktoken/lite';
import { adaptChat2GptMessages } from '@/utils/common/adapt/message';
import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constant';
import encodingJson from './cl100k_base.json';
/* init tikToken obj */
export function getTikTokenEnc() {
if (typeof window !== 'undefined' && window.TikToken) {
return window.TikToken;
}
if (typeof global !== 'undefined' && global.TikToken) {
return global.TikToken;
}
const enc = new Tiktoken(encodingJson);
if (typeof window !== 'undefined') {
window.TikToken = enc;
}
if (typeof global !== 'undefined') {
global.TikToken = enc;
}
return enc;
}
/* count one prompt tokens */
export function countPromptTokens(
prompt = '',
role: '' | `${ChatCompletionRequestMessageRoleEnum}` = ''
) {
const enc = getTikTokenEnc();
const text = `${role}\n${prompt}`;
try {
const encodeText = enc.encode(text);
return encodeText.length + 3; // 补充 role 估算值
} catch (error) {
return text.length;
}
}
/* count messages tokens */
export function countMessagesTokens({ messages }: { messages: ChatItemType[] }) {
const adaptMessages = adaptChat2GptMessages({ messages, reserveId: true });
let totalTokens = 0;
for (let i = 0; i < adaptMessages.length; i++) {
const item = adaptMessages[i];
const tokens = countPromptTokens(item.content, item.role);
totalTokens += tokens;
}
return totalTokens;
}
export function sliceTextByTokens({ text, length }: { text: string; length: number }) {
const enc = getTikTokenEnc();
try {
const encodeText = enc.encode(text);
return enc.decode(encodeText.slice(0, length));
} catch (error) {
return text.slice(0, length);
}
}
/* slice messages from top to bottom by maxTokens */
export function sliceMessagesTB({
messages,
maxTokens
}: {
messages: ChatItemType[];
maxTokens: number;
}) {
const adaptMessages = adaptChat2GptMessages({ messages, reserveId: true });
let reduceTokens = maxTokens;
let result: ChatItemType[] = [];
for (let i = 0; i < adaptMessages.length; i++) {
const item = adaptMessages[i];
const tokens = countPromptTokens(item.content, item.role);
reduceTokens -= tokens;
if (reduceTokens > 0) {
result.push(messages[i]);
} else {
break;
}
}
return result.length === 0 && messages[0] ? [messages[0]] : result;
}

View File

@@ -2,7 +2,7 @@ import { DatasetCollectionTypeEnum, DatasetTypeEnum } from '@fastgpt/global/core
import type { RequestPaging } from '@/types';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
import type { SearchTestItemType } from '@/types/core/dataset';
import { DatasetChunkItemType, UploadChunkItemType } from '@fastgpt/global/core/dataset/type';
import { UploadChunkItemType } from '@fastgpt/global/core/dataset/type';
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type';
import { PermissionTypeEnum } from '@fastgpt/global/support/permission/constant';
@@ -10,19 +10,11 @@ import { PermissionTypeEnum } from '@fastgpt/global/support/permission/constant'
export type DatasetUpdateParams = {
id: string;
parentId?: string;
tags?: string;
tags?: string[];
name?: string;
avatar?: string;
permission?: `${PermissionTypeEnum}`;
};
export type CreateDatasetParams = {
parentId?: string;
name: string;
tags: string;
avatar: string;
vectorModel?: string;
type: `${DatasetTypeEnum}`;
};
export type SearchTestProps = {
datasetId: string;
@@ -54,20 +46,6 @@ export type UpdateDatasetCollectionParams = {
};
/* ==== data ===== */
export type SetOneDatasetDataProps = {
id?: string;
collectionId: string;
q?: string; // embedding content
a?: string; // bonus content
};
export type PushDataProps = {
collectionId: string;
data: DatasetChunkItemType[];
mode: `${TrainingModeEnum}`;
prompt?: string;
billId?: string;
};
export type GetDatasetDataListProps = RequestPaging & {
searchText?: string;
collectionId: string;

View File

@@ -0,0 +1,35 @@
import { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
import { DatasetDataIndexItemType } from '@fastgpt/global/core/dataset/type';
/* ================= dataset ===================== */
export type CreateDatasetParams = {
parentId?: string;
name: string;
tags: string;
avatar: string;
vectorModel?: string;
type: `${DatasetTypeEnum}`;
};
/* ================= collection ===================== */
/* ================= data ===================== */
export type InsertOneDatasetDataProps = PushDatasetDataChunkProps & {
collectionId: string;
};
export type PushDatasetDataProps = {
collectionId: string;
data: PushDatasetDataChunkProps[];
mode: `${TrainingModeEnum}`;
prompt?: string;
billId?: string;
};
export type UpdateDatasetDataProps = {
id: string;
q?: string; // embedding content
a?: string; // bonus content
indexes: (Omit<DatasetDataIndexItemType, 'dataId'> & {
dataId?: string; // pg data id
})[];
};

View File

@@ -1,5 +1,8 @@
import { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder/type';
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
import {
DatasetCollectionSchemaType,
DatasetDataSchemaType
} from '@fastgpt/global/core/dataset/type.d';
/* ================= dataset ===================== */
@@ -11,7 +14,7 @@ export type DatasetCollectionsListItemType = {
name: string;
type: DatasetCollectionSchemaType['type'];
updateTime: Date;
dataAmount?: number;
dataAmount: number;
trainingAmount: number;
metadata: DatasetCollectionSchemaType['metadata'];
canWrite: boolean;
@@ -19,7 +22,10 @@ export type DatasetCollectionsListItemType = {
/* ================= data ===================== */
export type DatasetDataListItemType = {
id: string;
_id: string;
datasetId: string;
collectionId: string;
q: string; // embedding content
a: string; // bonus content
indexes: DatasetDataSchemaType['indexes'];
};