This commit is contained in:
Archer
2023-11-15 11:36:25 +08:00
committed by GitHub
parent 592e1a93a2
commit bfd8be5df0
181 changed files with 2499 additions and 1552 deletions

View File

@@ -0,0 +1,131 @@
import { getErrText } from '../error/utils';
import { countPromptTokens } from './tiktoken';
/**
* text split into chunks
* maxLen - one chunk len. max: 3500
* overlapLen - The size of the before and after Text
* maxLen > overlapLen
* markdown
*/
export const splitText2Chunks = (props: { text: string; maxLen: number; overlapLen?: number }) => {
const { text = '', maxLen, overlapLen = Math.floor(maxLen * 0.2) } = props;
const tempMarker = 'SPLIT_HERE_SPLIT_HERE';
const stepReg: Record<number, RegExp> = {
0: /^(#\s[^\n]+)\n/gm,
1: /^(##\s[^\n]+)\n/gm,
2: /^(###\s[^\n]+)\n/gm,
3: /^(####\s[^\n]+)\n/gm,
4: /(\n\n)/g,
5: /([\n])/g,
6: /[。]|(?!<[^a-zA-Z])\.\s/g,
7: /([]|!\s|\?\s)/g,
8: /([]|;\s)/g,
9: /([]|,\s)/g
};
const splitTextRecursively = ({
text = '',
step,
lastChunk,
overlayChunk
}: {
text: string;
step: number;
lastChunk: string;
overlayChunk: string;
}) => {
if (text.length <= maxLen) {
return [text];
}
const reg = stepReg[step];
const isMarkdownSplit = step < 4;
if (!reg) {
// use slice-maxLen to split text
const chunks: string[] = [];
let chunk = '';
for (let i = 0; i < text.length; i += maxLen - overlapLen) {
chunk = text.slice(i, i + maxLen);
chunks.push(chunk);
}
return chunks;
}
// split text by special char
const splitTexts = text
.replace(reg, isMarkdownSplit ? `${tempMarker}$1` : `$1${tempMarker}`)
.split(`${tempMarker}`)
.filter((part) => part);
let chunks: string[] = [];
for (let i = 0; i < splitTexts.length; i++) {
let text = splitTexts[i];
let chunkToken = countPromptTokens(lastChunk, '');
const textToken = countPromptTokens(text, '');
// next chunk is too large / new chunk is too large(The current chunk must be smaller than maxLen)
if (textToken >= maxLen || chunkToken + textToken > maxLen * 1.4) {
// last chunk is too large, push it to chunks, not add to next chunk
if (chunkToken > maxLen * 0.7) {
chunks.push(lastChunk);
lastChunk = '';
overlayChunk = '';
}
// chunk is small, insert to next chunks
const innerChunks = splitTextRecursively({
text,
step: step + 1,
lastChunk,
overlayChunk
});
if (innerChunks.length === 0) continue;
chunks = chunks.concat(innerChunks);
lastChunk = '';
overlayChunk = '';
continue;
}
// size less than maxLen, push text to last chunk
lastChunk += text;
chunkToken += textToken; // Definitely less than 1.4 * maxLen
// size over lapLen, push it to next chunk
if (
overlapLen !== 0 &&
!isMarkdownSplit &&
chunkToken >= maxLen - overlapLen &&
textToken < overlapLen
) {
overlayChunk += text;
}
if (chunkToken >= maxLen) {
chunks.push(lastChunk);
lastChunk = overlayChunk;
overlayChunk = '';
}
}
/* If the last chunk is independent, it needs to be push chunks. */
if (lastChunk && chunks[chunks.length - 1] && !chunks[chunks.length - 1].endsWith(lastChunk)) {
chunks.push(lastChunk);
}
return chunks;
};
try {
const chunks = splitTextRecursively({ text, step: 0, lastChunk: '', overlayChunk: '' });
const tokens = chunks.reduce((sum, chunk) => sum + countPromptTokens(chunk, 'system'), 0);
return {
chunks,
tokens
};
} catch (err) {
throw new Error(getErrText(err));
}
};

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,84 @@
/* Only the token of gpt-3.5-turbo is used */
import type { ChatItemType } from '../../../core/chat/type';
import { Tiktoken } from 'js-tiktoken/lite';
import { adaptChat2GptMessages } from '../../../core/chat/adapt';
import { ChatCompletionRequestMessageRoleEnum } from '../../../core/ai/constant';
import encodingJson from './cl100k_base.json';
/* init tikToken obj */
export function getTikTokenEnc() {
if (typeof window !== 'undefined' && window.TikToken) {
return window.TikToken;
}
if (typeof global !== 'undefined' && global.TikToken) {
return global.TikToken;
}
const enc = new Tiktoken(encodingJson);
if (typeof window !== 'undefined') {
window.TikToken = enc;
}
if (typeof global !== 'undefined') {
global.TikToken = enc;
}
return enc;
}
/* count one prompt tokens */
export function countPromptTokens(
prompt = '',
role: '' | `${ChatCompletionRequestMessageRoleEnum}` = ''
) {
const enc = getTikTokenEnc();
const text = `${role}\n${prompt}`;
try {
const encodeText = enc.encode(text);
return encodeText.length + 3; // 补充 role 估算值
} catch (error) {
return text.length;
}
}
/* count messages tokens */
export function countMessagesTokens({ messages }: { messages: ChatItemType[] }) {
const adaptMessages = adaptChat2GptMessages({ messages, reserveId: true });
let totalTokens = 0;
for (let i = 0; i < adaptMessages.length; i++) {
const item = adaptMessages[i];
const tokens = countPromptTokens(item.content, item.role);
totalTokens += tokens;
}
return totalTokens;
}
/* slice messages from top to bottom by maxTokens */
export function sliceMessagesTB({
messages,
maxTokens
}: {
messages: ChatItemType[];
maxTokens: number;
}) {
const adaptMessages = adaptChat2GptMessages({ messages, reserveId: true });
let reduceTokens = maxTokens;
let result: ChatItemType[] = [];
for (let i = 0; i < adaptMessages.length; i++) {
const item = adaptMessages[i];
const tokens = countPromptTokens(item.content, item.role);
reduceTokens -= tokens;
if (reduceTokens > 0) {
result.push(messages[i]);
} else {
break;
}
}
return result.length === 0 && messages[0] ? [messages[0]] : result;
}

View File

@@ -0,0 +1,5 @@
import type { Tiktoken } from 'js-tiktoken';
declare global {
var TikToken: Tiktoken;
}

View File

@@ -1,13 +1,15 @@
import crypto from 'crypto';
/* check string is a web link */
export function strIsLink(str?: string) {
if (!str) return false;
if (/^((http|https)?:\/\/|www\.|\/)[^\s/$.?#].[^\s]*$/i.test(str)) return true;
return false;
}
export const hashStr = (psw: string) => {
return crypto.createHash('sha256').update(psw).digest('hex');
/* hash string */
export const hashStr = (str: string) => {
return crypto.createHash('sha256').update(str).digest('hex');
};
/* simple text, remove chinese space and extra \n */
@@ -20,3 +22,16 @@ export const simpleText = (text: string) => {
return text;
};
/*
replace {{variable}} to value
*/
export function replaceVariable(text: string, obj: Record<string, string | number>) {
for (const key in obj) {
const val = obj[key];
if (!['string', 'number'].includes(typeof val)) continue;
text = text.replace(new RegExp(`{{(${key})}}`, 'g'), String(val));
}
return text || '';
}

5
packages/global/core/ai/api.d.ts vendored Normal file
View File

@@ -0,0 +1,5 @@
export type PostReRankProps = {
query: string;
inputs: { id: string; text: string }[];
};
export type PostReRankResponse = { id: string; score: number }[];

View File

@@ -0,0 +1,40 @@
import type { ChatItemType } from '../../core/chat/type.d';
import { ChatRoleEnum } from '../../core/chat/constants';
import { ChatCompletionRequestMessageRoleEnum } from '../../core/ai/constant';
import type { ChatMessageItemType } from '../../core/ai/type.d';
const chat2Message = {
[ChatRoleEnum.AI]: ChatCompletionRequestMessageRoleEnum.Assistant,
[ChatRoleEnum.Human]: ChatCompletionRequestMessageRoleEnum.User,
[ChatRoleEnum.System]: ChatCompletionRequestMessageRoleEnum.System,
[ChatRoleEnum.Function]: ChatCompletionRequestMessageRoleEnum.Function,
[ChatRoleEnum.Tool]: ChatCompletionRequestMessageRoleEnum.Tool
};
const message2Chat = {
[ChatCompletionRequestMessageRoleEnum.System]: ChatRoleEnum.System,
[ChatCompletionRequestMessageRoleEnum.User]: ChatRoleEnum.Human,
[ChatCompletionRequestMessageRoleEnum.Assistant]: ChatRoleEnum.AI,
[ChatCompletionRequestMessageRoleEnum.Function]: ChatRoleEnum.Function,
[ChatCompletionRequestMessageRoleEnum.Tool]: ChatRoleEnum.Tool
};
export function adaptRole_Chat2Message(role: `${ChatRoleEnum}`) {
return chat2Message[role];
}
export function adaptRole_Message2Chat(role: `${ChatCompletionRequestMessageRoleEnum}`) {
return message2Chat[role];
}
export const adaptChat2GptMessages = ({
messages,
reserveId
}: {
messages: ChatItemType[];
reserveId: boolean;
}): ChatMessageItemType[] => {
return messages.map((item) => ({
...(reserveId && { dataId: item.dataId }),
role: chat2Message[item.obj],
content: item.value || ''
}));
};

20
packages/global/core/dataset/api.d.ts vendored Normal file
View File

@@ -0,0 +1,20 @@
import { DatasetDataIndexItemType } from './type';
/* ================= dataset ===================== */
/* ================= collection ===================== */
/* ================= data ===================== */
export type PgSearchRawType = {
id: string;
team_id: string;
tmb_id: string;
collection_id: string;
data_id: string;
score: number;
};
export type PushDatasetDataChunkProps = {
q: string; // embedding content
a?: string; // bonus content
indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
};

View File

@@ -36,29 +36,54 @@ export const DatasetCollectionTypeMap = {
}
};
export enum TrainingModeEnum {
'qa' = 'qa',
'index' = 'index'
export enum DatasetDataIndexTypeEnum {
chunk = 'chunk',
qa = 'qa',
summary = 'summary',
hypothetical = 'hypothetical',
custom = 'custom'
}
export const TrainingTypeMap = {
[TrainingModeEnum.qa]: 'qa',
[TrainingModeEnum.index]: 'index'
};
export enum DatasetSpecialIdEnum {
manual = 'manual',
mark = 'mark'
}
export const datasetSpecialIdMap = {
[DatasetSpecialIdEnum.manual]: {
name: 'kb.Manual Data',
sourceName: 'kb.Manual Input'
export const DatasetDataIndexTypeMap = {
[DatasetDataIndexTypeEnum.chunk]: {
name: 'dataset.data.indexes.chunk'
},
[DatasetSpecialIdEnum.mark]: {
name: 'kb.Mark Data',
sourceName: 'kb.Manual Mark'
[DatasetDataIndexTypeEnum.summary]: {
name: 'dataset.data.indexes.summary'
},
[DatasetDataIndexTypeEnum.hypothetical]: {
name: 'dataset.data.indexes.hypothetical'
},
[DatasetDataIndexTypeEnum.qa]: {
name: 'dataset.data.indexes.qa'
},
[DatasetDataIndexTypeEnum.custom]: {
name: 'dataset.data.indexes.custom'
}
};
export const datasetSpecialIds: string[] = [DatasetSpecialIdEnum.manual, DatasetSpecialIdEnum.mark];
export enum TrainingModeEnum {
'chunk' = 'chunk',
'qa' = 'qa'
// 'hypothetical' = 'hypothetical',
// 'summary' = 'summary',
// 'multipleIndex' = 'multipleIndex'
}
export const TrainingTypeMap = {
[TrainingModeEnum.chunk]: {
name: 'chunk'
},
[TrainingModeEnum.qa]: {
name: 'qa'
}
// [TrainingModeEnum.hypothetical]: {
// name: 'hypothetical'
// },
// [TrainingModeEnum.summary]: {
// name: 'summary'
// },
// [TrainingModeEnum.multipleIndex]: {
// name: 'multipleIndex'
// }
};
export const FolderAvatarSrc = '/imgs/files/folder.svg';

View File

@@ -0,0 +1,27 @@
import type { DatasetDataIndexItemType, DatasetDataSchemaType } from './type';
export type CreateDatasetDataProps = {
teamId: string;
tmbId: string;
datasetId: string;
collectionId: string;
q: string;
a?: string;
indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
};
export type UpdateDatasetDataProps = {
dataId: string;
q?: string;
a?: string;
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & {
dataId?: string; // pg data id
})[];
};
export type PatchIndexesProps = {
type: 'create' | 'update' | 'delete';
index: Omit<DatasetDataIndexItemType, 'dataId'> & {
dataId?: string;
};
};

View File

@@ -1,6 +1,14 @@
import type { VectorModelItemType } from '../../core/ai/model.d';
import { PermissionTypeEnum } from '../../support/permission/constant';
import { DatasetCollectionTypeEnum, DatasetTypeEnum, TrainingModeEnum } from './constant';
import { PushDatasetDataChunkProps } from './api';
import {
DatasetCollectionTypeEnum,
DatasetDataIndexTypeEnum,
DatasetTypeEnum,
TrainingModeEnum
} from './constant';
/* schema */
export type DatasetSchemaType = {
_id: string;
parentId: string;
@@ -33,13 +41,33 @@ export type DatasetCollectionSchemaType = {
};
};
export type DatasetDataIndexItemType = {
defaultIndex: boolean;
dataId: string; // pg data id
type: `${DatasetDataIndexTypeEnum}`;
text: string;
};
export type DatasetDataSchemaType = {
_id: string;
userId: string;
teamId: string;
tmbId: string;
datasetId: string;
collectionId: string;
datasetId: string;
collectionId: string;
q: string; // large chunks or question
a: string; // answer or custom content
indexes: DatasetDataIndexItemType[];
};
export type DatasetTrainingSchemaType = {
_id: string;
userId: string;
teamId: string;
tmbId: string;
datasetId: string;
datasetCollectionId: string;
collectionId: string;
billId: string;
expireAt: Date;
lockTime: Date;
@@ -48,6 +76,7 @@ export type DatasetTrainingSchemaType = {
prompt: string;
q: string;
a: string;
indexes: Omit<DatasetDataIndexItemType, 'dataId'>[];
};
export type CollectionWithDatasetType = Omit<DatasetCollectionSchemaType, 'datasetId'> & {
@@ -55,41 +84,31 @@ export type CollectionWithDatasetType = Omit<DatasetCollectionSchemaType, 'datas
};
/* ================= dataset ===================== */
/* ================= collection ===================== */
export type DatasetCollectionItemType = DatasetCollectionSchemaType & {
export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel'> & {
vectorModel: VectorModelItemType;
isOwner: boolean;
canWrite: boolean;
};
/* ================= collection ===================== */
export type DatasetCollectionItemType = CollectionWithDatasetType & {
canWrite: boolean;
sourceName: string;
sourceId?: string;
};
/* ================= data ===================== */
export type PgRawDataItemType = {
id: string;
q: string;
a: string;
team_id: string;
tmb_id: string;
dataset_id: string;
collection_id: string;
};
export type PgDataItemType = {
id: string;
q: string;
a: string;
teamId: string;
tmbId: string;
datasetId: string;
collectionId: string;
};
export type DatasetChunkItemType = {
q: string;
a: string;
};
export type DatasetDataItemType = DatasetChunkItemType & {
export type DatasetDataItemType = {
id: string;
datasetId: string;
collectionId: string;
sourceName: string;
sourceId?: string;
q: string;
a: string;
indexes: DatasetDataIndexItemType[];
isOwner: boolean;
canWrite: boolean;
};
/* --------------- file ---------------------- */
@@ -109,9 +128,6 @@ export type DatasetFileSchema = {
};
/* ============= search =============== */
export type SearchDataResultItemType = PgRawDataItemType & {
score: number;
};
export type SearchDataResponseItemType = DatasetDataItemType & {
score: number;
};

View File

@@ -1,4 +1,4 @@
import { DatasetCollectionTypeEnum } from './constant';
import { DatasetCollectionTypeEnum, DatasetDataIndexTypeEnum } from './constant';
import { getFileIcon } from '../../common/file/icon';
import { strIsLink } from '../../common/string/tools';
@@ -44,3 +44,14 @@ export function getSourceNameIcon({
}
return '/imgs/files/collection.svg';
}
export function getDefaultIndex(props?: { q?: string; a?: string; dataId?: string }) {
const { q = '', a, dataId } = props || {};
const qaStr = `${q}\n${a}`.trim();
return {
defaultIndex: true,
type: a ? DatasetDataIndexTypeEnum.qa : DatasetDataIndexTypeEnum.chunk,
text: a ? qaStr : q,
dataId
};
}

View File

@@ -0,0 +1,3 @@
import { VectorModelItemType } from '../ai/model.d';
export type SelectedDatasetType = { datasetId: string; vectorModel: VectorModelItemType }[];

View File

@@ -6,7 +6,8 @@
"timezones-list": "^3.0.2",
"dayjs": "^1.11.7",
"encoding": "^0.1.13",
"openai": "^4.16.1"
"openai": "^4.16.1",
"js-tiktoken": "^1.0.7"
},
"devDependencies": {
"@types/node": "^20.8.5"

View File

@@ -24,7 +24,8 @@ export const TeamMemberRoleMap = {
export enum TeamMemberStatusEnum {
waiting = 'waiting',
active = 'active',
reject = 'reject'
reject = 'reject',
leave = 'leave'
}
export const TeamMemberStatusMap = {
[TeamMemberStatusEnum.waiting]: {
@@ -38,5 +39,10 @@ export const TeamMemberStatusMap = {
[TeamMemberStatusEnum.reject]: {
label: 'user.team.member.reject',
color: 'red.600'
},
[TeamMemberStatusEnum.leave]: {
label: 'user.team.member.leave',
color: 'red.600'
}
};
export const leaveStatus = { $ne: TeamMemberStatusEnum.leave };

View File

@@ -37,4 +37,7 @@ export type UpdateInviteProps = {
tmbId: string;
status: TeamMemberSchema['status'];
};
export type InviteMemberResponse = Record<'invite' | 'inValid' | 'inTeam', string[]>;
export type InviteMemberResponse = Record<
'invite' | 'inValid' | 'inTeam',
{ username: string; userId: string }[]
>;

View File

@@ -16,6 +16,7 @@ export type TeamMemberSchema = {
teamId: string;
userId: string;
createTime: Date;
name: string;
role: `${TeamMemberRoleEnum}`;
status: `${TeamMemberStatusEnum}`;
defaultTeam: boolean;
@@ -25,6 +26,7 @@ export type TeamItemType = {
userId: string;
teamId: string;
teamName: string;
memberName: string;
avatar: string;
balance: number;
tmbId: string;
@@ -39,7 +41,7 @@ export type TeamMemberItemType = {
userId: string;
tmbId: string;
teamId: string;
memberUsername: string;
memberName: string;
avatar: string;
role: `${TeamMemberRoleEnum}`;
status: `${TeamMemberStatusEnum}`;

View File

@@ -15,7 +15,7 @@ export type BillSchema = CreateBillProps & {
export type BillItemType = {
id: string;
username: string;
memberName: string;
time: Date;
appName: string;
source: BillSchema['source'];