v4.6-3 (#471)

2025-07-23 05:12:39 +00:00 · 2023-11-15 11:36:25 +08:00
parent 592e1a93a2
commit bfd8be5df0
181 changed files with 2499 additions and 1552 deletions
--- a/packages/global/common/string/textSplitter.ts
+++ b/packages/global/common/string/textSplitter.ts
@@ -0,0 +1,131 @@
+import { getErrText } from '../error/utils';
+import { countPromptTokens } from './tiktoken';
+
+/**
+ * text split into chunks
+ * maxLen - one chunk len. max: 3500
+ * overlapLen - The size of the before and after Text
+ * maxLen > overlapLen
+ * markdown
+ */
+export const splitText2Chunks = (props: { text: string; maxLen: number; overlapLen?: number }) => {
+  const { text = '', maxLen, overlapLen = Math.floor(maxLen * 0.2) } = props;
+  const tempMarker = 'SPLIT_HERE_SPLIT_HERE';
+
+  const stepReg: Record<number, RegExp> = {
+    0: /^(#\s[^\n]+)\n/gm,
+    1: /^(##\s[^\n]+)\n/gm,
+    2: /^(###\s[^\n]+)\n/gm,
+    3: /^(####\s[^\n]+)\n/gm,
+
+    4: /(\n\n)/g,
+    5: /([\n])/g,
+    6: /[。]|(?!<[^a-zA-Z])\.\s/g,
+    7: /([！？]|!\s|\?\s)/g,
+    8: /([；]|;\s)/g,
+    9: /([，]|,\s)/g
+  };
+
+  const splitTextRecursively = ({
+    text = '',
+    step,
+    lastChunk,
+    overlayChunk
+  }: {
+    text: string;
+    step: number;
+    lastChunk: string;
+    overlayChunk: string;
+  }) => {
+    if (text.length <= maxLen) {
+      return [text];
+    }
+    const reg = stepReg[step];
+    const isMarkdownSplit = step < 4;
+
+    if (!reg) {
+      // use slice-maxLen to split text
+      const chunks: string[] = [];
+      let chunk = '';
+      for (let i = 0; i < text.length; i += maxLen - overlapLen) {
+        chunk = text.slice(i, i + maxLen);
+        chunks.push(chunk);
+      }
+      return chunks;
+    }
+
+    // split text by special char
+    const splitTexts = text
+      .replace(reg, isMarkdownSplit ? `${tempMarker}$1` : `$1${tempMarker}`)
+      .split(`${tempMarker}`)
+      .filter((part) => part);
+
+    let chunks: string[] = [];
+    for (let i = 0; i < splitTexts.length; i++) {
+      let text = splitTexts[i];
+      let chunkToken = countPromptTokens(lastChunk, '');
+      const textToken = countPromptTokens(text, '');
+
+      // next chunk is too large / new chunk is too large(The current chunk must be smaller than maxLen)
+      if (textToken >= maxLen || chunkToken + textToken > maxLen * 1.4) {
+        // last chunk is too large, push it to chunks, not add to next chunk
+        if (chunkToken > maxLen * 0.7) {
+          chunks.push(lastChunk);
+          lastChunk = '';
+          overlayChunk = '';
+        }
+        // chunk is small, insert to next chunks
+        const innerChunks = splitTextRecursively({
+          text,
+          step: step + 1,
+          lastChunk,
+          overlayChunk
+        });
+        if (innerChunks.length === 0) continue;
+        chunks = chunks.concat(innerChunks);
+        lastChunk = '';
+        overlayChunk = '';
+        continue;
+      }
+
+      // size less than maxLen, push text to last chunk
+      lastChunk += text;
+      chunkToken += textToken; // Definitely less than 1.4 * maxLen
+
+      // size over lapLen, push it to next chunk
+      if (
+        overlapLen !== 0 &&
+        !isMarkdownSplit &&
+        chunkToken >= maxLen - overlapLen &&
+        textToken < overlapLen
+      ) {
+        overlayChunk += text;
+      }
+      if (chunkToken >= maxLen) {
+        chunks.push(lastChunk);
+        lastChunk = overlayChunk;
+        overlayChunk = '';
+      }
+    }
+
+    /* If the last chunk is independent, it needs to be push chunks. */
+    if (lastChunk && chunks[chunks.length - 1] && !chunks[chunks.length - 1].endsWith(lastChunk)) {
+      chunks.push(lastChunk);
+    }
+
+    return chunks;
+  };
+
+  try {
+    const chunks = splitTextRecursively({ text, step: 0, lastChunk: '', overlayChunk: '' });
+
+    const tokens = chunks.reduce((sum, chunk) => sum + countPromptTokens(chunk, 'system'), 0);
+
+    return {
+      chunks,
+      tokens
+    };
+  } catch (err) {
+    throw new Error(getErrText(err));
+  }
+};
--- a/packages/global/common/string/tiktoken/cl100k_base.json
+++ b/packages/global/common/string/tiktoken/cl100k_base.json
--- a/packages/global/common/string/tiktoken/index.ts
+++ b/packages/global/common/string/tiktoken/index.ts
@@ -0,0 +1,84 @@
+/* Only the token of gpt-3.5-turbo is used */
+import type { ChatItemType } from '../../../core/chat/type';
+import { Tiktoken } from 'js-tiktoken/lite';
+import { adaptChat2GptMessages } from '../../../core/chat/adapt';
+import { ChatCompletionRequestMessageRoleEnum } from '../../../core/ai/constant';
+import encodingJson from './cl100k_base.json';
+
+/* init tikToken obj */
+export function getTikTokenEnc() {
+  if (typeof window !== 'undefined' && window.TikToken) {
+    return window.TikToken;
+  }
+  if (typeof global !== 'undefined' && global.TikToken) {
+    return global.TikToken;
+  }
+
+  const enc = new Tiktoken(encodingJson);
+
+  if (typeof window !== 'undefined') {
+    window.TikToken = enc;
+  }
+  if (typeof global !== 'undefined') {
+    global.TikToken = enc;
+  }
+
+  return enc;
+}
+
+/* count one prompt tokens */
+export function countPromptTokens(
+  prompt = '',
+  role: '' | `${ChatCompletionRequestMessageRoleEnum}` = ''
+) {
+  const enc = getTikTokenEnc();
+  const text = `${role}\n${prompt}`;
+  try {
+    const encodeText = enc.encode(text);
+    return encodeText.length + 3; // 补充 role 估算值
+  } catch (error) {
+    return text.length;
+  }
+}
+
+/* count messages tokens */
+export function countMessagesTokens({ messages }: { messages: ChatItemType[] }) {
+  const adaptMessages = adaptChat2GptMessages({ messages, reserveId: true });
+
+  let totalTokens = 0;
+  for (let i = 0; i < adaptMessages.length; i++) {
+    const item = adaptMessages[i];
+    const tokens = countPromptTokens(item.content, item.role);
+    totalTokens += tokens;
+  }
+
+  return totalTokens;
+}
+
+/* slice messages from top to bottom by maxTokens */
+export function sliceMessagesTB({
+  messages,
+  maxTokens
+}: {
+  messages: ChatItemType[];
+  maxTokens: number;
+}) {
+  const adaptMessages = adaptChat2GptMessages({ messages, reserveId: true });
+  let reduceTokens = maxTokens;
+  let result: ChatItemType[] = [];
+
+  for (let i = 0; i < adaptMessages.length; i++) {
+    const item = adaptMessages[i];
+
+    const tokens = countPromptTokens(item.content, item.role);
+    reduceTokens -= tokens;
+
+    if (reduceTokens > 0) {
+      result.push(messages[i]);
+    } else {
+      break;
+    }
+  }
+
+  return result.length === 0 && messages[0] ? [messages[0]] : result;
+}
--- a/packages/global/common/string/tiktoken/type.d.ts
+++ b/packages/global/common/string/tiktoken/type.d.ts
@@ -0,0 +1,5 @@
+import type { Tiktoken } from 'js-tiktoken';
+
+declare global {
+  var TikToken: Tiktoken;
+}
--- a/packages/global/common/string/tools.ts
+++ b/packages/global/common/string/tools.ts
@@ -1,13 +1,15 @@
 import crypto from 'crypto';

+/* check string is a web link */
 export function strIsLink(str?: string) {
  if (!str) return false;
  if (/^((http|https)?:\/\/|www\.|\/)[^\s/$.?#].[^\s]*$/i.test(str)) return true;
  return false;
 }

-export const hashStr = (psw: string) => {
-  return crypto.createHash('sha256').update(psw).digest('hex');
+/* hash string */
+export const hashStr = (str: string) => {
+  return crypto.createHash('sha256').update(str).digest('hex');
 };

 /* simple text, remove chinese space and extra \n */
@@ -20,3 +22,16 @@ export const simpleText = (text: string) => {

  return text;
 };
+
+/* 
+    replace {{variable}} to value
+*/
+export function replaceVariable(text: string, obj: Record<string, string | number>) {
+  for (const key in obj) {
+    const val = obj[key];
+    if (!['string', 'number'].includes(typeof val)) continue;
+
+    text = text.replace(new RegExp(`{{(${key})}}`, 'g'), String(val));
+  }
+  return text || '';
+}
--- a/packages/global/core/ai/api.d.ts
+++ b/packages/global/core/ai/api.d.ts
@@ -0,0 +1,5 @@
+export type PostReRankProps = {
+  query: string;
+  inputs: { id: string; text: string }[];
+};
+export type PostReRankResponse = { id: string; score: number }[];
--- a/packages/global/core/chat/adapt.ts
+++ b/packages/global/core/chat/adapt.ts
@@ -0,0 +1,40 @@
+import type { ChatItemType } from '../../core/chat/type.d';
+import { ChatRoleEnum } from '../../core/chat/constants';
+import { ChatCompletionRequestMessageRoleEnum } from '../../core/ai/constant';
+import type { ChatMessageItemType } from '../../core/ai/type.d';
+
+const chat2Message = {
+  [ChatRoleEnum.AI]: ChatCompletionRequestMessageRoleEnum.Assistant,
+  [ChatRoleEnum.Human]: ChatCompletionRequestMessageRoleEnum.User,
+  [ChatRoleEnum.System]: ChatCompletionRequestMessageRoleEnum.System,
+  [ChatRoleEnum.Function]: ChatCompletionRequestMessageRoleEnum.Function,
+  [ChatRoleEnum.Tool]: ChatCompletionRequestMessageRoleEnum.Tool
+};
+const message2Chat = {
+  [ChatCompletionRequestMessageRoleEnum.System]: ChatRoleEnum.System,
+  [ChatCompletionRequestMessageRoleEnum.User]: ChatRoleEnum.Human,
+  [ChatCompletionRequestMessageRoleEnum.Assistant]: ChatRoleEnum.AI,
+  [ChatCompletionRequestMessageRoleEnum.Function]: ChatRoleEnum.Function,
+  [ChatCompletionRequestMessageRoleEnum.Tool]: ChatRoleEnum.Tool
+};
+
+export function adaptRole_Chat2Message(role: `${ChatRoleEnum}`) {
+  return chat2Message[role];
+}
+export function adaptRole_Message2Chat(role: `${ChatCompletionRequestMessageRoleEnum}`) {
+  return message2Chat[role];
+}
+
+export const adaptChat2GptMessages = ({
+  messages,
+  reserveId
+}: {
+  messages: ChatItemType[];
+  reserveId: boolean;
+}): ChatMessageItemType[] => {
+  return messages.map((item) => ({
+    ...(reserveId && { dataId: item.dataId }),
+    role: chat2Message[item.obj],
+    content: item.value || ''
+  }));
+};
--- a/packages/global/core/dataset/api.d.ts
+++ b/packages/global/core/dataset/api.d.ts
@@ -0,0 +1,20 @@
+import { DatasetDataIndexItemType } from './type';
+
+/* ================= dataset ===================== */
+
+/* ================= collection ===================== */
+
+/* ================= data ===================== */
+export type PgSearchRawType = {
+  id: string;
+  team_id: string;
+  tmb_id: string;
+  collection_id: string;
+  data_id: string;
+  score: number;
+};
+export type PushDatasetDataChunkProps = {
+  q: string; // embedding content
+  a?: string; // bonus content
+  indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
+};
--- a/packages/global/core/dataset/constant.ts
+++ b/packages/global/core/dataset/constant.ts
@@ -36,29 +36,54 @@ export const DatasetCollectionTypeMap = {
  }
 };

-export enum TrainingModeEnum {
-  'qa' = 'qa',
-  'index' = 'index'
+export enum DatasetDataIndexTypeEnum {
+  chunk = 'chunk',
+  qa = 'qa',
+  summary = 'summary',
+  hypothetical = 'hypothetical',
+  custom = 'custom'
 }
-export const TrainingTypeMap = {
-  [TrainingModeEnum.qa]: 'qa',
-  [TrainingModeEnum.index]: 'index'
-};
-
-export enum DatasetSpecialIdEnum {
-  manual = 'manual',
-  mark = 'mark'
-}
-export const datasetSpecialIdMap = {
-  [DatasetSpecialIdEnum.manual]: {
-    name: 'kb.Manual Data',
-    sourceName: 'kb.Manual Input'
+export const DatasetDataIndexTypeMap = {
+  [DatasetDataIndexTypeEnum.chunk]: {
+    name: 'dataset.data.indexes.chunk'
  },
-  [DatasetSpecialIdEnum.mark]: {
-    name: 'kb.Mark Data',
-    sourceName: 'kb.Manual Mark'
+  [DatasetDataIndexTypeEnum.summary]: {
+    name: 'dataset.data.indexes.summary'
+  },
+  [DatasetDataIndexTypeEnum.hypothetical]: {
+    name: 'dataset.data.indexes.hypothetical'
+  },
+  [DatasetDataIndexTypeEnum.qa]: {
+    name: 'dataset.data.indexes.qa'
+  },
+  [DatasetDataIndexTypeEnum.custom]: {
+    name: 'dataset.data.indexes.custom'
  }
 };
-export const datasetSpecialIds: string[] = [DatasetSpecialIdEnum.manual, DatasetSpecialIdEnum.mark];
+
+export enum TrainingModeEnum {
+  'chunk' = 'chunk',
+  'qa' = 'qa'
+  // 'hypothetical' = 'hypothetical',
+  // 'summary' = 'summary',
+  // 'multipleIndex' = 'multipleIndex'
+}
+export const TrainingTypeMap = {
+  [TrainingModeEnum.chunk]: {
+    name: 'chunk'
+  },
+  [TrainingModeEnum.qa]: {
+    name: 'qa'
+  }
+  // [TrainingModeEnum.hypothetical]: {
+  //   name: 'hypothetical'
+  // },
+  // [TrainingModeEnum.summary]: {
+  //   name: 'summary'
+  // },
+  // [TrainingModeEnum.multipleIndex]: {
+  //   name: 'multipleIndex'
+  // }
+};

 export const FolderAvatarSrc = '/imgs/files/folder.svg';
--- a/packages/global/core/dataset/controller.d.ts
+++ b/packages/global/core/dataset/controller.d.ts
@@ -0,0 +1,27 @@
+import type { DatasetDataIndexItemType, DatasetDataSchemaType } from './type';
+
+export type CreateDatasetDataProps = {
+  teamId: string;
+  tmbId: string;
+  datasetId: string;
+  collectionId: string;
+  q: string;
+  a?: string;
+  indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
+};
+
+export type UpdateDatasetDataProps = {
+  dataId: string;
+  q?: string;
+  a?: string;
+  indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & {
+    dataId?: string; // pg data id
+  })[];
+};
+
+export type PatchIndexesProps = {
+  type: 'create' | 'update' | 'delete';
+  index: Omit<DatasetDataIndexItemType, 'dataId'> & {
+    dataId?: string;
+  };
+};
--- a/packages/global/core/dataset/type.d.ts
+++ b/packages/global/core/dataset/type.d.ts
@@ -1,6 +1,14 @@
+import type { VectorModelItemType } from '../../core/ai/model.d';
 import { PermissionTypeEnum } from '../../support/permission/constant';
-import { DatasetCollectionTypeEnum, DatasetTypeEnum, TrainingModeEnum } from './constant';
+import { PushDatasetDataChunkProps } from './api';
+import {
+  DatasetCollectionTypeEnum,
+  DatasetDataIndexTypeEnum,
+  DatasetTypeEnum,
+  TrainingModeEnum
+} from './constant';

+/* schema */
 export type DatasetSchemaType = {
  _id: string;
  parentId: string;
@@ -33,13 +41,33 @@ export type DatasetCollectionSchemaType = {
  };
 };

+export type DatasetDataIndexItemType = {
+  defaultIndex: boolean;
+  dataId: string; // pg data id
+  type: `${DatasetDataIndexTypeEnum}`;
+  text: string;
+};
+export type DatasetDataSchemaType = {
+  _id: string;
+  userId: string;
+  teamId: string;
+  tmbId: string;
+  datasetId: string;
+  collectionId: string;
+  datasetId: string;
+  collectionId: string;
+  q: string; // large chunks or question
+  a: string; // answer or custom content
+  indexes: DatasetDataIndexItemType[];
+};
+
 export type DatasetTrainingSchemaType = {
  _id: string;
  userId: string;
  teamId: string;
  tmbId: string;
  datasetId: string;
-  datasetCollectionId: string;
+  collectionId: string;
  billId: string;
  expireAt: Date;
  lockTime: Date;
@@ -48,6 +76,7 @@ export type DatasetTrainingSchemaType = {
  prompt: string;
  q: string;
  a: string;
+  indexes: Omit<DatasetDataIndexItemType, 'dataId'>[];
 };

 export type CollectionWithDatasetType = Omit<DatasetCollectionSchemaType, 'datasetId'> & {
@@ -55,41 +84,31 @@ export type CollectionWithDatasetType = Omit<DatasetCollectionSchemaType, 'datas
 };

 /* ================= dataset ===================== */
-
-/* ================= collection ===================== */
-export type DatasetCollectionItemType = DatasetCollectionSchemaType & {
+export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel'> & {
+  vectorModel: VectorModelItemType;
+  isOwner: boolean;
  canWrite: boolean;
 };

+/* ================= collection ===================== */
+export type DatasetCollectionItemType = CollectionWithDatasetType & {
+  canWrite: boolean;
+  sourceName: string;
+  sourceId?: string;
+};
+
 /* ================= data ===================== */
-export type PgRawDataItemType = {
-  id: string;
-  q: string;
-  a: string;
-  team_id: string;
-  tmb_id: string;
-  dataset_id: string;
-  collection_id: string;
-};
-export type PgDataItemType = {
-  id: string;
-  q: string;
-  a: string;
-  teamId: string;
-  tmbId: string;
-  datasetId: string;
-  collectionId: string;
-};
-export type DatasetChunkItemType = {
-  q: string;
-  a: string;
-};
-export type DatasetDataItemType = DatasetChunkItemType & {
+export type DatasetDataItemType = {
  id: string;
  datasetId: string;
  collectionId: string;
  sourceName: string;
  sourceId?: string;
+  q: string;
+  a: string;
+  indexes: DatasetDataIndexItemType[];
+  isOwner: boolean;
+  canWrite: boolean;
 };

 /* --------------- file ---------------------- */
@@ -109,9 +128,6 @@ export type DatasetFileSchema = {
 };

 /* ============= search =============== */
-export type SearchDataResultItemType = PgRawDataItemType & {
-  score: number;
-};
 export type SearchDataResponseItemType = DatasetDataItemType & {
  score: number;
 };
--- a/packages/global/core/dataset/utils.ts
+++ b/packages/global/core/dataset/utils.ts
@@ -1,4 +1,4 @@
-import { DatasetCollectionTypeEnum } from './constant';
+import { DatasetCollectionTypeEnum, DatasetDataIndexTypeEnum } from './constant';
 import { getFileIcon } from '../../common/file/icon';
 import { strIsLink } from '../../common/string/tools';

@@ -44,3 +44,14 @@ export function getSourceNameIcon({
  }
  return '/imgs/files/collection.svg';
 }
+
+export function getDefaultIndex(props?: { q?: string; a?: string; dataId?: string }) {
+  const { q = '', a, dataId } = props || {};
+  const qaStr = `${q}\n${a}`.trim();
+  return {
+    defaultIndex: true,
+    type: a ? DatasetDataIndexTypeEnum.qa : DatasetDataIndexTypeEnum.chunk,
+    text: a ? qaStr : q,
+    dataId
+  };
+}
--- a/packages/global/core/module/api.d.ts
+++ b/packages/global/core/module/api.d.ts
@@ -0,0 +1,3 @@
+import { VectorModelItemType } from '../ai/model.d';
+
+export type SelectedDatasetType = { datasetId: string; vectorModel: VectorModelItemType }[];
--- a/packages/global/package.json
+++ b/packages/global/package.json
@@ -6,7 +6,8 @@
    "timezones-list": "^3.0.2",
    "dayjs": "^1.11.7",
    "encoding": "^0.1.13",
-    "openai": "^4.16.1"
+    "openai": "^4.16.1",
+    "js-tiktoken": "^1.0.7"
  },
  "devDependencies": {
    "@types/node": "^20.8.5"
--- a/packages/global/support/user/team/constant.ts
+++ b/packages/global/support/user/team/constant.ts
@@ -24,7 +24,8 @@ export const TeamMemberRoleMap = {
 export enum TeamMemberStatusEnum {
  waiting = 'waiting',
  active = 'active',
-  reject = 'reject'
+  reject = 'reject',
+  leave = 'leave'
 }
 export const TeamMemberStatusMap = {
  [TeamMemberStatusEnum.waiting]: {
@@ -38,5 +39,10 @@ export const TeamMemberStatusMap = {
  [TeamMemberStatusEnum.reject]: {
    label: 'user.team.member.reject',
    color: 'red.600'
+  },
+  [TeamMemberStatusEnum.leave]: {
+    label: 'user.team.member.leave',
+    color: 'red.600'
  }
 };
+export const leaveStatus = { $ne: TeamMemberStatusEnum.leave };
--- a/packages/global/support/user/team/controller.d.ts
+++ b/packages/global/support/user/team/controller.d.ts
@@ -37,4 +37,7 @@ export type UpdateInviteProps = {
  tmbId: string;
  status: TeamMemberSchema['status'];
 };
-export type InviteMemberResponse = Record<'invite' | 'inValid' | 'inTeam', string[]>;
+export type InviteMemberResponse = Record<
+  'invite' | 'inValid' | 'inTeam',
+  { username: string; userId: string }[]
+>;
--- a/packages/global/support/user/team/type.d.ts
+++ b/packages/global/support/user/team/type.d.ts
@@ -16,6 +16,7 @@ export type TeamMemberSchema = {
  teamId: string;
  userId: string;
  createTime: Date;
+  name: string;
  role: `${TeamMemberRoleEnum}`;
  status: `${TeamMemberStatusEnum}`;
  defaultTeam: boolean;
@@ -25,6 +26,7 @@ export type TeamItemType = {
  userId: string;
  teamId: string;
  teamName: string;
+  memberName: string;
  avatar: string;
  balance: number;
  tmbId: string;
@@ -39,7 +41,7 @@ export type TeamMemberItemType = {
  userId: string;
  tmbId: string;
  teamId: string;
-  memberUsername: string;
+  memberName: string;
  avatar: string;
  role: `${TeamMemberRoleEnum}`;
  status: `${TeamMemberStatusEnum}`;
--- a/packages/global/support/wallet/bill/type.d.ts
+++ b/packages/global/support/wallet/bill/type.d.ts
@@ -15,7 +15,7 @@ export type BillSchema = CreateBillProps & {

 export type BillItemType = {
  id: string;
-  username: string;
+  memberName: string;
  time: Date;
  appName: string;
  source: BillSchema['source'];