v4.6-3 (#471)

2025-08-01 20:27:45 +00:00 · 2023-11-15 11:36:25 +08:00
parent 592e1a93a2
commit bfd8be5df0
181 changed files with 2499 additions and 1552 deletions
--- a/projects/app/src/global/common/string/tools.ts
+++ b/projects/app/src/global/common/string/tools.ts
@@ -1,108 +0,0 @@
-import { getErrText } from '@fastgpt/global/common/error/utils';
-import { countPromptTokens } from '@/global/common/tiktoken';
-
-/* 
-    replace {{variable}} to value
-*/
-export function replaceVariable(text: string, obj: Record<string, string | number>) {
-  for (const key in obj) {
-    const val = obj[key];
-    if (!['string', 'number'].includes(typeof val)) continue;
-
-    text = text.replace(new RegExp(`{{(${key})}}`, 'g'), String(val));
-  }
-  return text || '';
-}
-
-/**
- * text split into chunks
- * maxLen - one chunk len. max: 3500
- * overlapLen - The size of the before and after Text
- * maxLen > overlapLen
- */
-export const splitText2Chunks = ({ text = '', maxLen }: { text: string; maxLen: number }) => {
-  const overlapLen = Math.floor(maxLen * 0.15); // Overlap length
-  const tempMarker = 'SPLIT_HERE_SPLIT_HERE';
-
-  const stepReg: Record<number, RegExp> = {
-    0: /(\n\n)/g,
-    1: /([\n])/g,
-    2: /[。]|(?!<[^a-zA-Z])\.\s/g,
-    3: /([！？]|!\s|\?\s)/g,
-    4: /([；]|;\s)/g,
-    5: /([，]|,\s)/g
-  };
-
-  const splitTextRecursively = ({ text = '', step }: { text: string; step: number }) => {
-    if (text.length <= maxLen) {
-      return [text];
-    }
-    const reg = stepReg[step];
-
-    if (!reg) {
-      // use slice-maxLen to split text
-      const chunks: string[] = [];
-      let chunk = '';
-      for (let i = 0; i < text.length; i += maxLen - overlapLen) {
-        chunk = text.slice(i, i + maxLen);
-        chunks.push(chunk);
-      }
-      return chunks;
-    }
-
-    // split text by delimiters
-    const splitTexts = text
-      .replace(reg, `$1${tempMarker}`)
-      .split(`${tempMarker}`)
-      .filter((part) => part);
-
-    let chunks: string[] = [];
-    let preChunk = '';
-    let chunk = '';
-    for (let i = 0; i < splitTexts.length; i++) {
-      let text = splitTexts[i];
-      // chunk over size
-      if (text.length > maxLen) {
-        const innerChunks = splitTextRecursively({ text, step: step + 1 });
-        if (innerChunks.length === 0) continue;
-        // If the last chunk is too small, it is merged into the next chunk
-        if (innerChunks[innerChunks.length - 1].length <= maxLen * 0.5) {
-          text = innerChunks.pop() || '';
-          chunks = chunks.concat(innerChunks);
-        } else {
-          chunks = chunks.concat(innerChunks);
-          continue;
-        }
-      }
-
-      chunk += text;
-      // size over lapLen, push it to next chunk
-      if (chunk.length > maxLen - overlapLen) {
-        preChunk += text;
-      }
-      if (chunk.length >= maxLen) {
-        chunks.push(chunk);
-        chunk = preChunk;
-        preChunk = '';
-      }
-    }
-
-    if (chunk && !chunks[chunks.length - 1].endsWith(chunk)) {
-      chunks.push(chunk);
-    }
-    return chunks;
-  };
-
-  try {
-    const chunks = splitTextRecursively({ text, step: 0 });
-
-    const tokens = chunks.reduce((sum, chunk) => sum + countPromptTokens(chunk, 'system'), 0);
-
-    return {
-      chunks,
-      tokens
-    };
-  } catch (err) {
-    throw new Error(getErrText(err));
-  }
-};
--- a/projects/app/src/global/common/tiktoken/cl100k_base.json
+++ b/projects/app/src/global/common/tiktoken/cl100k_base.json
--- a/projects/app/src/global/common/tiktoken/index.ts
+++ b/projects/app/src/global/common/tiktoken/index.ts
@@ -1,95 +0,0 @@
-/* Only the token of gpt-3.5-turbo is used */
-import type { ChatItemType } from '@fastgpt/global/core/chat/type';
-import { Tiktoken } from 'js-tiktoken/lite';
-import { adaptChat2GptMessages } from '@/utils/common/adapt/message';
-import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constant';
-import encodingJson from './cl100k_base.json';
-
-/* init tikToken obj */
-export function getTikTokenEnc() {
-  if (typeof window !== 'undefined' && window.TikToken) {
-    return window.TikToken;
-  }
-  if (typeof global !== 'undefined' && global.TikToken) {
-    return global.TikToken;
-  }
-
-  const enc = new Tiktoken(encodingJson);
-
-  if (typeof window !== 'undefined') {
-    window.TikToken = enc;
-  }
-  if (typeof global !== 'undefined') {
-    global.TikToken = enc;
-  }
-
-  return enc;
-}
-
-/* count one prompt tokens */
-export function countPromptTokens(
-  prompt = '',
-  role: '' | `${ChatCompletionRequestMessageRoleEnum}` = ''
-) {
-  const enc = getTikTokenEnc();
-  const text = `${role}\n${prompt}`;
-  try {
-    const encodeText = enc.encode(text);
-    return encodeText.length + 3; // 补充 role 估算值
-  } catch (error) {
-    return text.length;
-  }
-}
-
-/* count messages tokens */
-export function countMessagesTokens({ messages }: { messages: ChatItemType[] }) {
-  const adaptMessages = adaptChat2GptMessages({ messages, reserveId: true });
-
-  let totalTokens = 0;
-  for (let i = 0; i < adaptMessages.length; i++) {
-    const item = adaptMessages[i];
-    const tokens = countPromptTokens(item.content, item.role);
-    totalTokens += tokens;
-  }
-
-  return totalTokens;
-}
-
-export function sliceTextByTokens({ text, length }: { text: string; length: number }) {
-  const enc = getTikTokenEnc();
-
-  try {
-    const encodeText = enc.encode(text);
-    return enc.decode(encodeText.slice(0, length));
-  } catch (error) {
-    return text.slice(0, length);
-  }
-}
-
-/* slice messages from top to bottom by maxTokens */
-export function sliceMessagesTB({
-  messages,
-  maxTokens
-}: {
-  messages: ChatItemType[];
-  maxTokens: number;
-}) {
-  const adaptMessages = adaptChat2GptMessages({ messages, reserveId: true });
-  let reduceTokens = maxTokens;
-  let result: ChatItemType[] = [];
-
-  for (let i = 0; i < adaptMessages.length; i++) {
-    const item = adaptMessages[i];
-
-    const tokens = countPromptTokens(item.content, item.role);
-    reduceTokens -= tokens;
-
-    if (reduceTokens > 0) {
-      result.push(messages[i]);
-    } else {
-      break;
-    }
-  }
-
-  return result.length === 0 && messages[0] ? [messages[0]] : result;
-}
--- a/projects/app/src/global/core/api/datasetReq.d.ts
+++ b/projects/app/src/global/core/api/datasetReq.d.ts
@@ -2,7 +2,7 @@ import { DatasetCollectionTypeEnum, DatasetTypeEnum } from '@fastgpt/global/core
 import type { RequestPaging } from '@/types';
 import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
 import type { SearchTestItemType } from '@/types/core/dataset';
-import { DatasetChunkItemType, UploadChunkItemType } from '@fastgpt/global/core/dataset/type';
+import { UploadChunkItemType } from '@fastgpt/global/core/dataset/type';
 import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type';
 import { PermissionTypeEnum } from '@fastgpt/global/support/permission/constant';

@@ -10,19 +10,11 @@ import { PermissionTypeEnum } from '@fastgpt/global/support/permission/constant'
 export type DatasetUpdateParams = {
  id: string;
  parentId?: string;
-  tags?: string;
+  tags?: string[];
  name?: string;
  avatar?: string;
  permission?: `${PermissionTypeEnum}`;
 };
-export type CreateDatasetParams = {
-  parentId?: string;
-  name: string;
-  tags: string;
-  avatar: string;
-  vectorModel?: string;
-  type: `${DatasetTypeEnum}`;
-};

 export type SearchTestProps = {
  datasetId: string;
@@ -54,20 +46,6 @@ export type UpdateDatasetCollectionParams = {
 };

 /* ==== data ===== */
-export type SetOneDatasetDataProps = {
-  id?: string;
-  collectionId: string;
-  q?: string; // embedding content
-  a?: string; // bonus content
-};
-export type PushDataProps = {
-  collectionId: string;
-  data: DatasetChunkItemType[];
-  mode: `${TrainingModeEnum}`;
-  prompt?: string;
-  billId?: string;
-};
-
 export type GetDatasetDataListProps = RequestPaging & {
  searchText?: string;
  collectionId: string;
--- a/projects/app/src/global/core/dataset/api.d.ts
+++ b/projects/app/src/global/core/dataset/api.d.ts
@@ -0,0 +1,35 @@
+import { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api';
+import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
+import { DatasetDataIndexItemType } from '@fastgpt/global/core/dataset/type';
+
+/* ================= dataset ===================== */
+export type CreateDatasetParams = {
+  parentId?: string;
+  name: string;
+  tags: string;
+  avatar: string;
+  vectorModel?: string;
+  type: `${DatasetTypeEnum}`;
+};
+
+/* ================= collection ===================== */
+
+/* ================= data ===================== */
+export type InsertOneDatasetDataProps = PushDatasetDataChunkProps & {
+  collectionId: string;
+};
+export type PushDatasetDataProps = {
+  collectionId: string;
+  data: PushDatasetDataChunkProps[];
+  mode: `${TrainingModeEnum}`;
+  prompt?: string;
+  billId?: string;
+};
+export type UpdateDatasetDataProps = {
+  id: string;
+  q?: string; // embedding content
+  a?: string; // bonus content
+  indexes: (Omit<DatasetDataIndexItemType, 'dataId'> & {
+    dataId?: string; // pg data id
+  })[];
+};
--- a/projects/app/src/global/core/dataset/response.d.ts
+++ b/projects/app/src/global/core/dataset/response.d.ts
@@ -1,5 +1,8 @@
 import { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder/type';
-import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
+import {
+  DatasetCollectionSchemaType,
+  DatasetDataSchemaType
+} from '@fastgpt/global/core/dataset/type.d';

 /* ================= dataset ===================== */

@@ -11,7 +14,7 @@ export type DatasetCollectionsListItemType = {
  name: string;
  type: DatasetCollectionSchemaType['type'];
  updateTime: Date;
-  dataAmount?: number;
+  dataAmount: number;
  trainingAmount: number;
  metadata: DatasetCollectionSchemaType['metadata'];
  canWrite: boolean;
@@ -19,7 +22,10 @@ export type DatasetCollectionsListItemType = {

 /* ================= data ===================== */
 export type DatasetDataListItemType = {
-  id: string;
+  _id: string;
+  datasetId: string;
+  collectionId: string;
  q: string; // embedding content
  a: string; // bonus content
+  indexes: DatasetDataSchemaType['indexes'];
 };