4.6.4-alpha (#582)

2025-07-23 05:12:39 +00:00 · 2023-12-08 15:01:11 +08:00
parent 54d52d8d25
commit b58249fc3a
66 changed files with 962 additions and 527 deletions
--- a/packages/global/common/error/code/common.ts
+++ b/packages/global/common/error/code/common.ts
@@ -0,0 +1,24 @@
+import { ErrType } from '../errorCode';
+
+/* dataset: 507000 */
+const startCode = 507000;
+export enum CommonErrEnum {
+  fileNotFound = 'fileNotFound'
+}
+const datasetErr = [
+  {
+    statusText: CommonErrEnum.fileNotFound,
+    message: 'error.fileNotFound'
+  }
+];
+export default datasetErr.reduce((acc, cur, index) => {
+  return {
+    ...acc,
+    [cur.statusText]: {
+      code: startCode + index,
+      statusText: cur.statusText,
+      message: cur.message,
+      data: null
+    }
+  };
+}, {} as ErrType<`${CommonErrEnum}`>);
--- a/packages/global/common/error/code/dataset.ts
+++ b/packages/global/common/error/code/dataset.ts
@@ -13,23 +13,23 @@ export enum DatasetErrEnum {
 const datasetErr = [
  {
    statusText: DatasetErrEnum.unAuthDataset,
-    message: '无权操作该知识库'
+    message: 'core.dataset.error.unAuthDataset'
  },
  {
    statusText: DatasetErrEnum.unAuthDatasetCollection,
-    message: '无权操作该数据集'
+    message: 'core.dataset.error.unAuthDatasetCollection'
  },
  {
    statusText: DatasetErrEnum.unAuthDatasetData,
-    message: '无权操作该数据'
+    message: 'core.dataset.error.unAuthDatasetData'
  },
  {
    statusText: DatasetErrEnum.unAuthDatasetFile,
-    message: '无权操作该文件'
+    message: 'core.dataset.error.unAuthDatasetFile'
  },
  {
    statusText: DatasetErrEnum.unCreateCollection,
-    message: '无权创建数据集'
+    message: 'core.dataset.error.unCreateCollection'
  },
  {
    statusText: DatasetErrEnum.unLinkCollection,
--- a/packages/global/common/error/errorCode.ts
+++ b/packages/global/common/error/errorCode.ts
@@ -6,6 +6,7 @@ import pluginErr from './code/plugin';
 import outLinkErr from './code/outLink';
 import teamErr from './code/team';
 import userErr from './code/user';
+import commonErr from './code/common';

 export const ERROR_CODE: { [key: number]: string } = {
  400: '请求失败',
@@ -96,5 +97,6 @@ export const ERROR_RESPONSE: Record<
  ...outLinkErr,
  ...teamErr,
  ...userErr,
-  ...pluginErr
+  ...pluginErr,
+  ...commonErr
 };
--- a/packages/global/common/file/api.d.ts
+++ b/packages/global/common/file/api.d.ts
@@ -1,3 +1,10 @@
+export type UploadImgProps = {
+  base64Img: string;
+  expiredTime?: Date;
+  metadata?: Record<string, any>;
+  shareId?: string;
+};
+
 export type UrlFetchParams = {
  urlList: string[];
  selector?: string;
--- a/packages/global/common/file/tools.ts
+++ b/packages/global/common/file/tools.ts
@@ -49,7 +49,14 @@ export const cheerioToHtml = ({
    }
  });

-  return $(selector || 'body').html();
+  const html = $(selector || 'body')
+    .map((item, dom) => {
+      return $(dom).html();
+    })
+    .get()
+    .join('\n');
+
+  return html;
 };
 export const urlsFetch = async ({
  urlList,
--- a/packages/global/common/string/markdown.ts
+++ b/packages/global/common/string/markdown.ts
@@ -26,10 +26,14 @@ export const simpleMarkdownText = (rawText: string) => {
  rawText = rawText.replace(/\\\\n/g, '\\n');

  // Remove headings and code blocks front spaces
-  ['####', '###', '##', '#', '```', '~~~'].forEach((item) => {
+  ['####', '###', '##', '#', '```', '~~~'].forEach((item, i) => {
+    const isMarkdown = i <= 3;
    const reg = new RegExp(`\\n\\s*${item}`, 'g');
    if (reg.test(rawText)) {
-      rawText = rawText.replace(new RegExp(`\\n\\s*(${item})`, 'g'), '\n$1');
+      rawText = rawText.replace(
+        new RegExp(`(\\n)\\s*(${item})`, 'g'),
+        isMarkdown ? '\n$1$2' : '$1$2'
+      );
    }
  });

--- a/packages/global/common/string/textSplitter.ts
+++ b/packages/global/common/string/textSplitter.ts
@@ -12,12 +12,13 @@ export const splitText2Chunks = (props: {
  text: string;
  chunkLen: number;
  overlapRatio?: number;
+  customReg?: string[];
 }): {
  chunks: string[];
  tokens: number;
  overlapRatio?: number;
 } => {
-  let { text = '', chunkLen, overlapRatio = 0.2 } = props;
+  let { text = '', chunkLen, overlapRatio = 0.2, customReg = [] } = props;
  const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
  const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
  const overlapLen = Math.round(chunkLen * overlapRatio);
@@ -29,22 +30,29 @@ export const splitText2Chunks = (props: {

  // The larger maxLen is, the next sentence is less likely to trigger splitting
  const stepReges: { reg: RegExp; maxLen: number }[] = [
-    { reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
-    { reg: /^(##\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
-    { reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
-    { reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
+    ...customReg.map((text) => ({ reg: new RegExp(`([${text}])`, 'g'), maxLen: chunkLen * 1.4 })),
+    { reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
+    { reg: /^(##\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
+    { reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
+    { reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },

-    { reg: /([\n](`))/g, maxLen: chunkLen * 4 }, // code block
-    { reg: /([\n](?![\*\-|>0-9]))/g, maxLen: chunkLen * 1.8 }, // (?![\*\-|>`0-9]): markdown special char
-    { reg: /([\n])/g, maxLen: chunkLen * 1.4 },
+    { reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
+    { reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // (?![\*\-|>`0-9]): markdown special char
+    { reg: /([\n])/g, maxLen: chunkLen * 1.2 },

-    { reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.4 },
-    { reg: /([！]|!\s)/g, maxLen: chunkLen * 1.4 },
-    { reg: /([？]|\?\s)/g, maxLen: chunkLen * 1.6 },
-    { reg: /([；]|;\s)/g, maxLen: chunkLen * 1.8 },
+    { reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
+    { reg: /([！]|!\s)/g, maxLen: chunkLen * 1.2 },
+    { reg: /([？]|\?\s)/g, maxLen: chunkLen * 1.4 },
+    { reg: /([；]|;\s)/g, maxLen: chunkLen * 1.6 },
    { reg: /([，]|,\s)/g, maxLen: chunkLen * 2 }
  ];

+  const customRegLen = customReg.length;
+  const checkIsCustomStep = (step: number) => step < customRegLen;
+  const checkIsMarkdownSplit = (step: number) => step >= customRegLen && step <= 3 + customRegLen;
+  const checkIndependentChunk = (step: number) => step >= customRegLen && step <= 4 + customRegLen;
+  const checkForbidOverlap = (step: number) => step <= 6 + customRegLen;
+
  // if use markdown title split, Separate record title title
  const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
    if (step >= stepReges.length) {
@@ -55,11 +63,13 @@ export const splitText2Chunks = (props: {
        }
      ];
    }
-    const isMarkdownSplit = step <= 3;
+    const isMarkdownSplit = checkIsMarkdownSplit(step);
+    const independentChunk = checkIndependentChunk(step);
+
    const { reg } = stepReges[step];

    const splitTexts = text
-      .replace(reg, isMarkdownSplit ? `${splitMarker}$1` : `$1${splitMarker}`)
+      .replace(reg, independentChunk ? `${splitMarker}$1` : `$1${splitMarker}`)
      .split(`${splitMarker}`)
      .filter((part) => part.trim());

@@ -76,7 +86,7 @@ export const splitText2Chunks = (props: {
  };

  const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
-    const forbidOverlap = step <= 6;
+    const forbidOverlap = checkForbidOverlap(step);
    const maxOverlapLen = chunkLen * 0.4;

    // step >= stepReges.length: Do not overlap incomplete sentences
@@ -114,7 +124,8 @@ export const splitText2Chunks = (props: {
    lastText: string;
    mdTitle: string;
  }): string[] => {
-    const isMarkdownSplit = step <= 3;
+    const independentChunk = checkIndependentChunk(step);
+    const isCustomStep = checkIsCustomStep(step);

    // mini text
    if (text.length <= chunkLen) {
@@ -134,12 +145,13 @@ export const splitText2Chunks = (props: {
      return chunks;
    }

-    const { maxLen } = stepReges[step];
-    const minChunkLen = chunkLen * 0.7;
-
    // split text by special char
    const splitTexts = getSplitTexts({ text, step });

+    const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen;
+    const minChunkLen = chunkLen * 0.7;
+    const miniChunkLen = 30;
+
    const chunks: string[] = [];
    for (let i = 0; i < splitTexts.length; i++) {
      const item = splitTexts[i];
@@ -170,8 +182,8 @@ export const splitText2Chunks = (props: {
          mdTitle: currentTitle
        });
        const lastChunk = innerChunks[innerChunks.length - 1];
-        // last chunk is too small, concat it to lastText
-        if (!isMarkdownSplit && lastChunk.length < minChunkLen) {
+        // last chunk is too small, concat it to lastText(next chunk start)
+        if (!independentChunk && lastChunk.length < minChunkLen) {
          chunks.push(...innerChunks.slice(0, -1));
          lastText = lastChunk;
        } else {
@@ -189,10 +201,14 @@ export const splitText2Chunks = (props: {
      lastText = newText;

      // markdown paragraph block: Direct addition; If the chunk size reaches, add a chunk
-      if (isMarkdownSplit || newTextLen >= chunkLen) {
+      if (
+        isCustomStep ||
+        (independentChunk && newTextLen > miniChunkLen) ||
+        newTextLen >= chunkLen
+      ) {
        chunks.push(`${currentTitle}${lastText}`);

-        lastText = isMarkdownSplit ? '' : getOneTextOverlapText({ text: lastText, step });
+        lastText = getOneTextOverlapText({ text: lastText, step });
      }
    }

--- a/packages/global/core/app/utils.ts
+++ b/packages/global/core/app/utils.ts
@@ -24,7 +24,7 @@ export const getDefaultAppForm = (templateId = 'fastgpt-universal'): AppSimpleEd
    dataset: {
      datasets: [],
      similarity: 0.4,
-      limit: 5,
+      limit: 1500,
      searchEmptyText: '',
      searchMode: DatasetSearchModeEnum.embedding
    },
--- a/packages/global/core/chat/constants.ts
+++ b/packages/global/core/chat/constants.ts
@@ -55,3 +55,5 @@ export const LOGO_ICON = `/icon/logo.svg`;

 export const IMG_BLOCK_KEY = 'img-block';
 export const FILE_BLOCK_KEY = 'file-block';
+
+export const MARKDOWN_QUOTE_SIGN = 'QUOTE SIGN';
--- a/packages/global/core/module/template/system/datasetSearch.ts
+++ b/packages/global/core/module/template/system/datasetSearch.ts
@@ -54,17 +54,10 @@ export const DatasetSearchModule: FlowModuleTemplateType = {
    {
      key: ModuleInputKeyEnum.datasetLimit,
      type: FlowNodeInputTypeEnum.hidden,
-      label: '单次搜索上限',
-      description: '最多取 n 条记录作为本次问题引用',
-      value: 5,
+      label: '引用上限',
+      description: '单次搜索最大的 Tokens 数量，中文约1字=1.7Tokens，英文约1字=1Tokens',
+      value: 1500,
      valueType: ModuleDataTypeEnum.number,
-      min: 1,
-      max: 20,
-      step: 1,
-      markList: [
-        { label: '1', value: 1 },
-        { label: '20', value: 20 }
-      ],
      showTargetInApp: false,
      showTargetInPlugin: false
    },
--- a/packages/service/common/file/gridfs/controller.ts
+++ b/packages/service/common/file/gridfs/controller.ts
@@ -3,6 +3,7 @@ import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
 import fsp from 'fs/promises';
 import fs from 'fs';
 import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
+import { delImgByFileIdList } from '../image/controller';

 export function getGFSCollection(bucket: `${BucketNameEnum}`) {
  return connectionMongo.connection.db.collection(`${bucket}.files`);
@@ -69,24 +70,65 @@ export async function getFileById({
    _id: new Types.ObjectId(fileId)
  });

-  if (!file) {
-    return Promise.reject('File not found');
-  }
+  // if (!file) {
+  //   return Promise.reject('File not found');
+  // }

-  return file;
+  return file || undefined;
 }

-export async function delFileById({
+export async function delFileByFileIdList({
  bucketName,
-  fileId
+  fileIdList,
+  retry = 3
 }: {
  bucketName: `${BucketNameEnum}`;
-  fileId: string;
+  fileIdList: string[];
+  retry?: number;
+}): Promise<any> {
+  try {
+    const bucket = getGridBucket(bucketName);
+
+    await Promise.all(fileIdList.map((id) => bucket.delete(new Types.ObjectId(id))));
+  } catch (error) {
+    if (retry > 0) {
+      return delFileByFileIdList({ bucketName, fileIdList, retry: retry - 1 });
+    }
+  }
+}
+// delete file by metadata(datasetId)
+export async function delFileByMetadata({
+  bucketName,
+  datasetId
+}: {
+  bucketName: `${BucketNameEnum}`;
+  datasetId?: string;
 }) {
  const bucket = getGridBucket(bucketName);

-  await bucket.delete(new Types.ObjectId(fileId));
-  return true;
+  const files = await bucket
+    .find(
+      {
+        ...(datasetId && { 'metadata.datasetId': datasetId })
+      },
+      {
+        projection: {
+          _id: 1
+        }
+      }
+    )
+    .toArray();
+
+  const idList = files.map((item) => String(item._id));
+
+  // delete img
+  await delImgByFileIdList(idList);
+
+  // delete file
+  await delFileByFileIdList({
+    bucketName,
+    fileIdList: idList
+  });
 }

 export async function getDownloadStream({
--- a/packages/service/common/file/image/controller.ts
+++ b/packages/service/common/file/image/controller.ts
@@ -1,3 +1,4 @@
+import { UploadImgProps } from '@fastgpt/global/common/file/api';
 import { imageBaseUrl } from './constant';
 import { MongoImage } from './schema';

@@ -9,11 +10,10 @@ export const maxImgSize = 1024 * 1024 * 12;
 export async function uploadMongoImg({
  base64Img,
  teamId,
-  expiredTime
-}: {
-  base64Img: string;
+  expiredTime,
+  metadata
+}: UploadImgProps & {
  teamId: string;
-  expiredTime?: Date;
 }) {
  if (base64Img.length > maxImgSize) {
    return Promise.reject('Image too large');
@@ -24,7 +24,8 @@ export async function uploadMongoImg({
  const { _id } = await MongoImage.create({
    teamId,
    binary: Buffer.from(base64Data, 'base64'),
-    expiredTime
+    expiredTime: expiredTime,
+    metadata
  });

  return getMongoImgUrl(String(_id));
@@ -37,3 +38,9 @@ export async function readMongoImg({ id }: { id: string }) {
  }
  return data?.binary;
 }
+
+export async function delImgByFileIdList(fileIds: string[]) {
+  return MongoImage.deleteMany({
+    'metadata.fileId': { $in: fileIds.map((item) => String(item)) }
+  });
+}
--- a/packages/service/common/file/image/schema.ts
+++ b/packages/service/common/file/image/schema.ts
@@ -5,13 +5,17 @@ const { Schema, model, models } = connectionMongo;
 const ImageSchema = new Schema({
  teamId: {
    type: Schema.Types.ObjectId,
-    ref: TeamCollectionName
+    ref: TeamCollectionName,
+    required: true
  },
  binary: {
    type: Buffer
  },
  expiredTime: {
    type: Date
+  },
+  metadata: {
+    type: Object
  }
 });

@@ -21,7 +25,7 @@ try {
  console.log(error);
 }

-export const MongoImage: Model<{ teamId: string; binary: Buffer }> =
+export const MongoImage: Model<{ teamId: string; binary: Buffer; metadata?: Record<string, any> }> =
  models['image'] || model('image', ImageSchema);

 MongoImage.syncIndexes();
--- a/packages/service/common/response/index.ts
+++ b/packages/service/common/response/index.ts
@@ -82,7 +82,7 @@ export const sseErrRes = (res: NextApiResponse, error: any) => {
  } else if (error?.response?.data?.error?.message) {
    msg = error?.response?.data?.error?.message;
  } else if (error?.error?.message) {
-    msg = error?.error?.message;
+    msg = `${error?.error?.code} ${error?.error?.message}`;
  }

  addLog.error(`sse error: ${msg}`, error);
--- a/packages/service/core/dataset/data/controller.ts
+++ b/packages/service/core/dataset/data/controller.ts
@@ -1,11 +1,11 @@
 import { MongoDatasetData } from './schema';
 import { deletePgDataById } from './pg';
 import { MongoDatasetTraining } from '../training/schema';
-import { delFileById } from '../../../common/file/gridfs/controller';
+import { delFileByFileIdList, delFileByMetadata } from '../../../common/file/gridfs/controller';
 import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
 import { MongoDatasetCollection } from '../collection/schema';
-import { delDatasetFiles } from '../file/controller';
 import { delay } from '@fastgpt/global/common/system/utils';
+import { delImgByFileIdList } from '../../../common/file/image/controller';

 /* delete all data by datasetIds */
 export async function delDatasetRelevantData({ datasetIds }: { datasetIds: string[] }) {
@@ -17,9 +17,11 @@ export async function delDatasetRelevantData({ datasetIds }: { datasetIds: strin
  });

  // delete related files
-  await Promise.all(datasetIds.map((id) => delDatasetFiles({ datasetId: id })));
+  await Promise.all(
+    datasetIds.map((id) => delFileByMetadata({ bucketName: BucketNameEnum.dataset, datasetId: id }))
+  );

-  await delay(1000);
+  await delay(500);

  // delete pg data
  await deletePgDataById(`dataset_id IN ('${datasetIds.join("','")}')`);
@@ -49,17 +51,16 @@ export async function delCollectionRelevantData({
    collectionId: { $in: collectionIds }
  });

-  // delete file
-  await Promise.all(
-    filterFileIds.map((fileId) => {
-      return delFileById({
-        bucketName: BucketNameEnum.dataset,
-        fileId
-      });
+  // delete file and imgs
+  await Promise.all([
+    delImgByFileIdList(filterFileIds),
+    delFileByFileIdList({
+      bucketName: BucketNameEnum.dataset,
+      fileIdList: filterFileIds
    })
-  );
+  ]);

-  await delay(1000);
+  await delay(500);

  // delete pg data
  await deletePgDataById(`collection_id IN ('${collectionIds.join("','")}')`);
--- a/packages/service/core/dataset/file/controller.ts
+++ b/packages/service/core/dataset/file/controller.ts
@@ -1,9 +0,0 @@
-import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
-import { getGFSCollection } from '../../../common/file/gridfs/controller';
-
-export async function delDatasetFiles({ datasetId }: { datasetId: string }) {
-  const db = getGFSCollection(BucketNameEnum.dataset);
-  await db.deleteMany({
-    'metadata.datasetId': String(datasetId)
-  });
-}
--- a/packages/service/support/permission/auth/common.ts
+++ b/packages/service/support/permission/auth/common.ts
@@ -12,7 +12,7 @@ export const authCert = async (props: AuthModeType) => {
    canWrite: true
  };
 };
-export async function authCertAndShareId({
+export async function authCertOrShareId({
  shareId,
  ...props
 }: AuthModeType & { shareId?: string }) {
--- a/packages/service/support/permission/auth/dataset.ts
+++ b/packages/service/support/permission/auth/dataset.ts
@@ -14,6 +14,7 @@ import {
 import { getFileById } from '../../../common/file/gridfs/controller';
 import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
 import { getTeamInfoByTmbId } from '../../user/team/controller';
+import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';

 export async function authDatasetByTmbId({
  teamId,
@@ -167,6 +168,10 @@ export async function authDatasetFile({

  const file = await getFileById({ bucketName: BucketNameEnum.dataset, fileId });

+  if (!file) {
+    return Promise.reject(CommonErrEnum.fileNotFound);
+  }
+
  if (file.metadata.teamId !== teamId) {
    return Promise.reject(DatasetErrEnum.unAuthDatasetFile);
  }