V4.6.6-1 (#656)

2025-07-23 05:12:39 +00:00 · 2023-12-27 11:07:39 +08:00
parent 86286efb54
commit 759a2330e6
182 changed files with 3099 additions and 81685 deletions
--- a/packages/global/common/file/read/index.ts
+++ b/packages/global/common/file/read/index.ts
@@ -0,0 +1,62 @@
+/* read file to txt */
+import * as pdfjsLib from 'pdfjs-dist';
+
+export const readPdfFile = async ({ pdf }: { pdf: string | URL | ArrayBuffer }) => {
+  pdfjsLib.GlobalWorkerOptions.workerSrc = '/js/pdf.worker.js';
+
+  type TokenType = {
+    str: string;
+    dir: string;
+    width: number;
+    height: number;
+    transform: number[];
+    fontName: string;
+    hasEOL: boolean;
+  };
+
+  const readPDFPage = async (doc: any, pageNo: number) => {
+    const page = await doc.getPage(pageNo);
+    const tokenizedText = await page.getTextContent();
+
+    const viewport = page.getViewport({ scale: 1 });
+    const pageHeight = viewport.height;
+    const headerThreshold = pageHeight * 0.07; // 假设页头在页面顶部5%的区域内
+    const footerThreshold = pageHeight * 0.93; // 假设页脚在页面底部5%的区域内
+
+    const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
+      return (
+        !token.transform ||
+        (token.transform[5] > headerThreshold && token.transform[5] < footerThreshold)
+      );
+    });
+
+    // concat empty string 'hasEOL'
+    for (let i = 0; i < pageTexts.length; i++) {
+      const item = pageTexts[i];
+      if (item.str === '' && pageTexts[i - 1]) {
+        pageTexts[i - 1].hasEOL = item.hasEOL;
+        pageTexts.splice(i, 1);
+        i--;
+      }
+    }
+
+    page.cleanup();
+
+    return pageTexts
+      .map((token) => {
+        const paragraphEnd = token.hasEOL && /([。？！.?!\n\r]|(\r\n))$/.test(token.str);
+
+        return paragraphEnd ? `${token.str}\n` : token.str;
+      })
+      .join('');
+  };
+
+  const doc = await pdfjsLib.getDocument(pdf).promise;
+  const pageTextPromises = [];
+  for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
+    pageTextPromises.push(readPDFPage(doc, pageNo));
+  }
+  const pageTexts = await Promise.all(pageTextPromises);
+
+  return pageTexts.join('');
+};
--- a/packages/global/common/string/markdown.ts
+++ b/packages/global/common/string/markdown.ts
@@ -34,3 +34,41 @@ export const simpleMarkdownText = (rawText: string) => {

  return rawText.trim();
 };
+
+/**
+ * format markdown
+ * 1. upload base64
+ * 2. replace \
+ */
+export const uploadMarkdownBase64 = async ({
+  rawText,
+  uploadImgController
+}: {
+  rawText: string;
+  uploadImgController: (base64: string) => Promise<string>;
+}) => {
+  // match base64, upload and replace it
+  const base64Regex = /data:image\/.*;base64,([^\)]+)/g;
+  const base64Arr = rawText.match(base64Regex) || [];
+  // upload base64 and replace it
+  await Promise.all(
+    base64Arr.map(async (base64Img) => {
+      try {
+        const str = await uploadImgController(base64Img);
+
+        rawText = rawText.replace(base64Img, str);
+      } catch (error) {
+        rawText = rawText.replace(base64Img, '');
+        rawText = rawText.replace(/!\[.*\]\(\)/g, '');
+      }
+    })
+  );
+
+  // Remove white space on both sides of the picture
+  const trimReg = /(!\[.*\]\(.*\))\s*/g;
+  if (trimReg.test(rawText)) {
+    rawText = rawText.replace(trimReg, '$1');
+  }
+
+  return simpleMarkdownText(rawText);
+};
--- a/packages/global/common/string/textSplitter.ts
+++ b/packages/global/common/string/textSplitter.ts
@@ -31,7 +31,7 @@ export const splitText2Chunks = (props: {

  // The larger maxLen is, the next sentence is less likely to trigger splitting
  const stepReges: { reg: RegExp; maxLen: number }[] = [
-    ...customReg.map((text) => ({ reg: new RegExp(`([${text}])`, 'g'), maxLen: chunkLen * 1.4 })),
+    ...customReg.map((text) => ({ reg: new RegExp(`(${text})`, 'g'), maxLen: chunkLen * 1.4 })),
    { reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
    { reg: /^(##\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
    { reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
@@ -64,13 +64,22 @@ export const splitText2Chunks = (props: {
        }
      ];
    }
+
+    const isCustomSteep = checkIsCustomStep(step);
    const isMarkdownSplit = checkIsMarkdownSplit(step);
    const independentChunk = checkIndependentChunk(step);

    const { reg } = stepReges[step];

    const splitTexts = text
-      .replace(reg, independentChunk ? `${splitMarker}$1` : `$1${splitMarker}`)
+      .replace(
+        reg,
+        (() => {
+          if (isCustomSteep) return splitMarker;
+          if (independentChunk) return `${splitMarker}$1`;
+          return `$1${splitMarker}`;
+        })()
+      )
      .split(`${splitMarker}`)
      .filter((part) => part.trim());

@@ -128,11 +137,6 @@ export const splitText2Chunks = (props: {
    const independentChunk = checkIndependentChunk(step);
    const isCustomStep = checkIsCustomStep(step);

-    // mini text
-    if (text.length <= chunkLen) {
-      return [text];
-    }
-
    // oversize
    if (step >= stepReges.length) {
      if (text.length < chunkLen * 3) {
@@ -221,6 +225,8 @@ export const splitText2Chunks = (props: {
      } else {
        chunks.push(`${mdTitle}${lastText}`);
      }
+    } else if (lastText && chunks.length === 0) {
+      chunks.push(lastText);
    }

    return chunks;
--- a/packages/global/common/system/types/index.d.ts
+++ b/packages/global/common/system/types/index.d.ts
@@ -1,4 +1,29 @@
-export type FeConfigsType = {
+import type {
+  ChatModelItemType,
+  FunctionModelItemType,
+  LLMModelItemType,
+  VectorModelItemType,
+  AudioSpeechModels,
+  WhisperModelType,
+  ReRankModelItemType
+} from '../../../core/ai/model.d';
+
+/* fastgpt main */
+export type FastGPTConfigFileType = {
+  feConfigs: FastGPTFeConfigsType;
+  systemEnv: SystemEnvType;
+  chatModels: ChatModelItemType[];
+  qaModels: LLMModelItemType[];
+  cqModels: FunctionModelItemType[];
+  extractModels: FunctionModelItemType[];
+  qgModels: LLMModelItemType[];
+  vectorModels: VectorModelItemType[];
+  reRankModels: ReRankModelItemType[];
+  audioSpeechModels: AudioSpeechModelType[];
+  whisperModel: WhisperModelType;
+};
+
+export type FastGPTFeConfigsType = {
  show_emptyChat?: boolean;
  show_register?: boolean;
  show_appStore?: boolean;
@@ -34,6 +59,6 @@ export type SystemEnvType = {
 };

 declare global {
-  var feConfigs: FeConfigsType;
+  var feConfigs: FastGPTFeConfigsType;
  var systemEnv: SystemEnvType;
 }
--- a/packages/global/core/ai/model.d.ts
+++ b/packages/global/core/ai/model.d.ts
@@ -24,6 +24,7 @@ export type VectorModelItemType = {
  defaultToken: number;
  price: number;
  maxToken: number;
+  weight: number;
 };

 export type ReRankModelItemType = {
--- a/packages/global/core/ai/model.ts
+++ b/packages/global/core/ai/model.ts
@@ -16,6 +16,7 @@ export const defaultVectorModels: VectorModelItemType[] = [
    name: 'Embedding-2',
    price: 0,
    defaultToken: 500,
-    maxToken: 3000
+    maxToken: 3000,
+    weight: 100
  }
 ];
--- a/packages/global/core/dataset/type.d.ts
+++ b/packages/global/core/dataset/type.d.ts
@@ -89,6 +89,7 @@ export type DatasetTrainingSchemaType = {
  q: string;
  a: string;
  chunkIndex: number;
+  weight: number;
  indexes: Omit<DatasetDataIndexItemType, 'dataId'>[];
 };

--- a/packages/global/core/module/template/system/contextExtract.ts
+++ b/packages/global/core/module/template/system/contextExtract.ts
@@ -36,10 +36,11 @@ export const ContextExtractModule: FlowModuleTemplateType = {
      type: FlowNodeInputTypeEnum.textarea,
      valueType: ModuleIOValueTypeEnum.string,
      label: '提取要求描述',
-      description: '给AI一些对应的背景知识或要求描述，引导AI更好的完成任务',
+      description:
+        '给AI一些对应的背景知识或要求描述，引导AI更好的完成任务。\n该输入框可使用全局变量。',
      required: true,
      placeholder:
-        '例如: \n1. 你是一个实验室预约助手，你的任务是帮助用户预约实验室。\n2. 你是谷歌搜索助手，需要从文本中提取出合适的搜索词。',
+        '例如: \n1. 当前时间为: {{cTime}}。你是一个实验室预约助手，你的任务是帮助用户预约实验室，从文本中获取对应的预约信息。\n2. 你是谷歌搜索助手，需要从文本中提取出合适的搜索词。',
      showTargetInApp: true,
      showTargetInPlugin: true
    },
--- a/packages/global/package.json
+++ b/packages/global/package.json
@@ -2,11 +2,12 @@
  "name": "@fastgpt/global",
  "version": "1.0.0",
  "dependencies": {
+    "axios": "^1.5.1",
    "dayjs": "^1.11.7",
-    "openai": "4.23.0",
    "encoding": "^0.1.13",
    "js-tiktoken": "^1.0.7",
-    "axios": "^1.5.1",
+    "openai": "4.23.0",
+    "pdfjs-dist": "^4.0.269",
    "timezones-list": "^3.0.2"
  },
  "devDependencies": {
--- a/packages/service/common/mongo/init.ts
+++ b/packages/service/common/mongo/init.ts
@@ -20,12 +20,12 @@ export async function connectMongo({
  console.log('mongo start connect');
  try {
    mongoose.set('strictQuery', true);
-    const maxConnecting = Math.max(20, Number(process.env.DB_MAX_LINK || 20));
+    const maxConnecting = Math.max(30, Number(process.env.DB_MAX_LINK || 20));
    await mongoose.connect(process.env.MONGODB_URI as string, {
      bufferCommands: true,
      maxConnecting: maxConnecting,
      maxPoolSize: maxConnecting,
-      minPoolSize: Math.max(5, Math.round(Number(process.env.DB_MAX_LINK || 5) * 0.1)),
+      minPoolSize: 20,
      connectTimeoutMS: 60000,
      waitQueueTimeoutMS: 60000,
      socketTimeoutMS: 60000,
--- a/packages/service/common/pg/index.ts
+++ b/packages/service/common/pg/index.ts
@@ -9,7 +9,8 @@ export const connectPg = async (): Promise<Pool> => {

  global.pgClient = new Pool({
    connectionString: process.env.PG_URL,
-    max: Number(process.env.DB_MAX_LINK || 5),
+    max: Number(process.env.DB_MAX_LINK || 20),
+    min: 10,
    keepAlive: true,
    idleTimeoutMillis: 60000,
    connectionTimeoutMillis: 20000
--- a/packages/service/common/system/config/controller.ts
+++ b/packages/service/common/system/config/controller.ts
@@ -1,15 +1,15 @@
 import { SystemConfigsTypeEnum } from '@fastgpt/global/common/system/config/constants';
 import { MongoSystemConfigs } from './schema';
-import { FeConfigsType } from '@fastgpt/global/common/system/types';
+import { FastGPTConfigFileType } from '@fastgpt/global/common/system/types';

-export const getFastGPTFeConfig = async () => {
+export const getFastGPTConfigFromDB = async () => {
  const res = await MongoSystemConfigs.findOne({
    type: SystemConfigsTypeEnum.fastgpt
  }).sort({
    createTime: -1
  });

-  const config: FeConfigsType = res?.value?.FeConfig || {};
+  const config = res?.value || {};

-  return config;
+  return config as Omit<FastGPTConfigFileType, 'systemEnv'>;
 };
--- a/packages/service/common/system/config/schema.ts
+++ b/packages/service/common/system/config/schema.ts
@@ -22,7 +22,6 @@ const systemConfigSchema = new Schema({
 });

 try {
-  systemConfigSchema.index({ createTime: -1 }, { expireAfterSeconds: 90 * 24 * 60 * 60 });
  systemConfigSchema.index({ type: 1 });
 } catch (error) {
  console.log(error);
--- a/packages/service/core/dataset/training/schema.ts
+++ b/packages/service/core/dataset/training/schema.ts
@@ -79,6 +79,10 @@ const TrainingDataSchema = new Schema({
    type: Number,
    default: 0
  },
+  weight: {
+    type: Number,
+    default: 0
+  },
  indexes: {
    type: [
      {
--- a/packages/service/support/permission/controller.ts
+++ b/packages/service/support/permission/controller.ts
@@ -56,7 +56,7 @@ export async function parseHeaderCert({
  async function authCookieToken(cookie?: string, token?: string) {
    // 获取 cookie
    const cookies = Cookie.parse(cookie || '');
-    const cookieToken = cookies.token || token;
+    const cookieToken = token || cookies.token;

    if (!cookieToken) {
      return Promise.reject(ERROR_ENUM.unAuthorization);
@@ -127,7 +127,7 @@ export async function parseHeaderCert({
        authType: AuthUserTypeEnum.apikey
      };
    }
-    if (authToken && (cookie || token)) {
+    if (authToken && (token || cookie)) {
      // user token(from fastgpt web)
      const res = await authCookieToken(cookie, token);
      return {
@@ -182,7 +182,7 @@ export async function parseHeaderCert({
 export const setCookie = (res: NextApiResponse, token: string) => {
  res.setHeader(
    'Set-Cookie',
-    `token=${token}; Path=/; HttpOnly; Max-Age=604800; Samesite=None; Secure;`
+    `token=${token}; Path=/; HttpOnly; Max-Age=604800; Samesite=Strict; Secure;`
  );
 };
 /* clear cookie */
--- a/packages/web/common/file/img.ts
+++ b/packages/web/common/file/img.ts
@@ -0,0 +1,66 @@
+export type CompressImgProps = {
+  maxW?: number;
+  maxH?: number;
+  maxSize?: number;
+};
+
+export const compressBase64ImgAndUpload = ({
+  base64Img,
+  maxW = 1080,
+  maxH = 1080,
+  maxSize = 1024 * 500, // 300kb
+  uploadController
+}: CompressImgProps & {
+  base64Img: string;
+  uploadController: (base64: string) => Promise<string>;
+}) => {
+  return new Promise<string>((resolve, reject) => {
+    const fileType =
+      /^data:([a-zA-Z0-9]+\/[a-zA-Z0-9-.+]+).*,/.exec(base64Img)?.[1] || 'image/jpeg';
+
+    const img = new Image();
+    img.src = base64Img;
+    img.onload = async () => {
+      let width = img.width;
+      let height = img.height;
+
+      if (width > height) {
+        if (width > maxW) {
+          height *= maxW / width;
+          width = maxW;
+        }
+      } else {
+        if (height > maxH) {
+          width *= maxH / height;
+          height = maxH;
+        }
+      }
+
+      const canvas = document.createElement('canvas');
+      canvas.width = width;
+      canvas.height = height;
+      const ctx = canvas.getContext('2d');
+
+      if (!ctx) {
+        return reject('压缩图片异常');
+      }
+
+      ctx.drawImage(img, 0, 0, width, height);
+      const compressedDataUrl = canvas.toDataURL(fileType, 1);
+      // 移除 canvas 元素
+      canvas.remove();
+
+      if (compressedDataUrl.length > maxSize) {
+        return reject('图片太大了');
+      }
+
+      try {
+        const src = await uploadController(compressedDataUrl);
+        resolve(src);
+      } catch (error) {
+        reject(error);
+      }
+    };
+    img.onerror = reject;
+  });
+};
--- a/packages/web/common/file/read.ts
+++ b/packages/web/common/file/read.ts
@@ -0,0 +1,53 @@
+import { uploadMarkdownBase64 } from '@fastgpt/global/common/string/markdown';
+import { htmlStr2Md } from '../string/markdown';
+/**
+ * read file raw text
+ */
+export const readFileRawText = (file: File) => {
+  return new Promise((resolve: (_: string) => void, reject) => {
+    try {
+      const reader = new FileReader();
+      reader.onload = () => {
+        resolve(reader.result as string);
+      };
+      reader.onerror = (err) => {
+        console.log('error txt read:', err);
+        reject('Read file error');
+      };
+      reader.readAsText(file);
+    } catch (error) {
+      reject(error);
+    }
+  });
+};
+
+export const readMdFile = async ({
+  file,
+  uploadImgController
+}: {
+  file: File;
+  uploadImgController: (base64: string) => Promise<string>;
+}) => {
+  const md = await readFileRawText(file);
+  const rawText = await uploadMarkdownBase64({
+    rawText: md,
+    uploadImgController
+  });
+  return rawText;
+};
+
+export const readHtmlFile = async ({
+  file,
+  uploadImgController
+}: {
+  file: File;
+  uploadImgController: (base64: string) => Promise<string>;
+}) => {
+  const md = htmlStr2Md(await readFileRawText(file));
+  const rawText = await uploadMarkdownBase64({
+    rawText: md,
+    uploadImgController
+  });
+
+  return rawText;
+};
--- a/packages/web/package.json
+++ b/packages/web/package.json
@@ -2,6 +2,7 @@
  "name": "@fastgpt/web",
  "version": "1.0.0",
  "dependencies": {
+    "@fastgpt/global": "workspace:*",
    "joplin-turndown-plugin-gfm": "^1.0.12",
    "turndown": "^7.1.2"
  },