Add image index and pdf parse (#3956)

* feat: think tag parse * feat: parse think tag test * feat: pdf parse ux * feat: doc2x parse * perf: rewrite training mode setting * feat: image parse queue * perf: image index * feat: image parse process * feat: add init sh * fix: ts
2025-07-22 04:06:18 +00:00 · 2025-03-03 23:08:29 +08:00
parent 08b6f594df
commit adf5377ebe
106 changed files with 2337 additions and 1454 deletions
--- a/packages/global/common/fn/utils.ts
+++ b/packages/global/common/fn/utils.ts
@@ -1,31 +0,0 @@
-export const retryRun = <T>(fn: () => T, retry = 2): T => {
-  try {
-    return fn();
-  } catch (error) {
-    if (retry > 0) {
-      return retryRun(fn, retry - 1);
-    }
-    throw error;
-  }
-};
-
-export const batchRun = async <T>(arr: T[], fn: (arr: T) => any, batchSize = 10) => {
-  const batchArr = new Array(batchSize).fill(null);
-  const result: any[] = [];
-
-  const batchFn = async () => {
-    const data = arr.shift();
-    if (data) {
-      result.push(await fn(data));
-      return batchFn();
-    }
-  };
-
-  await Promise.all(
-    batchArr.map(async () => {
-      await batchFn();
-    })
-  );
-
-  return result;
-};
--- a/packages/global/common/string/markdown.ts
+++ b/packages/global/common/string/markdown.ts
@@ -1,4 +1,4 @@
-import { batchRun } from '../fn/utils';
+import { batchRun } from '../system/utils';
 import { getNanoid, simpleText } from './tools';
 import type { ImageType } from '../../../service/worker/readFile/type';

@@ -37,6 +37,80 @@ export const simpleMarkdownText = (rawText: string) => {
  return rawText.trim();
 };

+export const htmlTable2Md = (content: string): string => {
+  return content.replace(/<table>[\s\S]*?<\/table>/g, (htmlTable) => {
+    try {
+      // Clean up whitespace and newlines
+      const cleanHtml = htmlTable.replace(/\n\s*/g, '');
+      const rows = cleanHtml.match(/<tr>(.*?)<\/tr>/g);
+      if (!rows) return htmlTable;
+
+      // Parse table data
+      let tableData: string[][] = [];
+      let maxColumns = 0;
+
+      // Try to convert to markdown table
+      rows.forEach((row, rowIndex) => {
+        if (!tableData[rowIndex]) {
+          tableData[rowIndex] = [];
+        }
+        let colIndex = 0;
+        const cells = row.match(/<td.*?>(.*?)<\/td>/g) || [];
+
+        cells.forEach((cell) => {
+          while (tableData[rowIndex][colIndex]) {
+            colIndex++;
+          }
+          const colspan = parseInt(cell.match(/colspan="(\d+)"/)?.[1] || '1');
+          const rowspan = parseInt(cell.match(/rowspan="(\d+)"/)?.[1] || '1');
+          const content = cell.replace(/<td.*?>|<\/td>/g, '').trim();
+
+          for (let i = 0; i < rowspan; i++) {
+            for (let j = 0; j < colspan; j++) {
+              if (!tableData[rowIndex + i]) {
+                tableData[rowIndex + i] = [];
+              }
+              tableData[rowIndex + i][colIndex + j] = i === 0 && j === 0 ? content : '^^';
+            }
+          }
+          colIndex += colspan;
+          maxColumns = Math.max(maxColumns, colIndex);
+        });
+
+        for (let i = 0; i < maxColumns; i++) {
+          if (!tableData[rowIndex][i]) {
+            tableData[rowIndex][i] = ' ';
+          }
+        }
+      });
+      const chunks: string[] = [];
+
+      const headerCells = tableData[0]
+        .slice(0, maxColumns)
+        .map((cell) => (cell === '^^' ? ' ' : cell || ' '));
+      const headerRow = '| ' + headerCells.join(' | ') + ' |';
+      chunks.push(headerRow);
+
+      const separator = '| ' + Array(headerCells.length).fill('---').join(' | ') + ' |';
+      chunks.push(separator);
+
+      tableData.slice(1).forEach((row) => {
+        const paddedRow = row
+          .slice(0, maxColumns)
+          .map((cell) => (cell === '^^' ? ' ' : cell || ' '));
+        while (paddedRow.length < maxColumns) {
+          paddedRow.push(' ');
+        }
+        chunks.push('| ' + paddedRow.join(' | ') + ' |');
+      });
+
+      return chunks.join('\n');
+    } catch (error) {
+      return htmlTable;
+    }
+  });
+};
+
 /**
 * format markdown
 * 1. upload base64
--- a/packages/global/common/system/types/index.d.ts
+++ b/packages/global/common/system/types/index.d.ts
@@ -43,10 +43,14 @@ export type FastGPTConfigFileType = {
 export type FastGPTFeConfigsType = {
  show_workorder?: boolean;
  show_emptyChat?: boolean;
+  isPlus?: boolean;
  register_method?: ['email' | 'phone' | 'sync'];
  login_method?: ['email' | 'phone']; // Attention: login method is diffrent with oauth
  find_password_method?: ['email' | 'phone'];
  bind_notification_method?: ['email' | 'phone'];
+  googleClientVerKey?: string;
+
+  show_emptyChat?: boolean;
  show_appStore?: boolean;
  show_git?: boolean;
  show_pay?: boolean;
@@ -57,15 +61,19 @@ export type FastGPTFeConfigsType = {
  show_aiproxy?: boolean;
  concatMd?: string;

+  concatMd?: string;
  docUrl?: string;
  openAPIDocUrl?: string;
  systemPluginCourseUrl?: string;
  appTemplateCourse?: string;
+  customApiDomain?: string;
+  customSharePageDomain?: string;

  systemTitle?: string;
  systemDescription?: string;
-  googleClientVerKey?: string;
-  isPlus?: boolean;
+  scripts?: { [key: string]: string }[];
+  favicon?: string;
+
  sso?: {
    icon?: string;
    title?: string;
@@ -91,13 +99,14 @@ export type FastGPTFeConfigsType = {
    exportDatasetLimitMinutes?: number;
    websiteSyncLimitMinuted?: number;
  };
-  scripts?: { [key: string]: string }[];
-  favicon?: string;
-  customApiDomain?: string;
-  customSharePageDomain?: string;

  uploadFileMaxAmount?: number;
  uploadFileMaxSize?: number;
+
+  // Compute by systemEnv.customPdfParse
+  showCustomPdfParse?: boolean;
+  customPdfParsePrice?: number;
+
  lafEnv?: string;
  navbarItems?: NavbarItemType[];
  externalProviderWorkflowVariables?: ExternalProviderWorkflowVarType[];
@@ -107,9 +116,18 @@ export type SystemEnvType = {
  openapiPrefix?: string;
  vectorMaxProcess: number;
  qaMaxProcess: number;
+  vlmMaxProcess: number;
  pgHNSWEfSearch: number;
  tokenWorkers: number; // token count max worker

  oneapiUrl?: string;
  chatApiKey?: string;
+
+  customPdfParse?: {
+    url?: string;
+    key?: string;
+
+    doc2xKey?: string;
+    price?: number; // n points/1 page
+  };
 };
--- a/packages/global/common/system/utils.ts
+++ b/packages/global/common/system/utils.ts
@@ -16,3 +16,24 @@ export const retryFn = async <T>(fn: () => Promise<T>, retryTimes = 3): Promise<
    return Promise.reject(error);
  }
 };
+
+export const batchRun = async <T>(arr: T[], fn: (arr: T) => any, batchSize = 10) => {
+  const batchArr = new Array(batchSize).fill(null);
+  const result: any[] = [];
+
+  const batchFn = async () => {
+    const data = arr.shift();
+    if (data) {
+      result.push(await fn(data));
+      return batchFn();
+    }
+  };
+
+  await Promise.all(
+    batchArr.map(async () => {
+      await batchFn();
+    })
+  );
+
+  return result;
+};
--- a/packages/global/core/ai/model.ts
+++ b/packages/global/core/ai/model.ts
@@ -22,7 +22,7 @@ export const defaultQAModels: LLMModelItemType[] = [
    maxTemperature: 1.2,
    charsPointsPrice: 0,
    censor: false,
-    vision: false,
+    vision: true,
    datasetProcess: true,
    toolChoice: true,
    functionCall: false,
@@ -59,10 +59,17 @@ export const defaultSTTModels: STTModelType[] = [
 export const getModelFromList = (
  modelList: { provider: ModelProviderIdType; name: string; model: string }[],
  model: string
-) => {
+):
+  | {
+      avatar: string;
+      provider: ModelProviderIdType;
+      name: string;
+      model: string;
+    }
+  | undefined => {
  const modelData = modelList.find((item) => item.model === model) ?? modelList[0];
  if (!modelData) {
-    throw new Error('No Key model is configured');
+    return;
  }
  const provider = getModelProvider(modelData.provider);
  return {
--- a/packages/global/core/dataset/api.d.ts
+++ b/packages/global/core/dataset/api.d.ts
@@ -1,5 +1,5 @@
 import { DatasetDataIndexItemType, DatasetSchemaType } from './type';
-import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
+import { DatasetCollectionTypeEnum, DatasetCollectionDataProcessModeEnum } from './constants';
 import type { LLMModelItemType } from '../ai/model.d';
 import { ParentIdType } from 'common/parentFolder/type';

@@ -10,9 +10,11 @@ export type DatasetUpdateBody = {
  name?: string;
  avatar?: string;
  intro?: string;
-  agentModel?: LLMModelItemType;
  status?: DatasetSchemaType['status'];

+  agentModel?: string;
+  vlmModel?: string;
+
  websiteConfig?: DatasetSchemaType['websiteConfig'];
  externalReadUrl?: DatasetSchemaType['externalReadUrl'];
  defaultPermission?: DatasetSchemaType['defaultPermission'];
@@ -27,7 +29,10 @@ export type DatasetUpdateBody = {
 /* ================= collection ===================== */
 export type DatasetCollectionChunkMetadataType = {
  parentId?: string;
-  trainingType?: TrainingModeEnum;
+  customPdfParse?: boolean;
+  trainingType?: DatasetCollectionDataProcessModeEnum;
+  imageIndex?: boolean;
+  autoIndexes?: boolean;
  chunkSize?: number;
  chunkSplitter?: string;
  qaPrompt?: string;
@@ -131,9 +136,15 @@ export type PostWebsiteSyncParams = {
 export type PushDatasetDataProps = {
  collectionId: string;
  data: PushDatasetDataChunkProps[];
-  trainingMode: TrainingModeEnum;
+  trainingType?: DatasetCollectionDataProcessModeEnum;
+  autoIndexes?: boolean;
+  imageIndex?: boolean;
  prompt?: string;
+
  billId?: string;
+
+  // Abandon
+  trainingMode?: DatasetCollectionDataProcessModeEnum;
 };
 export type PushDatasetDataResponse = {
  insertLen: number;
--- a/packages/global/core/dataset/collection/utils.ts
+++ b/packages/global/core/dataset/collection/utils.ts
@@ -1,4 +1,4 @@
-import { DatasetCollectionTypeEnum, TrainingModeEnum, TrainingTypeMap } from '../constants';
+import { DatasetCollectionTypeEnum } from '../constants';
 import { DatasetCollectionSchemaType } from '../type';

 export const getCollectionSourceData = (collection?: DatasetCollectionSchemaType) => {
@@ -16,9 +16,3 @@ export const getCollectionSourceData = (collection?: DatasetCollectionSchemaType
 export const checkCollectionIsFolder = (type: DatasetCollectionTypeEnum) => {
  return type === DatasetCollectionTypeEnum.folder || type === DatasetCollectionTypeEnum.virtual;
 };
-
-export const getTrainingTypeLabel = (type?: TrainingModeEnum) => {
-  if (!type) return '';
-  if (!TrainingTypeMap[type]) return '';
-  return TrainingTypeMap[type].label;
-};
--- a/packages/global/core/dataset/constants.ts
+++ b/packages/global/core/dataset/constants.ts
@@ -109,6 +109,26 @@ export const DatasetCollectionSyncResultMap = {
  }
 };

+export enum DatasetCollectionDataProcessModeEnum {
+  chunk = 'chunk',
+  qa = 'qa',
+  auto = 'auto' // abandon
+}
+export const DatasetCollectionDataProcessModeMap = {
+  [DatasetCollectionDataProcessModeEnum.chunk]: {
+    label: i18nT('common:core.dataset.training.Chunk mode'),
+    tooltip: i18nT('common:core.dataset.import.Chunk Split Tip')
+  },
+  [DatasetCollectionDataProcessModeEnum.qa]: {
+    label: i18nT('common:core.dataset.training.QA mode'),
+    tooltip: i18nT('common:core.dataset.import.QA Import Tip')
+  },
+  [DatasetCollectionDataProcessModeEnum.auto]: {
+    label: i18nT('common:core.dataset.training.Auto mode'),
+    tooltip: i18nT('common:core.dataset.training.Auto mode Tip')
+  }
+};
+
 /* ------------ data -------------- */

 /* ------------ training -------------- */
@@ -124,28 +144,11 @@ export enum ImportDataSourceEnum {

 export enum TrainingModeEnum {
  chunk = 'chunk',
+  qa = 'qa',
  auto = 'auto',
-  qa = 'qa'
+  image = 'image'
 }

-export const TrainingTypeMap = {
-  [TrainingModeEnum.chunk]: {
-    label: i18nT('common:core.dataset.training.Chunk mode'),
-    tooltip: i18nT('common:core.dataset.import.Chunk Split Tip'),
-    openSource: true
-  },
-  [TrainingModeEnum.auto]: {
-    label: i18nT('common:core.dataset.training.Auto mode'),
-    tooltip: i18nT('common:core.dataset.training.Auto mode Tip'),
-    openSource: false
-  },
-  [TrainingModeEnum.qa]: {
-    label: i18nT('common:core.dataset.training.QA mode'),
-    tooltip: i18nT('common:core.dataset.import.QA Import Tip'),
-    openSource: true
-  }
-};
-
 /* ------------ search -------------- */
 export enum DatasetSearchModeEnum {
  embedding = 'embedding',
--- a/packages/global/core/dataset/controller.d.ts
+++ b/packages/global/core/dataset/controller.d.ts
@@ -20,9 +20,22 @@ export type UpdateDatasetDataProps = {
  })[];
 };

-export type PatchIndexesProps = {
-  type: 'create' | 'update' | 'delete' | 'unChange';
-  index: Omit<DatasetDataIndexItemType, 'dataId'> & {
-    dataId?: string;
-  };
-};
+export type PatchIndexesProps =
+  | {
+      type: 'create';
+      index: Omit<DatasetDataIndexItemType, 'dataId'> & {
+        dataId?: string;
+      };
+    }
+  | {
+      type: 'update';
+      index: DatasetDataIndexItemType;
+    }
+  | {
+      type: 'delete';
+      index: DatasetDataIndexItemType;
+    }
+  | {
+      type: 'unChange';
+      index: DatasetDataIndexItemType;
+    };
--- a/packages/global/core/dataset/data/constants.ts
+++ b/packages/global/core/dataset/data/constants.ts
@@ -0,0 +1,42 @@
+import { i18nT } from '../../../../web/i18n/utils';
+
+export enum DatasetDataIndexTypeEnum {
+  default = 'default',
+  custom = 'custom',
+  summary = 'summary',
+  question = 'question',
+  image = 'image'
+}
+
+export const DatasetDataIndexMap: Record<
+  `${DatasetDataIndexTypeEnum}`,
+  {
+    label: any;
+    color: string;
+  }
+> = {
+  [DatasetDataIndexTypeEnum.default]: {
+    label: i18nT('dataset:data_index_default'),
+    color: 'gray'
+  },
+  [DatasetDataIndexTypeEnum.custom]: {
+    label: i18nT('dataset:data_index_custom'),
+    color: 'blue'
+  },
+  [DatasetDataIndexTypeEnum.summary]: {
+    label: i18nT('dataset:data_index_summary'),
+    color: 'green'
+  },
+  [DatasetDataIndexTypeEnum.question]: {
+    label: i18nT('dataset:data_index_question'),
+    color: 'red'
+  },
+  [DatasetDataIndexTypeEnum.image]: {
+    label: i18nT('dataset:data_index_image'),
+    color: 'purple'
+  }
+};
+export const defaultDatasetIndexData = DatasetDataIndexMap[DatasetDataIndexTypeEnum.custom];
+export const getDatasetIndexMapData = (type: `${DatasetDataIndexTypeEnum}`) => {
+  return DatasetDataIndexMap[type] || defaultDatasetIndexData;
+};
--- a/packages/global/core/dataset/training/type.d.ts
+++ b/packages/global/core/dataset/training/type.d.ts
@@ -0,0 +1,20 @@
+import { PushDatasetDataChunkProps } from '../api';
+import { TrainingModeEnum } from '../constants';
+
+export type PushDataToTrainingQueueProps = {
+  teamId: string;
+  tmbId: string;
+  datasetId: string;
+  collectionId: string;
+
+  mode?: TrainingModeEnum;
+  data: PushDatasetDataChunkProps[];
+  prompt?: string;
+
+  agentModel: string;
+  vectorModel: string;
+  vlmModel?: string;
+
+  billId?: string;
+  session?: ClientSession;
+};
--- a/packages/global/core/dataset/type.d.ts
+++ b/packages/global/core/dataset/type.d.ts
@@ -2,6 +2,7 @@ import type { LLMModelItemType, EmbeddingModelItemType } from '../../core/ai/mod
 import { PermissionTypeEnum } from '../../support/permission/constant';
 import { PushDatasetDataChunkProps } from './api';
 import {
+  DatasetCollectionDataProcessModeEnum,
  DatasetCollectionTypeEnum,
  DatasetStatusEnum,
  DatasetTypeEnum,
@@ -12,6 +13,7 @@ import { DatasetPermission } from '../../support/permission/dataset/controller';
 import { Permission } from '../../support/permission/controller';
 import { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
 import { SourceMemberType } from 'support/user/type';
+import { DatasetDataIndexTypeEnum } from './data/constants';

 export type DatasetSchemaType = {
  _id: string;
@@ -23,11 +25,14 @@ export type DatasetSchemaType = {

  avatar: string;
  name: string;
-  vectorModel: string;
-  agentModel: string;
  intro: string;
  type: `${DatasetTypeEnum}`;
  status: `${DatasetStatusEnum}`;
+
+  vectorModel: string;
+  agentModel: string;
+  vlmModel?: string;
+
  websiteConfig?: {
    url: string;
    selector: string;
@@ -52,26 +57,22 @@ export type DatasetCollectionSchemaType = {
  parentId?: string;
  name: string;
  type: DatasetCollectionTypeEnum;
-  createTime: Date;
-  updateTime: Date;
-  forbid?: boolean;
-
-  trainingType: TrainingModeEnum;
-  chunkSize: number;
-  chunkSplitter?: string;
-  qaPrompt?: string;
-  ocrParse?: boolean;
-
  tags?: string[];

+  createTime: Date;
+  updateTime: Date;
+
+  // Status
+  forbid?: boolean;
+  nextSyncTime?: Date;
+
+  // Collection metadata
  fileId?: string; // local file id
  rawLink?: string; // link url
  externalFileId?: string; //external file id
  apiFileId?: string; // api file id
  externalFileUrl?: string; // external import url

-  nextSyncTime?: Date;
-
  rawTextLength?: number;
  hashRawText?: string;
  metadata?: {
@@ -80,6 +81,16 @@ export type DatasetCollectionSchemaType = {

    [key: string]: any;
  };
+
+  // Parse settings
+  customPdfParse?: boolean;
+  // Chunk settings
+  autoIndexes?: boolean;
+  imageIndex?: boolean;
+  trainingType: DatasetCollectionDataProcessModeEnum;
+  chunkSize: number;
+  chunkSplitter?: string;
+  qaPrompt?: string;
 };

 export type DatasetCollectionTagsSchemaType = {
@@ -90,7 +101,7 @@ export type DatasetCollectionTagsSchemaType = {
 };

 export type DatasetDataIndexItemType = {
-  defaultIndex: boolean;
+  type: `${DatasetDataIndexTypeEnum}`;
  dataId: string; // pg data id
  text: string;
 };
@@ -141,6 +152,7 @@ export type DatasetTrainingSchemaType = {
  chunkIndex: number;
  weight: number;
  indexes: Omit<DatasetDataIndexItemType, 'dataId'>[];
+  retryCount: number;
 };

 export type CollectionWithDatasetType = DatasetCollectionSchemaType & {
@@ -169,9 +181,10 @@ export type DatasetListItemType = {
  sourceMember?: SourceMemberType;
 };

-export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel' | 'agentModel'> & {
+export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel' | 'agentModel' | 'vlmModel'> & {
  vectorModel: EmbeddingModelItemType;
  agentModel: LLMModelItemType;
+  vlmModel?: LLMModelItemType;
  permission: DatasetPermission;
 };

--- a/packages/global/core/dataset/utils.ts
+++ b/packages/global/core/dataset/utils.ts
@@ -1,6 +1,7 @@
 import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
 import { getFileIcon } from '../../common/file/icon';
 import { strIsLink } from '../../common/string/tools';
+import { DatasetDataIndexTypeEnum } from './data/constants';

 export function getCollectionIcon(
  type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file,
@@ -38,14 +39,23 @@ export function getSourceNameIcon({
 }

 /* get dataset data default index */
-export function getDefaultIndex(props?: { q?: string; a?: string; dataId?: string }) {
-  const { q = '', a, dataId } = props || {};
-  const qaStr = `${q}\n${a}`.trim();
-  return {
-    defaultIndex: true,
-    text: a ? qaStr : q,
-    dataId
-  };
+export function getDefaultIndex(props?: { q?: string; a?: string }) {
+  const { q = '', a } = props || {};
+
+  return [
+    {
+      text: q,
+      type: DatasetDataIndexTypeEnum.default
+    },
+    ...(a
+      ? [
+          {
+            text: a,
+            type: DatasetDataIndexTypeEnum.default
+          }
+        ]
+      : [])
+  ];
 }

 export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => {
--- a/packages/global/support/wallet/usage/constants.ts
+++ b/packages/global/support/wallet/usage/constants.ts
@@ -10,7 +10,8 @@ export enum UsageSourceEnum {
  wecom = 'wecom',
  feishu = 'feishu',
  dingtalk = 'dingtalk',
-  official_account = 'official_account'
+  official_account = 'official_account',
+  pdfParse = 'pdfParse'
 }

 export const UsageSourceMap = {
@@ -43,5 +44,8 @@ export const UsageSourceMap = {
  },
  [UsageSourceEnum.dingtalk]: {
    label: i18nT('account_usage:dingtalk')
+  },
+  [UsageSourceEnum.pdfParse]: {
+    label: i18nT('account_usage:pdf_parse')
  }
 };
--- a/packages/global/support/wallet/usage/type.d.ts
+++ b/packages/global/support/wallet/usage/type.d.ts
@@ -7,6 +7,7 @@ export type UsageListItemCountType = {
  outputTokens?: number;
  charsLength?: number;
  duration?: number;
+  pages?: number;

  // deprecated
  tokens?: number;