Add image index and pdf parse (#3956)

* feat: think tag parse * feat: parse think tag test * feat: pdf parse ux * feat: doc2x parse * perf: rewrite training mode setting * feat: image parse queue * perf: image index * feat: image parse process * feat: add init sh * fix: ts
2025-10-15 23:55:36 +00:00 · 2025-03-03 23:08:29 +08:00
parent 08b6f594df
commit adf5377ebe
106 changed files with 2337 additions and 1454 deletions
--- a/packages/service/common/file/gridfs/controller.ts
+++ b/packages/service/common/file/gridfs/controller.ts
@@ -186,20 +186,25 @@ export async function getDownloadStream({

 export const readFileContentFromMongo = async ({
  teamId,
+  tmbId,
  bucketName,
  fileId,
-  isQAImport = false
+  isQAImport = false,
+  customPdfParse = false
 }: {
  teamId: string;
+  tmbId: string;
  bucketName: `${BucketNameEnum}`;
  fileId: string;
  isQAImport?: boolean;
+  customPdfParse?: boolean;
 }): Promise<{
  rawText: string;
  filename: string;
 }> => {
+  const bufferId = `${fileId}-${customPdfParse}`;
  // read buffer
-  const fileBuffer = await MongoRawTextBuffer.findOne({ sourceId: fileId }, undefined, {
+  const fileBuffer = await MongoRawTextBuffer.findOne({ sourceId: bufferId }, undefined, {
    ...readFromSecondary
  }).lean();
  if (fileBuffer) {
@@ -227,9 +232,11 @@ export const readFileContentFromMongo = async ({

  // Get raw text
  const { rawText } = await readRawContentByFileBuffer({
+    customPdfParse,
    extension,
    isQAImport,
    teamId,
+    tmbId,
    buffer: fileBuffers,
    encoding,
    metadata: {
@@ -240,7 +247,7 @@ export const readFileContentFromMongo = async ({
  // < 14M
  if (fileBuffers.length < 14 * 1024 * 1024 && rawText.trim()) {
    MongoRawTextBuffer.create({
-      sourceId: fileId,
+      sourceId: bufferId,
      rawText,
      metadata: {
        filename: file.filename
--- a/packages/service/common/file/image/utils.ts
+++ b/packages/service/common/file/image/utils.ts
@@ -0,0 +1,27 @@
+import axios from 'axios';
+import { addLog } from '../../system/log';
+import { serverRequestBaseUrl } from '../../api/serverRequest';
+import { getFileContentTypeFromHeader, guessBase64ImageType } from '../utils';
+
+export const getImageBase64 = async (url: string) => {
+  addLog.debug(`Load image to base64: ${url}`);
+
+  try {
+    const response = await axios.get(url, {
+      baseURL: serverRequestBaseUrl,
+      responseType: 'arraybuffer',
+      proxy: false
+    });
+
+    const base64 = Buffer.from(response.data, 'binary').toString('base64');
+    const imageType =
+      getFileContentTypeFromHeader(response.headers['content-type']) ||
+      guessBase64ImageType(base64);
+
+    return `data:${imageType};base64,${base64}`;
+  } catch (error) {
+    addLog.debug(`Load image to base64 failed: ${url}`);
+    console.log(error);
+    return Promise.reject(error);
+  }
+};
--- a/packages/service/common/file/read/utils.ts
+++ b/packages/service/common/file/read/utils.ts
@@ -1,18 +1,23 @@
 import { uploadMongoImg } from '../image/controller';
 import FormData from 'form-data';
-
 import { WorkerNameEnum, runWorker } from '../../../worker/utils';
 import fs from 'fs';
-import type { ReadFileResponse } from '../../../worker/readFile/type';
+import type { ImageType, ReadFileResponse } from '../../../worker/readFile/type';
 import axios from 'axios';
 import { addLog } from '../../system/log';
-import { batchRun } from '@fastgpt/global/common/fn/utils';
-import { matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
+import { batchRun } from '@fastgpt/global/common/system/utils';
+import { htmlTable2Md, matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
+import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
+import { getErrText } from '@fastgpt/global/common/error/utils';
+import { delay } from '@fastgpt/global/common/system/utils';
+import { getNanoid } from '@fastgpt/global/common/string/tools';

 export type readRawTextByLocalFileParams = {
  teamId: string;
+  tmbId: string;
  path: string;
  encoding: string;
+  customPdfParse?: boolean;
  metadata?: Record<string, any>;
 };
 export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParams) => {
@@ -22,46 +27,51 @@ export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParam

  const buffer = await fs.promises.readFile(path);

-  const { rawText } = await readRawContentByFileBuffer({
+  return readRawContentByFileBuffer({
    extension,
    isQAImport: false,
+    customPdfParse: params.customPdfParse,
    teamId: params.teamId,
+    tmbId: params.tmbId,
    encoding: params.encoding,
    buffer,
    metadata: params.metadata
  });
-
-  return {
-    rawText
-  };
 };

 export const readRawContentByFileBuffer = async ({
-  extension,
-  isQAImport,
  teamId,
+  tmbId,
+
+  extension,
  buffer,
  encoding,
-  metadata
+  metadata,
+  customPdfParse = false,
+  isQAImport = false
 }: {
-  isQAImport?: boolean;
-  extension: string;
  teamId: string;
+  tmbId: string;
+
+  extension: string;
  buffer: Buffer;
  encoding: string;
  metadata?: Record<string, any>;
-}) => {
-  // Custom read file service
-  const customReadfileUrl = process.env.CUSTOM_READ_FILE_URL;
-  const customReadFileExtension = process.env.CUSTOM_READ_FILE_EXTENSION || '';
-  const ocrParse = process.env.CUSTOM_READ_FILE_OCR || 'false';
-  const readFileFromCustomService = async (): Promise<ReadFileResponse | undefined> => {
-    if (
-      !customReadfileUrl ||
-      !customReadFileExtension ||
-      !customReadFileExtension.includes(extension)
-    )
-      return;
+
+  customPdfParse?: boolean;
+  isQAImport: boolean;
+}): Promise<ReadFileResponse> => {
+  const systemParse = () =>
+    runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
+      extension,
+      encoding,
+      buffer,
+      teamId
+    });
+  const parsePdfFromCustomService = async (): Promise<ReadFileResponse> => {
+    const url = global.systemEnv.customPdfParse?.url;
+    const token = global.systemEnv.customPdfParse?.key;
+    if (!url) return systemParse();

    const start = Date.now();
    addLog.info('Parsing files from an external service');
@@ -70,20 +80,18 @@ export const readRawContentByFileBuffer = async ({
    data.append('file', buffer, {
      filename: `file.${extension}`
    });
-    data.append('extension', extension);
-    data.append('ocr', ocrParse);
    const { data: response } = await axios.post<{
      success: boolean;
      message: string;
      data: {
        page: number;
        markdown: string;
-        duration: number;
      };
-    }>(customReadfileUrl, data, {
+    }>(url, data, {
      timeout: 600000,
      headers: {
-        ...data.getHeaders()
+        ...data.getHeaders(),
+        Authorization: token ? `Bearer ${token}` : undefined
      }
    });

@@ -92,21 +100,208 @@ export const readRawContentByFileBuffer = async ({
    const rawText = response.data.markdown;
    const { text, imageList } = matchMdImgTextAndUpload(rawText);

+    createPdfParseUsage({
+      teamId,
+      tmbId,
+      pages: response.data.page
+    });
+
    return {
      rawText: text,
      formatText: rawText,
      imageList
    };
  };
+  const parsePdfFromDoc2x = async (): Promise<ReadFileResponse> => {
+    const doc2xKey = global.systemEnv.customPdfParse?.doc2xKey;
+    if (!doc2xKey) return systemParse();

-  let { rawText, formatText, imageList } =
-    (await readFileFromCustomService()) ||
-    (await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
-      extension,
-      encoding,
-      buffer,
-      teamId
-    }));
+    const parseTextImage = async (text: string) => {
+      // Extract image links and convert to base64
+      const imageList: { id: string; url: string }[] = [];
+      const processedText = text.replace(/!\[.*?\]\((http[^)]+)\)/g, (match, url) => {
+        const id = getNanoid();
+        imageList.push({
+          id,
+          url
+        });
+        return `![](${id})`;
+      });
+
+      let resultImageList: ImageType[] = [];
+      await Promise.all(
+        imageList.map(async (item) => {
+          try {
+            const response = await axios.get(item.url, { responseType: 'arraybuffer' });
+            const mime = response.headers['content-type'] || 'image/jpeg';
+            const base64 = response.data.toString('base64');
+            resultImageList.push({
+              uuid: item.id,
+              mime,
+              base64
+            });
+          } catch (error) {
+            addLog.warn(`Failed to get image from ${item.url}: ${getErrText(error)}`);
+          }
+        })
+      );
+
+      return {
+        text: processedText,
+        imageList: resultImageList
+      };
+    };
+
+    let startTime = Date.now();
+
+    // 1. Get pre-upload URL first
+    const { data: preupload_data } = await axios
+      .post<{ code: string; data: { uid: string; url: string } }>(
+        'https://v2.doc2x.noedgeai.com/api/v2/parse/preupload',
+        null,
+        {
+          headers: {
+            Authorization: `Bearer ${doc2xKey}`
+          }
+        }
+      )
+      .catch((error) => {
+        return Promise.reject(
+          `[Pre-upload Error] Failed to get pre-upload URL: ${getErrText(error)}`
+        );
+      });
+    if (preupload_data?.code !== 'success') {
+      return Promise.reject(`Failed to get pre-upload URL: ${JSON.stringify(preupload_data)}`);
+    }
+
+    const upload_url = preupload_data.data.url;
+    const uid = preupload_data.data.uid;
+
+    // 2. Upload file to pre-signed URL with binary stream
+    const blob = new Blob([buffer], { type: 'application/pdf' });
+    const response = await axios
+      .put(upload_url, blob, {
+        headers: {
+          'Content-Type': 'application/pdf'
+        }
+      })
+      .catch((error) => {
+        return Promise.reject(`[Upload Error] Failed to upload file: ${getErrText(error)}`);
+      });
+    if (response.status !== 200) {
+      return Promise.reject(`Upload failed with status ${response.status}: ${response.statusText}`);
+    }
+
+    await delay(5000);
+    addLog.debug(`Uploaded file to Doc2x, uid: ${uid}`);
+    // 3. Get the result by uid
+    const checkResult = async (retry = 30) => {
+      if (retry <= 0) {
+        return Promise.reject(
+          `[Parse Timeout Error] Failed to get result (uid: ${uid}): Process timeout`
+        );
+      }
+
+      try {
+        const { data: result_data } = await axios
+          .get<{
+            code: string;
+            data: {
+              progress: number;
+              status: 'processing' | 'failed' | 'success';
+              result: {
+                pages: {
+                  md: string;
+                }[];
+              };
+            };
+          }>(`https://v2.doc2x.noedgeai.com/api/v2/parse/status?uid=${uid}`, {
+            headers: {
+              Authorization: `Bearer ${doc2xKey}`
+            }
+          })
+          .catch((error) => {
+            return Promise.reject(
+              `[Parse Status Error] Failed to get parse status: ${getErrText(error)}`
+            );
+          });
+
+        // Error
+        if (!['ok', 'success'].includes(result_data.code)) {
+          return Promise.reject(
+            `Failed to get result (uid: ${uid}): ${JSON.stringify(result_data)}`
+          );
+        }
+
+        // Process
+        if (['ready', 'processing'].includes(result_data.data.status)) {
+          addLog.debug(`Waiting for the result, uid: ${uid}`);
+          await delay(5000);
+          return checkResult(retry - 1);
+        }
+
+        // Finifsh
+        if (result_data.data.status === 'success') {
+          const result = result_data.data.result.pages
+            .map((page) => page.md)
+            .join('\n')
+            // Do some post-processing
+            .replace(/\\[\(\)]/g, '$')
+            .replace(/\\[\[\]]/g, '$$')
+            .replace(/<img\s+src="([^"]+)"(?:\s*\?[^>]*)?(?:\s*\/>|>)/g, '![img]($1)')
+            .replace(/<!-- Media -->/g, '')
+            .replace(/<!-- Footnote -->/g, '')
+            .replace(/\$(.+?)\s+\\tag\{(.+?)\}\$/g, '$$$1 \\qquad \\qquad ($2)$$')
+            .replace(/\\text\{([^}]*?)(\b\w+)_(\w+\b)([^}]*?)\}/g, '\\text{$1$2\\_$3$4}');
+
+          const { text, imageList } = await parseTextImage(htmlTable2Md(result));
+
+          return {
+            pages: result_data.data.result.pages.length,
+            text,
+            imageList
+          };
+        }
+        return checkResult(retry - 1);
+      } catch (error) {
+        if (retry > 1) {
+          await delay(100);
+          return checkResult(retry - 1);
+        }
+        return Promise.reject(error);
+      }
+    };
+
+    const { pages, text, imageList } = await checkResult();
+
+    createPdfParseUsage({
+      teamId,
+      tmbId,
+      pages
+    });
+
+    addLog.info(`Doc2x parse success, time: ${Date.now() - startTime}ms`);
+    return {
+      rawText: text,
+      formatText: text,
+      imageList
+    };
+  };
+  // Custom read file service
+  const pdfParseFn = async (): Promise<ReadFileResponse> => {
+    if (!customPdfParse) return systemParse();
+    if (global.systemEnv.customPdfParse?.url) return parsePdfFromCustomService();
+    if (global.systemEnv.customPdfParse?.doc2xKey) return parsePdfFromDoc2x();
+
+    return systemParse();
+  };
+
+  let { rawText, formatText, imageList } = await (async () => {
+    if (extension === 'pdf') {
+      return await pdfParseFn();
+    }
+    return await systemParse();
+  })();

  // markdown data format
  if (imageList) {
@@ -142,5 +337,5 @@ export const readRawContentByFileBuffer = async ({
    }
  }

-  return { rawText };
+  return { rawText, formatText, imageList };
 };
--- a/packages/service/common/system/tools.ts
+++ b/packages/service/common/system/tools.ts
@@ -10,6 +10,11 @@ export const SERVICE_LOCAL_HOST =
 export const initFastGPTConfig = (config?: FastGPTConfigFileType) => {
  if (!config) return;

+  // Special config computed
+  config.feConfigs.showCustomPdfParse =
+    !!config.systemEnv.customPdfParse?.url || !!config.systemEnv.customPdfParse?.doc2xKey;
+  config.feConfigs.customPdfParsePrice = config.systemEnv.customPdfParse?.price || 0;
+
  global.feConfigs = config.feConfigs;
  global.systemEnv = config.systemEnv;
  global.subPlans = config.subPlans;
--- a/packages/service/core/ai/model.ts
+++ b/packages/service/core/ai/model.ts
@@ -13,6 +13,11 @@ export const getDatasetModel = (model?: string) => {
      ?.find((item) => item.model === model || item.name === model) ?? getDefaultLLMModel()
  );
 };
+export const getVlmModel = (model?: string) => {
+  return Array.from(global.llmModelMap.values())
+    ?.filter((item) => item.vision)
+    ?.find((item) => item.model === model || item.name === model);
+};

 export const getDefaultEmbeddingModel = () => global?.systemDefaultModel.embedding!;
 export const getEmbeddingModel = (model?: string) => {
--- a/packages/service/core/chat/utils.ts
+++ b/packages/service/core/chat/utils.ts
@@ -9,10 +9,9 @@ import type {
 } from '@fastgpt/global/core/ai/type.d';
 import axios from 'axios';
 import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants';
-import { getFileContentTypeFromHeader, guessBase64ImageType } from '../../common/file/utils';
-import { serverRequestBaseUrl } from '../../common/api/serverRequest';
 import { i18nT } from '../../../web/i18n/utils';
 import { addLog } from '../../common/system/log';
+import { getImageBase64 } from '../../common/file/image/utils';

 export const filterGPTMessageByMaxContext = async ({
  messages = [],
@@ -166,25 +165,13 @@ export const loadRequestMessages = async ({
            try {
              // If imgUrl is a local path, load image from local, and set url to base64
              if (imgUrl.startsWith('/') || process.env.MULTIPLE_DATA_TO_BASE64 === 'true') {
-                addLog.debug('Load image from local server', {
-                  baseUrl: serverRequestBaseUrl,
-                  requestUrl: imgUrl
-                });
-                const response = await axios.get(imgUrl, {
-                  baseURL: serverRequestBaseUrl,
-                  responseType: 'arraybuffer',
-                  proxy: false
-                });
-                const base64 = Buffer.from(response.data, 'binary').toString('base64');
-                const imageType =
-                  getFileContentTypeFromHeader(response.headers['content-type']) ||
-                  guessBase64ImageType(base64);
+                const base64 = await getImageBase64(imgUrl);

                return {
                  ...item,
                  image_url: {
                    ...item.image_url,
-                    url: `data:${imageType};base64,${base64}`
+                    url: base64
                  }
                };
              }
@@ -223,7 +210,8 @@ export const loadRequestMessages = async ({
      await Promise.all(
        content.map(async (item) => {
          if (item.type === 'text') {
-            if (item.text) return parseStringWithImages(item.text);
+            // If it is array, not need to parse image
+            if (item.text) return item;
            return;
          }
          if (item.type === 'file_url') return; // LLM not support file_url
--- a/packages/service/core/dataset/apiDataset/api.ts
+++ b/packages/service/core/dataset/apiDataset/api.ts
@@ -108,7 +108,15 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer }
    return formattedFiles;
  };

-  const getFileContent = async ({ teamId, apiFileId }: { teamId: string; apiFileId: string }) => {
+  const getFileContent = async ({
+    teamId,
+    tmbId,
+    apiFileId
+  }: {
+    teamId: string;
+    tmbId: string;
+    apiFileId: string;
+  }) => {
    const data = await request<APIFileContentResponse>(
      `/v1/file/content`,
      { id: apiFileId },
@@ -123,6 +131,7 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer }
    if (previewUrl) {
      const rawText = await readFileRawTextByUrl({
        teamId,
+        tmbId,
        url: previewUrl,
        relatedId: apiFileId
      });
--- a/packages/service/core/dataset/collection/controller.ts
+++ b/packages/service/core/dataset/collection/controller.ts
@@ -1,6 +1,6 @@
 import {
  DatasetCollectionTypeEnum,
-  TrainingModeEnum
+  DatasetCollectionDataProcessModeEnum
 } from '@fastgpt/global/core/dataset/constants';
 import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
 import { MongoDatasetCollection } from './schema';
@@ -19,13 +19,14 @@ import { predictDataLimitLength } from '../../../../global/core/dataset/utils';
 import { mongoSessionRun } from '../../../common/mongo/sessionRun';
 import { createTrainingUsage } from '../../../support/wallet/usage/controller';
 import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
-import { getLLMModel, getEmbeddingModel } from '../../ai/model';
+import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
 import { pushDataListToTrainingQueue } from '../training/controller';
 import { MongoImage } from '../../../common/file/image/schema';
 import { hashStr } from '@fastgpt/global/common/string/tools';
 import { addDays } from 'date-fns';
 import { MongoDatasetDataText } from '../data/dataTextSchema';
-import { delay, retryFn } from '@fastgpt/global/common/system/utils';
+import { retryFn } from '@fastgpt/global/common/system/utils';
+import { getTrainingModeByCollection } from './utils';

 export const createCollectionAndInsertData = async ({
  dataset,
@@ -33,6 +34,7 @@ export const createCollectionAndInsertData = async ({
  relatedId,
  createCollectionParams,
  isQAImport = false,
+  billId,
  session
 }: {
  dataset: DatasetSchemaType;
@@ -41,13 +43,21 @@ export const createCollectionAndInsertData = async ({
  createCollectionParams: CreateOneCollectionParams;

  isQAImport?: boolean;
+  billId?: string;
  session?: ClientSession;
 }) => {
+  // Adapter 4.9.0
+  if (createCollectionParams.trainingType === DatasetCollectionDataProcessModeEnum.auto) {
+    createCollectionParams.trainingType = DatasetCollectionDataProcessModeEnum.chunk;
+    createCollectionParams.autoIndexes = true;
+  }
+
  const teamId = createCollectionParams.teamId;
  const tmbId = createCollectionParams.tmbId;
  // Chunk split params
-  const trainingType = createCollectionParams.trainingType || TrainingModeEnum.chunk;
-  const chunkSize = createCollectionParams.chunkSize;
+  const trainingType =
+    createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
+  const chunkSize = createCollectionParams.chunkSize || 512;
  const chunkSplitter = createCollectionParams.chunkSplitter;
  const qaPrompt = createCollectionParams.qaPrompt;
  const usageName = createCollectionParams.name;
@@ -56,7 +66,7 @@ export const createCollectionAndInsertData = async ({
  const chunks = rawText2Chunks({
    rawText,
    chunkLen: chunkSize,
-    overlapRatio: trainingType === TrainingModeEnum.chunk ? 0.2 : 0,
+    overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
    customReg: chunkSplitter ? [chunkSplitter] : [],
    isQAImport
  });
@@ -64,7 +74,14 @@ export const createCollectionAndInsertData = async ({
  // 2. auth limit
  await checkDatasetLimit({
    teamId,
-    insertLen: predictDataLimitLength(trainingType, chunks)
+    insertLen: predictDataLimitLength(
+      getTrainingModeByCollection({
+        trainingType,
+        autoIndexes: createCollectionParams.autoIndexes,
+        imageIndex: createCollectionParams.imageIndex
+      }),
+      chunks
+    )
  });

  const fn = async (session: ClientSession) => {
@@ -89,15 +106,20 @@ export const createCollectionAndInsertData = async ({
    });

    // 4. create training bill
-    const { billId } = await createTrainingUsage({
-      teamId,
-      tmbId,
-      appName: usageName,
-      billSource: UsageSourceEnum.training,
-      vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
-      agentModel: getLLMModel(dataset.agentModel)?.name,
-      session
-    });
+    const traingBillId = await (async () => {
+      if (billId) return billId;
+      const { billId: newBillId } = await createTrainingUsage({
+        teamId,
+        tmbId,
+        appName: usageName,
+        billSource: UsageSourceEnum.training,
+        vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
+        agentModel: getLLMModel(dataset.agentModel)?.name,
+        vllmModel: getVlmModel(dataset.vlmModel)?.name,
+        session
+      });
+      return newBillId;
+    })();

    // 5. insert to training queue
    const insertResults = await pushDataListToTrainingQueue({
@@ -107,9 +129,14 @@ export const createCollectionAndInsertData = async ({
      collectionId,
      agentModel: dataset.agentModel,
      vectorModel: dataset.vectorModel,
-      trainingMode: trainingType,
+      vlmModel: dataset.vlmModel,
+      mode: getTrainingModeByCollection({
+        trainingType,
+        autoIndexes: createCollectionParams.autoIndexes,
+        imageIndex: createCollectionParams.imageIndex
+      }),
      prompt: qaPrompt,
-      billId,
+      billId: traingBillId,
      data: chunks.map((item, index) => ({
        ...item,
        chunkIndex: index
@@ -161,10 +188,15 @@ export async function createOneCollection({
  datasetId,
  type,

-  trainingType = TrainingModeEnum.chunk,
-  chunkSize = 512,
-  chunkSplitter,
-  qaPrompt,
+  createTime,
+  updateTime,
+
+  hashRawText,
+  rawTextLength,
+  metadata = {},
+  tags,
+
+  nextSyncTime,

  fileId,
  rawLink,
@@ -172,15 +204,18 @@ export async function createOneCollection({
  externalFileUrl,
  apiFileId,

-  hashRawText,
-  rawTextLength,
-  metadata = {},
-  session,
-  tags,
+  // Parse settings
+  customPdfParse,
+  imageIndex,

-  createTime,
-  updateTime,
-  nextSyncTime
+  // Chunk settings
+  trainingType = DatasetCollectionDataProcessModeEnum.chunk,
+  autoIndexes,
+  chunkSize = 512,
+  chunkSplitter,
+  qaPrompt,
+
+  session
 }: CreateOneCollectionParams) {
  // Create collection tags
  const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session });
@@ -196,25 +231,31 @@ export async function createOneCollection({
        name,
        type,

-        trainingType,
-        chunkSize,
-        chunkSplitter,
-        qaPrompt,
+        rawTextLength,
+        hashRawText,
+        tags: collectionTags,
        metadata,

+        createTime,
+        updateTime,
+        nextSyncTime,
+
        ...(fileId ? { fileId } : {}),
        ...(rawLink ? { rawLink } : {}),
        ...(externalFileId ? { externalFileId } : {}),
        ...(externalFileUrl ? { externalFileUrl } : {}),
        ...(apiFileId ? { apiFileId } : {}),

-        rawTextLength,
-        hashRawText,
-        tags: collectionTags,
+        // Parse settings
+        customPdfParse,
+        imageIndex,

-        createTime,
-        updateTime,
-        nextSyncTime
+        // Chunk settings
+        trainingType,
+        autoIndexes,
+        chunkSize,
+        chunkSplitter,
+        qaPrompt
      }
    ],
    { session, ordered: true }
--- a/packages/service/core/dataset/collection/schema.ts
+++ b/packages/service/core/dataset/collection/schema.ts
@@ -1,7 +1,10 @@
 import { connectionMongo, getMongoModel } from '../../../common/mongo';
 const { Schema, model, models } = connectionMongo;
 import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
-import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constants';
+import {
+  DatasetCollectionTypeMap,
+  DatasetCollectionDataProcessModeEnum
+} from '@fastgpt/global/core/dataset/constants';
 import { DatasetCollectionName } from '../schema';
 import {
  TeamCollectionName,
@@ -31,6 +34,8 @@ const DatasetCollectionSchema = new Schema({
    ref: DatasetCollectionName,
    required: true
  },
+
+  // Basic info
  type: {
    type: String,
    enum: Object.keys(DatasetCollectionTypeMap),
@@ -40,6 +45,11 @@ const DatasetCollectionSchema = new Schema({
    type: String,
    required: true
  },
+  tags: {
+    type: [String],
+    default: []
+  },
+
  createTime: {
    type: Date,
    default: () => new Date()
@@ -48,33 +58,8 @@ const DatasetCollectionSchema = new Schema({
    type: Date,
    default: () => new Date()
  },
-  forbid: {
-    type: Boolean,
-    default: false
-  },
-
-  // chunk filed
-  trainingType: {
-    type: String,
-    enum: Object.keys(TrainingTypeMap)
-  },
-  chunkSize: {
-    type: Number,
-    required: true
-  },
-  chunkSplitter: {
-    type: String
-  },
-  qaPrompt: {
-    type: String
-  },
-  ocrParse: Boolean,
-
-  tags: {
-    type: [String],
-    default: []
-  },

+  // Metadata
  // local file collection
  fileId: {
    type: Schema.Types.ObjectId,
@@ -82,22 +67,39 @@ const DatasetCollectionSchema = new Schema({
  },
  // web link collection
  rawLink: String,
-  // api collection
+  // Api collection
  apiFileId: String,
-  // external collection
+  // external collection(Abandoned)
  externalFileId: String,
  externalFileUrl: String, // external import url

-  // next sync time
-  nextSyncTime: Date,
-
-  // metadata
  rawTextLength: Number,
  hashRawText: String,
  metadata: {
    type: Object,
    default: {}
-  }
+  },
+
+  forbid: Boolean,
+  // next sync time
+  nextSyncTime: Date,
+
+  // Parse settings
+  customPdfParse: Boolean,
+
+  // Chunk settings
+  imageIndex: Boolean,
+  autoIndexes: Boolean,
+  trainingType: {
+    type: String,
+    enum: Object.values(DatasetCollectionDataProcessModeEnum)
+  },
+  chunkSize: {
+    type: Number,
+    required: true
+  },
+  chunkSplitter: String,
+  qaPrompt: String
 });

 DatasetCollectionSchema.virtual('dataset', {
--- a/packages/service/core/dataset/collection/utils.ts
+++ b/packages/service/core/dataset/collection/utils.ts
@@ -2,12 +2,17 @@ import { MongoDatasetCollection } from './schema';
 import { ClientSession } from '../../../common/mongo';
 import { MongoDatasetCollectionTags } from '../tag/schema';
 import { readFromSecondary } from '../../../common/mongo/utils';
-import { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type';
 import {
+  CollectionWithDatasetType,
+  DatasetCollectionSchemaType
+} from '@fastgpt/global/core/dataset/type';
+import {
+  DatasetCollectionDataProcessModeEnum,
  DatasetCollectionSyncResultEnum,
  DatasetCollectionTypeEnum,
  DatasetSourceReadTypeEnum,
-  DatasetTypeEnum
+  DatasetTypeEnum,
+  TrainingModeEnum
 } from '@fastgpt/global/core/dataset/constants';
 import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
 import { readDatasetSourceRawText } from '../read';
@@ -160,6 +165,7 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
  })();
  const rawText = await readDatasetSourceRawText({
    teamId: collection.teamId,
+    tmbId: collection.tmbId,
    ...sourceReadType
  });

@@ -220,3 +226,24 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {

  return DatasetCollectionSyncResultEnum.success;
 };
+
+/* 
+  QA: 独立进程
+  Chunk: Image Index -> Auto index -> chunk index
+*/
+export const getTrainingModeByCollection = (collection: {
+  trainingType: DatasetCollectionSchemaType['trainingType'];
+  autoIndexes?: DatasetCollectionSchemaType['autoIndexes'];
+  imageIndex?: DatasetCollectionSchemaType['imageIndex'];
+}) => {
+  if (collection.trainingType === DatasetCollectionDataProcessModeEnum.qa) {
+    return TrainingModeEnum.qa;
+  }
+  if (collection.imageIndex && global.feConfigs?.isPlus) {
+    return TrainingModeEnum.image;
+  }
+  if (collection.autoIndexes && global.feConfigs?.isPlus) {
+    return TrainingModeEnum.auto;
+  }
+  return TrainingModeEnum.chunk;
+};
--- a/packages/service/core/dataset/data/schema.ts
+++ b/packages/service/core/dataset/data/schema.ts
@@ -7,6 +7,7 @@ import {
 } from '@fastgpt/global/support/user/team/constant';
 import { DatasetCollectionName } from '../schema';
 import { DatasetColCollectionName } from '../collection/schema';
+import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';

 export const DatasetDataCollectionName = 'dataset_datas';

@@ -42,10 +43,16 @@ const DatasetDataSchema = new Schema({
  indexes: {
    type: [
      {
+        // Abandon
        defaultIndex: {
          type: Boolean,
          default: false
        },
+        type: {
+          type: String,
+          enum: Object.values(DatasetDataIndexTypeEnum),
+          default: DatasetDataIndexTypeEnum.custom
+        },
        dataId: {
          type: String,
          required: true
--- a/packages/service/core/dataset/read.ts
+++ b/packages/service/core/dataset/read.ts
@@ -13,11 +13,15 @@ import { POST } from '../../common/api/plusRequest';

 export const readFileRawTextByUrl = async ({
  teamId,
+  tmbId,
  url,
+  customPdfParse,
  relatedId
 }: {
  teamId: string;
+  tmbId: string;
  url: string;
+  customPdfParse?: boolean;
  relatedId: string; // externalFileId / apiFileId
 }) => {
  const response = await axios({
@@ -30,8 +34,11 @@ export const readFileRawTextByUrl = async ({
  const buffer = Buffer.from(response.data, 'binary');

  const { rawText } = await readRawContentByFileBuffer({
+    customPdfParse,
+    isQAImport: false,
    extension,
    teamId,
+    tmbId,
    buffer,
    encoding: 'utf-8',
    metadata: {
@@ -49,6 +56,7 @@ export const readFileRawTextByUrl = async ({
 */
 export const readDatasetSourceRawText = async ({
  teamId,
+  tmbId,
  type,
  sourceId,
  isQAImport,
@@ -56,11 +64,14 @@ export const readDatasetSourceRawText = async ({
  externalFileId,
  apiServer,
  feishuServer,
-  yuqueServer
+  yuqueServer,
+  customPdfParse
 }: {
  teamId: string;
+  tmbId: string;
  type: DatasetSourceReadTypeEnum;
  sourceId: string;
+  customPdfParse?: boolean;

  isQAImport?: boolean; // csv data
  selector?: string; // link selector
@@ -72,9 +83,11 @@ export const readDatasetSourceRawText = async ({
  if (type === DatasetSourceReadTypeEnum.fileLocal) {
    const { rawText } = await readFileContentFromMongo({
      teamId,
+      tmbId,
      bucketName: BucketNameEnum.dataset,
      fileId: sourceId,
-      isQAImport
+      isQAImport,
+      customPdfParse
    });
    return rawText;
  } else if (type === DatasetSourceReadTypeEnum.link) {
@@ -88,8 +101,10 @@ export const readDatasetSourceRawText = async ({
    if (!externalFileId) return Promise.reject('FileId not found');
    const rawText = await readFileRawTextByUrl({
      teamId,
+      tmbId,
      url: sourceId,
-      relatedId: externalFileId
+      relatedId: externalFileId,
+      customPdfParse
    });
    return rawText;
  } else if (type === DatasetSourceReadTypeEnum.apiFile) {
@@ -98,7 +113,8 @@ export const readDatasetSourceRawText = async ({
      feishuServer,
      yuqueServer,
      apiFileId: sourceId,
-      teamId
+      teamId,
+      tmbId
    });
    return rawText;
  }
@@ -110,16 +126,18 @@ export const readApiServerFileContent = async ({
  feishuServer,
  yuqueServer,
  apiFileId,
-  teamId
+  teamId,
+  tmbId
 }: {
  apiServer?: APIFileServer;
  feishuServer?: FeishuServer;
  yuqueServer?: YuqueServer;
  apiFileId: string;
  teamId: string;
+  tmbId: string;
 }) => {
  if (apiServer) {
-    return useApiDatasetRequest({ apiServer }).getFileContent({ teamId, apiFileId });
+    return useApiDatasetRequest({ apiServer }).getFileContent({ teamId, tmbId, apiFileId });
  }

  if (feishuServer || yuqueServer) {
--- a/packages/service/core/dataset/schema.ts
+++ b/packages/service/core/dataset/schema.ts
@@ -67,6 +67,7 @@ const DatasetSchema = new Schema({
    required: true,
    default: 'gpt-4o-mini'
  },
+  vlmModel: String,
  intro: {
    type: String,
    default: ''
--- a/packages/service/core/dataset/training/controller.ts
+++ b/packages/service/core/dataset/training/controller.ts
@@ -1,16 +1,16 @@
 import { MongoDatasetTraining } from './schema';
 import type {
  PushDatasetDataChunkProps,
-  PushDatasetDataProps,
  PushDatasetDataResponse
 } from '@fastgpt/global/core/dataset/api.d';
 import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
 import { simpleText } from '@fastgpt/global/common/string/tools';
 import { ClientSession } from '../../../common/mongo';
-import { getLLMModel, getEmbeddingModel } from '../../ai/model';
+import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
 import { addLog } from '../../../common/system/log';
 import { getCollectionWithDataset } from '../controller';
 import { mongoSessionRun } from '../../../common/mongo/sessionRun';
+import { PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type';

 export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
  try {
@@ -28,20 +28,17 @@ export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> =>
 export const pushDataListToTrainingQueueByCollectionId = async ({
  collectionId,
  ...props
-}: {
-  teamId: string;
-  tmbId: string;
-  session?: ClientSession;
-} & PushDatasetDataProps) => {
+}: Omit<PushDataToTrainingQueueProps, 'datasetId' | 'agentModel' | 'vectorModel' | 'vlmModel'>) => {
  const {
-    dataset: { _id: datasetId, agentModel, vectorModel }
+    dataset: { _id: datasetId, agentModel, vectorModel, vlmModel }
  } = await getCollectionWithDataset(collectionId);
  return pushDataListToTrainingQueue({
    ...props,
    datasetId,
    collectionId,
+    vectorModel,
    agentModel,
-    vectorModel
+    vlmModel
  });
 };

@@ -52,30 +49,30 @@ export async function pushDataListToTrainingQueue({
  collectionId,
  agentModel,
  vectorModel,
+  vlmModel,
  data,
  prompt,
  billId,
-  trainingMode = TrainingModeEnum.chunk,
+  mode = TrainingModeEnum.chunk,
  session
-}: {
-  teamId: string;
-  tmbId: string;
-  datasetId: string;
-  agentModel: string;
-  vectorModel: string;
-  session?: ClientSession;
-} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
+}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
+  const getImageChunkMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => {
+    if (mode !== TrainingModeEnum.image) return mode;
+    // 检查内容中，是否包含 ![](xxx) 的图片格式
+    const text = data.q + data.a || '';
+    const regex = /!\[\]\((.*?)\)/g;
+    const match = text.match(regex);
+    if (match) {
+      return TrainingModeEnum.image;
+    }
+    return mode;
+  };
  const { model, maxToken, weight } = await (async () => {
-    const agentModelData = getLLMModel(agentModel);
-    if (!agentModelData) {
-      return Promise.reject(`File model ${agentModel} is inValid`);
-    }
-    const vectorModelData = getEmbeddingModel(vectorModel);
-    if (!vectorModelData) {
-      return Promise.reject(`Vector model ${vectorModel} is inValid`);
-    }
-
-    if (trainingMode === TrainingModeEnum.chunk) {
+    if (mode === TrainingModeEnum.chunk) {
+      const vectorModelData = getEmbeddingModel(vectorModel);
+      if (!vectorModelData) {
+        return Promise.reject(`Vector model ${vectorModel} is inValid`);
+      }
      return {
        maxToken: vectorModelData.maxToken * 1.5,
        model: vectorModelData.model,
@@ -83,7 +80,11 @@ export async function pushDataListToTrainingQueue({
      };
    }

-    if (trainingMode === TrainingModeEnum.qa || trainingMode === TrainingModeEnum.auto) {
+    if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) {
+      const agentModelData = getLLMModel(agentModel);
+      if (!agentModelData) {
+        return Promise.reject(`File model ${agentModel} is inValid`);
+      }
      return {
        maxToken: agentModelData.maxContext * 0.8,
        model: agentModelData.model,
@@ -91,8 +92,24 @@ export async function pushDataListToTrainingQueue({
      };
    }

-    return Promise.reject(`Training mode "${trainingMode}" is inValid`);
+    if (mode === TrainingModeEnum.image) {
+      const vllmModelData = getVlmModel(vlmModel);
+      if (!vllmModelData) {
+        return Promise.reject(`Vlm model ${vlmModel} is inValid`);
+      }
+      return {
+        maxToken: vllmModelData.maxContext * 0.8,
+        model: vllmModelData.model,
+        weight: 0
+      };
+    }
+
+    return Promise.reject(`Training mode "${mode}" is inValid`);
  })();
+  // Filter redundant params
+  if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) {
+    prompt = undefined;
+  }

  // filter repeat or equal content
  const set = new Set();
@@ -158,7 +175,7 @@ export async function pushDataListToTrainingQueue({
          datasetId,
          collectionId,
          billId,
-          mode: trainingMode,
+          mode: getImageChunkMode(item, mode),
          prompt,
          model,
          q: item.q,
--- a/packages/service/core/dataset/training/schema.ts
+++ b/packages/service/core/dataset/training/schema.ts
@@ -1,14 +1,15 @@
 /* 模型的知识库 */
-import { connectionMongo, getMongoModel, type Model } from '../../../common/mongo';
-const { Schema, model, models } = connectionMongo;
+import { connectionMongo, getMongoModel } from '../../../common/mongo';
+const { Schema } = connectionMongo;
 import { DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
-import { TrainingTypeMap } from '@fastgpt/global/core/dataset/constants';
+import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
 import { DatasetColCollectionName } from '../collection/schema';
 import { DatasetCollectionName } from '../schema';
 import {
  TeamCollectionName,
  TeamMemberCollectionName
 } from '@fastgpt/global/support/user/team/constant';
+import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';

 export const DatasetTrainingCollectionName = 'dataset_trainings';

@@ -25,7 +26,6 @@ const TrainingDataSchema = new Schema({
  },
  datasetId: {
    type: Schema.Types.ObjectId,
-    ref: DatasetCollectionName,
    required: true
  },
  collectionId: {
@@ -33,15 +33,13 @@ const TrainingDataSchema = new Schema({
    ref: DatasetColCollectionName,
    required: true
  },
-  billId: {
-    // concat bill
-    type: String
-  },
+  billId: String,
  mode: {
    type: String,
-    enum: Object.keys(TrainingTypeMap),
+    enum: Object.values(TrainingModeEnum),
    required: true
  },
+
  expireAt: {
    // It will be deleted after 7 days
    type: Date,
@@ -88,6 +86,10 @@ const TrainingDataSchema = new Schema({
  indexes: {
    type: [
      {
+        type: {
+          type: String,
+          enum: Object.values(DatasetDataIndexTypeEnum)
+        },
        text: {
          type: String,
          required: true
@@ -98,6 +100,19 @@ const TrainingDataSchema = new Schema({
  }
 });

+TrainingDataSchema.virtual('dataset', {
+  ref: DatasetCollectionName,
+  localField: 'datasetId',
+  foreignField: '_id',
+  justOne: true
+});
+TrainingDataSchema.virtual('collection', {
+  ref: DatasetColCollectionName,
+  localField: 'collectionId',
+  foreignField: '_id',
+  justOne: true
+});
+
 try {
  // lock training data(teamId); delete training data
  TrainingDataSchema.index({ teamId: 1, datasetId: 1 });
--- a/packages/service/core/workflow/dispatch/agent/runTool/index.ts
+++ b/packages/service/core/workflow/dispatch/agent/runTool/index.ts
@@ -1,6 +1,7 @@
 import { NodeOutputKeyEnum } from '@fastgpt/global/core/workflow/constants';
 import { DispatchNodeResponseKeyEnum } from '@fastgpt/global/core/workflow/runtime/constants';
 import type {
+  ChatDispatchProps,
  DispatchNodeResultType,
  RuntimeNodeItemType
 } from '@fastgpt/global/core/workflow/runtime/type';
@@ -46,7 +47,7 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise<
    query,
    requestOrigin,
    chatConfig,
-    runningAppInfo: { teamId },
+    runningUserInfo,
    externalProvider,
    params: {
      model,
@@ -99,10 +100,10 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise<

  const globalFiles = chatValue2RuntimePrompt(query).files;
  const { documentQuoteText, userFiles } = await getMultiInput({
+    runningUserInfo,
    histories: chatHistories,
    requestOrigin,
    maxFiles: chatConfig?.fileSelectConfig?.maxFiles || 20,
-    teamId,
    fileLinks,
    inputFiles: globalFiles,
    hasReadFilesTool
@@ -289,19 +290,19 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise<
 };

 const getMultiInput = async ({
+  runningUserInfo,
  histories,
  fileLinks,
  requestOrigin,
  maxFiles,
-  teamId,
  inputFiles,
  hasReadFilesTool
 }: {
+  runningUserInfo: ChatDispatchProps['runningUserInfo'];
  histories: ChatItemType[];
  fileLinks?: string[];
  requestOrigin?: string;
  maxFiles: number;
-  teamId: string;
  inputFiles: UserChatItemValueItemType['file'][];
  hasReadFilesTool: boolean;
 }) => {
@@ -329,7 +330,8 @@ const getMultiInput = async ({
    urls,
    requestOrigin,
    maxFiles,
-    teamId
+    teamId: runningUserInfo.teamId,
+    tmbId: runningUserInfo.tmbId
  });

  return {
--- a/packages/service/core/workflow/dispatch/chat/oneapi.ts
+++ b/packages/service/core/workflow/dispatch/chat/oneapi.ts
@@ -11,7 +11,10 @@ import { formatModelChars2Points } from '../../../../support/wallet/usage/utils'
 import type { LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
 import { postTextCensor } from '../../../../common/api/requestPlusApi';
 import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants';
-import type { DispatchNodeResultType } from '@fastgpt/global/core/workflow/runtime/type';
+import type {
+  ChatDispatchProps,
+  DispatchNodeResultType
+} from '@fastgpt/global/core/workflow/runtime/type';
 import { countGptMessagesTokens } from '../../../../common/string/tiktoken/index';
 import {
  chats2GPTMessages,
@@ -69,7 +72,7 @@ export const dispatchChatCompletion = async (props: ChatProps): Promise<ChatResp
    histories,
    node: { name },
    query,
-    runningAppInfo: { teamId },
+    runningUserInfo,
    workflowStreamResponse,
    chatConfig,
    params: {
@@ -121,7 +124,7 @@ export const dispatchChatCompletion = async (props: ChatProps): Promise<ChatResp
      stringQuoteText,
      requestOrigin,
      maxFiles: chatConfig?.fileSelectConfig?.maxFiles || 20,
-      teamId
+      runningUserInfo
    })
  ]);

@@ -355,7 +358,7 @@ async function getMultiInput({
  stringQuoteText,
  requestOrigin,
  maxFiles,
-  teamId
+  runningUserInfo
 }: {
  histories: ChatItemType[];
  inputFiles: UserChatItemValueItemType['file'][];
@@ -363,7 +366,7 @@ async function getMultiInput({
  stringQuoteText?: string; // file quote
  requestOrigin?: string;
  maxFiles: number;
-  teamId: string;
+  runningUserInfo: ChatDispatchProps['runningUserInfo'];
 }) {
  // 旧版本适配====>
  if (stringQuoteText) {
@@ -400,7 +403,8 @@ async function getMultiInput({
    urls,
    requestOrigin,
    maxFiles,
-    teamId
+    teamId: runningUserInfo.teamId,
+    tmbId: runningUserInfo.tmbId
  });

  return {
--- a/packages/service/core/workflow/dispatch/tools/readFiles.ts
+++ b/packages/service/core/workflow/dispatch/tools/readFiles.ts
@@ -45,7 +45,7 @@ ${content.slice(0, 100)}${content.length > 100 ? '......' : ''}
 export const dispatchReadFiles = async (props: Props): Promise<Response> => {
  const {
    requestOrigin,
-    runningAppInfo: { teamId },
+    runningUserInfo: { teamId, tmbId },
    histories,
    chatConfig,
    node: { version },
@@ -61,7 +61,8 @@ export const dispatchReadFiles = async (props: Props): Promise<Response> => {
    urls: [...fileUrlList, ...filesFromHistories],
    requestOrigin,
    maxFiles,
-    teamId
+    teamId,
+    tmbId
  });

  return {
@@ -105,12 +106,14 @@ export const getFileContentFromLinks = async ({
  urls,
  requestOrigin,
  maxFiles,
-  teamId
+  teamId,
+  tmbId
 }: {
  urls: string[];
  requestOrigin?: string;
  maxFiles: number;
  teamId: string;
+  tmbId: string;
 }) => {
  const parseUrlList = urls
    // Remove invalid urls
@@ -205,6 +208,7 @@ export const getFileContentFromLinks = async ({
            extension,
            isQAImport: false,
            teamId,
+            tmbId,
            buffer,
            encoding
          });
--- a/packages/service/support/wallet/usage/controller.ts
+++ b/packages/service/support/wallet/usage/controller.ts
@@ -117,14 +117,16 @@ export const createTrainingUsage = async ({
  billSource,
  vectorModel,
  agentModel,
+  vllmModel,
  session
 }: {
  teamId: string;
  tmbId: string;
  appName: string;
  billSource: UsageSourceEnum;
-  vectorModel: string;
-  agentModel: string;
+  vectorModel?: string;
+  agentModel?: string;
+  vllmModel?: string;
  session?: ClientSession;
 }) => {
  const [{ _id }] = await MongoUsage.create(
@@ -136,27 +138,46 @@ export const createTrainingUsage = async ({
        source: billSource,
        totalPoints: 0,
        list: [
-          {
-            moduleName: i18nT('common:support.wallet.moduleName.index'),
-            model: vectorModel,
-            amount: 0,
-            inputTokens: 0,
-            outputTokens: 0
-          },
-          {
-            moduleName: i18nT('common:support.wallet.moduleName.qa'),
-            model: agentModel,
-            amount: 0,
-            inputTokens: 0,
-            outputTokens: 0
-          },
-          {
-            moduleName: i18nT('common:core.dataset.training.Auto mode'),
-            model: agentModel,
-            amount: 0,
-            inputTokens: 0,
-            outputTokens: 0
-          }
+          ...(vectorModel
+            ? [
+                {
+                  moduleName: i18nT('account_usage:embedding_index'),
+                  model: vectorModel,
+                  amount: 0,
+                  inputTokens: 0,
+                  outputTokens: 0
+                }
+              ]
+            : []),
+          ...(agentModel
+            ? [
+                {
+                  moduleName: i18nT('account_usage:qa'),
+                  model: agentModel,
+                  amount: 0,
+                  inputTokens: 0,
+                  outputTokens: 0
+                },
+                {
+                  moduleName: i18nT('account_usage:auto_index'),
+                  model: agentModel,
+                  amount: 0,
+                  inputTokens: 0,
+                  outputTokens: 0
+                }
+              ]
+            : []),
+          ...(vllmModel
+            ? [
+                {
+                  moduleName: i18nT('account_usage:image_parse'),
+                  model: vllmModel,
+                  amount: 0,
+                  inputTokens: 0,
+                  outputTokens: 0
+                }
+              ]
+            : [])
        ]
      }
    ],
@@ -165,3 +186,31 @@ export const createTrainingUsage = async ({

  return { billId: String(_id) };
 };
+
+export const createPdfParseUsage = async ({
+  teamId,
+  tmbId,
+  pages
+}: {
+  teamId: string;
+  tmbId: string;
+  pages: number;
+}) => {
+  const unitPrice = global.systemEnv?.customPdfParse?.price || 0;
+  const totalPoints = pages * unitPrice;
+
+  createUsage({
+    teamId,
+    tmbId,
+    appName: i18nT('account_usage:pdf_enhanced_parse'),
+    totalPoints,
+    source: UsageSourceEnum.pdfParse,
+    list: [
+      {
+        moduleName: i18nT('account_usage:pdf_enhanced_parse'),
+        amount: totalPoints,
+        pages
+      }
+    ]
+  });
+};
--- a/packages/service/worker/readFile/index.ts
+++ b/packages/service/worker/readFile/index.ts
@@ -9,7 +9,7 @@ import { readXlsxRawText } from './extension/xlsx';
 import { readCsvRawText } from './extension/csv';

 parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
-  const readRawContentByFileBuffer = async (params: ReadRawTextByBuffer) => {
+  const read = async (params: ReadRawTextByBuffer) => {
    switch (params.extension) {
      case 'txt':
      case 'md':
@@ -41,7 +41,7 @@ parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
  try {
    parentPort?.postMessage({
      type: 'success',
-      data: await readRawContentByFileBuffer(newProps)
+      data: await read(newProps)
    });
  } catch (error) {
    console.log(error);