Add image index and pdf parse (#3956)

* feat: think tag parse * feat: parse think tag test * feat: pdf parse ux * feat: doc2x parse * perf: rewrite training mode setting * feat: image parse queue * perf: image index * feat: image parse process * feat: add init sh * fix: ts
2025-10-18 01:16:01 +00:00 · 2025-03-03 23:08:29 +08:00
parent 08b6f594df
commit adf5377ebe
106 changed files with 2337 additions and 1454 deletions
--- a/packages/service/common/file/gridfs/controller.ts
+++ b/packages/service/common/file/gridfs/controller.ts
@@ -186,20 +186,25 @@ export async function getDownloadStream({

 export const readFileContentFromMongo = async ({
  teamId,
+  tmbId,
  bucketName,
  fileId,
-  isQAImport = false
+  isQAImport = false,
+  customPdfParse = false
 }: {
  teamId: string;
+  tmbId: string;
  bucketName: `${BucketNameEnum}`;
  fileId: string;
  isQAImport?: boolean;
+  customPdfParse?: boolean;
 }): Promise<{
  rawText: string;
  filename: string;
 }> => {
+  const bufferId = `${fileId}-${customPdfParse}`;
  // read buffer
-  const fileBuffer = await MongoRawTextBuffer.findOne({ sourceId: fileId }, undefined, {
+  const fileBuffer = await MongoRawTextBuffer.findOne({ sourceId: bufferId }, undefined, {
    ...readFromSecondary
  }).lean();
  if (fileBuffer) {
@@ -227,9 +232,11 @@ export const readFileContentFromMongo = async ({

  // Get raw text
  const { rawText } = await readRawContentByFileBuffer({
+    customPdfParse,
    extension,
    isQAImport,
    teamId,
+    tmbId,
    buffer: fileBuffers,
    encoding,
    metadata: {
@@ -240,7 +247,7 @@ export const readFileContentFromMongo = async ({
  // < 14M
  if (fileBuffers.length < 14 * 1024 * 1024 && rawText.trim()) {
    MongoRawTextBuffer.create({
-      sourceId: fileId,
+      sourceId: bufferId,
      rawText,
      metadata: {
        filename: file.filename