Feat: pptx and xlsx loader (#1118)

* perf: plan tip * perf: upload size controller * feat: add image ttl index * feat: new upload file ux * remove file * feat: support read pptx * feat: support xlsx * fix: rerank docker flie
2025-10-16 08:01:18 +00:00 · 2024-04-01 19:01:26 +08:00
parent f9d266a6af
commit 21288d1736
90 changed files with 2707 additions and 1678 deletions
--- a/packages/service/common/buffer/rawText/schema.ts
+++ b/packages/service/common/buffer/rawText/schema.ts
@@ -0,0 +1,33 @@
+import { connectionMongo, type Model } from '../../mongo';
+const { Schema, model, models } = connectionMongo;
+import { RawTextBufferSchemaType } from './type';
+
+export const collectionName = 'buffer.rawText';
+
+const RawTextBufferSchema = new Schema({
+  sourceId: {
+    type: String,
+    required: true
+  },
+  rawText: {
+    type: String,
+    default: ''
+  },
+  createTime: {
+    type: Date,
+    default: () => new Date()
+  },
+  metadata: Object
+});
+
+try {
+  RawTextBufferSchema.index({ sourceId: 1 });
+  //  20 minutes
+  RawTextBufferSchema.index({ createTime: 1 }, { expireAfterSeconds: 20 * 60 });
+} catch (error) {
+  console.log(error);
+}
+
+export const MongoRwaTextBuffer: Model<RawTextBufferSchemaType> =
+  models[collectionName] || model(collectionName, RawTextBufferSchema);
+MongoRwaTextBuffer.syncIndexes();
--- a/packages/service/common/buffer/rawText/type.d.ts
+++ b/packages/service/common/buffer/rawText/type.d.ts
@@ -0,0 +1,8 @@
+export type RawTextBufferSchemaType = {
+  sourceId: string;
+  rawText: string;
+  createTime: Date;
+  metadata?: {
+    filename: string;
+  };
+};
--- a/packages/service/common/buffer/tts/schema.ts
+++ b/packages/service/common/buffer/tts/schema.ts
@@ -2,7 +2,7 @@ import { connectionMongo, type Model } from '../../../common/mongo';
 const { Schema, model, models } = connectionMongo;
 import { TTSBufferSchemaType } from './type.d';

-export const collectionName = 'ttsbuffers';
+export const collectionName = 'buffer.tts';

 const TTSBufferSchema = new Schema({
  bufferId: {
--- a/packages/service/common/file/gridfs/controller.ts
+++ b/packages/service/common/file/gridfs/controller.ts
@@ -4,6 +4,18 @@ import fsp from 'fs/promises';
 import fs from 'fs';
 import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
 import { MongoFileSchema } from './schema';
+import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
+import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
+import { readFileRawText } from '../read/rawText';
+import { ReadFileByBufferParams } from '../read/type';
+import { readMarkdown } from '../read/markdown';
+import { readHtmlRawText } from '../read/html';
+import { readPdfFile } from '../read/pdf';
+import { readWordFile } from '../read/word';
+import { readCsvRawText } from '../read/csv';
+import { MongoRwaTextBuffer } from '../../buffer/rawText/schema';
+import { readPptxRawText } from '../read/pptx';
+import { readXlsxRawText } from '../read/xlsx';

 export function getGFSCollection(bucket: `${BucketNameEnum}`) {
  MongoFileSchema;
@@ -111,3 +123,139 @@ export async function getDownloadStream({

  return bucket.openDownloadStream(new Types.ObjectId(fileId));
 }
+
+export const readFileEncode = async ({
+  bucketName,
+  fileId
+}: {
+  bucketName: `${BucketNameEnum}`;
+  fileId: string;
+}) => {
+  const encodeStream = await getDownloadStream({ bucketName, fileId });
+  let buffers: Buffer = Buffer.from([]);
+  for await (const chunk of encodeStream) {
+    buffers = Buffer.concat([buffers, chunk]);
+    if (buffers.length > 10) {
+      encodeStream.abort();
+      break;
+    }
+  }
+
+  const encoding = detectFileEncoding(buffers);
+
+  return encoding as BufferEncoding;
+};
+
+export const readFileContent = async ({
+  teamId,
+  bucketName,
+  fileId,
+  csvFormat = false
+}: {
+  teamId: string;
+  bucketName: `${BucketNameEnum}`;
+  fileId: string;
+  csvFormat?: boolean;
+}): Promise<{
+  rawText: string;
+  filename: string;
+}> => {
+  // read buffer
+  const fileBuffer = await MongoRwaTextBuffer.findOne({ sourceId: fileId }).lean();
+  if (fileBuffer) {
+    return {
+      rawText: fileBuffer.rawText,
+      filename: fileBuffer.metadata?.filename || ''
+    };
+  }
+
+  const [file, encoding, fileStream] = await Promise.all([
+    getFileById({ bucketName, fileId }),
+    readFileEncode({ bucketName, fileId }),
+    getDownloadStream({ bucketName, fileId })
+  ]);
+
+  if (!file) {
+    return Promise.reject(CommonErrEnum.fileNotFound);
+  }
+
+  const extension = file?.filename?.split('.')?.pop()?.toLowerCase() || '';
+
+  const fileBuffers = await (() => {
+    return new Promise<Buffer>((resolve, reject) => {
+      let buffers = Buffer.from([]);
+      fileStream.on('data', (chunk) => {
+        buffers = Buffer.concat([buffers, chunk]);
+      });
+      fileStream.on('end', () => {
+        resolve(buffers);
+      });
+      fileStream.on('error', (err) => {
+        reject(err);
+      });
+    });
+  })();
+
+  const params: ReadFileByBufferParams = {
+    teamId,
+    buffer: fileBuffers,
+    encoding,
+    metadata: {
+      relatedId: fileId
+    }
+  };
+
+  const { rawText } = await (async () => {
+    switch (extension) {
+      case 'txt':
+        return readFileRawText(params);
+      case 'md':
+        return readMarkdown(params);
+      case 'html':
+        return readHtmlRawText(params);
+      case 'pdf':
+        return readPdfFile(params);
+      case 'docx':
+        return readWordFile(params);
+      case 'pptx':
+        return readPptxRawText(params);
+      case 'xlsx':
+        const xlsxResult = await readXlsxRawText(params);
+        if (csvFormat) {
+          return {
+            rawText: xlsxResult.formatText || ''
+          };
+        }
+        return {
+          rawText: xlsxResult.rawText
+        };
+      case 'csv':
+        const csvResult = await readCsvRawText(params);
+        if (csvFormat) {
+          return {
+            rawText: csvResult.formatText || ''
+          };
+        }
+        return {
+          rawText: csvResult.rawText
+        };
+      default:
+        return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
+    }
+  })();
+
+  if (rawText.trim()) {
+    await MongoRwaTextBuffer.create({
+      sourceId: fileId,
+      rawText,
+      metadata: {
+        filename: file.filename
+      }
+    });
+  }
+
+  return {
+    rawText,
+    filename: file.filename
+  };
+};
--- a/packages/service/common/file/image/controller.ts
+++ b/packages/service/common/file/image/controller.ts
@@ -14,7 +14,6 @@ export async function uploadMongoImg({
  teamId,
  expiredTime,
  metadata,
-
  shareId
 }: UploadImgProps & {
  teamId: string;
@@ -30,9 +29,8 @@ export async function uploadMongoImg({
    type,
    teamId,
    binary,
-    expiredTime: expiredTime,
+    expiredTime,
    metadata,
-
    shareId
  });

--- a/packages/service/common/file/image/schema.ts
+++ b/packages/service/common/file/image/schema.ts
@@ -25,13 +25,13 @@ const ImageSchema = new Schema({
    enum: Object.keys(mongoImageTypeMap),
    required: true
  },
-
  metadata: {
    type: Object
  }
 });

 try {
+  // tts expired
  ImageSchema.index({ expiredTime: 1 }, { expireAfterSeconds: 60 });
  ImageSchema.index({ type: 1 });
  ImageSchema.index({ createTime: 1 });
--- a/packages/service/common/file/read/csv.ts
+++ b/packages/service/common/file/read/csv.ts
@@ -0,0 +1,21 @@
+import Papa from 'papaparse';
+import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
+import { readFileRawText } from './rawText';
+
+// 加载源文件内容
+export const readCsvRawText = async (params: ReadFileByBufferParams): Promise<ReadFileResponse> => {
+  const { rawText } = readFileRawText(params);
+
+  const csvArr = Papa.parse(rawText).data as string[][];
+
+  const header = csvArr[0];
+
+  const formatText = header
+    ? csvArr.map((item) => item.map((item, i) => `${header[i]}:${item}`).join('\n')).join('\n')
+    : '';
+
+  return {
+    rawText,
+    formatText
+  };
+};
--- a/packages/service/common/file/read/html.ts
+++ b/packages/service/common/file/read/html.ts
@@ -0,0 +1,23 @@
+import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
+import { initMarkdownText } from './utils';
+import { htmlToMarkdown } from '../../string/markdown';
+import { readFileRawText } from './rawText';
+
+export const readHtmlRawText = async (
+  params: ReadFileByBufferParams
+): Promise<ReadFileResponse> => {
+  const { teamId, metadata } = params;
+  const { rawText: html } = readFileRawText(params);
+
+  const md = await htmlToMarkdown(html);
+
+  const rawText = await initMarkdownText({
+    teamId,
+    md,
+    metadata
+  });
+
+  return {
+    rawText
+  };
+};
--- a/packages/service/common/file/read/markdown.ts
+++ b/packages/service/common/file/read/markdown.ts
@@ -0,0 +1,18 @@
+import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
+import { initMarkdownText } from './utils';
+import { readFileRawText } from './rawText';
+
+export const readMarkdown = async (params: ReadFileByBufferParams): Promise<ReadFileResponse> => {
+  const { teamId, metadata } = params;
+  const { rawText: md } = readFileRawText(params);
+
+  const rawText = await initMarkdownText({
+    teamId,
+    md,
+    metadata
+  });
+
+  return {
+    rawText
+  };
+};
--- a/packages/service/common/file/read/parseOffice.ts
+++ b/packages/service/common/file/read/parseOffice.ts
@@ -0,0 +1,119 @@
+import { getNanoid } from '@fastgpt/global/common/string/tools';
+import fs from 'fs';
+import decompress from 'decompress';
+import { DOMParser } from '@xmldom/xmldom';
+import { clearDirFiles } from '../utils';
+import { addLog } from '../../system/log';
+
+const DEFAULTDECOMPRESSSUBLOCATION = '/tmp';
+
+function getNewFileName(ext: string) {
+  return `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}.${ext}`;
+}
+
+const parseString = (xml: string) => {
+  let parser = new DOMParser();
+  return parser.parseFromString(xml, 'text/xml');
+};
+
+const parsePowerPoint = async ({
+  filepath,
+  decompressPath,
+  encoding
+}: {
+  filepath: string;
+  decompressPath: string;
+  encoding: BufferEncoding;
+}) => {
+  // Files regex that hold our content of interest
+  const allFilesRegex = /ppt\/(notesSlides|slides)\/(notesSlide|slide)\d+.xml/g;
+  const slidesRegex = /ppt\/slides\/slide\d+.xml/g;
+
+  /** The decompress location which contains the filename in it */
+
+  const files = await decompress(filepath, decompressPath, {
+    filter: (x) => !!x.path.match(allFilesRegex)
+  });
+
+  // Verify if atleast the slides xml files exist in the extracted files list.
+  if (
+    files.length == 0 ||
+    !files.map((file) => file.path).some((filename) => filename.match(slidesRegex))
+  ) {
+    return Promise.reject('解析 PPT 失败');
+  }
+
+  // Returning an array of all the xml contents read using fs.readFileSync
+  const xmlContentArray = files.map((file) =>
+    fs.readFileSync(`${decompressPath}/${file.path}`, encoding)
+  );
+
+  let responseArr: string[] = [];
+
+  xmlContentArray.forEach((xmlContent) => {
+    /** Find text nodes with a:p tags */
+    const xmlParagraphNodesList = parseString(xmlContent).getElementsByTagName('a:p');
+
+    /** Store all the text content to respond */
+    responseArr.push(
+      Array.from(xmlParagraphNodesList)
+        // Filter paragraph nodes than do not have any text nodes which are identifiable by a:t tag
+        .filter((paragraphNode) => paragraphNode.getElementsByTagName('a:t').length != 0)
+        .map((paragraphNode) => {
+          /** Find text nodes with a:t tags */
+          const xmlTextNodeList = paragraphNode.getElementsByTagName('a:t');
+          return Array.from(xmlTextNodeList)
+            .filter((textNode) => textNode.childNodes[0] && textNode.childNodes[0].nodeValue)
+            .map((textNode) => textNode.childNodes[0].nodeValue)
+            .join('');
+        })
+        .join('\n')
+    );
+  });
+
+  return responseArr.join('\n');
+};
+
+export const parseOffice = async ({
+  buffer,
+  encoding,
+  extension
+}: {
+  buffer: Buffer;
+  encoding: BufferEncoding;
+  extension: string;
+}) => {
+  // Prepare file for processing
+  // create temp file subdirectory if it does not exist
+  if (!fs.existsSync(DEFAULTDECOMPRESSSUBLOCATION)) {
+    fs.mkdirSync(DEFAULTDECOMPRESSSUBLOCATION, { recursive: true });
+  }
+
+  // temp file name
+  const filepath = getNewFileName(extension);
+  const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}`;
+  //   const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/test`;
+
+  // write new file
+  fs.writeFileSync(filepath, buffer, {
+    encoding
+  });
+
+  const text = await (async () => {
+    try {
+      switch (extension) {
+        case 'pptx':
+          return parsePowerPoint({ filepath, decompressPath, encoding });
+        default:
+          return Promise.reject('只能读取 .pptx 文件');
+      }
+    } catch (error) {
+      addLog.error(`Load ppt error`, { error });
+    }
+    return '';
+  })();
+
+  fs.unlinkSync(filepath);
+  clearDirFiles(decompressPath);
+  return text;
+};
--- a/packages/service/common/file/read/pdf.ts
+++ b/packages/service/common/file/read/pdf.ts
@@ -0,0 +1,71 @@
+import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
+// @ts-ignore
+import('pdfjs-dist/legacy/build/pdf.worker.min.mjs');
+import { ReadFileByBufferParams, ReadFileResponse } from './type';
+
+type TokenType = {
+  str: string;
+  dir: string;
+  width: number;
+  height: number;
+  transform: number[];
+  fontName: string;
+  hasEOL: boolean;
+};
+
+export const readPdfFile = async ({
+  buffer
+}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
+  const readPDFPage = async (doc: any, pageNo: number) => {
+    const page = await doc.getPage(pageNo);
+    const tokenizedText = await page.getTextContent();
+
+    const viewport = page.getViewport({ scale: 1 });
+    const pageHeight = viewport.height;
+    const headerThreshold = pageHeight * 0.95;
+    const footerThreshold = pageHeight * 0.05;
+
+    const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
+      return (
+        !token.transform ||
+        (token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
+      );
+    });
+
+    // concat empty string 'hasEOL'
+    for (let i = 0; i < pageTexts.length; i++) {
+      const item = pageTexts[i];
+      if (item.str === '' && pageTexts[i - 1]) {
+        pageTexts[i - 1].hasEOL = item.hasEOL;
+        pageTexts.splice(i, 1);
+        i--;
+      }
+    }
+
+    page.cleanup();
+
+    return pageTexts
+      .map((token) => {
+        const paragraphEnd = token.hasEOL && /([。？！.?!\n\r]|(\r\n))$/.test(token.str);
+
+        return paragraphEnd ? `${token.str}\n` : token.str;
+      })
+      .join('');
+  };
+
+  const loadingTask = pdfjs.getDocument(buffer.buffer);
+  const doc = await loadingTask.promise;
+
+  const pageTextPromises = [];
+  for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
+    pageTextPromises.push(readPDFPage(doc, pageNo));
+  }
+  const pageTexts = await Promise.all(pageTextPromises);
+
+  loadingTask.destroy();
+
+  return {
+    rawText: pageTexts.join(''),
+    metadata: {}
+  };
+};
--- a/packages/service/common/file/read/pptx.ts
+++ b/packages/service/common/file/read/pptx.ts
@@ -0,0 +1,14 @@
+import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
+// import { parseOfficeAsync } from 'officeparser';
+import { parseOffice } from './parseOffice';
+
+export const readPptxRawText = async ({
+  buffer,
+  encoding
+}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
+  const result = await parseOffice({ buffer, encoding, extension: 'pptx' });
+
+  return {
+    rawText: result
+  };
+};
--- a/packages/service/common/file/read/rawText.ts
+++ b/packages/service/common/file/read/rawText.ts
@@ -0,0 +1,10 @@
+import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
+
+// 加载源文件内容
+export const readFileRawText = ({ buffer, encoding }: ReadFileByBufferParams): ReadFileResponse => {
+  const content = buffer.toString(encoding);
+
+  return {
+    rawText: content
+  };
+};
--- a/packages/service/common/file/read/type.d.ts
+++ b/packages/service/common/file/read/type.d.ts
@@ -0,0 +1,12 @@
+export type ReadFileByBufferParams = {
+  teamId: string;
+  buffer: Buffer;
+  encoding: BufferEncoding;
+  metadata?: Record<string, any>;
+};
+
+export type ReadFileResponse = {
+  rawText: string;
+  formatText?: string;
+  metadata?: Record<string, any>;
+};
--- a/packages/service/common/file/read/utils.ts
+++ b/packages/service/common/file/read/utils.ts
@@ -0,0 +1,25 @@
+import { markdownProcess } from '@fastgpt/global/common/string/markdown';
+import { uploadMongoImg } from '../image/controller';
+import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
+import { addHours } from 'date-fns';
+
+export const initMarkdownText = ({
+  teamId,
+  md,
+  metadata
+}: {
+  md: string;
+  teamId: string;
+  metadata?: Record<string, any>;
+}) =>
+  markdownProcess({
+    rawText: md,
+    uploadImgController: (base64Img) =>
+      uploadMongoImg({
+        type: MongoImageTypeEnum.collectionImage,
+        base64Img,
+        teamId,
+        metadata,
+        expiredTime: addHours(new Date(), 2)
+      })
+  });
--- a/packages/service/common/file/read/word.ts
+++ b/packages/service/common/file/read/word.ts
@@ -0,0 +1,35 @@
+import mammoth from 'mammoth';
+import { htmlToMarkdown } from '../../string/markdown';
+import { ReadFileByBufferParams, ReadFileResponse } from './type';
+import { initMarkdownText } from './utils';
+
+/**
+ * read docx to markdown
+ */
+export const readWordFile = async ({
+  teamId,
+  buffer,
+  metadata = {}
+}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
+  try {
+    const { value: html } = await mammoth.convertToHtml({
+      buffer
+    });
+
+    const md = await htmlToMarkdown(html);
+
+    const rawText = await initMarkdownText({
+      teamId,
+      md,
+      metadata
+    });
+
+    return {
+      rawText,
+      metadata: {}
+    };
+  } catch (error) {
+    console.log('error doc read:', error);
+    return Promise.reject('Can not read doc file, please convert to PDF');
+  }
+};
--- a/packages/service/common/file/read/xlsx.ts
+++ b/packages/service/common/file/read/xlsx.ts
@@ -0,0 +1,45 @@
+import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
+import xlsx from 'node-xlsx';
+import Papa from 'papaparse';
+
+export const readXlsxRawText = async ({
+  buffer
+}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
+  const result = xlsx.parse(buffer, {
+    skipHidden: false,
+    defval: ''
+  });
+
+  const format2Csv = result.map(({ name, data }) => {
+    return {
+      title: `#${name}`,
+      csvText: data.map((item) => item.join(',')).join('\n')
+    };
+  });
+
+  const rawText = format2Csv.map((item) => item.csvText).join('\n');
+  const formatText = format2Csv
+    .map((item) => {
+      const csvArr = Papa.parse(item.csvText).data as string[][];
+      const header = csvArr[0];
+
+      const formatText = header
+        ? csvArr
+            .map((item) =>
+              item
+                .map((item, i) => (item ? `${header[i]}:${item}` : ''))
+                .filter(Boolean)
+                .join('\n')
+            )
+            .join('\n')
+        : '';
+
+      return `${item.title}\n${formatText}`;
+    })
+    .join('\n');
+
+  return {
+    rawText: rawText,
+    formatText
+  };
+};
--- a/packages/service/common/file/utils.ts
+++ b/packages/service/common/file/utils.ts
@@ -35,13 +35,8 @@ export const clearDirFiles = (dirPath: string) => {
    return;
  }

-  fs.readdirSync(dirPath).forEach((file) => {
-    const curPath = `${dirPath}/${file}`;
-    if (fs.lstatSync(curPath).isDirectory()) {
-      clearDirFiles(curPath);
-    } else {
-      fs.unlinkSync(curPath);
-    }
+  fs.rmdirSync(dirPath, {
+    recursive: true
  });
 };

--- a/packages/service/core/dataset/collection/controller.ts
+++ b/packages/service/core/dataset/collection/controller.ts
@@ -9,7 +9,6 @@ import {
  DatasetCollectionSchemaType
 } from '@fastgpt/global/core/dataset/type';
 import { MongoDatasetTraining } from '../training/schema';
-import { delay } from '@fastgpt/global/common/system/utils';
 import { MongoDatasetData } from '../data/schema';
 import { delImgByRelatedId } from '../../../common/file/image/controller';
 import { deleteDatasetDataVector } from '../../../common/vectorStore/controller';
--- a/packages/service/core/dataset/training/constants.ts
+++ b/packages/service/core/dataset/training/constants.ts
@@ -0,0 +1,6 @@
+export enum ImportDataSourceEnum {
+  fileLocal = 'fileLocal',
+  fileLink = 'fileLink',
+  fileCustom = 'fileCustom',
+  tableLocal = 'tableLocal'
+}
--- a/packages/service/core/dataset/training/controller.ts
+++ b/packages/service/core/dataset/training/controller.ts
@@ -1,14 +1,16 @@
-import { delay } from '@fastgpt/global/common/system/utils';
 import { MongoDatasetTraining } from './schema';
 import type {
  PushDatasetDataChunkProps,
  PushDatasetDataProps,
  PushDatasetDataResponse
 } from '@fastgpt/global/core/dataset/api.d';
-import { getCollectionWithDataset } from '../controller';
 import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
 import { simpleText } from '@fastgpt/global/common/string/tools';
 import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
+import { ClientSession } from '../../../common/mongo';
+import { getLLMModel, getVectorModel } from '../../ai/model';
+import { addLog } from '../../../common/system/log';
+import { getCollectionWithDataset } from '../controller';

 export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
  try {
@@ -23,31 +25,52 @@ export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> =>
  } catch (error) {}
 };

-export async function pushDataListToTrainingQueue({
-  teamId,
-  tmbId,
+export const pushDataListToTrainingQueueByCollectionId = async ({
  collectionId,
-  data,
-  prompt,
-  billId,
-  trainingMode = TrainingModeEnum.chunk
+  ...props
 }: {
  teamId: string;
  tmbId: string;
-} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
-  const vectorModelList = global.vectorModels;
-  const datasetModelList = global.llmModels;
-
+  session?: ClientSession;
+} & PushDatasetDataProps) => {
  const {
-    datasetId: { _id: datasetId, vectorModel, agentModel }
+    datasetId: { _id: datasetId, agentModel, vectorModel }
  } = await getCollectionWithDataset(collectionId);
+  return pushDataListToTrainingQueue({
+    ...props,
+    datasetId,
+    collectionId,
+    agentModel,
+    vectorModel
+  });
+};

+export async function pushDataListToTrainingQueue({
+  teamId,
+  tmbId,
+  datasetId,
+  collectionId,
+  agentModel,
+  vectorModel,
+  data,
+  prompt,
+  billId,
+  trainingMode = TrainingModeEnum.chunk,
+  session
+}: {
+  teamId: string;
+  tmbId: string;
+  datasetId: string;
+  agentModel: string;
+  vectorModel: string;
+  session?: ClientSession;
+} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
  const checkModelValid = async () => {
-    const agentModelData = datasetModelList?.find((item) => item.model === agentModel);
+    const agentModelData = getLLMModel(agentModel);
    if (!agentModelData) {
      return Promise.reject(`File model ${agentModel} is inValid`);
    }
-    const vectorModelData = vectorModelList?.find((item) => item.model === vectorModel);
+    const vectorModelData = getVectorModel(vectorModel);
    if (!vectorModelData) {
      return Promise.reject(`Vector model ${vectorModel} is inValid`);
    }
@@ -124,52 +147,43 @@ export async function pushDataListToTrainingQueue({
  });

  // insert data to db
-  const insertData = async (dataList: PushDatasetDataChunkProps[], retry = 3): Promise<number> => {
-    try {
-      const results = await MongoDatasetTraining.insertMany(
-        dataList.map((item, i) => ({
-          teamId,
-          tmbId,
-          datasetId,
-          collectionId,
-          billId,
-          mode: trainingMode,
-          prompt,
-          model,
-          q: item.q,
-          a: item.a,
-          chunkIndex: item.chunkIndex ?? 0,
-          weight: weight ?? 0,
-          indexes: item.indexes
-        }))
-      );
-      await delay(500);
-      return results.length;
-    } catch (error) {
-      if (retry > 0) {
-        await delay(500);
-        return insertData(dataList, retry - 1);
-      }
-      return Promise.reject(error);
-    }
-  };
+  const insertLen = filterResult.success.length;
+  const failedDocuments: PushDatasetDataChunkProps[] = [];

-  let insertLen = 0;
-  const chunkSize = 50;
-  const chunkList = filterResult.success.reduce(
-    (acc, cur) => {
-      const lastChunk = acc[acc.length - 1];
-      if (lastChunk.length < chunkSize) {
-        lastChunk.push(cur);
-      } else {
-        acc.push([cur]);
+  // 使用 insertMany 批量插入
+  try {
+    await MongoDatasetTraining.insertMany(
+      filterResult.success.map((item) => ({
+        teamId,
+        tmbId,
+        datasetId,
+        collectionId,
+        billId,
+        mode: trainingMode,
+        prompt,
+        model,
+        q: item.q,
+        a: item.a,
+        chunkIndex: item.chunkIndex ?? 0,
+        weight: weight ?? 0,
+        indexes: item.indexes
+      })),
+      {
+        session
      }
-      return acc;
-    },
-    [[]] as PushDatasetDataChunkProps[][]
-  );
-  for await (const chunks of chunkList) {
-    insertLen += await insertData(chunks);
+    );
+  } catch (error: any) {
+    addLog.error(`Insert error`, error);
+    // 如果有错误，将失败的文档添加到失败列表中
+    error.writeErrors.forEach((writeError: any) => {
+      failedDocuments.push(data[writeError.index]);
+    });
+    console.log('failed', failedDocuments);
+  }
+
+  // 对于失败的文档，尝试单独插入
+  for await (const item of failedDocuments) {
+    await MongoDatasetTraining.create(item);
  }

  delete filterResult.success;
--- a/packages/service/core/dataset/training/utils.ts
+++ b/packages/service/core/dataset/training/utils.ts
@@ -2,6 +2,7 @@ import { DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
 import { addLog } from '../../../common/system/log';
 import { getErrText } from '@fastgpt/global/common/error/utils';
 import { MongoDatasetTraining } from './schema';
+import Papa from 'papaparse';

 export const checkInvalidChunkAndLock = async ({
  err,
@@ -39,3 +40,18 @@ export const checkInvalidChunkAndLock = async ({
  }
  return false;
 };
+
+export const parseCsvTable2Chunks = (rawText: string) => {
+  const csvArr = Papa.parse(rawText).data as string[][];
+
+  const chunks = csvArr
+    .map((item) => ({
+      q: item[0] || '',
+      a: item[1] || ''
+    }))
+    .filter((item) => item.q || item.a);
+
+  return {
+    chunks
+  };
+};
--- a/packages/service/package.json
+++ b/packages/service/package.json
@@ -4,27 +4,36 @@
  "dependencies": {
    "@fastgpt/global": "workspace:*",
    "@node-rs/jieba": "1.10.0",
+    "@xmldom/xmldom": "^0.8.10",
    "axios": "^1.5.1",
    "cheerio": "1.0.0-rc.12",
    "cookie": "^0.5.0",
    "date-fns": "2.30.0",
    "dayjs": "^1.11.7",
+    "decompress": "^4.2.1",
    "encoding": "^0.1.13",
+    "file-type": "^19.0.0",
    "json5": "^2.2.3",
    "jsonwebtoken": "^9.0.2",
+    "mammoth": "^1.6.0",
    "mongoose": "^7.0.2",
    "multer": "1.4.5-lts.1",
    "next": "13.5.2",
    "nextjs-cors": "^2.1.2",
    "node-cron": "^3.0.3",
+    "node-xlsx": "^0.23.0",
+    "papaparse": "5.4.1",
+    "pdfjs-dist": "4.0.269",
    "pg": "^8.10.0",
    "tunnel": "^0.0.6"
  },
  "devDependencies": {
    "@types/cookie": "^0.5.2",
+    "@types/decompress": "^4.2.7",
    "@types/jsonwebtoken": "^9.0.3",
    "@types/multer": "^1.4.10",
    "@types/node-cron": "^3.0.11",
+    "@types/papaparse": "5.3.7",
    "@types/pg": "^8.6.6",
    "@types/tunnel": "^0.0.4"
  }
--- a/packages/service/support/permission/auth/file.ts
+++ b/packages/service/support/permission/auth/file.ts
@@ -0,0 +1,42 @@
+import { AuthResponseType } from '@fastgpt/global/support/permission/type';
+import { AuthModeType } from '../type';
+import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
+import { parseHeaderCert } from '../controller';
+import { getFileById } from '../../../common/file/gridfs/controller';
+import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
+import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
+
+export async function authFile({
+  fileId,
+  per = 'owner',
+  ...props
+}: AuthModeType & {
+  fileId: string;
+}): Promise<
+  AuthResponseType & {
+    file: DatasetFileSchema;
+  }
+> {
+  const authRes = await parseHeaderCert(props);
+  const { teamId, tmbId } = authRes;
+
+  const file = await getFileById({ bucketName: BucketNameEnum.dataset, fileId });
+
+  if (!file) {
+    return Promise.reject(CommonErrEnum.fileNotFound);
+  }
+
+  if (file.metadata?.teamId !== teamId) {
+    return Promise.reject(CommonErrEnum.unAuthFile);
+  }
+  if (per === 'owner' && file.metadata?.tmbId !== tmbId) {
+    return Promise.reject(CommonErrEnum.unAuthFile);
+  }
+
+  return {
+    ...authRes,
+    isOwner: per === 'owner',
+    canWrite: per === 'owner',
+    file
+  };
+}