diff --git a/packages/global/core/dataset/type.d.ts b/packages/global/core/dataset/type.d.ts index 8c9d5b872..fbfc59c97 100644 --- a/packages/global/core/dataset/type.d.ts +++ b/packages/global/core/dataset/type.d.ts @@ -51,6 +51,7 @@ export type DatasetCollectionSchemaType = { chunkSize: number; chunkSplitter?: string; qaPrompt?: string; + ocrParse?: boolean; tags?: string[]; diff --git a/packages/service/common/file/read/utils.ts b/packages/service/common/file/read/utils.ts index afd85b0ea..bdcd5f2ce 100644 --- a/packages/service/common/file/read/utils.ts +++ b/packages/service/common/file/read/utils.ts @@ -2,11 +2,14 @@ import { markdownProcess } from '@fastgpt/global/common/string/markdown'; import { uploadMongoImg } from '../image/controller'; import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants'; import { addHours } from 'date-fns'; +import FormData from 'form-data'; import { WorkerNameEnum, runWorker } from '../../../worker/utils'; import fs from 'fs'; import { detectFileEncoding } from '@fastgpt/global/common/file/tools'; import type { ReadFileResponse } from '../../../worker/readFile/type'; +import axios from 'axios'; +import { addLog } from '../../system/log'; export type readRawTextByLocalFileParams = { teamId: string; @@ -51,15 +54,7 @@ export const readRawContentByFileBuffer = async ({ metadata?: Record; }) => { // Upload image in markdown - const matchMdImgTextAndUpload = ({ - teamId, - md, - metadata - }: { - md: string; - teamId: string; - metadata?: Record; - }) => + const matchMdImgTextAndUpload = ({ teamId, md }: { md: string; teamId: string }) => markdownProcess({ rawText: md, uploadImgController: (base64Img) => @@ -72,18 +67,61 @@ export const readRawContentByFileBuffer = async ({ }) }); - let { rawText, formatText } = await runWorker(WorkerNameEnum.readFile, { - extension, - encoding, - buffer - }); + /* If */ + const customReadfileUrl = process.env.CUSTOM_READ_FILE_URL; + const customReadFileExtension = process.env.CUSTOM_READ_FILE_EXTENSION || ''; + const ocrParse = process.env.CUSTOM_READ_FILE_OCR || 'false'; + const readFileFromCustomService = async (): Promise => { + if ( + !customReadfileUrl || + !customReadFileExtension || + !customReadFileExtension.includes(extension) + ) + return; + + addLog.info('Use custom read file service'); + + const data = new FormData(); + data.append('file', buffer, { + filename: `file.${extension}` + }); + data.append('extension', extension); + data.append('ocr', ocrParse); + const { data: response } = await axios.post<{ + success: boolean; + message: string; + data: { + page: number; + markdown: string; + }; + }>(customReadfileUrl, data, { + timeout: 600000, + headers: { + ...data.getHeaders() + } + }); + + const rawText = response.data.markdown; + + return { + rawText, + formatText: rawText + }; + }; + + let { rawText, formatText } = + (await readFileFromCustomService()) || + (await runWorker(WorkerNameEnum.readFile, { + extension, + encoding, + buffer + })); // markdown data format - if (['md', 'html', 'docx'].includes(extension)) { + if (['md', 'html', 'docx', ...customReadFileExtension.split(',')].includes(extension)) { rawText = await matchMdImgTextAndUpload({ teamId: teamId, - md: rawText, - metadata: metadata + md: rawText }); } diff --git a/packages/service/core/dataset/collection/schema.ts b/packages/service/core/dataset/collection/schema.ts index beb158180..9b6246c2b 100644 --- a/packages/service/core/dataset/collection/schema.ts +++ b/packages/service/core/dataset/collection/schema.ts @@ -68,6 +68,7 @@ const DatasetCollectionSchema = new Schema({ qaPrompt: { type: String }, + ocrParse: Boolean, tags: { type: [String], diff --git a/projects/app/src/web/common/file/api.ts b/projects/app/src/web/common/file/api.ts index e7607c72f..32a899699 100644 --- a/projects/app/src/web/common/file/api.ts +++ b/projects/app/src/web/common/file/api.ts @@ -24,4 +24,6 @@ export const getPreviewFileContent = (data: PreviewContextProps) => POST<{ previewContent: string; totalLength: number; - }>('/common/file/previewContent', data); + }>('/common/file/previewContent', data, { + timeout: 600000 + }); diff --git a/projects/app/src/web/core/dataset/api.ts b/projects/app/src/web/core/dataset/api.ts index 704081823..c881601c2 100644 --- a/projects/app/src/web/core/dataset/api.ts +++ b/projects/app/src/web/core/dataset/api.ts @@ -98,7 +98,7 @@ export const postDatasetCollection = (data: CreateDatasetCollectionParams) => POST(`/core/dataset/collection/create`, data); export const postCreateDatasetFileCollection = (data: FileIdCreateDatasetCollectionParams) => POST<{ collectionId: string }>(`/core/dataset/collection/create/fileId`, data, { - timeout: 120000 + timeout: 360000 }); export const postCreateDatasetLinkCollection = (data: LinkCreateDatasetCollectionParams) => POST<{ collectionId: string }>(`/core/dataset/collection/create/link`, data); @@ -106,13 +106,13 @@ export const postCreateDatasetTextCollection = (data: TextCreateDatasetCollectio POST<{ collectionId: string }>(`/core/dataset/collection/create/text`, data); export const postCreateDatasetCsvTableCollection = (data: CsvTableCreateDatasetCollectionParams) => POST<{ collectionId: string }>(`/core/dataset/collection/create/csvTable`, data, { - timeout: 120000 + timeout: 360000 }); export const postCreateDatasetExternalFileCollection = ( data: ExternalFileCreateDatasetCollectionParams ) => POST<{ collectionId: string }>(`/proApi/core/dataset/collection/create/externalFileUrl`, data, { - timeout: 120000 + timeout: 360000 }); export const putDatasetCollectionById = (data: UpdateDatasetCollectionParams) =>