feat: custom read file service (#2548)

This commit is contained in:
Archer
2024-08-28 11:35:06 +08:00
committed by GitHub
parent bebf565c06
commit 52cbfeace3
5 changed files with 63 additions and 21 deletions

View File

@@ -51,6 +51,7 @@ export type DatasetCollectionSchemaType = {
chunkSize: number;
chunkSplitter?: string;
qaPrompt?: string;
ocrParse?: boolean;
tags?: string[];

View File

@@ -2,11 +2,14 @@ import { markdownProcess } from '@fastgpt/global/common/string/markdown';
import { uploadMongoImg } from '../image/controller';
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
import { addHours } from 'date-fns';
import FormData from 'form-data';
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
import fs from 'fs';
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
import type { ReadFileResponse } from '../../../worker/readFile/type';
import axios from 'axios';
import { addLog } from '../../system/log';
export type readRawTextByLocalFileParams = {
teamId: string;
@@ -51,15 +54,7 @@ export const readRawContentByFileBuffer = async ({
metadata?: Record<string, any>;
}) => {
// Upload image in markdown
const matchMdImgTextAndUpload = ({
teamId,
md,
metadata
}: {
md: string;
teamId: string;
metadata?: Record<string, any>;
}) =>
const matchMdImgTextAndUpload = ({ teamId, md }: { md: string; teamId: string }) =>
markdownProcess({
rawText: md,
uploadImgController: (base64Img) =>
@@ -72,18 +67,61 @@ export const readRawContentByFileBuffer = async ({
})
});
let { rawText, formatText } = await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
extension,
encoding,
buffer
});
/* If */
const customReadfileUrl = process.env.CUSTOM_READ_FILE_URL;
const customReadFileExtension = process.env.CUSTOM_READ_FILE_EXTENSION || '';
const ocrParse = process.env.CUSTOM_READ_FILE_OCR || 'false';
const readFileFromCustomService = async (): Promise<ReadFileResponse | undefined> => {
if (
!customReadfileUrl ||
!customReadFileExtension ||
!customReadFileExtension.includes(extension)
)
return;
addLog.info('Use custom read file service');
const data = new FormData();
data.append('file', buffer, {
filename: `file.${extension}`
});
data.append('extension', extension);
data.append('ocr', ocrParse);
const { data: response } = await axios.post<{
success: boolean;
message: string;
data: {
page: number;
markdown: string;
};
}>(customReadfileUrl, data, {
timeout: 600000,
headers: {
...data.getHeaders()
}
});
const rawText = response.data.markdown;
return {
rawText,
formatText: rawText
};
};
let { rawText, formatText } =
(await readFileFromCustomService()) ||
(await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
extension,
encoding,
buffer
}));
// markdown data format
if (['md', 'html', 'docx'].includes(extension)) {
if (['md', 'html', 'docx', ...customReadFileExtension.split(',')].includes(extension)) {
rawText = await matchMdImgTextAndUpload({
teamId: teamId,
md: rawText,
metadata: metadata
md: rawText
});
}

View File

@@ -68,6 +68,7 @@ const DatasetCollectionSchema = new Schema({
qaPrompt: {
type: String
},
ocrParse: Boolean,
tags: {
type: [String],

View File

@@ -24,4 +24,6 @@ export const getPreviewFileContent = (data: PreviewContextProps) =>
POST<{
previewContent: string;
totalLength: number;
}>('/common/file/previewContent', data);
}>('/common/file/previewContent', data, {
timeout: 600000
});

View File

@@ -98,7 +98,7 @@ export const postDatasetCollection = (data: CreateDatasetCollectionParams) =>
POST<string>(`/core/dataset/collection/create`, data);
export const postCreateDatasetFileCollection = (data: FileIdCreateDatasetCollectionParams) =>
POST<{ collectionId: string }>(`/core/dataset/collection/create/fileId`, data, {
timeout: 120000
timeout: 360000
});
export const postCreateDatasetLinkCollection = (data: LinkCreateDatasetCollectionParams) =>
POST<{ collectionId: string }>(`/core/dataset/collection/create/link`, data);
@@ -106,13 +106,13 @@ export const postCreateDatasetTextCollection = (data: TextCreateDatasetCollectio
POST<{ collectionId: string }>(`/core/dataset/collection/create/text`, data);
export const postCreateDatasetCsvTableCollection = (data: CsvTableCreateDatasetCollectionParams) =>
POST<{ collectionId: string }>(`/core/dataset/collection/create/csvTable`, data, {
timeout: 120000
timeout: 360000
});
export const postCreateDatasetExternalFileCollection = (
data: ExternalFileCreateDatasetCollectionParams
) =>
POST<{ collectionId: string }>(`/proApi/core/dataset/collection/create/externalFileUrl`, data, {
timeout: 120000
timeout: 360000
});
export const putDatasetCollectionById = (data: UpdateDatasetCollectionParams) =>