mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 13:03:50 +00:00
feat: custom read file service (#2548)
This commit is contained in:
1
packages/global/core/dataset/type.d.ts
vendored
1
packages/global/core/dataset/type.d.ts
vendored
@@ -51,6 +51,7 @@ export type DatasetCollectionSchemaType = {
|
||||
chunkSize: number;
|
||||
chunkSplitter?: string;
|
||||
qaPrompt?: string;
|
||||
ocrParse?: boolean;
|
||||
|
||||
tags?: string[];
|
||||
|
||||
|
@@ -2,11 +2,14 @@ import { markdownProcess } from '@fastgpt/global/common/string/markdown';
|
||||
import { uploadMongoImg } from '../image/controller';
|
||||
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
|
||||
import { addHours } from 'date-fns';
|
||||
import FormData from 'form-data';
|
||||
|
||||
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
|
||||
import fs from 'fs';
|
||||
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
|
||||
import type { ReadFileResponse } from '../../../worker/readFile/type';
|
||||
import axios from 'axios';
|
||||
import { addLog } from '../../system/log';
|
||||
|
||||
export type readRawTextByLocalFileParams = {
|
||||
teamId: string;
|
||||
@@ -51,15 +54,7 @@ export const readRawContentByFileBuffer = async ({
|
||||
metadata?: Record<string, any>;
|
||||
}) => {
|
||||
// Upload image in markdown
|
||||
const matchMdImgTextAndUpload = ({
|
||||
teamId,
|
||||
md,
|
||||
metadata
|
||||
}: {
|
||||
md: string;
|
||||
teamId: string;
|
||||
metadata?: Record<string, any>;
|
||||
}) =>
|
||||
const matchMdImgTextAndUpload = ({ teamId, md }: { md: string; teamId: string }) =>
|
||||
markdownProcess({
|
||||
rawText: md,
|
||||
uploadImgController: (base64Img) =>
|
||||
@@ -72,18 +67,61 @@ export const readRawContentByFileBuffer = async ({
|
||||
})
|
||||
});
|
||||
|
||||
let { rawText, formatText } = await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
|
||||
extension,
|
||||
encoding,
|
||||
buffer
|
||||
});
|
||||
/* If */
|
||||
const customReadfileUrl = process.env.CUSTOM_READ_FILE_URL;
|
||||
const customReadFileExtension = process.env.CUSTOM_READ_FILE_EXTENSION || '';
|
||||
const ocrParse = process.env.CUSTOM_READ_FILE_OCR || 'false';
|
||||
const readFileFromCustomService = async (): Promise<ReadFileResponse | undefined> => {
|
||||
if (
|
||||
!customReadfileUrl ||
|
||||
!customReadFileExtension ||
|
||||
!customReadFileExtension.includes(extension)
|
||||
)
|
||||
return;
|
||||
|
||||
addLog.info('Use custom read file service');
|
||||
|
||||
const data = new FormData();
|
||||
data.append('file', buffer, {
|
||||
filename: `file.${extension}`
|
||||
});
|
||||
data.append('extension', extension);
|
||||
data.append('ocr', ocrParse);
|
||||
const { data: response } = await axios.post<{
|
||||
success: boolean;
|
||||
message: string;
|
||||
data: {
|
||||
page: number;
|
||||
markdown: string;
|
||||
};
|
||||
}>(customReadfileUrl, data, {
|
||||
timeout: 600000,
|
||||
headers: {
|
||||
...data.getHeaders()
|
||||
}
|
||||
});
|
||||
|
||||
const rawText = response.data.markdown;
|
||||
|
||||
return {
|
||||
rawText,
|
||||
formatText: rawText
|
||||
};
|
||||
};
|
||||
|
||||
let { rawText, formatText } =
|
||||
(await readFileFromCustomService()) ||
|
||||
(await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
|
||||
extension,
|
||||
encoding,
|
||||
buffer
|
||||
}));
|
||||
|
||||
// markdown data format
|
||||
if (['md', 'html', 'docx'].includes(extension)) {
|
||||
if (['md', 'html', 'docx', ...customReadFileExtension.split(',')].includes(extension)) {
|
||||
rawText = await matchMdImgTextAndUpload({
|
||||
teamId: teamId,
|
||||
md: rawText,
|
||||
metadata: metadata
|
||||
md: rawText
|
||||
});
|
||||
}
|
||||
|
||||
|
@@ -68,6 +68,7 @@ const DatasetCollectionSchema = new Schema({
|
||||
qaPrompt: {
|
||||
type: String
|
||||
},
|
||||
ocrParse: Boolean,
|
||||
|
||||
tags: {
|
||||
type: [String],
|
||||
|
@@ -24,4 +24,6 @@ export const getPreviewFileContent = (data: PreviewContextProps) =>
|
||||
POST<{
|
||||
previewContent: string;
|
||||
totalLength: number;
|
||||
}>('/common/file/previewContent', data);
|
||||
}>('/common/file/previewContent', data, {
|
||||
timeout: 600000
|
||||
});
|
||||
|
@@ -98,7 +98,7 @@ export const postDatasetCollection = (data: CreateDatasetCollectionParams) =>
|
||||
POST<string>(`/core/dataset/collection/create`, data);
|
||||
export const postCreateDatasetFileCollection = (data: FileIdCreateDatasetCollectionParams) =>
|
||||
POST<{ collectionId: string }>(`/core/dataset/collection/create/fileId`, data, {
|
||||
timeout: 120000
|
||||
timeout: 360000
|
||||
});
|
||||
export const postCreateDatasetLinkCollection = (data: LinkCreateDatasetCollectionParams) =>
|
||||
POST<{ collectionId: string }>(`/core/dataset/collection/create/link`, data);
|
||||
@@ -106,13 +106,13 @@ export const postCreateDatasetTextCollection = (data: TextCreateDatasetCollectio
|
||||
POST<{ collectionId: string }>(`/core/dataset/collection/create/text`, data);
|
||||
export const postCreateDatasetCsvTableCollection = (data: CsvTableCreateDatasetCollectionParams) =>
|
||||
POST<{ collectionId: string }>(`/core/dataset/collection/create/csvTable`, data, {
|
||||
timeout: 120000
|
||||
timeout: 360000
|
||||
});
|
||||
export const postCreateDatasetExternalFileCollection = (
|
||||
data: ExternalFileCreateDatasetCollectionParams
|
||||
) =>
|
||||
POST<{ collectionId: string }>(`/proApi/core/dataset/collection/create/externalFileUrl`, data, {
|
||||
timeout: 120000
|
||||
timeout: 360000
|
||||
});
|
||||
|
||||
export const putDatasetCollectionById = (data: UpdateDatasetCollectionParams) =>
|
||||
|
Reference in New Issue
Block a user