mirror of
https://github.com/labring/FastGPT.git
synced 2025-10-21 03:10:50 +00:00
feat: custom read file service (#2548)
This commit is contained in:
1
packages/global/core/dataset/type.d.ts
vendored
1
packages/global/core/dataset/type.d.ts
vendored
@@ -51,6 +51,7 @@ export type DatasetCollectionSchemaType = {
|
|||||||
chunkSize: number;
|
chunkSize: number;
|
||||||
chunkSplitter?: string;
|
chunkSplitter?: string;
|
||||||
qaPrompt?: string;
|
qaPrompt?: string;
|
||||||
|
ocrParse?: boolean;
|
||||||
|
|
||||||
tags?: string[];
|
tags?: string[];
|
||||||
|
|
||||||
|
@@ -2,11 +2,14 @@ import { markdownProcess } from '@fastgpt/global/common/string/markdown';
|
|||||||
import { uploadMongoImg } from '../image/controller';
|
import { uploadMongoImg } from '../image/controller';
|
||||||
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
|
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
|
||||||
import { addHours } from 'date-fns';
|
import { addHours } from 'date-fns';
|
||||||
|
import FormData from 'form-data';
|
||||||
|
|
||||||
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
|
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
|
||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
|
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
|
||||||
import type { ReadFileResponse } from '../../../worker/readFile/type';
|
import type { ReadFileResponse } from '../../../worker/readFile/type';
|
||||||
|
import axios from 'axios';
|
||||||
|
import { addLog } from '../../system/log';
|
||||||
|
|
||||||
export type readRawTextByLocalFileParams = {
|
export type readRawTextByLocalFileParams = {
|
||||||
teamId: string;
|
teamId: string;
|
||||||
@@ -51,15 +54,7 @@ export const readRawContentByFileBuffer = async ({
|
|||||||
metadata?: Record<string, any>;
|
metadata?: Record<string, any>;
|
||||||
}) => {
|
}) => {
|
||||||
// Upload image in markdown
|
// Upload image in markdown
|
||||||
const matchMdImgTextAndUpload = ({
|
const matchMdImgTextAndUpload = ({ teamId, md }: { md: string; teamId: string }) =>
|
||||||
teamId,
|
|
||||||
md,
|
|
||||||
metadata
|
|
||||||
}: {
|
|
||||||
md: string;
|
|
||||||
teamId: string;
|
|
||||||
metadata?: Record<string, any>;
|
|
||||||
}) =>
|
|
||||||
markdownProcess({
|
markdownProcess({
|
||||||
rawText: md,
|
rawText: md,
|
||||||
uploadImgController: (base64Img) =>
|
uploadImgController: (base64Img) =>
|
||||||
@@ -72,18 +67,61 @@ export const readRawContentByFileBuffer = async ({
|
|||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
let { rawText, formatText } = await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
|
/* If */
|
||||||
|
const customReadfileUrl = process.env.CUSTOM_READ_FILE_URL;
|
||||||
|
const customReadFileExtension = process.env.CUSTOM_READ_FILE_EXTENSION || '';
|
||||||
|
const ocrParse = process.env.CUSTOM_READ_FILE_OCR || 'false';
|
||||||
|
const readFileFromCustomService = async (): Promise<ReadFileResponse | undefined> => {
|
||||||
|
if (
|
||||||
|
!customReadfileUrl ||
|
||||||
|
!customReadFileExtension ||
|
||||||
|
!customReadFileExtension.includes(extension)
|
||||||
|
)
|
||||||
|
return;
|
||||||
|
|
||||||
|
addLog.info('Use custom read file service');
|
||||||
|
|
||||||
|
const data = new FormData();
|
||||||
|
data.append('file', buffer, {
|
||||||
|
filename: `file.${extension}`
|
||||||
|
});
|
||||||
|
data.append('extension', extension);
|
||||||
|
data.append('ocr', ocrParse);
|
||||||
|
const { data: response } = await axios.post<{
|
||||||
|
success: boolean;
|
||||||
|
message: string;
|
||||||
|
data: {
|
||||||
|
page: number;
|
||||||
|
markdown: string;
|
||||||
|
};
|
||||||
|
}>(customReadfileUrl, data, {
|
||||||
|
timeout: 600000,
|
||||||
|
headers: {
|
||||||
|
...data.getHeaders()
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const rawText = response.data.markdown;
|
||||||
|
|
||||||
|
return {
|
||||||
|
rawText,
|
||||||
|
formatText: rawText
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
let { rawText, formatText } =
|
||||||
|
(await readFileFromCustomService()) ||
|
||||||
|
(await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
|
||||||
extension,
|
extension,
|
||||||
encoding,
|
encoding,
|
||||||
buffer
|
buffer
|
||||||
});
|
}));
|
||||||
|
|
||||||
// markdown data format
|
// markdown data format
|
||||||
if (['md', 'html', 'docx'].includes(extension)) {
|
if (['md', 'html', 'docx', ...customReadFileExtension.split(',')].includes(extension)) {
|
||||||
rawText = await matchMdImgTextAndUpload({
|
rawText = await matchMdImgTextAndUpload({
|
||||||
teamId: teamId,
|
teamId: teamId,
|
||||||
md: rawText,
|
md: rawText
|
||||||
metadata: metadata
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -68,6 +68,7 @@ const DatasetCollectionSchema = new Schema({
|
|||||||
qaPrompt: {
|
qaPrompt: {
|
||||||
type: String
|
type: String
|
||||||
},
|
},
|
||||||
|
ocrParse: Boolean,
|
||||||
|
|
||||||
tags: {
|
tags: {
|
||||||
type: [String],
|
type: [String],
|
||||||
|
@@ -24,4 +24,6 @@ export const getPreviewFileContent = (data: PreviewContextProps) =>
|
|||||||
POST<{
|
POST<{
|
||||||
previewContent: string;
|
previewContent: string;
|
||||||
totalLength: number;
|
totalLength: number;
|
||||||
}>('/common/file/previewContent', data);
|
}>('/common/file/previewContent', data, {
|
||||||
|
timeout: 600000
|
||||||
|
});
|
||||||
|
@@ -98,7 +98,7 @@ export const postDatasetCollection = (data: CreateDatasetCollectionParams) =>
|
|||||||
POST<string>(`/core/dataset/collection/create`, data);
|
POST<string>(`/core/dataset/collection/create`, data);
|
||||||
export const postCreateDatasetFileCollection = (data: FileIdCreateDatasetCollectionParams) =>
|
export const postCreateDatasetFileCollection = (data: FileIdCreateDatasetCollectionParams) =>
|
||||||
POST<{ collectionId: string }>(`/core/dataset/collection/create/fileId`, data, {
|
POST<{ collectionId: string }>(`/core/dataset/collection/create/fileId`, data, {
|
||||||
timeout: 120000
|
timeout: 360000
|
||||||
});
|
});
|
||||||
export const postCreateDatasetLinkCollection = (data: LinkCreateDatasetCollectionParams) =>
|
export const postCreateDatasetLinkCollection = (data: LinkCreateDatasetCollectionParams) =>
|
||||||
POST<{ collectionId: string }>(`/core/dataset/collection/create/link`, data);
|
POST<{ collectionId: string }>(`/core/dataset/collection/create/link`, data);
|
||||||
@@ -106,13 +106,13 @@ export const postCreateDatasetTextCollection = (data: TextCreateDatasetCollectio
|
|||||||
POST<{ collectionId: string }>(`/core/dataset/collection/create/text`, data);
|
POST<{ collectionId: string }>(`/core/dataset/collection/create/text`, data);
|
||||||
export const postCreateDatasetCsvTableCollection = (data: CsvTableCreateDatasetCollectionParams) =>
|
export const postCreateDatasetCsvTableCollection = (data: CsvTableCreateDatasetCollectionParams) =>
|
||||||
POST<{ collectionId: string }>(`/core/dataset/collection/create/csvTable`, data, {
|
POST<{ collectionId: string }>(`/core/dataset/collection/create/csvTable`, data, {
|
||||||
timeout: 120000
|
timeout: 360000
|
||||||
});
|
});
|
||||||
export const postCreateDatasetExternalFileCollection = (
|
export const postCreateDatasetExternalFileCollection = (
|
||||||
data: ExternalFileCreateDatasetCollectionParams
|
data: ExternalFileCreateDatasetCollectionParams
|
||||||
) =>
|
) =>
|
||||||
POST<{ collectionId: string }>(`/proApi/core/dataset/collection/create/externalFileUrl`, data, {
|
POST<{ collectionId: string }>(`/proApi/core/dataset/collection/create/externalFileUrl`, data, {
|
||||||
timeout: 120000
|
timeout: 360000
|
||||||
});
|
});
|
||||||
|
|
||||||
export const putDatasetCollectionById = (data: UpdateDatasetCollectionParams) =>
|
export const putDatasetCollectionById = (data: UpdateDatasetCollectionParams) =>
|
||||||
|
Reference in New Issue
Block a user