External dataset (#1519)

* perf: local file create collection

* rename middleware

* perf: remove code

* feat: next14

* feat: external file dataset

* collection tags field

* external file dataset doc

* fix: ts
This commit is contained in:
Archer
2024-05-17 16:44:15 +08:00
committed by GitHub
parent 2d1ec9b3ad
commit 67c52992d7
102 changed files with 1839 additions and 1282 deletions

View File

@@ -7,7 +7,7 @@ import { MongoFileSchema } from './schema';
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { MongoRawTextBuffer } from '../../buffer/rawText/schema';
import { readFileRawContent } from '../read/utils';
import { readRawContentByFileBuffer } from '../read/utils';
import { PassThrough } from 'stream';
export function getGFSCollection(bucket: `${BucketNameEnum}`) {
@@ -196,7 +196,7 @@ export const readFileContentFromMongo = async ({
});
})();
const { rawText } = await readFileRawContent({
const { rawText } = await readRawContentByFileBuffer({
extension,
isQAImport,
teamId,

View File

@@ -1,11 +1,12 @@
import { markdownProcess, simpleMarkdownText } from '@fastgpt/global/common/string/markdown';
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
import { uploadMongoImg } from '../image/controller';
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
import { addHours } from 'date-fns';
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
import fs from 'fs';
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
import { ReadFileResponse } from '../../../worker/file/type';
import { rawTextBackupPrefix } from '@fastgpt/global/core/dataset/read';
export const initMarkdownText = ({
teamId,
@@ -28,7 +29,34 @@ export const initMarkdownText = ({
})
});
export const readFileRawContent = async ({
export type readRawTextByLocalFileParams = {
teamId: string;
path: string;
metadata?: Record<string, any>;
};
export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParams) => {
const { path } = params;
const extension = path?.split('.')?.pop()?.toLowerCase() || '';
const buffer = fs.readFileSync(path);
const encoding = detectFileEncoding(buffer);
const { rawText } = await readRawContentByFileBuffer({
extension,
isQAImport: false,
teamId: params.teamId,
encoding,
buffer,
metadata: params.metadata
});
return {
rawText
};
};
export const readRawContentByFileBuffer = async ({
extension,
isQAImport,
teamId,
@@ -69,9 +97,3 @@ export const readFileRawContent = async ({
return { rawText };
};
export const htmlToMarkdown = async (html?: string | null) => {
const md = await runWorker<string>(WorkerNameEnum.htmlStr2Md, { html: html || '' });
return simpleMarkdownText(md);
};

View File

@@ -0,0 +1,38 @@
import { jsonRes } from '../response';
import type { NextApiResponse } from 'next';
import { withNextCors } from './cors';
import { ApiRequestProps } from '../../type/next';
export type NextApiHandler<T = any> = (
req: ApiRequestProps,
res: NextApiResponse<T>
) => unknown | Promise<unknown>;
export const NextEntry = ({ beforeCallback = [] }: { beforeCallback?: Promise<any>[] }) => {
return (...args: NextApiHandler[]): NextApiHandler => {
return async function api(req: ApiRequestProps, res: NextApiResponse) {
try {
await Promise.all([withNextCors(req, res), ...beforeCallback]);
let response = null;
for (const handler of args) {
response = await handler(req, res);
}
const contentType = res.getHeader('Content-Type');
if ((!contentType || contentType === 'application/json') && !res.writableFinished) {
return jsonRes(res, {
code: 200,
data: response
});
}
} catch (error) {
return jsonRes(res, {
code: 500,
error,
url: req.url
});
}
};
};
};

View File

@@ -1,7 +1,7 @@
import { UrlFetchParams, UrlFetchResponse } from '@fastgpt/global/common/file/api';
import * as cheerio from 'cheerio';
import axios from 'axios';
import { htmlToMarkdown } from '../file/read/utils';
import { htmlToMarkdown } from './utils';
export const cheerioToHtml = ({
fetchUrl,

View File

@@ -0,0 +1,8 @@
import { simpleMarkdownText } from '@fastgpt/global/common/string/markdown';
import { WorkerNameEnum, runWorker } from '../../worker/utils';
export const htmlToMarkdown = async (html?: string | null) => {
const md = await runWorker<string>(WorkerNameEnum.htmlStr2Md, { html: html || '' });
return simpleMarkdownText(md);
};