mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 05:12:39 +00:00
External dataset (#1519)
* perf: local file create collection * rename middleware * perf: remove code * feat: next14 * feat: external file dataset * collection tags field * external file dataset doc * fix: ts
This commit is contained in:
@@ -7,7 +7,7 @@ import { MongoFileSchema } from './schema';
|
||||
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
|
||||
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
|
||||
import { MongoRawTextBuffer } from '../../buffer/rawText/schema';
|
||||
import { readFileRawContent } from '../read/utils';
|
||||
import { readRawContentByFileBuffer } from '../read/utils';
|
||||
import { PassThrough } from 'stream';
|
||||
|
||||
export function getGFSCollection(bucket: `${BucketNameEnum}`) {
|
||||
@@ -196,7 +196,7 @@ export const readFileContentFromMongo = async ({
|
||||
});
|
||||
})();
|
||||
|
||||
const { rawText } = await readFileRawContent({
|
||||
const { rawText } = await readRawContentByFileBuffer({
|
||||
extension,
|
||||
isQAImport,
|
||||
teamId,
|
||||
|
@@ -1,11 +1,12 @@
|
||||
import { markdownProcess, simpleMarkdownText } from '@fastgpt/global/common/string/markdown';
|
||||
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
|
||||
import { uploadMongoImg } from '../image/controller';
|
||||
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
|
||||
import { addHours } from 'date-fns';
|
||||
|
||||
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
|
||||
import fs from 'fs';
|
||||
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
|
||||
import { ReadFileResponse } from '../../../worker/file/type';
|
||||
import { rawTextBackupPrefix } from '@fastgpt/global/core/dataset/read';
|
||||
|
||||
export const initMarkdownText = ({
|
||||
teamId,
|
||||
@@ -28,7 +29,34 @@ export const initMarkdownText = ({
|
||||
})
|
||||
});
|
||||
|
||||
export const readFileRawContent = async ({
|
||||
export type readRawTextByLocalFileParams = {
|
||||
teamId: string;
|
||||
path: string;
|
||||
metadata?: Record<string, any>;
|
||||
};
|
||||
export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParams) => {
|
||||
const { path } = params;
|
||||
|
||||
const extension = path?.split('.')?.pop()?.toLowerCase() || '';
|
||||
|
||||
const buffer = fs.readFileSync(path);
|
||||
const encoding = detectFileEncoding(buffer);
|
||||
|
||||
const { rawText } = await readRawContentByFileBuffer({
|
||||
extension,
|
||||
isQAImport: false,
|
||||
teamId: params.teamId,
|
||||
encoding,
|
||||
buffer,
|
||||
metadata: params.metadata
|
||||
});
|
||||
|
||||
return {
|
||||
rawText
|
||||
};
|
||||
};
|
||||
|
||||
export const readRawContentByFileBuffer = async ({
|
||||
extension,
|
||||
isQAImport,
|
||||
teamId,
|
||||
@@ -69,9 +97,3 @@ export const readFileRawContent = async ({
|
||||
|
||||
return { rawText };
|
||||
};
|
||||
|
||||
export const htmlToMarkdown = async (html?: string | null) => {
|
||||
const md = await runWorker<string>(WorkerNameEnum.htmlStr2Md, { html: html || '' });
|
||||
|
||||
return simpleMarkdownText(md);
|
||||
};
|
||||
|
38
packages/service/common/middle/entry.ts
Normal file
38
packages/service/common/middle/entry.ts
Normal file
@@ -0,0 +1,38 @@
|
||||
import { jsonRes } from '../response';
|
||||
import type { NextApiResponse } from 'next';
|
||||
import { withNextCors } from './cors';
|
||||
import { ApiRequestProps } from '../../type/next';
|
||||
|
||||
export type NextApiHandler<T = any> = (
|
||||
req: ApiRequestProps,
|
||||
res: NextApiResponse<T>
|
||||
) => unknown | Promise<unknown>;
|
||||
|
||||
export const NextEntry = ({ beforeCallback = [] }: { beforeCallback?: Promise<any>[] }) => {
|
||||
return (...args: NextApiHandler[]): NextApiHandler => {
|
||||
return async function api(req: ApiRequestProps, res: NextApiResponse) {
|
||||
try {
|
||||
await Promise.all([withNextCors(req, res), ...beforeCallback]);
|
||||
|
||||
let response = null;
|
||||
for (const handler of args) {
|
||||
response = await handler(req, res);
|
||||
}
|
||||
|
||||
const contentType = res.getHeader('Content-Type');
|
||||
if ((!contentType || contentType === 'application/json') && !res.writableFinished) {
|
||||
return jsonRes(res, {
|
||||
code: 200,
|
||||
data: response
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
return jsonRes(res, {
|
||||
code: 500,
|
||||
error,
|
||||
url: req.url
|
||||
});
|
||||
}
|
||||
};
|
||||
};
|
||||
};
|
@@ -1,7 +1,7 @@
|
||||
import { UrlFetchParams, UrlFetchResponse } from '@fastgpt/global/common/file/api';
|
||||
import * as cheerio from 'cheerio';
|
||||
import axios from 'axios';
|
||||
import { htmlToMarkdown } from '../file/read/utils';
|
||||
import { htmlToMarkdown } from './utils';
|
||||
|
||||
export const cheerioToHtml = ({
|
||||
fetchUrl,
|
||||
|
8
packages/service/common/string/utils.ts
Normal file
8
packages/service/common/string/utils.ts
Normal file
@@ -0,0 +1,8 @@
|
||||
import { simpleMarkdownText } from '@fastgpt/global/common/string/markdown';
|
||||
import { WorkerNameEnum, runWorker } from '../../worker/utils';
|
||||
|
||||
export const htmlToMarkdown = async (html?: string | null) => {
|
||||
const md = await runWorker<string>(WorkerNameEnum.htmlStr2Md, { html: html || '' });
|
||||
|
||||
return simpleMarkdownText(md);
|
||||
};
|
@@ -32,6 +32,9 @@ export async function createOneCollection({
|
||||
fileId,
|
||||
rawLink,
|
||||
|
||||
externalFileId,
|
||||
externalFileUrl,
|
||||
|
||||
hashRawText,
|
||||
rawTextLength,
|
||||
metadata = {},
|
||||
@@ -61,6 +64,8 @@ export async function createOneCollection({
|
||||
|
||||
fileId,
|
||||
rawLink,
|
||||
externalFileId,
|
||||
externalFileUrl,
|
||||
|
||||
rawTextLength,
|
||||
hashRawText,
|
||||
|
@@ -66,7 +66,11 @@ const DatasetCollectionSchema = new Schema({
|
||||
type: String
|
||||
},
|
||||
|
||||
sourceId: String,
|
||||
tags: {
|
||||
type: [String],
|
||||
default: []
|
||||
},
|
||||
|
||||
// local file collection
|
||||
fileId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
@@ -74,13 +78,13 @@ const DatasetCollectionSchema = new Schema({
|
||||
},
|
||||
// web link collection
|
||||
rawLink: String,
|
||||
|
||||
// external collection
|
||||
externalFileId: String,
|
||||
|
||||
// metadata
|
||||
rawTextLength: Number,
|
||||
hashRawText: String,
|
||||
externalSourceUrl: String, // external import url
|
||||
externalFileUrl: String, // external import url
|
||||
metadata: {
|
||||
type: Object,
|
||||
default: {}
|
||||
|
@@ -2,13 +2,20 @@ import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
|
||||
import { urlsFetch } from '../../common/string/cheerio';
|
||||
import { rawTextBackupPrefix } from '@fastgpt/global/core/dataset/read';
|
||||
import { parseCsvTable2Chunks } from './training/utils';
|
||||
import { TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import axios from 'axios';
|
||||
import { readFileRawContent } from '../../common/file/read/utils';
|
||||
import { readRawContentByFileBuffer } from '../../common/file/read/utils';
|
||||
|
||||
export const readFileRawTextByUrl = async ({ teamId, url }: { teamId: string; url: string }) => {
|
||||
export const readFileRawTextByUrl = async ({
|
||||
teamId,
|
||||
url,
|
||||
relatedId
|
||||
}: {
|
||||
teamId: string;
|
||||
url: string;
|
||||
relatedId?: string;
|
||||
}) => {
|
||||
const response = await axios({
|
||||
method: 'get',
|
||||
url: url,
|
||||
@@ -18,11 +25,14 @@ export const readFileRawTextByUrl = async ({ teamId, url }: { teamId: string; ur
|
||||
|
||||
const buffer = Buffer.from(response.data, 'binary');
|
||||
|
||||
const { rawText } = await readFileRawContent({
|
||||
const { rawText } = await readRawContentByFileBuffer({
|
||||
extension,
|
||||
teamId,
|
||||
buffer,
|
||||
encoding: 'utf-8'
|
||||
encoding: 'utf-8',
|
||||
metadata: {
|
||||
relatedId
|
||||
}
|
||||
});
|
||||
|
||||
return rawText;
|
||||
@@ -38,13 +48,15 @@ export const readDatasetSourceRawText = async ({
|
||||
type,
|
||||
sourceId,
|
||||
isQAImport,
|
||||
selector
|
||||
selector,
|
||||
relatedId
|
||||
}: {
|
||||
teamId: string;
|
||||
type: DatasetSourceReadTypeEnum;
|
||||
sourceId: string;
|
||||
isQAImport?: boolean;
|
||||
selector?: string;
|
||||
relatedId?: string;
|
||||
}): Promise<string> => {
|
||||
if (type === DatasetSourceReadTypeEnum.fileLocal) {
|
||||
const { rawText } = await readFileContentFromMongo({
|
||||
@@ -64,7 +76,8 @@ export const readDatasetSourceRawText = async ({
|
||||
} else if (type === DatasetSourceReadTypeEnum.externalFile) {
|
||||
const rawText = await readFileRawTextByUrl({
|
||||
teamId,
|
||||
url: sourceId
|
||||
url: sourceId,
|
||||
relatedId
|
||||
});
|
||||
return rawText;
|
||||
}
|
||||
|
@@ -18,6 +18,7 @@ import { countPromptTokens } from '../../../common/string/tiktoken/index';
|
||||
import { datasetSearchResultConcat } from '@fastgpt/global/core/dataset/search/utils';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
import { jiebaSplit } from '../../../common/string/jieba';
|
||||
import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection/utils';
|
||||
|
||||
type SearchDatasetDataProps = {
|
||||
teamId: string;
|
||||
@@ -98,7 +99,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
},
|
||||
'datasetId collectionId q a chunkIndex indexes'
|
||||
)
|
||||
.populate('collectionId', 'name fileId rawLink')
|
||||
.populate('collectionId', 'name fileId rawLink externalFileId externalFileUrl')
|
||||
.lean()) as DatasetDataWithCollectionType[];
|
||||
|
||||
// add score to data(It's already sorted. The first one is the one with the most points)
|
||||
@@ -130,8 +131,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
chunkIndex: data.chunkIndex,
|
||||
datasetId: String(data.datasetId),
|
||||
collectionId: String(data.collectionId?._id),
|
||||
sourceName: data.collectionId?.name || '',
|
||||
sourceId: data.collectionId?.fileId || data.collectionId?.rawLink,
|
||||
...getCollectionSourceData(data.collectionId),
|
||||
score: [{ type: SearchScoreTypeEnum.embedding, value: data.score, index }]
|
||||
};
|
||||
|
||||
@@ -205,8 +205,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
id: String(item._id),
|
||||
datasetId: String(item.datasetId),
|
||||
collectionId: String(item.collectionId),
|
||||
sourceName: collection?.name || '',
|
||||
sourceId: collection?.fileId || collection?.rawLink,
|
||||
...getCollectionSourceData(collection),
|
||||
q: item.q,
|
||||
a: item.a,
|
||||
chunkIndex: item.chunkIndex,
|
||||
|
@@ -174,7 +174,7 @@ export async function pushDataListToTrainingQueue({
|
||||
} catch (error: any) {
|
||||
addLog.error(`Insert error`, error);
|
||||
// 如果有错误,将失败的文档添加到失败列表中
|
||||
error.writeErrors.forEach((writeError: any) => {
|
||||
error.writeErrors?.forEach((writeError: any) => {
|
||||
failedDocuments.push(data[writeError.index]);
|
||||
});
|
||||
console.log('failed', failedDocuments);
|
||||
|
@@ -35,7 +35,7 @@ const TrainingDataSchema = new Schema({
|
||||
},
|
||||
billId: {
|
||||
// concat bill
|
||||
type: Schema.Types.ObjectId
|
||||
type: String
|
||||
},
|
||||
mode: {
|
||||
type: String,
|
||||
|
@@ -53,7 +53,7 @@ export const dispatchLafRequest = async (props: LafRequestProps): Promise<LafRes
|
||||
appId,
|
||||
chatId,
|
||||
responseChatItemId,
|
||||
histories: histories.slice(0, 10)
|
||||
histories: histories?.slice(0, 10)
|
||||
},
|
||||
variables,
|
||||
...dynamicInput,
|
||||
|
@@ -21,7 +21,7 @@
|
||||
"mammoth": "^1.6.0",
|
||||
"mongoose": "^7.0.2",
|
||||
"multer": "1.4.5-lts.1",
|
||||
"next": "13.5.2",
|
||||
"next": "14.2.3",
|
||||
"nextjs-cors": "^2.1.2",
|
||||
"node-cron": "^3.0.3",
|
||||
"node-xlsx": "^0.23.0",
|
||||
|
@@ -19,7 +19,9 @@ export const checkDatasetLimit = async ({
|
||||
if (!standardConstants) return;
|
||||
|
||||
if (usedSize + insertLen >= datasetMaxSize) {
|
||||
return Promise.reject(TeamErrEnum.datasetSizeNotEnough);
|
||||
return Promise.reject(
|
||||
`您的知识库容量为: ${datasetMaxSize}组,已使用: ${usedSize}组,导入当前文件需要: ${insertLen}组,请增加知识库容量后导入。`
|
||||
);
|
||||
}
|
||||
|
||||
if (usedPoints >= totalPoints) {
|
||||
|
@@ -9,7 +9,7 @@ import { readXlsxRawText } from './extension/xlsx';
|
||||
import { readCsvRawText } from './extension/csv';
|
||||
|
||||
parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
|
||||
const readFileRawContent = async (params: ReadRawTextByBuffer) => {
|
||||
const readRawContentByFileBuffer = async (params: ReadRawTextByBuffer) => {
|
||||
switch (params.extension) {
|
||||
case 'txt':
|
||||
case 'md':
|
||||
@@ -41,7 +41,7 @@ parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
|
||||
try {
|
||||
parentPort?.postMessage({
|
||||
type: 'success',
|
||||
data: await readFileRawContent(newProps)
|
||||
data: await readRawContentByFileBuffer(newProps)
|
||||
});
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
|
Reference in New Issue
Block a user