External dataset (#1519)

* perf: local file create collection

* rename middleware

* perf: remove code

* feat: next14

* feat: external file dataset

* collection tags field

* external file dataset doc

* fix: ts
This commit is contained in:
Archer
2024-05-17 16:44:15 +08:00
committed by GitHub
parent 2d1ec9b3ad
commit 67c52992d7
102 changed files with 1839 additions and 1282 deletions

View File

@@ -26,18 +26,27 @@ export type DatasetCollectionChunkMetadataType = {
qaPrompt?: string;
metadata?: Record<string, any>;
};
// create collection params
export type CreateDatasetCollectionParams = DatasetCollectionChunkMetadataType & {
datasetId: string;
name: string;
type: `${DatasetCollectionTypeEnum}`;
type: DatasetCollectionTypeEnum;
tags?: string[];
fileId?: string;
rawLink?: string;
externalFileId?: string;
externalFileUrl?: string;
rawTextLength?: number;
hashRawText?: string;
};
export type ApiCreateDatasetCollectionParams = DatasetCollectionChunkMetadataType & {
datasetId: string;
tags?: string[];
};
export type TextCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
name: string;
@@ -58,6 +67,11 @@ export type CsvTableCreateDatasetCollectionParams = {
parentId?: string;
fileId: string;
};
export type ExternalFileCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
externalFileId?: string;
externalFileUrl: string;
filename?: string;
};
/* ================= data ===================== */
export type PgSearchRawType = {

View File

@@ -1,4 +1,4 @@
/* sourceId = prefix-id; id=fileId;link url;externalId */
/* sourceId = prefix-id; id=fileId;link url;externalFileId */
export enum CollectionSourcePrefixEnum {
local = 'local',
link = 'link',

View File

@@ -0,0 +1,14 @@
import { CollectionWithDatasetType, DatasetCollectionSchemaType } from '../type';
export const getCollectionSourceData = (
collection?: CollectionWithDatasetType | DatasetCollectionSchemaType
) => {
return {
sourceId:
collection?.fileId ||
collection?.rawLink ||
collection?.externalFileId ||
collection?.externalFileUrl,
sourceName: collection?.name || ''
};
};

View File

@@ -22,7 +22,7 @@ export const DatasetTypeMap = {
collectionLabel: 'common.Website'
},
[DatasetTypeEnum.externalFile]: {
icon: 'core/dataset/commonDataset',
icon: 'core/dataset/externalDataset',
label: 'External File',
collectionLabel: 'common.File'
}
@@ -44,9 +44,11 @@ export const DatasetStatusMap = {
/* ------------ collection -------------- */
export enum DatasetCollectionTypeEnum {
folder = 'folder',
virtual = 'virtual',
file = 'file',
link = 'link', // one link
virtual = 'virtual'
externalFile = 'externalFile'
}
export const DatasetCollectionTypeMap = {
[DatasetCollectionTypeEnum.folder]: {
@@ -55,6 +57,9 @@ export const DatasetCollectionTypeMap = {
[DatasetCollectionTypeEnum.file]: {
name: 'core.dataset.file'
},
[DatasetCollectionTypeEnum.externalFile]: {
name: 'core.dataset.externalFile'
},
[DatasetCollectionTypeEnum.link]: {
name: 'core.dataset.link'
},

View File

@@ -1,7 +1,5 @@
import { DatasetSourceReadTypeEnum, ImportDataSourceEnum } from './constants';
export const rawTextBackupPrefix = 'index,content';
export const importType2ReadType = (type: ImportDataSourceEnum) => {
if (type === ImportDataSourceEnum.csvTable || type === ImportDataSourceEnum.fileLocal) {
return DatasetSourceReadTypeEnum.fileLocal;

View File

@@ -41,7 +41,7 @@ export type DatasetCollectionSchemaType = {
datasetId: string;
parentId?: string;
name: string;
type: `${DatasetCollectionTypeEnum}`;
type: DatasetCollectionTypeEnum;
createTime: Date;
updateTime: Date;
@@ -50,13 +50,15 @@ export type DatasetCollectionSchemaType = {
chunkSplitter?: string;
qaPrompt?: string;
sourceId?: string; // relate CollectionSourcePrefixEnum
tags?: string[];
fileId?: string; // local file id
rawLink?: string; // link url
externalFileId?: string; //external file id
rawTextLength?: number;
hashRawText?: string;
externalSourceUrl?: string; // external import url
externalFileUrl?: string; // external import url
metadata?: {
webPageSelector?: string;
relatedImgId?: string; // The id of the associated image collections

View File

@@ -3,7 +3,7 @@ import { getFileIcon } from '../../common/file/icon';
import { strIsLink } from '../../common/string/tools';
export function getCollectionIcon(
type: `${DatasetCollectionTypeEnum}` = DatasetCollectionTypeEnum.file,
type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file,
name = ''
) {
if (type === DatasetCollectionTypeEnum.folder) {
@@ -24,13 +24,13 @@ export function getSourceNameIcon({
sourceName: string;
sourceId?: string;
}) {
if (strIsLink(sourceId)) {
return 'common/linkBlue';
}
const fileIcon = getFileIcon(sourceName, '');
const fileIcon = getFileIcon(decodeURIComponent(sourceName), '');
if (fileIcon) {
return fileIcon;
}
if (strIsLink(sourceId)) {
return 'common/linkBlue';
}
return 'file/fill/manual';
}

View File

@@ -10,7 +10,7 @@
"js-yaml": "^4.1.0",
"jschardet": "3.1.1",
"nanoid": "^4.0.1",
"next": "13.5.2",
"next": "14.2.3",
"openai": "4.28.0",
"openapi-types": "^12.1.3",
"timezones-list": "^3.0.2"

View File

@@ -7,7 +7,7 @@ import { MongoFileSchema } from './schema';
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { MongoRawTextBuffer } from '../../buffer/rawText/schema';
import { readFileRawContent } from '../read/utils';
import { readRawContentByFileBuffer } from '../read/utils';
import { PassThrough } from 'stream';
export function getGFSCollection(bucket: `${BucketNameEnum}`) {
@@ -196,7 +196,7 @@ export const readFileContentFromMongo = async ({
});
})();
const { rawText } = await readFileRawContent({
const { rawText } = await readRawContentByFileBuffer({
extension,
isQAImport,
teamId,

View File

@@ -1,11 +1,12 @@
import { markdownProcess, simpleMarkdownText } from '@fastgpt/global/common/string/markdown';
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
import { uploadMongoImg } from '../image/controller';
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
import { addHours } from 'date-fns';
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
import fs from 'fs';
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
import { ReadFileResponse } from '../../../worker/file/type';
import { rawTextBackupPrefix } from '@fastgpt/global/core/dataset/read';
export const initMarkdownText = ({
teamId,
@@ -28,7 +29,34 @@ export const initMarkdownText = ({
})
});
export const readFileRawContent = async ({
export type readRawTextByLocalFileParams = {
teamId: string;
path: string;
metadata?: Record<string, any>;
};
export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParams) => {
const { path } = params;
const extension = path?.split('.')?.pop()?.toLowerCase() || '';
const buffer = fs.readFileSync(path);
const encoding = detectFileEncoding(buffer);
const { rawText } = await readRawContentByFileBuffer({
extension,
isQAImport: false,
teamId: params.teamId,
encoding,
buffer,
metadata: params.metadata
});
return {
rawText
};
};
export const readRawContentByFileBuffer = async ({
extension,
isQAImport,
teamId,
@@ -69,9 +97,3 @@ export const readFileRawContent = async ({
return { rawText };
};
export const htmlToMarkdown = async (html?: string | null) => {
const md = await runWorker<string>(WorkerNameEnum.htmlStr2Md, { html: html || '' });
return simpleMarkdownText(md);
};

View File

@@ -0,0 +1,38 @@
import { jsonRes } from '../response';
import type { NextApiResponse } from 'next';
import { withNextCors } from './cors';
import { ApiRequestProps } from '../../type/next';
export type NextApiHandler<T = any> = (
req: ApiRequestProps,
res: NextApiResponse<T>
) => unknown | Promise<unknown>;
export const NextEntry = ({ beforeCallback = [] }: { beforeCallback?: Promise<any>[] }) => {
return (...args: NextApiHandler[]): NextApiHandler => {
return async function api(req: ApiRequestProps, res: NextApiResponse) {
try {
await Promise.all([withNextCors(req, res), ...beforeCallback]);
let response = null;
for (const handler of args) {
response = await handler(req, res);
}
const contentType = res.getHeader('Content-Type');
if ((!contentType || contentType === 'application/json') && !res.writableFinished) {
return jsonRes(res, {
code: 200,
data: response
});
}
} catch (error) {
return jsonRes(res, {
code: 500,
error,
url: req.url
});
}
};
};
};

View File

@@ -1,7 +1,7 @@
import { UrlFetchParams, UrlFetchResponse } from '@fastgpt/global/common/file/api';
import * as cheerio from 'cheerio';
import axios from 'axios';
import { htmlToMarkdown } from '../file/read/utils';
import { htmlToMarkdown } from './utils';
export const cheerioToHtml = ({
fetchUrl,

View File

@@ -0,0 +1,8 @@
import { simpleMarkdownText } from '@fastgpt/global/common/string/markdown';
import { WorkerNameEnum, runWorker } from '../../worker/utils';
export const htmlToMarkdown = async (html?: string | null) => {
const md = await runWorker<string>(WorkerNameEnum.htmlStr2Md, { html: html || '' });
return simpleMarkdownText(md);
};

View File

@@ -32,6 +32,9 @@ export async function createOneCollection({
fileId,
rawLink,
externalFileId,
externalFileUrl,
hashRawText,
rawTextLength,
metadata = {},
@@ -61,6 +64,8 @@ export async function createOneCollection({
fileId,
rawLink,
externalFileId,
externalFileUrl,
rawTextLength,
hashRawText,

View File

@@ -66,7 +66,11 @@ const DatasetCollectionSchema = new Schema({
type: String
},
sourceId: String,
tags: {
type: [String],
default: []
},
// local file collection
fileId: {
type: Schema.Types.ObjectId,
@@ -74,13 +78,13 @@ const DatasetCollectionSchema = new Schema({
},
// web link collection
rawLink: String,
// external collection
externalFileId: String,
// metadata
rawTextLength: Number,
hashRawText: String,
externalSourceUrl: String, // external import url
externalFileUrl: String, // external import url
metadata: {
type: Object,
default: {}

View File

@@ -2,13 +2,20 @@ import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
import { urlsFetch } from '../../common/string/cheerio';
import { rawTextBackupPrefix } from '@fastgpt/global/core/dataset/read';
import { parseCsvTable2Chunks } from './training/utils';
import { TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import axios from 'axios';
import { readFileRawContent } from '../../common/file/read/utils';
import { readRawContentByFileBuffer } from '../../common/file/read/utils';
export const readFileRawTextByUrl = async ({ teamId, url }: { teamId: string; url: string }) => {
export const readFileRawTextByUrl = async ({
teamId,
url,
relatedId
}: {
teamId: string;
url: string;
relatedId?: string;
}) => {
const response = await axios({
method: 'get',
url: url,
@@ -18,11 +25,14 @@ export const readFileRawTextByUrl = async ({ teamId, url }: { teamId: string; ur
const buffer = Buffer.from(response.data, 'binary');
const { rawText } = await readFileRawContent({
const { rawText } = await readRawContentByFileBuffer({
extension,
teamId,
buffer,
encoding: 'utf-8'
encoding: 'utf-8',
metadata: {
relatedId
}
});
return rawText;
@@ -38,13 +48,15 @@ export const readDatasetSourceRawText = async ({
type,
sourceId,
isQAImport,
selector
selector,
relatedId
}: {
teamId: string;
type: DatasetSourceReadTypeEnum;
sourceId: string;
isQAImport?: boolean;
selector?: string;
relatedId?: string;
}): Promise<string> => {
if (type === DatasetSourceReadTypeEnum.fileLocal) {
const { rawText } = await readFileContentFromMongo({
@@ -64,7 +76,8 @@ export const readDatasetSourceRawText = async ({
} else if (type === DatasetSourceReadTypeEnum.externalFile) {
const rawText = await readFileRawTextByUrl({
teamId,
url: sourceId
url: sourceId,
relatedId
});
return rawText;
}

View File

@@ -18,6 +18,7 @@ import { countPromptTokens } from '../../../common/string/tiktoken/index';
import { datasetSearchResultConcat } from '@fastgpt/global/core/dataset/search/utils';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { jiebaSplit } from '../../../common/string/jieba';
import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection/utils';
type SearchDatasetDataProps = {
teamId: string;
@@ -98,7 +99,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
},
'datasetId collectionId q a chunkIndex indexes'
)
.populate('collectionId', 'name fileId rawLink')
.populate('collectionId', 'name fileId rawLink externalFileId externalFileUrl')
.lean()) as DatasetDataWithCollectionType[];
// add score to data(It's already sorted. The first one is the one with the most points)
@@ -130,8 +131,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
chunkIndex: data.chunkIndex,
datasetId: String(data.datasetId),
collectionId: String(data.collectionId?._id),
sourceName: data.collectionId?.name || '',
sourceId: data.collectionId?.fileId || data.collectionId?.rawLink,
...getCollectionSourceData(data.collectionId),
score: [{ type: SearchScoreTypeEnum.embedding, value: data.score, index }]
};
@@ -205,8 +205,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
id: String(item._id),
datasetId: String(item.datasetId),
collectionId: String(item.collectionId),
sourceName: collection?.name || '',
sourceId: collection?.fileId || collection?.rawLink,
...getCollectionSourceData(collection),
q: item.q,
a: item.a,
chunkIndex: item.chunkIndex,

View File

@@ -174,7 +174,7 @@ export async function pushDataListToTrainingQueue({
} catch (error: any) {
addLog.error(`Insert error`, error);
// 如果有错误,将失败的文档添加到失败列表中
error.writeErrors.forEach((writeError: any) => {
error.writeErrors?.forEach((writeError: any) => {
failedDocuments.push(data[writeError.index]);
});
console.log('failed', failedDocuments);

View File

@@ -35,7 +35,7 @@ const TrainingDataSchema = new Schema({
},
billId: {
// concat bill
type: Schema.Types.ObjectId
type: String
},
mode: {
type: String,

View File

@@ -53,7 +53,7 @@ export const dispatchLafRequest = async (props: LafRequestProps): Promise<LafRes
appId,
chatId,
responseChatItemId,
histories: histories.slice(0, 10)
histories: histories?.slice(0, 10)
},
variables,
...dynamicInput,

View File

@@ -21,7 +21,7 @@
"mammoth": "^1.6.0",
"mongoose": "^7.0.2",
"multer": "1.4.5-lts.1",
"next": "13.5.2",
"next": "14.2.3",
"nextjs-cors": "^2.1.2",
"node-cron": "^3.0.3",
"node-xlsx": "^0.23.0",

View File

@@ -19,7 +19,9 @@ export const checkDatasetLimit = async ({
if (!standardConstants) return;
if (usedSize + insertLen >= datasetMaxSize) {
return Promise.reject(TeamErrEnum.datasetSizeNotEnough);
return Promise.reject(
`您的知识库容量为: ${datasetMaxSize}组,已使用: ${usedSize}组,导入当前文件需要: ${insertLen}组,请增加知识库容量后导入。`
);
}
if (usedPoints >= totalPoints) {

View File

@@ -9,7 +9,7 @@ import { readXlsxRawText } from './extension/xlsx';
import { readCsvRawText } from './extension/csv';
parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
const readFileRawContent = async (params: ReadRawTextByBuffer) => {
const readRawContentByFileBuffer = async (params: ReadRawTextByBuffer) => {
switch (params.extension) {
case 'txt':
case 'md':
@@ -41,7 +41,7 @@ parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
try {
parentPort?.postMessage({
type: 'success',
data: await readFileRawContent(newProps)
data: await readRawContentByFileBuffer(newProps)
});
} catch (error) {
console.log(error);

View File

@@ -101,6 +101,7 @@ export const iconPaths = {
'core/dataset/commonDataset': () => import('./icons/core/dataset/commonDataset.svg'),
'core/dataset/datasetFill': () => import('./icons/core/dataset/datasetFill.svg'),
'core/dataset/datasetLight': () => import('./icons/core/dataset/datasetLight.svg'),
'core/dataset/externalDataset': () => import('./icons/core/dataset/externalDataset.svg'),
'core/dataset/fileCollection': () => import('./icons/core/dataset/fileCollection.svg'),
'core/dataset/fullTextRecall': () => import('./icons/core/dataset/fullTextRecall.svg'),
'core/dataset/manualCollection': () => import('./icons/core/dataset/manualCollection.svg'),

View File

@@ -0,0 +1,5 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 23 20">
<path fill-rule="evenodd" clip-rule="evenodd"
d="M15.7233 1.21707C14.9554 1.3079 14.4221 2.00197 14.532 2.76731L16.6975 17.8447C16.8074 18.6101 17.519 19.1569 18.2868 19.066L21.5433 18.6808C22.3111 18.59 22.8445 17.8959 22.7345 17.1306L20.5691 2.05317C20.4592 1.28782 19.7476 0.741021 18.9797 0.831852L15.7233 1.21707ZM0.830017 2.32412C0.830017 1.55092 1.45682 0.924121 2.23002 0.924121H5.51815C6.29135 0.924121 6.91815 1.55092 6.91815 2.32412V17.675C6.91815 18.4482 6.29135 19.075 5.51815 19.075H2.23002C1.45682 19.075 0.830017 18.4482 0.830017 17.675V2.32412ZM7.9198 2.32412C7.9198 1.55092 8.5466 0.924121 9.3198 0.924121H12.6079C13.3811 0.924121 14.0079 1.55092 14.0079 2.32412V17.675C14.0079 18.4482 13.3811 19.075 12.6079 19.075H9.3198C8.5466 19.075 7.9198 18.4482 7.9198 17.675V2.32412Z"
fill="#8A95A7" />
</svg>

After

Width:  |  Height:  |  Size: 899 B

View File

@@ -10,6 +10,7 @@ import React from 'react';
type Props = Omit<NumberInputProps, 'onChange'> & {
onChange: (e: number | '') => any;
placeholder?: string;
};
const MyNumberInput = (props: Props) => {
@@ -24,7 +25,7 @@ const MyNumberInput = (props: Props) => {
}
}}
>
<NumberInputField />
<NumberInputField placeholder={props?.placeholder} />
<NumberInputStepper>
<NumberIncrementStepper />
<NumberDecrementStepper />

View File

@@ -22,6 +22,7 @@ type Props = Omit<BoxProps, 'resize' | 'onChange'> & {
onChange?: (e: string) => void;
variables?: EditorVariablePickerType[];
defaultHeight?: number;
placeholder?: string;
};
const options = {

View File

@@ -27,18 +27,18 @@
"next-i18next": "15.2.0",
"papaparse": "^5.4.1",
"pdfjs-dist": "4.0.269",
"react": "18.2.0",
"react": "18.3.1",
"use-context-selector": "^1.4.4",
"react-day-picker": "^8.7.1",
"react-dom": "18.2.0",
"react-dom": "18.3.1",
"react-i18next": "13.5.0",
"react-beautiful-dnd": "^13.1.1"
},
"devDependencies": {
"@types/lodash": "^4.14.191",
"@types/papaparse": "^5.3.7",
"@types/react": "18.2.0",
"@types/react-dom": "18.2.0",
"@types/react": "18.3.0",
"@types/react-dom": "18.3.0",
"@types/react-beautiful-dnd": "^13.1.8"
}
}