External dataset (#1519)

* perf: local file create collection

* rename middleware

* perf: remove code

* feat: next14

* feat: external file dataset

* collection tags field

* external file dataset doc

* fix: ts
This commit is contained in:
Archer
2024-05-17 16:44:15 +08:00
committed by GitHub
parent 2d1ec9b3ad
commit 67c52992d7
102 changed files with 1839 additions and 1282 deletions

View File

@@ -32,6 +32,9 @@ export async function createOneCollection({
fileId,
rawLink,
externalFileId,
externalFileUrl,
hashRawText,
rawTextLength,
metadata = {},
@@ -61,6 +64,8 @@ export async function createOneCollection({
fileId,
rawLink,
externalFileId,
externalFileUrl,
rawTextLength,
hashRawText,

View File

@@ -66,7 +66,11 @@ const DatasetCollectionSchema = new Schema({
type: String
},
sourceId: String,
tags: {
type: [String],
default: []
},
// local file collection
fileId: {
type: Schema.Types.ObjectId,
@@ -74,13 +78,13 @@ const DatasetCollectionSchema = new Schema({
},
// web link collection
rawLink: String,
// external collection
externalFileId: String,
// metadata
rawTextLength: Number,
hashRawText: String,
externalSourceUrl: String, // external import url
externalFileUrl: String, // external import url
metadata: {
type: Object,
default: {}

View File

@@ -2,13 +2,20 @@ import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
import { urlsFetch } from '../../common/string/cheerio';
import { rawTextBackupPrefix } from '@fastgpt/global/core/dataset/read';
import { parseCsvTable2Chunks } from './training/utils';
import { TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import axios from 'axios';
import { readFileRawContent } from '../../common/file/read/utils';
import { readRawContentByFileBuffer } from '../../common/file/read/utils';
export const readFileRawTextByUrl = async ({ teamId, url }: { teamId: string; url: string }) => {
export const readFileRawTextByUrl = async ({
teamId,
url,
relatedId
}: {
teamId: string;
url: string;
relatedId?: string;
}) => {
const response = await axios({
method: 'get',
url: url,
@@ -18,11 +25,14 @@ export const readFileRawTextByUrl = async ({ teamId, url }: { teamId: string; ur
const buffer = Buffer.from(response.data, 'binary');
const { rawText } = await readFileRawContent({
const { rawText } = await readRawContentByFileBuffer({
extension,
teamId,
buffer,
encoding: 'utf-8'
encoding: 'utf-8',
metadata: {
relatedId
}
});
return rawText;
@@ -38,13 +48,15 @@ export const readDatasetSourceRawText = async ({
type,
sourceId,
isQAImport,
selector
selector,
relatedId
}: {
teamId: string;
type: DatasetSourceReadTypeEnum;
sourceId: string;
isQAImport?: boolean;
selector?: string;
relatedId?: string;
}): Promise<string> => {
if (type === DatasetSourceReadTypeEnum.fileLocal) {
const { rawText } = await readFileContentFromMongo({
@@ -64,7 +76,8 @@ export const readDatasetSourceRawText = async ({
} else if (type === DatasetSourceReadTypeEnum.externalFile) {
const rawText = await readFileRawTextByUrl({
teamId,
url: sourceId
url: sourceId,
relatedId
});
return rawText;
}

View File

@@ -18,6 +18,7 @@ import { countPromptTokens } from '../../../common/string/tiktoken/index';
import { datasetSearchResultConcat } from '@fastgpt/global/core/dataset/search/utils';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { jiebaSplit } from '../../../common/string/jieba';
import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection/utils';
type SearchDatasetDataProps = {
teamId: string;
@@ -98,7 +99,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
},
'datasetId collectionId q a chunkIndex indexes'
)
.populate('collectionId', 'name fileId rawLink')
.populate('collectionId', 'name fileId rawLink externalFileId externalFileUrl')
.lean()) as DatasetDataWithCollectionType[];
// add score to data(It's already sorted. The first one is the one with the most points)
@@ -130,8 +131,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
chunkIndex: data.chunkIndex,
datasetId: String(data.datasetId),
collectionId: String(data.collectionId?._id),
sourceName: data.collectionId?.name || '',
sourceId: data.collectionId?.fileId || data.collectionId?.rawLink,
...getCollectionSourceData(data.collectionId),
score: [{ type: SearchScoreTypeEnum.embedding, value: data.score, index }]
};
@@ -205,8 +205,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
id: String(item._id),
datasetId: String(item.datasetId),
collectionId: String(item.collectionId),
sourceName: collection?.name || '',
sourceId: collection?.fileId || collection?.rawLink,
...getCollectionSourceData(collection),
q: item.q,
a: item.a,
chunkIndex: item.chunkIndex,

View File

@@ -174,7 +174,7 @@ export async function pushDataListToTrainingQueue({
} catch (error: any) {
addLog.error(`Insert error`, error);
// 如果有错误,将失败的文档添加到失败列表中
error.writeErrors.forEach((writeError: any) => {
error.writeErrors?.forEach((writeError: any) => {
failedDocuments.push(data[writeError.index]);
});
console.log('failed', failedDocuments);

View File

@@ -35,7 +35,7 @@ const TrainingDataSchema = new Schema({
},
billId: {
// concat bill
type: Schema.Types.ObjectId
type: String
},
mode: {
type: String,

View File

@@ -53,7 +53,7 @@ export const dispatchLafRequest = async (props: LafRequestProps): Promise<LafRes
appId,
chatId,
responseChatItemId,
histories: histories.slice(0, 10)
histories: histories?.slice(0, 10)
},
variables,
...dynamicInput,