4.6.3-website dataset (#532)

This commit is contained in:
Archer
2023-12-03 20:45:57 +08:00
committed by GitHub
parent b916183848
commit a9ae270335
122 changed files with 3793 additions and 1360 deletions

View File

@@ -3,6 +3,7 @@ import { ChatRoleEnum, IMG_BLOCK_KEY } from '@fastgpt/global/core/chat/constants
import { countMessagesTokens, countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
import { adaptRole_Chat2Message } from '@fastgpt/global/core/chat/adapt';
import type { ChatCompletionContentPart } from '@fastgpt/global/core/ai/type.d';
import axios from 'axios';
/* slice chat context by tokens */
export function ChatContextFilter({
@@ -81,11 +82,13 @@ export function ChatContextFilter({
}
]
*/
export function formatStr2ChatContent(str: string) {
export async function formatStr2ChatContent(str: string) {
const content: ChatCompletionContentPart[] = [];
let lastIndex = 0;
const regex = new RegExp(`\`\`\`(${IMG_BLOCK_KEY})\\n([\\s\\S]*?)\`\`\``, 'g');
const imgKey: 'image_url' = 'image_url';
let match;
while ((match = regex.exec(str)) !== null) {
@@ -115,7 +118,7 @@ export function formatStr2ChatContent(str: string) {
content.push(
...jsonLines.map((item) => ({
type: 'image_url' as any,
type: imgKey,
image_url: {
url: item.src
}
@@ -148,5 +151,18 @@ export function formatStr2ChatContent(str: string) {
if (content.length === 1 && content[0].type === 'text') {
return content[0].text;
}
if (!content) return null;
// load img to base64
for await (const item of content) {
if (item.type === imgKey && item[imgKey]?.url) {
const response = await axios.get(item[imgKey].url, {
responseType: 'arraybuffer'
});
const base64 = Buffer.from(response.data).toString('base64');
item[imgKey].url = `data:${response.headers['content-type']};base64,${base64}`;
}
}
return content ? content : null;
}

View File

@@ -0,0 +1,73 @@
import {
DatasetCollectionTrainingModeEnum,
DatasetCollectionTypeEnum
} from '@fastgpt/global/core/dataset/constant';
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { MongoDatasetCollection } from './schema';
export async function createOneCollection({
name,
parentId,
datasetId,
type,
trainingType = DatasetCollectionTrainingModeEnum.manual,
chunkSize = 0,
fileId,
rawLink,
teamId,
tmbId,
metadata = {}
}: CreateDatasetCollectionParams & { teamId: string; tmbId: string }) {
const { _id } = await MongoDatasetCollection.create({
name,
teamId,
tmbId,
datasetId,
parentId: parentId || null,
type,
trainingType,
chunkSize,
fileId,
rawLink,
metadata
});
// create default collection
if (type === DatasetCollectionTypeEnum.folder) {
await createDefaultCollection({
datasetId,
parentId: _id,
teamId,
tmbId
});
}
return _id;
}
// create default collection
export function createDefaultCollection({
name = '手动录入',
datasetId,
parentId,
teamId,
tmbId
}: {
name?: '手动录入' | '手动标注';
datasetId: string;
parentId?: string;
teamId: string;
tmbId: string;
}) {
return MongoDatasetCollection.create({
name,
teamId,
tmbId,
datasetId,
parentId,
type: DatasetCollectionTypeEnum.virtual,
trainingType: DatasetCollectionTrainingModeEnum.manual,
chunkSize: 0,
updateTime: new Date('2099')
});
}

View File

@@ -39,15 +39,16 @@ const DatasetCollectionSchema = new Schema({
ref: DatasetCollectionName,
required: true
},
name: {
type: String,
required: true
},
type: {
type: String,
enum: Object.keys(DatasetCollectionTypeMap),
required: true
},
name: {
type: String,
required: true
},
createTime: {
type: Date,
default: () => new Date()

View File

@@ -0,0 +1,75 @@
import { MongoDatasetData } from './schema';
import { deletePgDataById } from './pg';
import { MongoDatasetTraining } from '../training/schema';
import { delFileById } from '../../../common/file/gridfs/controller';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { MongoDatasetCollection } from '../collection/schema';
import { delDatasetFiles } from '../file/controller';
import { delay } from '@fastgpt/global/common/system/utils';
/* delete all data by datasetIds */
export async function delDatasetRelevantData({ datasetIds }: { datasetIds: string[] }) {
datasetIds = datasetIds.map((item) => String(item));
// delete training data(There could be a training mission)
await MongoDatasetTraining.deleteMany({
datasetId: { $in: datasetIds }
});
// delete related files
await Promise.all(datasetIds.map((id) => delDatasetFiles({ datasetId: id })));
await delay(1000);
// delete pg data
await deletePgDataById(`dataset_id IN ('${datasetIds.join("','")}')`);
// delete dataset.datas
await MongoDatasetData.deleteMany({ datasetId: { $in: datasetIds } });
// delete collections
await MongoDatasetCollection.deleteMany({
datasetId: { $in: datasetIds }
});
}
/**
* delete all data by collectionIds
*/
export async function delCollectionRelevantData({
collectionIds,
fileIds
}: {
collectionIds: string[];
fileIds: string[];
}) {
collectionIds = collectionIds.map((item) => String(item));
const filterFileIds = fileIds.filter(Boolean);
// delete training data
await MongoDatasetTraining.deleteMany({
collectionId: { $in: collectionIds }
});
// delete file
await Promise.all(
filterFileIds.map((fileId) => {
return delFileById({
bucketName: BucketNameEnum.dataset,
fileId
});
})
);
await delay(1000);
// delete pg data
await deletePgDataById(`collection_id IN ('${collectionIds.join("','")}')`);
// delete dataset.datas
await MongoDatasetData.deleteMany({ collectionId: { $in: collectionIds } });
}
/**
* delete one data by mongoDataId
*/
export async function delDatasetDataByDataId(mongoDataId: string) {
await deletePgDataById(['data_id', mongoDataId]);
await MongoDatasetData.findByIdAndDelete(mongoDataId);
}

View File

@@ -0,0 +1,28 @@
import { PgDatasetTableName } from '@fastgpt/global/core/dataset/constant';
import { delay } from '@fastgpt/global/common/system/utils';
import { PgClient } from '../../../common/pg';
export async function deletePgDataById(
where: ['id' | 'dataset_id' | 'collection_id' | 'data_id', string] | string
) {
let retry = 2;
async function deleteData(): Promise<any> {
try {
await PgClient.delete(PgDatasetTableName, {
where: [where]
});
} catch (error) {
if (--retry < 0) {
return Promise.reject(error);
}
await delay(500);
return deleteData();
}
}
await deleteData();
return {
tokenLen: 0
};
}

View File

@@ -79,6 +79,9 @@ const DatasetDataSchema = new Schema({
chunkIndex: {
type: Number,
default: 0
},
inited: {
type: Boolean
}
});
@@ -88,7 +91,7 @@ try {
DatasetDataSchema.index({ collectionId: 1 });
// full text index
DatasetDataSchema.index({ datasetId: 1, fullTextToken: 'text' });
DatasetDataSchema.index({ fullTextToken: 1 });
DatasetDataSchema.index({ inited: 1 });
} catch (error) {
console.log(error);
}

View File

@@ -1,7 +1,11 @@
import { connectionMongo, type Model } from '../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { DatasetSchemaType } from '@fastgpt/global/core/dataset/type.d';
import { DatasetTypeMap } from '@fastgpt/global/core/dataset/constant';
import {
DatasetStatusEnum,
DatasetStatusMap,
DatasetTypeMap
} from '@fastgpt/global/core/dataset/constant';
import {
TeamCollectionName,
TeamMemberCollectionName
@@ -31,9 +35,16 @@ const DatasetSchema = new Schema({
ref: TeamMemberCollectionName,
required: true
},
updateTime: {
type: Date,
default: () => new Date()
type: {
type: String,
enum: Object.keys(DatasetTypeMap),
required: true,
default: 'dataset'
},
status: {
type: String,
enum: Object.keys(DatasetStatusMap),
default: DatasetStatusEnum.active
},
avatar: {
type: String,
@@ -43,6 +54,10 @@ const DatasetSchema = new Schema({
type: String,
required: true
},
updateTime: {
type: Date,
default: () => new Date()
},
vectorModel: {
type: String,
required: true,
@@ -53,24 +68,26 @@ const DatasetSchema = new Schema({
required: true,
default: 'gpt-3.5-turbo-16k'
},
type: {
intro: {
type: String,
enum: Object.keys(DatasetTypeMap),
required: true,
default: 'dataset'
},
tags: {
type: [String],
default: [],
set(val: string | string[]) {
if (Array.isArray(val)) return val;
return val.split(' ').filter((item) => item);
}
default: ''
},
permission: {
type: String,
enum: Object.keys(PermissionTypeMap),
default: PermissionTypeEnum.private
},
websiteConfig: {
type: {
url: {
type: String,
required: true
},
selector: {
type: String,
default: 'body'
}
}
}
});