mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-22 20:37:48 +00:00
4.6.3-website dataset (#532)
This commit is contained in:
@@ -3,6 +3,7 @@ import { ChatRoleEnum, IMG_BLOCK_KEY } from '@fastgpt/global/core/chat/constants
|
||||
import { countMessagesTokens, countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
|
||||
import { adaptRole_Chat2Message } from '@fastgpt/global/core/chat/adapt';
|
||||
import type { ChatCompletionContentPart } from '@fastgpt/global/core/ai/type.d';
|
||||
import axios from 'axios';
|
||||
|
||||
/* slice chat context by tokens */
|
||||
export function ChatContextFilter({
|
||||
@@ -81,11 +82,13 @@ export function ChatContextFilter({
|
||||
}
|
||||
]
|
||||
*/
|
||||
export function formatStr2ChatContent(str: string) {
|
||||
export async function formatStr2ChatContent(str: string) {
|
||||
const content: ChatCompletionContentPart[] = [];
|
||||
let lastIndex = 0;
|
||||
const regex = new RegExp(`\`\`\`(${IMG_BLOCK_KEY})\\n([\\s\\S]*?)\`\`\``, 'g');
|
||||
|
||||
const imgKey: 'image_url' = 'image_url';
|
||||
|
||||
let match;
|
||||
|
||||
while ((match = regex.exec(str)) !== null) {
|
||||
@@ -115,7 +118,7 @@ export function formatStr2ChatContent(str: string) {
|
||||
|
||||
content.push(
|
||||
...jsonLines.map((item) => ({
|
||||
type: 'image_url' as any,
|
||||
type: imgKey,
|
||||
image_url: {
|
||||
url: item.src
|
||||
}
|
||||
@@ -148,5 +151,18 @@ export function formatStr2ChatContent(str: string) {
|
||||
if (content.length === 1 && content[0].type === 'text') {
|
||||
return content[0].text;
|
||||
}
|
||||
|
||||
if (!content) return null;
|
||||
// load img to base64
|
||||
for await (const item of content) {
|
||||
if (item.type === imgKey && item[imgKey]?.url) {
|
||||
const response = await axios.get(item[imgKey].url, {
|
||||
responseType: 'arraybuffer'
|
||||
});
|
||||
const base64 = Buffer.from(response.data).toString('base64');
|
||||
item[imgKey].url = `data:${response.headers['content-type']};base64,${base64}`;
|
||||
}
|
||||
}
|
||||
|
||||
return content ? content : null;
|
||||
}
|
||||
|
73
packages/service/core/dataset/collection/controller.ts
Normal file
73
packages/service/core/dataset/collection/controller.ts
Normal file
@@ -0,0 +1,73 @@
|
||||
import {
|
||||
DatasetCollectionTrainingModeEnum,
|
||||
DatasetCollectionTypeEnum
|
||||
} from '@fastgpt/global/core/dataset/constant';
|
||||
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
|
||||
import { MongoDatasetCollection } from './schema';
|
||||
|
||||
export async function createOneCollection({
|
||||
name,
|
||||
parentId,
|
||||
datasetId,
|
||||
type,
|
||||
trainingType = DatasetCollectionTrainingModeEnum.manual,
|
||||
chunkSize = 0,
|
||||
fileId,
|
||||
rawLink,
|
||||
teamId,
|
||||
tmbId,
|
||||
metadata = {}
|
||||
}: CreateDatasetCollectionParams & { teamId: string; tmbId: string }) {
|
||||
const { _id } = await MongoDatasetCollection.create({
|
||||
name,
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId,
|
||||
parentId: parentId || null,
|
||||
type,
|
||||
trainingType,
|
||||
chunkSize,
|
||||
fileId,
|
||||
rawLink,
|
||||
metadata
|
||||
});
|
||||
|
||||
// create default collection
|
||||
if (type === DatasetCollectionTypeEnum.folder) {
|
||||
await createDefaultCollection({
|
||||
datasetId,
|
||||
parentId: _id,
|
||||
teamId,
|
||||
tmbId
|
||||
});
|
||||
}
|
||||
|
||||
return _id;
|
||||
}
|
||||
|
||||
// create default collection
|
||||
export function createDefaultCollection({
|
||||
name = '手动录入',
|
||||
datasetId,
|
||||
parentId,
|
||||
teamId,
|
||||
tmbId
|
||||
}: {
|
||||
name?: '手动录入' | '手动标注';
|
||||
datasetId: string;
|
||||
parentId?: string;
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
}) {
|
||||
return MongoDatasetCollection.create({
|
||||
name,
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId,
|
||||
parentId,
|
||||
type: DatasetCollectionTypeEnum.virtual,
|
||||
trainingType: DatasetCollectionTrainingModeEnum.manual,
|
||||
chunkSize: 0,
|
||||
updateTime: new Date('2099')
|
||||
});
|
||||
}
|
@@ -39,15 +39,16 @@ const DatasetCollectionSchema = new Schema({
|
||||
ref: DatasetCollectionName,
|
||||
required: true
|
||||
},
|
||||
name: {
|
||||
type: String,
|
||||
required: true
|
||||
},
|
||||
type: {
|
||||
type: String,
|
||||
enum: Object.keys(DatasetCollectionTypeMap),
|
||||
required: true
|
||||
},
|
||||
|
||||
name: {
|
||||
type: String,
|
||||
required: true
|
||||
},
|
||||
createTime: {
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
|
75
packages/service/core/dataset/data/controller.ts
Normal file
75
packages/service/core/dataset/data/controller.ts
Normal file
@@ -0,0 +1,75 @@
|
||||
import { MongoDatasetData } from './schema';
|
||||
import { deletePgDataById } from './pg';
|
||||
import { MongoDatasetTraining } from '../training/schema';
|
||||
import { delFileById } from '../../../common/file/gridfs/controller';
|
||||
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||
import { MongoDatasetCollection } from '../collection/schema';
|
||||
import { delDatasetFiles } from '../file/controller';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
|
||||
/* delete all data by datasetIds */
|
||||
export async function delDatasetRelevantData({ datasetIds }: { datasetIds: string[] }) {
|
||||
datasetIds = datasetIds.map((item) => String(item));
|
||||
|
||||
// delete training data(There could be a training mission)
|
||||
await MongoDatasetTraining.deleteMany({
|
||||
datasetId: { $in: datasetIds }
|
||||
});
|
||||
|
||||
// delete related files
|
||||
await Promise.all(datasetIds.map((id) => delDatasetFiles({ datasetId: id })));
|
||||
|
||||
await delay(1000);
|
||||
|
||||
// delete pg data
|
||||
await deletePgDataById(`dataset_id IN ('${datasetIds.join("','")}')`);
|
||||
// delete dataset.datas
|
||||
await MongoDatasetData.deleteMany({ datasetId: { $in: datasetIds } });
|
||||
|
||||
// delete collections
|
||||
await MongoDatasetCollection.deleteMany({
|
||||
datasetId: { $in: datasetIds }
|
||||
});
|
||||
}
|
||||
/**
|
||||
* delete all data by collectionIds
|
||||
*/
|
||||
export async function delCollectionRelevantData({
|
||||
collectionIds,
|
||||
fileIds
|
||||
}: {
|
||||
collectionIds: string[];
|
||||
fileIds: string[];
|
||||
}) {
|
||||
collectionIds = collectionIds.map((item) => String(item));
|
||||
const filterFileIds = fileIds.filter(Boolean);
|
||||
|
||||
// delete training data
|
||||
await MongoDatasetTraining.deleteMany({
|
||||
collectionId: { $in: collectionIds }
|
||||
});
|
||||
|
||||
// delete file
|
||||
await Promise.all(
|
||||
filterFileIds.map((fileId) => {
|
||||
return delFileById({
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileId
|
||||
});
|
||||
})
|
||||
);
|
||||
|
||||
await delay(1000);
|
||||
|
||||
// delete pg data
|
||||
await deletePgDataById(`collection_id IN ('${collectionIds.join("','")}')`);
|
||||
// delete dataset.datas
|
||||
await MongoDatasetData.deleteMany({ collectionId: { $in: collectionIds } });
|
||||
}
|
||||
/**
|
||||
* delete one data by mongoDataId
|
||||
*/
|
||||
export async function delDatasetDataByDataId(mongoDataId: string) {
|
||||
await deletePgDataById(['data_id', mongoDataId]);
|
||||
await MongoDatasetData.findByIdAndDelete(mongoDataId);
|
||||
}
|
28
packages/service/core/dataset/data/pg.ts
Normal file
28
packages/service/core/dataset/data/pg.ts
Normal file
@@ -0,0 +1,28 @@
|
||||
import { PgDatasetTableName } from '@fastgpt/global/core/dataset/constant';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { PgClient } from '../../../common/pg';
|
||||
|
||||
export async function deletePgDataById(
|
||||
where: ['id' | 'dataset_id' | 'collection_id' | 'data_id', string] | string
|
||||
) {
|
||||
let retry = 2;
|
||||
async function deleteData(): Promise<any> {
|
||||
try {
|
||||
await PgClient.delete(PgDatasetTableName, {
|
||||
where: [where]
|
||||
});
|
||||
} catch (error) {
|
||||
if (--retry < 0) {
|
||||
return Promise.reject(error);
|
||||
}
|
||||
await delay(500);
|
||||
return deleteData();
|
||||
}
|
||||
}
|
||||
|
||||
await deleteData();
|
||||
|
||||
return {
|
||||
tokenLen: 0
|
||||
};
|
||||
}
|
@@ -79,6 +79,9 @@ const DatasetDataSchema = new Schema({
|
||||
chunkIndex: {
|
||||
type: Number,
|
||||
default: 0
|
||||
},
|
||||
inited: {
|
||||
type: Boolean
|
||||
}
|
||||
});
|
||||
|
||||
@@ -88,7 +91,7 @@ try {
|
||||
DatasetDataSchema.index({ collectionId: 1 });
|
||||
// full text index
|
||||
DatasetDataSchema.index({ datasetId: 1, fullTextToken: 'text' });
|
||||
DatasetDataSchema.index({ fullTextToken: 1 });
|
||||
DatasetDataSchema.index({ inited: 1 });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
@@ -1,7 +1,11 @@
|
||||
import { connectionMongo, type Model } from '../../common/mongo';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
import { DatasetSchemaType } from '@fastgpt/global/core/dataset/type.d';
|
||||
import { DatasetTypeMap } from '@fastgpt/global/core/dataset/constant';
|
||||
import {
|
||||
DatasetStatusEnum,
|
||||
DatasetStatusMap,
|
||||
DatasetTypeMap
|
||||
} from '@fastgpt/global/core/dataset/constant';
|
||||
import {
|
||||
TeamCollectionName,
|
||||
TeamMemberCollectionName
|
||||
@@ -31,9 +35,16 @@ const DatasetSchema = new Schema({
|
||||
ref: TeamMemberCollectionName,
|
||||
required: true
|
||||
},
|
||||
updateTime: {
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
type: {
|
||||
type: String,
|
||||
enum: Object.keys(DatasetTypeMap),
|
||||
required: true,
|
||||
default: 'dataset'
|
||||
},
|
||||
status: {
|
||||
type: String,
|
||||
enum: Object.keys(DatasetStatusMap),
|
||||
default: DatasetStatusEnum.active
|
||||
},
|
||||
avatar: {
|
||||
type: String,
|
||||
@@ -43,6 +54,10 @@ const DatasetSchema = new Schema({
|
||||
type: String,
|
||||
required: true
|
||||
},
|
||||
updateTime: {
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
},
|
||||
vectorModel: {
|
||||
type: String,
|
||||
required: true,
|
||||
@@ -53,24 +68,26 @@ const DatasetSchema = new Schema({
|
||||
required: true,
|
||||
default: 'gpt-3.5-turbo-16k'
|
||||
},
|
||||
type: {
|
||||
intro: {
|
||||
type: String,
|
||||
enum: Object.keys(DatasetTypeMap),
|
||||
required: true,
|
||||
default: 'dataset'
|
||||
},
|
||||
tags: {
|
||||
type: [String],
|
||||
default: [],
|
||||
set(val: string | string[]) {
|
||||
if (Array.isArray(val)) return val;
|
||||
return val.split(' ').filter((item) => item);
|
||||
}
|
||||
default: ''
|
||||
},
|
||||
permission: {
|
||||
type: String,
|
||||
enum: Object.keys(PermissionTypeMap),
|
||||
default: PermissionTypeEnum.private
|
||||
},
|
||||
websiteConfig: {
|
||||
type: {
|
||||
url: {
|
||||
type: String,
|
||||
required: true
|
||||
},
|
||||
selector: {
|
||||
type: String,
|
||||
default: 'body'
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
|
Reference in New Issue
Block a user