4.6.7-alpha commit (#743)

Co-authored-by: Archer <545436317@qq.com>
Co-authored-by: heheer <71265218+newfish-cmyk@users.noreply.github.com>
This commit is contained in:
Archer
2024-01-19 11:17:28 +08:00
committed by GitHub
parent 8ee7407c4c
commit c031e6dcc9
324 changed files with 8509 additions and 4757 deletions

View File

@@ -18,10 +18,9 @@ export async function getVectorsByText({
}
try {
// 获取 chatAPI
const ai = getAIApi();
// 把输入的内容转成向量
// input text to vector
const result = await ai.embeddings
.create({
model,
@@ -38,7 +37,7 @@ export async function getVectorsByText({
}
return {
tokens: res.usage.total_tokens || 0,
charsLength: input.length,
vectors: await Promise.all(res.data.map((item) => unityDimensional(item.embedding)))
};
});
@@ -53,7 +52,9 @@ export async function getVectorsByText({
function unityDimensional(vector: number[]) {
if (vector.length > 1536) {
console.log(`当前向量维度为: ${vector.length}, 向量维度不能超过 1536, 已自动截取前 1536 维度`);
console.log(
`The current vector dimension is ${vector.length}, and the vector dimension cannot exceed 1536. The first 1536 dimensions are automatically captured`
);
return vector.slice(0, 1536);
}
let resultVector = vector;

View File

@@ -11,6 +11,8 @@ import { appCollectionName } from '../app/schema';
import { userCollectionName } from '../../support/user/schema';
import { ModuleOutputKeyEnum } from '@fastgpt/global/core/module/constants';
export const ChatItemCollectionName = 'chatitems';
const ChatItemSchema = new Schema({
teamId: {
type: Schema.Types.ObjectId,
@@ -79,20 +81,23 @@ const ChatItemSchema = new Schema({
});
try {
ChatItemSchema.index({ teamId: 1 });
ChatItemSchema.index({ time: -1 });
ChatItemSchema.index({ appId: 1 });
ChatItemSchema.index({ chatId: 1 });
ChatItemSchema.index({ obj: 1 });
ChatItemSchema.index({ userGoodFeedback: 1 });
ChatItemSchema.index({ userBadFeedback: 1 });
ChatItemSchema.index({ customFeedbacks: 1 });
ChatItemSchema.index({ adminFeedback: 1 });
ChatItemSchema.index({ dataId: 1 }, { background: true });
/* delete by app;
delete by chat id;
get chat list;
get chat logs;
close custom feedback;
*/
ChatItemSchema.index({ appId: 1, chatId: 1, dataId: 1 }, { background: true });
ChatItemSchema.index({ userGoodFeedback: 1 }, { background: true });
ChatItemSchema.index({ userBadFeedback: 1 }, { background: true });
ChatItemSchema.index({ customFeedbacks: 1 }, { background: true });
ChatItemSchema.index({ adminFeedback: 1 }, { background: true });
} catch (error) {
console.log(error);
}
export const MongoChatItem: Model<ChatItemType> =
models['chatItem'] || model('chatItem', ChatItemSchema);
models[ChatItemCollectionName] || model(ChatItemCollectionName, ChatItemSchema);
MongoChatItem.syncIndexes();

View File

@@ -1,13 +1,12 @@
import { connectionMongo, type Model } from '../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { ChatSchema as ChatType } from '@fastgpt/global/core/chat/type.d';
import { ChatRoleMap, ChatSourceMap } from '@fastgpt/global/core/chat/constants';
import { ChatSourceMap } from '@fastgpt/global/core/chat/constants';
import {
TeamCollectionName,
TeamMemberCollectionName
} from '@fastgpt/global/support/user/team/constant';
import { appCollectionName } from '../app/schema';
import { ModuleOutputKeyEnum } from '@fastgpt/global/core/module/constants';
export const chatCollectionName = 'chat';
@@ -48,7 +47,8 @@ const ChatSchema = new Schema({
default: ''
},
top: {
type: Boolean
type: Boolean,
default: false
},
source: {
type: String,
@@ -73,10 +73,16 @@ const ChatSchema = new Schema({
});
try {
ChatSchema.index({ appId: 1 });
ChatSchema.index({ tmbId: 1 });
ChatSchema.index({ shareId: 1 });
ChatSchema.index({ updateTime: -1 });
ChatSchema.index({ chatId: 1 }, { background: true });
// get user history
ChatSchema.index({ tmbId: 1, appId: 1, top: -1, updateTime: -1 }, { background: true });
// delete by appid; clear history; init chat; update chat; auth chat;
ChatSchema.index({ appId: 1, chatId: 1 }, { background: true });
// get chat logs;
ChatSchema.index({ teamId: 1, appId: 1, updateTime: -1 }, { background: true });
// get share chat history
ChatSchema.index({ shareId: 1, outLinkUid: 1 }, { background: true });
} catch (error) {
console.log(error);
}

View File

@@ -3,10 +3,12 @@ import { MongoChatItem } from './chatItemSchema';
import { addLog } from '../../common/system/log';
export async function getChatItems({
appId,
chatId,
limit = 30,
field
}: {
appId: string;
chatId?: string;
limit?: number;
field: string;
@@ -15,7 +17,10 @@ export async function getChatItems({
return { history: [] };
}
const history = await MongoChatItem.find({ chatId }, field).sort({ _id: -1 }).limit(limit).lean();
const history = await MongoChatItem.find({ appId, chatId }, field)
.sort({ _id: -1 })
.limit(limit)
.lean();
history.reverse();
@@ -23,10 +28,12 @@ export async function getChatItems({
}
export const addCustomFeedbacks = async ({
appId,
chatId,
chatItemId,
feedbacks
}: {
appId: string;
chatId?: string;
chatItemId?: string;
feedbacks: string[];

View File

@@ -1,6 +1,20 @@
import { TrainingModeEnum, DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
import {
TrainingModeEnum,
DatasetCollectionTypeEnum
} from '@fastgpt/global/core/dataset/constants';
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { MongoDatasetCollection } from './schema';
import {
CollectionWithDatasetType,
DatasetCollectionSchemaType
} from '@fastgpt/global/core/dataset/type';
import { MongoDatasetTraining } from '../training/schema';
import { delay } from '@fastgpt/global/common/system/utils';
import { MongoDatasetData } from '../data/schema';
import { delImgByRelatedId } from '../../../common/file/image/controller';
import { deleteDatasetDataVector } from '../../../common/vectorStore/controller';
import { delFileByFileIdList } from '../../../common/file/gridfs/controller';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
export async function createOneCollection({
teamId,
@@ -85,20 +99,50 @@ export function createDefaultCollection({
});
}
// check same collection
export const getSameRawTextCollection = async ({
datasetId,
hashRawText
/**
* delete collection and it related data
*/
export async function delCollectionAndRelatedSources({
collections
}: {
datasetId: string;
hashRawText?: string;
}) => {
if (!hashRawText) return undefined;
collections: (CollectionWithDatasetType | DatasetCollectionSchemaType)[];
}) {
if (collections.length === 0) return;
const collection = await MongoDatasetCollection.findOne({
datasetId,
hashRawText
const teamId = collections[0].teamId;
if (!teamId) return Promise.reject('teamId is not exist');
const collectionIds = collections.map((item) => String(item._id));
const fileIdList = collections.map((item) => item?.fileId || '').filter(Boolean);
const relatedImageIds = collections
.map((item) => item?.metadata?.relatedImgId || '')
.filter(Boolean);
// delete training data
await MongoDatasetTraining.deleteMany({
teamId,
collectionId: { $in: collectionIds }
});
return collection;
};
await delay(2000);
// delete dataset.datas
await MongoDatasetData.deleteMany({ teamId, collectionId: { $in: collectionIds } });
// delete pg data
await deleteDatasetDataVector({ teamId, collectionIds });
// delete file and imgs
await Promise.all([
delImgByRelatedId(relatedImageIds),
delFileByFileIdList({
bucketName: BucketNameEnum.dataset,
fileIdList
})
]);
// delete collections
await MongoDatasetCollection.deleteMany({
_id: { $in: collectionIds }
});
}

View File

@@ -1,7 +1,7 @@
import { connectionMongo, type Model } from '../../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constant';
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constants';
import { DatasetCollectionName } from '../schema';
import {
TeamCollectionName,
@@ -91,11 +91,19 @@ const DatasetCollectionSchema = new Schema({
});
try {
DatasetCollectionSchema.index({ teamId: 1 });
DatasetCollectionSchema.index({ datasetId: 1 });
DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, parentId: 1 });
DatasetCollectionSchema.index({ updateTime: -1 });
DatasetCollectionSchema.index({ hashRawText: -1 });
// auth file
DatasetCollectionSchema.index({ teamId: 1, fileId: 1 }, { background: true });
// list collection; deep find collections
DatasetCollectionSchema.index(
{
teamId: 1,
datasetId: 1,
parentId: 1,
updateTime: -1
},
{ background: true }
);
} catch (error) {
console.log(error);
}

View File

@@ -4,16 +4,32 @@ import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { MongoDatasetTraining } from '../training/schema';
import { urlsFetch } from '../../../common/string/cheerio';
import { DatasetCollectionTypeEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
import {
DatasetCollectionTypeEnum,
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { hashStr } from '@fastgpt/global/common/string/tools';
/**
* get all collection by top collectionId
*/
export async function findCollectionAndChild(id: string, fields = '_id parentId name metadata') {
export async function findCollectionAndChild({
teamId,
datasetId,
collectionId,
fields = '_id parentId name metadata'
}: {
teamId: string;
datasetId: string;
collectionId: string;
fields?: string;
}) {
async function find(id: string) {
// find children
const children = await MongoDatasetCollection.find({ parentId: id }, fields);
const children = await MongoDatasetCollection.find(
{ teamId, datasetId, parentId: id },
fields
).lean();
let collections = children;
@@ -25,8 +41,8 @@ export async function findCollectionAndChild(id: string, fields = '_id parentId
return collections;
}
const [collection, childCollections] = await Promise.all([
MongoDatasetCollection.findById(id, fields),
find(id)
MongoDatasetCollection.findById(collectionId, fields),
find(collectionId)
]);
if (!collection) {
@@ -107,8 +123,8 @@ export const getCollectionAndRawText = async ({
});
return {
title: result[0].title,
rawText: result[0].content
title: result[0]?.title,
rawText: result[0]?.content
};
}
@@ -121,7 +137,7 @@ export const getCollectionAndRawText = async ({
})();
const hashRawText = hashStr(rawText);
const isSameRawText = col.hashRawText === hashRawText;
const isSameRawText = rawText && col.hashRawText === hashRawText;
return {
collection: col,
@@ -161,8 +177,7 @@ export const reloadCollectionChunks = async ({
// split data
const { chunks } = splitText2Chunks({
text: newRawText,
chunkLen: col.chunkSize || 512,
countTokens: false
chunkLen: col.chunkSize || 512
});
// insert to training queue

View File

@@ -1,24 +1,47 @@
import { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type';
import { CollectionWithDatasetType, DatasetSchemaType } from '@fastgpt/global/core/dataset/type';
import { MongoDatasetCollection } from './collection/schema';
import { MongoDataset } from './schema';
import { delCollectionAndRelatedSources } from './collection/controller';
/* ============= dataset ========== */
/* find all datasetId by top datasetId */
export async function findDatasetIdTreeByTopDatasetId(
id: string,
result: string[] = []
): Promise<string[]> {
let allChildrenIds = [...result];
export async function findDatasetAndAllChildren({
teamId,
datasetId,
fields
}: {
teamId: string;
datasetId: string;
fields?: string;
}): Promise<DatasetSchemaType[]> {
const find = async (id: string) => {
const children = await MongoDataset.find(
{
teamId,
parentId: id
},
fields
).lean();
// find children
const children = await MongoDataset.find({ parentId: id });
let datasets = children;
for (const child of children) {
const grandChildrenIds = await findDatasetIdTreeByTopDatasetId(child._id, result);
allChildrenIds = allChildrenIds.concat(grandChildrenIds);
for (const child of children) {
const grandChildrenIds = await find(child._id);
datasets = datasets.concat(grandChildrenIds);
}
return datasets;
};
const [dataset, childDatasets] = await Promise.all([
MongoDataset.findById(datasetId),
find(datasetId)
]);
if (!dataset) {
return Promise.reject('Dataset not found');
}
return [String(id), ...allChildrenIds];
return [dataset, ...childDatasets];
}
export async function getCollectionWithDataset(collectionId: string) {
@@ -30,3 +53,22 @@ export async function getCollectionWithDataset(collectionId: string) {
}
return data;
}
/* delete all data by datasetIds */
export async function delDatasetRelevantData({ datasets }: { datasets: DatasetSchemaType[] }) {
if (!datasets.length) return;
const teamId = datasets[0].teamId;
const datasetIds = datasets.map((item) => String(item._id));
// Get _id, teamId, fileId, metadata.relatedImgId for all collections
const collections = await MongoDatasetCollection.find(
{
teamId,
datasetId: { $in: datasetIds }
},
'_id teamId fileId metadata'
).lean();
await delCollectionAndRelatedSources({ collections });
}

View File

@@ -1,87 +1,2 @@
import { MongoDatasetData } from './schema';
import { MongoDatasetTraining } from '../training/schema';
import { delFileByFileIdList, delFileByMetadata } from '../../../common/file/gridfs/controller';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { MongoDatasetCollection } from '../collection/schema';
import { delay } from '@fastgpt/global/common/system/utils';
import { delImgByFileIdList } from '../../../common/file/image/controller';
import { deleteDatasetDataVector } from '../../../common/vectorStore/controller';
/* delete all data by datasetIds */
export async function delDatasetRelevantData({ datasetIds }: { datasetIds: string[] }) {
datasetIds = datasetIds.map((item) => String(item));
// delete training data(There could be a training mission)
await MongoDatasetTraining.deleteMany({
datasetId: { $in: datasetIds }
});
await delay(2000);
// delete dataset.datas
await MongoDatasetData.deleteMany({ datasetId: { $in: datasetIds } });
// delete pg data
await deleteDatasetDataVector({ datasetIds });
// delete collections
await MongoDatasetCollection.deleteMany({
datasetId: { $in: datasetIds }
});
// delete related files
await Promise.all(
datasetIds.map((id) => delFileByMetadata({ bucketName: BucketNameEnum.dataset, datasetId: id }))
);
}
/**
* delete all data by collectionIds
*/
export async function delCollectionRelevantData({
collectionIds,
fileIds
}: {
collectionIds: string[];
fileIds: string[];
}) {
collectionIds = collectionIds.filter(Boolean).map((item) => String(item));
const filterFileIds = fileIds.filter(Boolean).map((item) => String(item));
// delete training data
await MongoDatasetTraining.deleteMany({
collectionId: { $in: collectionIds }
});
await delay(2000);
// delete dataset.datas
await MongoDatasetData.deleteMany({ collectionId: { $in: collectionIds } });
// delete pg data
await deleteDatasetDataVector({ collectionIds });
// delete collections
await MongoDatasetCollection.deleteMany({
_id: { $in: collectionIds }
});
// delete file and imgs
await Promise.all([
delImgByFileIdList(filterFileIds),
delFileByFileIdList({
bucketName: BucketNameEnum.dataset,
fileIdList: filterFileIds
})
]);
}
/**
* delete one data by mongoDataId
*/
export async function delDatasetDataByDataId({
collectionId,
mongoDataId
}: {
collectionId: string;
mongoDataId: string;
}) {
await deleteDatasetDataVector({ collectionId, dataIds: [mongoDataId] });
await MongoDatasetData.findByIdAndDelete(mongoDataId);
}

View File

@@ -10,7 +10,7 @@ import { DatasetColCollectionName } from '../collection/schema';
import {
DatasetDataIndexTypeEnum,
DatasetDataIndexTypeMap
} from '@fastgpt/global/core/dataset/constant';
} from '@fastgpt/global/core/dataset/constants';
export const DatasetDataCollectionName = 'dataset.datas';
@@ -71,6 +71,7 @@ const DatasetDataSchema = new Schema({
],
default: []
},
updateTime: {
type: Date,
default: () => new Date()
@@ -85,13 +86,18 @@ const DatasetDataSchema = new Schema({
});
try {
DatasetDataSchema.index({ teamId: 1 });
DatasetDataSchema.index({ datasetId: 1 });
DatasetDataSchema.index({ collectionId: 1 });
DatasetDataSchema.index({ updateTime: -1 });
DatasetDataSchema.index({ collectionId: 1, q: 1, a: 1 });
// same data check
DatasetDataSchema.index({ teamId: 1, collectionId: 1, q: 1, a: 1 }, { background: true });
// list collection and count data; list data
DatasetDataSchema.index(
{ teamId: 1, datasetId: 1, collectionId: 1, chunkIndex: 1, updateTime: -1 },
{ background: true }
);
// full text index
DatasetDataSchema.index({ datasetId: 1, fullTextToken: 'text' });
DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' }, { background: true });
// Recall vectors after data matching
DatasetDataSchema.index({ teamId: 1, datasetId: 1, 'indexes.dataId': 1 }, { background: true });
DatasetDataSchema.index({ updateTime: 1 }, { background: true });
} catch (error) {
console.log(error);
}

View File

@@ -5,7 +5,7 @@ import {
DatasetStatusEnum,
DatasetStatusMap,
DatasetTypeMap
} from '@fastgpt/global/core/dataset/constant';
} from '@fastgpt/global/core/dataset/constants';
import {
TeamCollectionName,
TeamMemberCollectionName

View File

@@ -2,7 +2,7 @@
import { connectionMongo, type Model } from '../../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
import { DatasetDataIndexTypeMap, TrainingTypeMap } from '@fastgpt/global/core/dataset/constant';
import { DatasetDataIndexTypeMap, TrainingTypeMap } from '@fastgpt/global/core/dataset/constants';
import { DatasetColCollectionName } from '../collection/schema';
import { DatasetCollectionName } from '../schema';
import {
@@ -102,11 +102,11 @@ const TrainingDataSchema = new Schema({
});
try {
TrainingDataSchema.index({ teamId: 1 });
// lock training data; delete training data
TrainingDataSchema.index({ teamId: 1, collectionId: 1 });
// get training data and sort
TrainingDataSchema.index({ weight: -1 });
TrainingDataSchema.index({ lockTime: 1 });
TrainingDataSchema.index({ datasetId: 1 });
TrainingDataSchema.index({ collectionId: 1 });
TrainingDataSchema.index({ expireAt: 1 }, { expireAfterSeconds: 7 * 24 * 60 });
} catch (error) {
console.log(error);