4.6.7 first pr (#726)

This commit is contained in:
Archer
2024-01-10 23:35:04 +08:00
committed by GitHub
parent 414b693303
commit 006ad17c6a
186 changed files with 2996 additions and 1838 deletions

View File

@@ -32,7 +32,7 @@ export async function getVectorsByText({
return Promise.reject('Embedding API 404');
}
if (!res?.data?.[0]?.embedding) {
console.log(res?.data);
console.log(res);
// @ts-ignore
return Promise.reject(res.data?.err?.message || 'Embedding API Error');
}

View File

@@ -2,8 +2,7 @@ import { connectionMongo, type Model } from '../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { ChatItemSchema as ChatItemType } from '@fastgpt/global/core/chat/type';
import { ChatRoleMap } from '@fastgpt/global/core/chat/constants';
import { customAlphabet } from 'nanoid';
const nanoid = customAlphabet('abcdefghijklmnopqrstuvwxyz1234567890', 24);
import { getNanoid } from '@fastgpt/global/common/string/tools';
import {
TeamCollectionName,
TeamMemberCollectionName
@@ -13,24 +12,6 @@ import { userCollectionName } from '../../support/user/schema';
import { ModuleOutputKeyEnum } from '@fastgpt/global/core/module/constants';
const ChatItemSchema = new Schema({
dataId: {
type: String,
require: true,
default: () => nanoid()
},
appId: {
type: Schema.Types.ObjectId,
ref: appCollectionName,
required: true
},
chatId: {
type: String,
require: true
},
userId: {
type: Schema.Types.ObjectId,
ref: userCollectionName
},
teamId: {
type: Schema.Types.ObjectId,
ref: TeamCollectionName,
@@ -41,6 +22,24 @@ const ChatItemSchema = new Schema({
ref: TeamMemberCollectionName,
required: true
},
userId: {
type: Schema.Types.ObjectId,
ref: userCollectionName
},
chatId: {
type: String,
require: true
},
dataId: {
type: String,
require: true,
default: () => getNanoid(22)
},
appId: {
type: Schema.Types.ObjectId,
ref: appCollectionName,
required: true
},
time: {
type: Date,
default: () => new Date()
@@ -80,10 +79,11 @@ const ChatItemSchema = new Schema({
});
try {
ChatItemSchema.index({ dataId: -1 });
ChatItemSchema.index({ teamId: 1 });
ChatItemSchema.index({ time: -1 });
ChatItemSchema.index({ appId: 1 });
ChatItemSchema.index({ chatId: 1 });
ChatItemSchema.index({ obj: 1 });
ChatItemSchema.index({ userGoodFeedback: 1 });
ChatItemSchema.index({ userBadFeedback: 1 });
ChatItemSchema.index({ customFeedbacks: 1 });

View File

@@ -1,7 +1,4 @@
import {
DatasetCollectionTrainingModeEnum,
DatasetCollectionTypeEnum
} from '@fastgpt/global/core/dataset/constant';
import { TrainingModeEnum, DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { MongoDatasetCollection } from './schema';
@@ -12,11 +9,15 @@ export async function createOneCollection({
parentId,
datasetId,
type,
trainingType = DatasetCollectionTrainingModeEnum.manual,
trainingType = TrainingModeEnum.chunk,
chunkSize = 0,
chunkSplitter,
qaPrompt,
fileId,
rawLink,
qaPrompt,
hashRawText,
rawTextLength,
metadata = {},
@@ -30,11 +31,15 @@ export async function createOneCollection({
datasetId,
name,
type,
trainingType,
chunkSize,
chunkSplitter,
qaPrompt,
fileId,
rawLink,
qaPrompt,
rawTextLength,
hashRawText,
metadata
@@ -74,7 +79,7 @@ export function createDefaultCollection({
datasetId,
parentId,
type: DatasetCollectionTypeEnum.virtual,
trainingType: DatasetCollectionTrainingModeEnum.manual,
trainingType: TrainingModeEnum.chunk,
chunkSize: 0,
updateTime: new Date('2099')
});

View File

@@ -1,10 +1,7 @@
import { connectionMongo, type Model } from '../../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
import {
DatasetCollectionTrainingTypeMap,
DatasetCollectionTypeMap
} from '@fastgpt/global/core/dataset/constant';
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constant';
import { DatasetCollectionName } from '../schema';
import {
TeamCollectionName,
@@ -56,15 +53,23 @@ const DatasetCollectionSchema = new Schema({
type: Date,
default: () => new Date()
},
trainingType: {
type: String,
enum: Object.keys(DatasetCollectionTrainingTypeMap),
enum: Object.keys(TrainingTypeMap),
required: true
},
chunkSize: {
type: Number,
required: true
},
chunkSplitter: {
type: String
},
qaPrompt: {
type: String
},
fileId: {
type: Schema.Types.ObjectId,
ref: 'dataset.files'
@@ -72,9 +77,6 @@ const DatasetCollectionSchema = new Schema({
rawLink: {
type: String
},
qaPrompt: {
type: String
},
rawTextLength: {
type: Number
@@ -89,8 +91,9 @@ const DatasetCollectionSchema = new Schema({
});
try {
DatasetCollectionSchema.index({ teamId: 1 });
DatasetCollectionSchema.index({ datasetId: 1 });
DatasetCollectionSchema.index({ datasetId: 1, parentId: 1 });
DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, parentId: 1 });
DatasetCollectionSchema.index({ updateTime: -1 });
DatasetCollectionSchema.index({ hashRawText: -1 });
} catch (error) {

View File

@@ -4,7 +4,7 @@ import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { MongoDatasetTraining } from '../training/schema';
import { urlsFetch } from '../../../common/string/cheerio';
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
import { DatasetCollectionTypeEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
import { hashStr } from '@fastgpt/global/common/string/tools';
/**
@@ -92,8 +92,12 @@ export const getCollectionAndRawText = async ({
return Promise.reject('Collection not found');
}
const rawText = await (async () => {
if (newRawText) return newRawText;
const { title, rawText } = await (async () => {
if (newRawText)
return {
title: '',
rawText: newRawText
};
// link
if (col.type === DatasetCollectionTypeEnum.link && col.rawLink) {
// crawl new data
@@ -102,12 +106,18 @@ export const getCollectionAndRawText = async ({
selector: col.datasetId?.websiteConfig?.selector || col?.metadata?.webPageSelector
});
return result[0].content;
return {
title: result[0].title,
rawText: result[0].content
};
}
// file
return '';
return {
title: '',
rawText: ''
};
})();
const hashRawText = hashStr(rawText);
@@ -115,6 +125,7 @@ export const getCollectionAndRawText = async ({
return {
collection: col,
title,
rawText,
isSameRawText
};
@@ -135,6 +146,7 @@ export const reloadCollectionChunks = async ({
rawText?: string;
}) => {
const {
title,
rawText: newRawText,
collection: col,
isSameRawText
@@ -154,6 +166,11 @@ export const reloadCollectionChunks = async ({
});
// insert to training queue
const model = await (() => {
if (col.trainingType === TrainingModeEnum.chunk) return col.datasetId.vectorModel;
if (col.trainingType === TrainingModeEnum.qa) return col.datasetId.agentModel;
return Promise.reject('Training model error');
})();
await MongoDatasetTraining.insertMany(
chunks.map((item, i) => ({
teamId: col.teamId,
@@ -163,7 +180,7 @@ export const reloadCollectionChunks = async ({
billId,
mode: col.trainingType,
prompt: '',
model: col.datasetId.vectorModel,
model,
q: item,
a: '',
chunkIndex: i
@@ -172,6 +189,7 @@ export const reloadCollectionChunks = async ({
// update raw text
await MongoDatasetCollection.findByIdAndUpdate(col._id, {
...(title && { name: title }),
rawTextLength: newRawText.length,
hashRawText: hashStr(newRawText)
});

View File

@@ -75,7 +75,13 @@ export async function delCollectionRelevantData({
/**
* delete one data by mongoDataId
*/
export async function delDatasetDataByDataId(mongoDataId: string) {
await deleteDatasetDataVector({ dataIds: [mongoDataId] });
export async function delDatasetDataByDataId({
collectionId,
mongoDataId
}: {
collectionId: string;
mongoDataId: string;
}) {
await deleteDatasetDataVector({ collectionId, dataIds: [mongoDataId] });
await MongoDatasetData.findByIdAndDelete(mongoDataId);
}

View File

@@ -85,12 +85,13 @@ const DatasetDataSchema = new Schema({
});
try {
DatasetDataSchema.index({ teamId: 1 });
DatasetDataSchema.index({ datasetId: 1 });
DatasetDataSchema.index({ collectionId: 1 });
DatasetDataSchema.index({ updateTime: -1 });
DatasetDataSchema.index({ collectionId: 1, q: 1, a: 1 });
// full text index
DatasetDataSchema.index({ datasetId: 1, fullTextToken: 'text' });
DatasetDataSchema.index({ inited: 1 });
} catch (error) {
console.log(error);
}

View File

@@ -92,7 +92,7 @@ const DatasetSchema = new Schema({
});
try {
DatasetSchema.index({ userId: 1 });
DatasetSchema.index({ teamId: 1 });
} catch (error) {
console.log(error);
}

View File

@@ -102,6 +102,7 @@ const TrainingDataSchema = new Schema({
});
try {
TrainingDataSchema.index({ teamId: 1 });
TrainingDataSchema.index({ weight: -1 });
TrainingDataSchema.index({ lockTime: 1 });
TrainingDataSchema.index({ datasetId: 1 });