mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-24 05:23:57 +00:00
4.6.7 first pr (#726)
This commit is contained in:
@@ -32,7 +32,7 @@ export async function getVectorsByText({
|
||||
return Promise.reject('Embedding API 404');
|
||||
}
|
||||
if (!res?.data?.[0]?.embedding) {
|
||||
console.log(res?.data);
|
||||
console.log(res);
|
||||
// @ts-ignore
|
||||
return Promise.reject(res.data?.err?.message || 'Embedding API Error');
|
||||
}
|
||||
|
@@ -2,8 +2,7 @@ import { connectionMongo, type Model } from '../../common/mongo';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
import { ChatItemSchema as ChatItemType } from '@fastgpt/global/core/chat/type';
|
||||
import { ChatRoleMap } from '@fastgpt/global/core/chat/constants';
|
||||
import { customAlphabet } from 'nanoid';
|
||||
const nanoid = customAlphabet('abcdefghijklmnopqrstuvwxyz1234567890', 24);
|
||||
import { getNanoid } from '@fastgpt/global/common/string/tools';
|
||||
import {
|
||||
TeamCollectionName,
|
||||
TeamMemberCollectionName
|
||||
@@ -13,24 +12,6 @@ import { userCollectionName } from '../../support/user/schema';
|
||||
import { ModuleOutputKeyEnum } from '@fastgpt/global/core/module/constants';
|
||||
|
||||
const ChatItemSchema = new Schema({
|
||||
dataId: {
|
||||
type: String,
|
||||
require: true,
|
||||
default: () => nanoid()
|
||||
},
|
||||
appId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: appCollectionName,
|
||||
required: true
|
||||
},
|
||||
chatId: {
|
||||
type: String,
|
||||
require: true
|
||||
},
|
||||
userId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: userCollectionName
|
||||
},
|
||||
teamId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: TeamCollectionName,
|
||||
@@ -41,6 +22,24 @@ const ChatItemSchema = new Schema({
|
||||
ref: TeamMemberCollectionName,
|
||||
required: true
|
||||
},
|
||||
userId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: userCollectionName
|
||||
},
|
||||
chatId: {
|
||||
type: String,
|
||||
require: true
|
||||
},
|
||||
dataId: {
|
||||
type: String,
|
||||
require: true,
|
||||
default: () => getNanoid(22)
|
||||
},
|
||||
appId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: appCollectionName,
|
||||
required: true
|
||||
},
|
||||
time: {
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
@@ -80,10 +79,11 @@ const ChatItemSchema = new Schema({
|
||||
});
|
||||
|
||||
try {
|
||||
ChatItemSchema.index({ dataId: -1 });
|
||||
ChatItemSchema.index({ teamId: 1 });
|
||||
ChatItemSchema.index({ time: -1 });
|
||||
ChatItemSchema.index({ appId: 1 });
|
||||
ChatItemSchema.index({ chatId: 1 });
|
||||
ChatItemSchema.index({ obj: 1 });
|
||||
ChatItemSchema.index({ userGoodFeedback: 1 });
|
||||
ChatItemSchema.index({ userBadFeedback: 1 });
|
||||
ChatItemSchema.index({ customFeedbacks: 1 });
|
||||
|
@@ -1,7 +1,4 @@
|
||||
import {
|
||||
DatasetCollectionTrainingModeEnum,
|
||||
DatasetCollectionTypeEnum
|
||||
} from '@fastgpt/global/core/dataset/constant';
|
||||
import { TrainingModeEnum, DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
|
||||
import { MongoDatasetCollection } from './schema';
|
||||
|
||||
@@ -12,11 +9,15 @@ export async function createOneCollection({
|
||||
parentId,
|
||||
datasetId,
|
||||
type,
|
||||
trainingType = DatasetCollectionTrainingModeEnum.manual,
|
||||
|
||||
trainingType = TrainingModeEnum.chunk,
|
||||
chunkSize = 0,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
|
||||
fileId,
|
||||
rawLink,
|
||||
qaPrompt,
|
||||
|
||||
hashRawText,
|
||||
rawTextLength,
|
||||
metadata = {},
|
||||
@@ -30,11 +31,15 @@ export async function createOneCollection({
|
||||
datasetId,
|
||||
name,
|
||||
type,
|
||||
|
||||
trainingType,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
|
||||
fileId,
|
||||
rawLink,
|
||||
qaPrompt,
|
||||
|
||||
rawTextLength,
|
||||
hashRawText,
|
||||
metadata
|
||||
@@ -74,7 +79,7 @@ export function createDefaultCollection({
|
||||
datasetId,
|
||||
parentId,
|
||||
type: DatasetCollectionTypeEnum.virtual,
|
||||
trainingType: DatasetCollectionTrainingModeEnum.manual,
|
||||
trainingType: TrainingModeEnum.chunk,
|
||||
chunkSize: 0,
|
||||
updateTime: new Date('2099')
|
||||
});
|
||||
|
@@ -1,10 +1,7 @@
|
||||
import { connectionMongo, type Model } from '../../../common/mongo';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
|
||||
import {
|
||||
DatasetCollectionTrainingTypeMap,
|
||||
DatasetCollectionTypeMap
|
||||
} from '@fastgpt/global/core/dataset/constant';
|
||||
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constant';
|
||||
import { DatasetCollectionName } from '../schema';
|
||||
import {
|
||||
TeamCollectionName,
|
||||
@@ -56,15 +53,23 @@ const DatasetCollectionSchema = new Schema({
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
},
|
||||
|
||||
trainingType: {
|
||||
type: String,
|
||||
enum: Object.keys(DatasetCollectionTrainingTypeMap),
|
||||
enum: Object.keys(TrainingTypeMap),
|
||||
required: true
|
||||
},
|
||||
chunkSize: {
|
||||
type: Number,
|
||||
required: true
|
||||
},
|
||||
chunkSplitter: {
|
||||
type: String
|
||||
},
|
||||
qaPrompt: {
|
||||
type: String
|
||||
},
|
||||
|
||||
fileId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: 'dataset.files'
|
||||
@@ -72,9 +77,6 @@ const DatasetCollectionSchema = new Schema({
|
||||
rawLink: {
|
||||
type: String
|
||||
},
|
||||
qaPrompt: {
|
||||
type: String
|
||||
},
|
||||
|
||||
rawTextLength: {
|
||||
type: Number
|
||||
@@ -89,8 +91,9 @@ const DatasetCollectionSchema = new Schema({
|
||||
});
|
||||
|
||||
try {
|
||||
DatasetCollectionSchema.index({ teamId: 1 });
|
||||
DatasetCollectionSchema.index({ datasetId: 1 });
|
||||
DatasetCollectionSchema.index({ datasetId: 1, parentId: 1 });
|
||||
DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, parentId: 1 });
|
||||
DatasetCollectionSchema.index({ updateTime: -1 });
|
||||
DatasetCollectionSchema.index({ hashRawText: -1 });
|
||||
} catch (error) {
|
||||
|
@@ -4,7 +4,7 @@ import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { MongoDatasetTraining } from '../training/schema';
|
||||
import { urlsFetch } from '../../../common/string/cheerio';
|
||||
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
import { DatasetCollectionTypeEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
|
||||
/**
|
||||
@@ -92,8 +92,12 @@ export const getCollectionAndRawText = async ({
|
||||
return Promise.reject('Collection not found');
|
||||
}
|
||||
|
||||
const rawText = await (async () => {
|
||||
if (newRawText) return newRawText;
|
||||
const { title, rawText } = await (async () => {
|
||||
if (newRawText)
|
||||
return {
|
||||
title: '',
|
||||
rawText: newRawText
|
||||
};
|
||||
// link
|
||||
if (col.type === DatasetCollectionTypeEnum.link && col.rawLink) {
|
||||
// crawl new data
|
||||
@@ -102,12 +106,18 @@ export const getCollectionAndRawText = async ({
|
||||
selector: col.datasetId?.websiteConfig?.selector || col?.metadata?.webPageSelector
|
||||
});
|
||||
|
||||
return result[0].content;
|
||||
return {
|
||||
title: result[0].title,
|
||||
rawText: result[0].content
|
||||
};
|
||||
}
|
||||
|
||||
// file
|
||||
|
||||
return '';
|
||||
return {
|
||||
title: '',
|
||||
rawText: ''
|
||||
};
|
||||
})();
|
||||
|
||||
const hashRawText = hashStr(rawText);
|
||||
@@ -115,6 +125,7 @@ export const getCollectionAndRawText = async ({
|
||||
|
||||
return {
|
||||
collection: col,
|
||||
title,
|
||||
rawText,
|
||||
isSameRawText
|
||||
};
|
||||
@@ -135,6 +146,7 @@ export const reloadCollectionChunks = async ({
|
||||
rawText?: string;
|
||||
}) => {
|
||||
const {
|
||||
title,
|
||||
rawText: newRawText,
|
||||
collection: col,
|
||||
isSameRawText
|
||||
@@ -154,6 +166,11 @@ export const reloadCollectionChunks = async ({
|
||||
});
|
||||
|
||||
// insert to training queue
|
||||
const model = await (() => {
|
||||
if (col.trainingType === TrainingModeEnum.chunk) return col.datasetId.vectorModel;
|
||||
if (col.trainingType === TrainingModeEnum.qa) return col.datasetId.agentModel;
|
||||
return Promise.reject('Training model error');
|
||||
})();
|
||||
await MongoDatasetTraining.insertMany(
|
||||
chunks.map((item, i) => ({
|
||||
teamId: col.teamId,
|
||||
@@ -163,7 +180,7 @@ export const reloadCollectionChunks = async ({
|
||||
billId,
|
||||
mode: col.trainingType,
|
||||
prompt: '',
|
||||
model: col.datasetId.vectorModel,
|
||||
model,
|
||||
q: item,
|
||||
a: '',
|
||||
chunkIndex: i
|
||||
@@ -172,6 +189,7 @@ export const reloadCollectionChunks = async ({
|
||||
|
||||
// update raw text
|
||||
await MongoDatasetCollection.findByIdAndUpdate(col._id, {
|
||||
...(title && { name: title }),
|
||||
rawTextLength: newRawText.length,
|
||||
hashRawText: hashStr(newRawText)
|
||||
});
|
||||
|
@@ -75,7 +75,13 @@ export async function delCollectionRelevantData({
|
||||
/**
|
||||
* delete one data by mongoDataId
|
||||
*/
|
||||
export async function delDatasetDataByDataId(mongoDataId: string) {
|
||||
await deleteDatasetDataVector({ dataIds: [mongoDataId] });
|
||||
export async function delDatasetDataByDataId({
|
||||
collectionId,
|
||||
mongoDataId
|
||||
}: {
|
||||
collectionId: string;
|
||||
mongoDataId: string;
|
||||
}) {
|
||||
await deleteDatasetDataVector({ collectionId, dataIds: [mongoDataId] });
|
||||
await MongoDatasetData.findByIdAndDelete(mongoDataId);
|
||||
}
|
||||
|
@@ -85,12 +85,13 @@ const DatasetDataSchema = new Schema({
|
||||
});
|
||||
|
||||
try {
|
||||
DatasetDataSchema.index({ teamId: 1 });
|
||||
DatasetDataSchema.index({ datasetId: 1 });
|
||||
DatasetDataSchema.index({ collectionId: 1 });
|
||||
DatasetDataSchema.index({ updateTime: -1 });
|
||||
DatasetDataSchema.index({ collectionId: 1, q: 1, a: 1 });
|
||||
// full text index
|
||||
DatasetDataSchema.index({ datasetId: 1, fullTextToken: 'text' });
|
||||
DatasetDataSchema.index({ inited: 1 });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
@@ -92,7 +92,7 @@ const DatasetSchema = new Schema({
|
||||
});
|
||||
|
||||
try {
|
||||
DatasetSchema.index({ userId: 1 });
|
||||
DatasetSchema.index({ teamId: 1 });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
@@ -102,6 +102,7 @@ const TrainingDataSchema = new Schema({
|
||||
});
|
||||
|
||||
try {
|
||||
TrainingDataSchema.index({ teamId: 1 });
|
||||
TrainingDataSchema.index({ weight: -1 });
|
||||
TrainingDataSchema.index({ lockTime: 1 });
|
||||
TrainingDataSchema.index({ datasetId: 1 });
|
||||
|
Reference in New Issue
Block a user