This commit is contained in:
Archer
2023-11-15 21:35:50 +08:00
committed by GitHub
parent bfd8be5df0
commit cd3acb44ab
39 changed files with 457 additions and 160 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

View File

@@ -50,5 +50,6 @@ curl --location --request POST 'https://{{host}}/api/admin/initv46-2' \
1. 新增 - 团队空间
2. 新增 - 多路向量(多个向量映射一组数据)
3. 新增 - tts语音
4. 线上环境新增 - ReRank向量召回提高召回精度
5. 优化 - 知识库导出,可直接触发流下载,无需等待转圈圈
4. 新增 - 支持知识库配置文本预处理模型
5. 线上环境新增 - ReRank向量召回提高召回精度
6. 优化 - 知识库导出,可直接触发流下载,无需等待转圈圈

View File

@@ -1,10 +1,10 @@
---
title: '定价'
description: 'FastGPT 定价'
title: '线上版定价'
description: 'FastGPT 线上版定价'
icon: 'currency_yen'
draft: false
toc: true
weight: 10
weight: 11
---
## Tokens 说明
@@ -15,7 +15,7 @@ weight: 10
## FastGPT 线上计费
目前FastGPT 线上计费也仅按 Tokens 使用数量为准。以下是详细的计费表(最新定价以线上表格为准,可在点击充值后实时获取):
使用: [https://fastgpt.run](https://fastgpt.run) 或 [https://ai.fastgpt.in](https://ai.fastgpt.in) 只需仅按 Tokens 使用数量扣费即可。可在 账号-使用记录 中查看具体使用情况,以下是详细的计费表(最新定价以线上表格为准,可在点击充值后实时获取):
{{< table "table-hover table-striped-columns" >}}
| 计费项 | 价格: 元/ 1K tokens包含上下文 |

View File

@@ -1,6 +1,6 @@
---
title: "知识库结构讲解"
description: "本节会介绍 FastGPT 知识库结构设计,理解其 QA 的存储格式和检索格式,以便更好的构建知识库。这篇介绍主要以使用为主,详细原理不多介绍。"
description: "本节会详细介绍 FastGPT 知识库结构设计,理解其 QA 的存储格式和多向量映射,以便更好的构建知识库。这篇介绍主要以使用为主,详细原理不多介绍。"
icon: "dataset"
draft: false
toc: true
@@ -25,13 +25,21 @@ FastGPT 采用了 RAG 中的 Embedding 方案构建知识库,要使用好 Fast
FastGPT 采用了 `PostgresSQL``PG Vector` 插件作为向量检索器,索引为`HNSW`。且`PostgresSQL`仅用于向量检索,`MongoDB`用于其他数据的存取。
`PostgresSQL`的表中,设置一个 `index` 字段用于存储向量、一个 `q` 字段用于存储向量对应的内容,以及一个 `a` 字段用于检索映射。之所以取字段为 `qa` 是由于一些历史缘故,无需完全解为 “问答对” 的格式。在实际使用过程中,可以利用`q``a`的组合,对检索后的内容做进一步的声明,提高大模型的理解力(注意,这里不直接提高搜索精度)
`PostgresSQL`的表中,设置一个 `index` 字段用于存储向量,以及一个`data_id`用于在`MongoDB`中寻找对应的映射值。多个`index`可以对应一组`data_id`,也就是说,一组向量可以对应多组数据。在进行检索时,相同数据会进行合并
目前,提高向量搜索的精度,主要可以通过几种途径:
![](/imgs/datasetSetting1.png)
1. 精简`q`的内容,减少向量内容的长度:当`q`的内容更少,更准确时,检索精度自然会提高。但与此同时,会牺牲一定的检索范围,适合答案较为严格的场景。
2. 更好分词分段:当一段话的结构和语义是完整的,并且是单一的,精度也会提高。因此,许多系统都会优化分词器,尽可能的保障每组数据的完整性。
3. 多样性文本:为一段内容增加关键词、摘要、相似问题等描述性信息,可以使得该内容的向量具有更大的检索覆盖范围
## 多向量的目的和使用方式
在一组数据中,如果我们希望它尽可能长,但语义又要在向量中尽可能提现,则没有办法通过一组向量来表示。因此,我们采用了多向量映射的方式,将一组数据映射到多组向量中,从而保障数据的完整性和语义的提现
你可以为一组较长的文本,添加多组向量,从而在检索时,只要其中一组向量被检索到,该数据也将被召回。
## 提高向量搜索精度的方法
1. 更好分词分段:当一段话的结构和语义是完整的,并且是单一的,精度也会提高。因此,许多系统都会优化分词器,尽可能的保障每组数据的完整性。
2. 精简`index`的内容,减少向量内容的长度:当`index`的内容更少,更准确时,检索精度自然会提高。但与此同时,会牺牲一定的检索范围,适合答案较为严格的场景。
3. 丰富`index`的数量,可以为同一个`chunk`内容增加多组`index`
4. 优化检索词:在实际使用过程中,用户的问题通常是模糊的或是缺失的,并不一定是完整清晰的问题。因此优化用户的问题(检索词)很大程度上也可以提高精度。
5. 微调向量模型:由于市面上直接使用的向量模型都是通用型模型,在特定领域的检索精度并不高,因此微调向量模型可以很大程度上提高专业领域的检索效果。

View File

@@ -63,8 +63,8 @@ export const splitText2Chunks = (props: { text: string; maxLen: number; overlapL
let chunks: string[] = [];
for (let i = 0; i < splitTexts.length; i++) {
let text = splitTexts[i];
let chunkToken = countPromptTokens(lastChunk, '');
const textToken = countPromptTokens(text, '');
let chunkToken = lastChunk.length;
const textToken = text.length;
// next chunk is too large / new chunk is too large(The current chunk must be smaller than maxLen)
if (textToken >= maxLen || chunkToken + textToken > maxLen * 1.4) {

View File

@@ -1,4 +1,4 @@
import type { VectorModelItemType } from '../../core/ai/model.d';
import type { LLMModelItemType, VectorModelItemType } from '../../core/ai/model.d';
import { PermissionTypeEnum } from '../../support/permission/constant';
import { PushDatasetDataChunkProps } from './api';
import {
@@ -19,6 +19,7 @@ export type DatasetSchemaType = {
avatar: string;
name: string;
vectorModel: string;
agentModel: string;
tags: string[];
type: `${DatasetTypeEnum}`;
permission: `${PermissionTypeEnum}`;
@@ -84,8 +85,9 @@ export type CollectionWithDatasetType = Omit<DatasetCollectionSchemaType, 'datas
};
/* ================= dataset ===================== */
export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel'> & {
export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel' | 'agentModel'> & {
vectorModel: VectorModelItemType;
agentModel: LLMModelItemType;
isOwner: boolean;
canWrite: boolean;
};

View File

@@ -3,6 +3,8 @@ import { BillListItemType } from './type';
export type CreateTrainingBillProps = {
name: string;
vectorModel?: string;
agentModel?: string;
};
export type ConcatBillProps = {

View File

@@ -61,7 +61,6 @@ const AppSchema = new Schema({
try {
AppSchema.index({ updateTime: -1 });
AppSchema.index({ 'share.collection': -1 });
} catch (error) {
console.log(error);
}

View File

@@ -69,7 +69,6 @@ const DatasetCollectionSchema = new Schema({
try {
DatasetCollectionSchema.index({ datasetId: 1 });
DatasetCollectionSchema.index({ userId: 1 });
DatasetCollectionSchema.index({ updateTime: -1 });
} catch (error) {
console.log(error);

View File

@@ -48,6 +48,11 @@ const DatasetSchema = new Schema({
required: true,
default: 'text-embedding-ada-002'
},
agentModel: {
type: String,
required: true,
default: 'gpt-3.5-turbo-16k'
},
type: {
type: String,
enum: Object.keys(DatasetTypeMap),

View File

@@ -95,7 +95,7 @@ const TrainingDataSchema = new Schema({
try {
TrainingDataSchema.index({ lockTime: 1 });
TrainingDataSchema.index({ userId: 1 });
TrainingDataSchema.index({ datasetId: 1 });
TrainingDataSchema.index({ collectionId: 1 });
TrainingDataSchema.index({ expireAt: 1 }, { expireAfterSeconds: 7 * 24 * 60 });
} catch (error) {

View File

@@ -250,6 +250,7 @@
}
},
"dataset": {
"Agent Model": "Learning Model",
"Chunk Length": "Chunk Length",
"Confirm move the folder": "Confirm Move",
"Confirm to delete the data": "Confirm to delete the data?",
@@ -259,6 +260,7 @@
"Delete Dataset Error": "Delete dataset failed",
"Edit Folder": "Edit Folder",
"Export": "Export",
"Export Dataset Limit Error": "Export Data Error",
"File Input": "Import File",
"File Size": "File Size",
"Filename": "Filename",

View File

@@ -250,6 +250,7 @@
}
},
"dataset": {
"Agent Model": "文件处理模型",
"Chunk Length": "数据总量",
"Confirm move the folder": "确认移动到该目录",
"Confirm to delete the data": "确认删除该数据?",
@@ -259,6 +260,7 @@
"Delete Dataset Error": "删除知识库异常",
"Edit Folder": "编辑文件夹",
"Export": "导出",
"Export Dataset Limit Error": "导出数据失败",
"File Input": "文件导入",
"File Size": "文件大小",
"Filename": "文件名",

View File

@@ -1,3 +1,4 @@
import { defaultQAModels, defaultVectorModels } from '@fastgpt/global/core/ai/model';
import type {
DatasetCollectionItemType,
DatasetItemType
@@ -17,13 +18,8 @@ export const defaultDatasetDetail: DatasetItemType = {
permission: 'private',
isOwner: false,
canWrite: false,
vectorModel: {
model: 'text-embedding-ada-002',
name: 'Embedding-2',
price: 0.2,
defaultToken: 500,
maxToken: 3000
}
vectorModel: defaultVectorModels[0],
agentModel: defaultQAModels[0]
};
export const defaultCollectionDetail: DatasetCollectionItemType = {
@@ -43,7 +39,8 @@ export const defaultCollectionDetail: DatasetCollectionItemType = {
name: '',
tags: [],
permission: 'private',
vectorModel: 'text-embedding-ada-002'
vectorModel: defaultVectorModels[0].model,
agentModel: defaultQAModels[0].model
},
parentId: '',
name: '',

View File

@@ -5,6 +5,7 @@ import type { SearchTestItemType } from '@/types/core/dataset';
import { UploadChunkItemType } from '@fastgpt/global/core/dataset/type';
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type';
import { PermissionTypeEnum } from '@fastgpt/global/support/permission/constant';
import type { LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
/* ===== dataset ===== */
export type DatasetUpdateParams = {
@@ -14,6 +15,7 @@ export type DatasetUpdateParams = {
name?: string;
avatar?: string;
permission?: `${PermissionTypeEnum}`;
agentModel?: LLMModelItemType;
};
export type SearchTestProps = {

View File

@@ -9,6 +9,7 @@ export type CreateDatasetParams = {
tags: string;
avatar: string;
vectorModel?: string;
agentModel?: string;
type: `${DatasetTypeEnum}`;
};

View File

@@ -1,8 +1,8 @@
export const Prompt_AgentQA = {
prompt: `我会给你一段文本,{{theme}},学习它们,并整理学习成果,要求为:
1. 提出最多 25 个问题。
2. 给出每个问题的答案
3. 答案要详细完整,答案可以包含普通文字、链接、代码、表格、公示、媒体链接等 markdown 元素
1. 提出问题并给出每个问题的答案
2. 每个答案都要详细完整,给出相关原文描述,答案可以包含普通文字、链接、代码、表格、公示、媒体链接等 markdown 元素
3. 最多提出 30 个问题
4. 按格式返回多个问题和答案:
Q1: 问题。

View File

@@ -11,6 +11,8 @@ import {
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { getUserDefaultTeam } from '@fastgpt/service/support/user/team/controller';
import { MongoDataset } from '@fastgpt/service/core/dataset/schema';
import { defaultQAModels } from '@fastgpt/global/core/ai/model';
let success = 0;
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
@@ -41,6 +43,13 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
await initPgData();
await MongoDataset.updateMany(
{},
{
agentModel: defaultQAModels[0].model
}
);
jsonRes(res, {
data: await init(limit),
message:
@@ -76,14 +85,19 @@ async function initPgData() {
for (let i = 0; i < limit; i++) {
init(i);
}
async function init(index: number): Promise<any> {
const userId = rows[index]?.user_id;
if (!userId) return;
try {
const tmb = await getUserDefaultTeam({ userId });
console.log(tmb);
// update pg
await PgClient.query(
`Update ${PgDatasetTableName} set team_id = '${tmb.teamId}', tmb_id = '${tmb.tmbId}' where user_id = '${userId}' AND team_id='null';`
`Update ${PgDatasetTableName} set team_id = '${String(tmb.teamId)}', tmb_id = '${String(
tmb.tmbId
)}' where user_id = '${userId}' AND team_id='null';`
);
console.log(++success);
init(index + limit);

View File

@@ -0,0 +1,101 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { delay } from '@/utils/tools';
import { PgClient } from '@fastgpt/service/common/pg';
import {
DatasetDataIndexTypeEnum,
PgDatasetTableName
} from '@fastgpt/global/core/dataset/constant';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
let success = 0;
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
const { limit = 50 } = req.body as { limit: number };
await authCert({ req, authRoot: true });
await connectToDatabase();
success = 0;
jsonRes(res, {
data: await init(limit)
});
} catch (error) {
console.log(error);
jsonRes(res, {
code: 500,
error
});
}
}
type PgItemType = {
id: string;
q: string;
a: string;
dataset_id: string;
collection_id: string;
data_id: string;
};
async function init(limit: number): Promise<any> {
const { rows: idList } = await PgClient.query<{ id: string }>(
`SELECT id FROM ${PgDatasetTableName} WHERE inited=1`
);
console.log('totalCount', idList.length);
await delay(2000);
if (idList.length === 0) return;
for (let i = 0; i < limit; i++) {
initData(i);
}
async function initData(index: number): Promise<any> {
const dataId = idList[index]?.id;
if (!dataId) {
console.log('done');
return;
}
// get limit data where data_id is null
const { rows } = await PgClient.query<PgItemType>(
`SELECT id,q,a,dataset_id,collection_id,data_id FROM ${PgDatasetTableName} WHERE id=${dataId};`
);
const data = rows[0];
if (!data) {
console.log('done');
return;
}
try {
// update mongo data and update inited
await MongoDatasetData.findByIdAndUpdate(data.data_id, {
q: data.q,
a: data.a,
indexes: [
{
defaultIndex: !data.a,
type: data.a ? DatasetDataIndexTypeEnum.qa : DatasetDataIndexTypeEnum.chunk,
dataId: data.id,
text: data.q
}
]
});
// update pg data_id
await PgClient.query(`UPDATE ${PgDatasetTableName} SET inited=0 WHERE id=${dataId};`);
return initData(index + limit);
} catch (error) {
console.log(error);
console.log(data);
await delay(500);
return initData(index);
}
}
}

View File

@@ -2,7 +2,7 @@ import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { MongoDataset } from '@fastgpt/service/core/dataset/schema';
import { getVectorModel } from '@/service/core/ai/model';
import { getQAModel, getVectorModel } from '@/service/core/ai/model';
import type { DatasetItemType } from '@fastgpt/global/core/dataset/type.d';
import { mongoRPermission } from '@fastgpt/global/support/permission/utils';
import { authUserRole } from '@fastgpt/service/support/permission/auth/user';
@@ -22,6 +22,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
const data = datasets.map((item) => ({
...item.toJSON(),
vectorModel: getVectorModel(item.vectorModel),
agentModel: getQAModel(item.agentModel),
canWrite: String(item.tmbId) === tmbId,
isOwner: teamOwner || String(item.tmbId) === tmbId
}));

View File

@@ -0,0 +1,73 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { MongoUser } from '@fastgpt/service/support/user/schema';
import { addLog } from '@fastgpt/service/common/mongo/controller';
import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { findDatasetIdTreeByTopDatasetId } from '@fastgpt/service/core/dataset/controller';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
await connectToDatabase();
let { datasetId } = req.query as {
datasetId: string;
};
if (!datasetId) {
throw new Error('缺少参数');
}
// 凭证校验
const { userId } = await authDataset({ req, authToken: true, datasetId, per: 'w' });
await limitCheck({
datasetId,
userId
});
jsonRes(res);
} catch (err) {
res.status(500);
jsonRes(res, {
code: 500,
error: err
});
}
}
export async function limitCheck({ datasetId, userId }: { datasetId: string; userId: string }) {
const exportIds = await findDatasetIdTreeByTopDatasetId(datasetId);
const limitMinutesAgo = new Date(
Date.now() - (global.feConfigs?.limit?.exportLimitMinutes || 0) * 60 * 1000
);
// auth export times
const authTimes = await MongoUser.findOne(
{
_id: userId,
$or: [
{ 'limit.exportKbTime': { $exists: false } },
{ 'limit.exportKbTime': { $lte: limitMinutesAgo } }
]
},
'_id limit'
);
if (!authTimes) {
const minutes = `${global.feConfigs?.limit?.exportLimitMinutes || 0} 分钟`;
return Promise.reject(`上次导出未到 ${minutes},每 ${minutes}仅可导出一次。`);
}
// auth max data
const total = await MongoDatasetData.countDocuments({
datasetId: { $in: exportIds }
});
addLog.info(`export datasets: ${datasetId}`, { total });
if (total > 100000) {
return Promise.reject('数据量超出 10 万,无法导出');
}
}

View File

@@ -9,7 +9,8 @@ import { authUserNotVisitor } from '@fastgpt/service/support/permission/auth/use
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
await connectToDatabase();
const { name, tags, avatar, vectorModel, parentId, type } = req.body as CreateDatasetParams;
const { name, tags, avatar, vectorModel, agentModel, parentId, type } =
req.body as CreateDatasetParams;
// 凭证校验
const { teamId, tmbId } = await authUserNotVisitor({ req, authToken: true });
@@ -20,6 +21,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
tmbId,
tags,
vectorModel,
agentModel,
avatar,
parentId: parentId || null,
type

View File

@@ -10,7 +10,7 @@ import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
import type { PushDataResponse } from '@/global/core/api/datasetRes.d';
import type { PushDatasetDataProps } from '@/global/core/dataset/api.d';
import { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api';
import { getVectorModel } from '@/service/core/ai/model';
import { getQAModel, getVectorModel } from '@/service/core/ai/model';
import { authDatasetCollection } from '@fastgpt/service/support/permission/auth/dataset';
import { getCollectionWithDataset } from '@fastgpt/service/core/dataset/controller';
@@ -63,24 +63,14 @@ export async function pushDataToDatasetCollection({
mode,
prompt,
billId
}: { teamId: string; tmbId: string } & PushDatasetDataProps): Promise<PushDataResponse> {
// get dataset vector model
const {
datasetId: { _id: datasetId, vectorModel }
} = await getCollectionWithDataset(collectionId);
const vectorModelData = getVectorModel(vectorModel);
const modeMap = {
[TrainingModeEnum.chunk]: {
maxToken: vectorModelData.maxToken * 1.5,
model: vectorModelData.model
},
[TrainingModeEnum.qa]: {
maxToken: global.qaModels[0].maxContext * 0.8,
model: global.qaModels[0].model
}
};
}: {
teamId: string;
tmbId: string;
} & PushDatasetDataProps): Promise<PushDataResponse> {
const { datasetId, model, maxToken } = await checkModelValid({
mode,
collectionId
});
// filter repeat or equal content
const set = new Set();
@@ -102,12 +92,13 @@ export async function pushDataToDatasetCollection({
// count q token
const token = countPromptTokens(item.q);
if (token > modeMap[mode].maxToken) {
if (token > maxToken) {
filterResult.overToken.push(item);
return;
}
if (set.has(text)) {
console.log('repeat', item);
filterResult.repeat.push(item);
} else {
filterResult.success.push(item);
@@ -126,7 +117,7 @@ export async function pushDataToDatasetCollection({
billId,
mode,
prompt,
model: modeMap[mode].model,
model,
q: item.q,
a: item.a,
indexes: item.indexes
@@ -142,6 +133,44 @@ export async function pushDataToDatasetCollection({
};
}
export async function checkModelValid({
mode,
collectionId
}: {
mode: `${TrainingModeEnum}`;
collectionId: string;
}) {
const {
datasetId: { _id: datasetId, vectorModel, agentModel }
} = await getCollectionWithDataset(collectionId);
if (mode === TrainingModeEnum.chunk) {
if (!collectionId) return Promise.reject(`CollectionId is empty`);
const vectorModelData = getVectorModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(`Model ${vectorModel} is inValid`);
}
return {
datasetId,
maxToken: vectorModelData.maxToken * 1.5,
model: vectorModelData.model
};
}
if (mode === TrainingModeEnum.qa) {
const qaModelData = getQAModel(agentModel);
if (!qaModelData) {
return Promise.reject(`Model ${agentModel} is inValid`);
}
return {
datasetId,
maxToken: qaModelData.maxContext * 0.8,
model: qaModelData.model
};
}
return Promise.reject(`Mode ${mode} is inValid`);
}
export const config = {
api: {
bodyParser: {

View File

@@ -1,7 +1,7 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { getVectorModel } from '@/service/core/ai/model';
import { getQAModel, getVectorModel } from '@/service/core/ai/model';
import type { DatasetItemType } from '@fastgpt/global/core/dataset/type.d';
import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
@@ -28,6 +28,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
data: {
...dataset,
vectorModel: getVectorModel(dataset.vectorModel),
agentModel: getQAModel(dataset.agentModel),
canWrite,
isOwner
}

View File

@@ -1,5 +1,5 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { jsonRes, responseWriteController } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { MongoUser } from '@fastgpt/service/support/user/schema';
import { addLog } from '@fastgpt/service/common/mongo/controller';
@@ -8,6 +8,7 @@ import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { findDatasetIdTreeByTopDatasetId } from '@fastgpt/service/core/dataset/controller';
import { Readable } from 'stream';
import type { Cursor } from '@fastgpt/service/common/mongo';
import { limitCheck } from './checkExportLimit';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
@@ -23,39 +24,12 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
// 凭证校验
const { userId } = await authDataset({ req, authToken: true, datasetId, per: 'w' });
const exportIds = await findDatasetIdTreeByTopDatasetId(datasetId);
const limitMinutesAgo = new Date(
Date.now() - (global.feConfigs?.limit?.exportLimitMinutes || 0) * 60 * 1000
);
// auth export times
const authTimes = await MongoUser.findOne(
{
_id: userId,
$or: [
{ 'limit.exportKbTime': { $exists: false } },
{ 'limit.exportKbTime': { $lte: limitMinutesAgo } }
]
},
'_id limit'
);
if (!authTimes) {
const minutes = `${global.feConfigs?.limit?.exportLimitMinutes || 0} 分钟`;
throw new Error(`上次导出未到 ${minutes},每 ${minutes}仅可导出一次。`);
}
// auth max data
const total = await MongoDatasetData.countDocuments({
datasetId: { $in: exportIds }
await limitCheck({
userId,
datasetId
});
addLog.info(`export datasets: ${datasetId}`, { total });
if (total > 100000) {
throw new Error('数据量超出 10 万,无法导出');
}
const exportIds = await findDatasetIdTreeByTopDatasetId(datasetId);
res.setHeader('Content-Type', 'text/csv; charset=utf-8;');
res.setHeader('Content-Disposition', 'attachment; filename=dataset.csv; ');
@@ -72,35 +46,27 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
'q a'
).cursor();
function cursorToReadableStream(cursor: Cursor) {
const readable = new Readable({
objectMode: true,
read() {}
const write = responseWriteController({
res,
readStream: cursor
});
write(`\uFEFFindex,content`);
cursor.on('data', (doc) => {
const q = doc.q.replace(/"/g, '""') || '';
const a = doc.a.replace(/"/g, '""') || '';
write(`\n"${q}","${a}"`);
});
cursor.on('end', async () => {
cursor.close();
res.end();
await MongoUser.findByIdAndUpdate(userId, {
'limit.exportKbTime': new Date()
});
readable.push(`\uFEFFindex,content`);
cursor.on('data', (doc) => {
const q = doc.q.replace(/"/g, '""') || '';
const a = doc.a.replace(/"/g, '""') || '';
readable.push(`\n"${q}","${a}"`);
});
cursor.on('end', async () => {
readable.push(null);
cursor.close();
await MongoUser.findByIdAndUpdate(userId, {
'limit.exportKbTime': new Date()
});
});
return readable;
}
// @ts-ignore
const stream = cursorToReadableStream(cursor);
stream.pipe(res);
});
} catch (err) {
res.status(500);
jsonRes(res, {

View File

@@ -1,7 +1,7 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { getVectorModel } from '@/service/core/ai/model';
import { getQAModel, getVectorModel } from '@/service/core/ai/model';
import type { DatasetItemType } from '@fastgpt/global/core/dataset/type.d';
import { DatasetTypeEnum } from '@fastgpt/global/core/dataset/constant';
import { MongoDataset } from '@fastgpt/service/core/dataset/schema';
@@ -28,6 +28,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
datasets.map(async (item) => ({
...item.toJSON(),
vectorModel: getVectorModel(item.vectorModel),
agentModel: getQAModel(item.agentModel),
canWrite,
isOwner: teamOwner || String(item.tmbId) === tmbId
}))

View File

@@ -8,7 +8,8 @@ import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
await connectToDatabase();
const { id, parentId, name, avatar, tags, permission } = req.body as DatasetUpdateParams;
const { id, parentId, name, avatar, tags, permission, agentModel } =
req.body as DatasetUpdateParams;
if (!id) {
throw new Error('缺少参数');
@@ -26,7 +27,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
...(name && { name }),
...(avatar && { avatar }),
...(tags && { tags }),
...(permission && { permission })
...(permission && { permission }),
...(agentModel && { agentModel: agentModel.model })
}
);

View File

@@ -5,15 +5,17 @@ import { MongoBill } from '@fastgpt/service/support/wallet/bill/schema';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { BillSourceEnum } from '@fastgpt/global/support/wallet/bill/constants';
import { CreateTrainingBillProps } from '@fastgpt/global/support/wallet/bill/api.d';
import { getQAModel, getVectorModel } from '@/service/core/ai/model';
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
await connectToDatabase();
const { name } = req.body as CreateTrainingBillProps;
const { name, vectorModel, agentModel } = req.body as CreateTrainingBillProps;
const { teamId, tmbId } = await authCert({ req, authToken: true, authApiKey: true });
const qaModel = global.qaModels[0];
const vectorModelData = getVectorModel(vectorModel);
const agentModelData = getQAModel(agentModel);
const { _id } = await MongoBill.create({
teamId,
@@ -23,13 +25,13 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
list: [
{
moduleName: '索引生成',
model: 'embedding',
model: vectorModelData.name,
amount: 0,
tokenLen: 0
},
{
moduleName: 'QA 拆分',
model: qaModel?.name,
model: agentModelData.name,
amount: 0,
tokenLen: 0
}

View File

@@ -170,7 +170,7 @@ const DataCard = () => {
</Flex>
<Grid
minH={'100px'}
gridTemplateColumns={['1fr', 'repeat(2,1fr)', 'repeat(3,1fr)']}
gridTemplateColumns={['1fr', 'repeat(2,1fr)', 'repeat(3,1fr)', 'repeat(4,1fr)']}
gridGap={4}
>
{datasetDataList.map((item) => (

View File

@@ -34,10 +34,10 @@ const ImportData = ({
const theme = useTheme();
const { datasetDetail } = useDatasetStore();
const [importType, setImportType] = useState<`${ImportTypeEnum}`>(ImportTypeEnum.chunk);
const vectorModel = datasetDetail.vectorModel;
const agentModel = datasetDetail.agentModel;
const typeMap = useMemo(() => {
const vectorModel = datasetDetail.vectorModel;
const qaModel = qaModelList[0];
const map = {
[ImportTypeEnum.chunk]: {
defaultChunkLen: vectorModel?.defaultToken || 500,
@@ -45,8 +45,8 @@ const ImportData = ({
mode: TrainingModeEnum.chunk
},
[ImportTypeEnum.qa]: {
defaultChunkLen: qaModel?.maxContext * 0.5 || 8000,
unitPrice: qaModel?.price || 3,
defaultChunkLen: agentModel?.maxContext * 0.6 || 9000,
unitPrice: agentModel?.price || 3,
mode: TrainingModeEnum.qa
},
[ImportTypeEnum.csv]: {
@@ -56,7 +56,13 @@ const ImportData = ({
}
};
return map[importType];
}, [datasetDetail.vectorModel, importType]);
}, [
agentModel?.maxContext,
agentModel?.price,
importType,
vectorModel?.defaultToken,
vectorModel?.price
]);
const TitleStyle: BoxProps = {
fontWeight: 'bold',
@@ -104,8 +110,10 @@ const ImportData = ({
<Provider
{...typeMap}
vectorModel={vectorModel.model}
agentModel={agentModel.model}
datasetId={datasetDetail._id}
importType={importType}
datasetId={datasetId}
parentId={parentId}
onUploadSuccess={uploadSuccess}
>

View File

@@ -90,6 +90,8 @@ const Provider = ({
parentId,
unitPrice,
mode,
vectorModel,
agentModel,
defaultChunkLen = 500,
importType,
onUploadSuccess,
@@ -99,6 +101,8 @@ const Provider = ({
parentId: string;
unitPrice: number;
mode: `${TrainingModeEnum}`;
vectorModel: string;
agentModel: string;
defaultChunkLen: number;
importType: `${ImportTypeEnum}`;
onUploadSuccess: () => void;
@@ -132,7 +136,9 @@ const Provider = ({
const chunks = file.chunks;
// create training bill
const billId = await postCreateTrainingBill({
name: t('dataset.collections.Create Training Data', { filename: file.filename })
name: t('dataset.collections.Create Training Data', { filename: file.filename }),
vectorModel,
agentModel
});
// create a file collection and training bill
const collectionId = await postDatasetCollection({

View File

@@ -13,8 +13,8 @@ const fileExtension = '.txt, .doc, .docx, .pdf, .md';
const QAImport = () => {
const { datasetDetail } = useDatasetStore();
const vectorModel = datasetDetail.vectorModel;
const unitPrice = vectorModel?.price || 0.2;
const agentModel = datasetDetail.agentModel;
const unitPrice = agentModel?.price || 3;
const {
successChunks,

View File

@@ -9,7 +9,7 @@ import React, {
import { useRouter } from 'next/router';
import { Box, Flex, Button, FormControl, IconButton, Input } from '@chakra-ui/react';
import { QuestionOutlineIcon, DeleteIcon } from '@chakra-ui/icons';
import { delDatasetById, putDatasetById } from '@/web/core/dataset/api';
import { delDatasetById } from '@/web/core/dataset/api';
import { useSelectFile } from '@/web/common/file/hooks/useSelectFile';
import { useToast } from '@/web/common/hooks/useToast';
import { useDatasetStore } from '@/web/core/dataset/store/dataset';
@@ -22,6 +22,8 @@ import Tag from '@/components/Tag';
import MyTooltip from '@/components/MyTooltip';
import { useTranslation } from 'next-i18next';
import PermissionRadio from '@/components/support/permission/Radio';
import MySelect from '@/components/Select';
import { qaModelList } from '@/web/common/system/staticData';
export interface ComponentRef {
initInput: (tags: string) => void;
@@ -50,7 +52,7 @@ const Info = (
multiple: false
});
const { datasetDetail, loadDatasetDetail, loadDatasets } = useDatasetStore();
const { datasetDetail, loadDatasetDetail, loadDatasets, updateDataset } = useDatasetStore();
/* 点击删除 */
const onclickDelKb = useCallback(async () => {
@@ -76,11 +78,10 @@ const Info = (
async (data: DatasetItemType) => {
setBtnLoading(true);
try {
await putDatasetById({
await updateDataset({
id: datasetId,
...data
});
await loadDatasetDetail(datasetId, true);
toast({
title: '更新成功',
status: 'success'
@@ -94,7 +95,7 @@ const Info = (
}
setBtnLoading(false);
},
[loadDatasetDetail, datasetId, loadDatasets, toast]
[updateDataset, datasetId, loadDatasetDetail, toast, loadDatasets]
);
const saveSubmitError = useCallback(() => {
// deep search message
@@ -194,6 +195,27 @@ const Info = (
})}
/>
</FormControl>
<Flex mt={6} alignItems={'center'}>
<Box flex={['0 0 90px', '0 0 160px']} w={0}>
{t('dataset.Agent Model')}
</Box>
<Box flex={[1, '0 0 300px']}>
<MySelect
w={'100%'}
value={getValues('agentModel').model}
list={qaModelList.map((item) => ({
label: item.name,
value: item.model
}))}
onchange={(e) => {
const agentModel = qaModelList.find((item) => item.model === e);
if (!agentModel) return;
setValue('agentModel', agentModel);
setRefresh((state) => !state);
}}
/>
</Box>
</Flex>
<Flex mt={8} alignItems={'center'} w={'100%'} flexWrap={'wrap'}>
<Box flex={['0 0 90px', '0 0 160px']} w={0}>

View File

@@ -196,7 +196,7 @@ const InputDataModal = ({
const loading = useMemo(() => isImporting || isUpdating, [isImporting, isUpdating]);
return (
<MyModal isOpen={true} isCentered w={'90vw'} maxW={'90vw'} h={'90vh'}>
<MyModal isOpen={true} isCentered w={'90vw'} maxW={'1440px'} h={'90vh'}>
<Flex h={'100%'}>
<Box p={5} borderRight={theme.borders.base}>
<RawSourceText
@@ -250,7 +250,7 @@ const InputDataModal = ({
mt={1}
placeholder={`该输入框是必填项,该内容通常是对于知识点的描述,也可以是用户的问题,最多 ${maxToken} 字。`}
maxLength={maxToken}
rows={10}
rows={12}
bg={'myWhite.400'}
{...register(`q`, {
required: true
@@ -274,7 +274,7 @@ const InputDataModal = ({
maxToken * 1.5
} 字。`}
bg={'myWhite.400'}
rows={10}
rows={12}
maxLength={maxToken * 1.5}
{...register('a')}
/>

View File

@@ -15,10 +15,12 @@ import { postCreateDataset } from '@/web/core/dataset/api';
import type { CreateDatasetParams } from '@/global/core/dataset/api.d';
import MySelect from '@/components/Select';
import { QuestionOutlineIcon } from '@chakra-ui/icons';
import { vectorModelList } from '@/web/common/system/staticData';
import { vectorModelList, qaModelList } from '@/web/common/system/staticData';
import Tag from '@/components/Tag';
import { useTranslation } from 'next-i18next';
const CreateModal = ({ onClose, parentId }: { onClose: () => void; parentId?: string }) => {
const { t } = useTranslation();
const [refresh, setRefresh] = useState(false);
const { toast } = useToast();
const router = useRouter();
@@ -29,6 +31,7 @@ const CreateModal = ({ onClose, parentId }: { onClose: () => void; parentId?: st
name: '',
tags: '',
vectorModel: vectorModelList[0].model,
agentModel: qaModelList[0].model,
type: 'dataset',
parentId
}
@@ -76,7 +79,7 @@ const CreateModal = ({ onClose, parentId }: { onClose: () => void; parentId?: st
});
return (
<MyModal isOpen onClose={onClose} isCentered={!isPc} w={'400px'}>
<MyModal isOpen onClose={onClose} isCentered={!isPc} w={'450px'}>
<ModalHeader fontSize={'2xl'}></ModalHeader>
<ModalBody>
<Box color={'myGray.800'} fontWeight={'bold'}>
@@ -106,7 +109,7 @@ const CreateModal = ({ onClose, parentId }: { onClose: () => void; parentId?: st
/>
</Flex>
<Flex mt={6} alignItems={'center'}>
<Box flex={'0 0 80px'}></Box>
<Box flex={'0 0 100px'}></Box>
<Box flex={1}>
<MySelect
w={'100%'}
@@ -122,8 +125,25 @@ const CreateModal = ({ onClose, parentId }: { onClose: () => void; parentId?: st
/>
</Box>
</Flex>
<Flex mt={6} alignItems={'center'}>
<Box flex={'0 0 100px'}>{t('dataset.Agent Model')}</Box>
<Box flex={1}>
<MySelect
w={'100%'}
value={getValues('agentModel')}
list={qaModelList.map((item) => ({
label: item.name,
value: item.model
}))}
onchange={(e) => {
setValue('agentModel', e);
setRefresh((state) => !state);
}}
/>
</Box>
</Flex>
<Flex mt={6} alignItems={'center'} w={'100%'}>
<Box flex={'0 0 80px'}>
<Box flex={'0 0 100px'}>
<MyTooltip label={'用空格隔开多个标签,便于搜索'} forceShow>
<QuestionOutlineIcon ml={1} />

View File

@@ -20,7 +20,8 @@ import {
delDatasetById,
getDatasetPaths,
putDatasetById,
postCreateDataset
postCreateDataset,
getCheckExportLimit
} from '@/web/core/dataset/api';
import { useTranslation } from 'next-i18next';
import Avatar from '@/components/Avatar';
@@ -38,6 +39,7 @@ import { useDrag } from '@/web/common/hooks/useDrag';
import { useUserStore } from '@/web/support/user/useUserStore';
import PermissionIconText from '@/components/support/permission/IconText';
import { PermissionTypeEnum } from '@fastgpt/global/support/permission/constant';
import { DatasetItemType } from '@fastgpt/global/core/dataset/type';
const CreateModal = dynamic(() => import('./component/CreateModal'), { ssr: false });
const MoveModal = dynamic(() => import('./component/MoveModal'), { ssr: false });
@@ -89,6 +91,23 @@ const Kb = () => {
successToast: t('common.Delete Success'),
errorToast: t('dataset.Delete Dataset Error')
});
// check export limit
const { mutate: exportDataset } = useRequest({
mutationFn: async (dataset: DatasetItemType) => {
setLoading(true);
await getCheckExportLimit(dataset._id);
const a = document.createElement('a');
a.href = `/api/core/dataset/exportAll?datasetId=${dataset._id}`;
a.download = `${dataset.name}.csv`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
},
onSettled() {
setLoading(false);
},
errorToast: t('dataset.Export Dataset Limit Error')
});
const { data, refetch } = useQuery(['loadDataset', parentId], () => {
return Promise.all([loadDatasets(parentId), getDatasetPaths(parentId)]);
@@ -371,12 +390,7 @@ const Kb = () => {
</Flex>
),
onClick: () => {
const a = document.createElement('a');
a.href = `/api/core/dataset/exportAll?datasetId=${dataset._id}`;
a.download = `${dataset.name}.csv`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
exportDataset(dataset);
}
},
{

View File

@@ -109,6 +109,7 @@ export async function generateQA(): Promise<any> {
try {
const startTime = Date.now();
const model = data.model ?? global.qaModels[0].model;
// request LLM to get QA
const messages: ChatMessageItemType[] = [
@@ -122,9 +123,10 @@ export async function generateQA(): Promise<any> {
})
}
];
const ai = getAIApi(undefined, 480000);
const ai = getAIApi(undefined, 600000);
const chatResponse = await ai.chat.completions.create({
model: global.qaModels[0].model,
model,
temperature: 0.01,
messages,
stream: false
@@ -147,8 +149,11 @@ export async function generateQA(): Promise<any> {
// delete data from training
await MongoDatasetTraining.findByIdAndDelete(data._id);
console.log(`split result length: `, qaArr.length);
console.log('生成QA成功time:', `${(Date.now() - startTime) / 1000}s`);
addLog.info(`QA Training Finish`, {
time: `${(Date.now() - startTime) / 1000}s`,
splitLength: qaArr.length,
usage: chatResponse.usage
});
// add bill
if (qaArr.length > 0) {
@@ -156,7 +161,8 @@ export async function generateQA(): Promise<any> {
teamId: data.teamId,
tmbId: data.tmbId,
totalTokens,
billId: data.billId
billId: data.billId,
model
});
} else {
addLog.info(`QA result 0:`, { answer });

View File

@@ -1,5 +1,5 @@
import { BillSourceEnum } from '@fastgpt/global/support/wallet/bill/constants';
import { getAudioSpeechModel } from '@/service/core/ai/model';
import { getAudioSpeechModel, getQAModel } from '@/service/core/ai/model';
import type { ChatHistoryItemResType } from '@fastgpt/global/core/chat/api.d';
import { formatPrice } from '@fastgpt/global/support/wallet/bill/tools';
import { addLog } from '@fastgpt/service/common/mongo/controller';
@@ -9,10 +9,16 @@ import { POST } from '@fastgpt/service/common/api/plusRequest';
export function createBill(data: CreateBillProps) {
if (!global.systemEnv.pluginBaseUrl) return;
if (data.total === 0) {
addLog.info('0 Bill', data);
}
POST('/support/wallet/bill/createBill', data);
}
export function concatBill(data: ConcatBillProps) {
if (!global.systemEnv.pluginBaseUrl) return;
if (data.total === 0) {
addLog.info('0 Bill', data);
}
POST('/support/wallet/bill/concatBill', data);
}
@@ -59,18 +65,18 @@ export const pushChatBill = ({
export const pushQABill = async ({
teamId,
tmbId,
model,
totalTokens,
billId
}: {
teamId: string;
tmbId: string;
model: string;
totalTokens: number;
billId: string;
}) => {
addLog.info('splitData generate success', { totalTokens });
// 获取模型单价格
const unitPrice = global.qaModels?.[0]?.price || 3;
const unitPrice = getQAModel(model).price;
// 计算价格
const total = unitPrice * totalTokens;

View File

@@ -48,6 +48,9 @@ export const putDatasetById = (data: DatasetUpdateParams) => PUT(`/core/dataset/
export const delDatasetById = (id: string) => DELETE(`/core/dataset/delete?id=${id}`);
export const getCheckExportLimit = (datasetId: string) =>
GET(`/core/dataset/checkExportLimit`, { datasetId });
/* =========== search test ============ */
export const postSearchText = (data: SearchTestProps) =>
POST<SearchDataResponseItemType[]>(`/core/dataset/searchTest`, data);