4.6.7-alpha commit (#743)

Co-authored-by: Archer <545436317@qq.com>
Co-authored-by: heheer <71265218+newfish-cmyk@users.noreply.github.com>
This commit is contained in:
Archer
2024-01-19 11:17:28 +08:00
committed by GitHub
parent 8ee7407c4c
commit c031e6dcc9
324 changed files with 8509 additions and 4757 deletions

View File

@@ -16,7 +16,7 @@ import {
DatasetSearchModeMap,
SearchScoreTypeEnum,
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constant';
} from '@fastgpt/global/core/dataset/constants';
import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils';
import { jiebaSplit } from '@/service/common/string/jieba';
import { deleteDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';
@@ -24,6 +24,7 @@ import { getVectorsByText } from '@fastgpt/service/core/ai/embedding';
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
import {
DatasetDataSchemaType,
DatasetDataWithCollectionType,
SearchDataResponseItemType
} from '@fastgpt/global/core/dataset/type';
import { reRankRecall } from '../../ai/rerank';
@@ -38,7 +39,7 @@ import { getCollectionWithDataset } from '@fastgpt/service/core/dataset/controll
import { getQAModel, getVectorModel } from '../../ai/model';
import { delay } from '@fastgpt/global/common/system/utils';
export async function pushDataToDatasetCollection({
export async function pushDataToTrainingQueue({
teamId,
tmbId,
collectionId,
@@ -222,7 +223,6 @@ export async function insertData2Dataset({
return Promise.reject("teamId and tmbId can't be the same");
}
const id = new Types.ObjectId();
const qaStr = `${q}\n${a}`.trim();
// empty indexes check, if empty, create default index
@@ -242,17 +242,14 @@ export async function insertData2Dataset({
query: item.text,
model,
teamId,
tmbId,
datasetId,
collectionId,
dataId: String(id)
collectionId
})
)
);
// create mongo
// create mongo data
const { _id } = await MongoDatasetData.create({
_id: id,
teamId,
tmbId,
datasetId,
@@ -269,7 +266,7 @@ export async function insertData2Dataset({
return {
insertId: _id,
tokens: result.reduce((acc, cur) => acc + cur.tokens, 0)
charsLength: result.reduce((acc, cur) => acc + cur.charsLength, 0)
};
}
@@ -293,7 +290,7 @@ export async function updateData2Dataset({
// patch index and update pg
const mongoData = await MongoDatasetData.findById(dataId);
if (!mongoData) return Promise.reject('Data not found');
if (!mongoData) return Promise.reject('core.dataset.error.Data not found');
// make sure have one index
if (indexes.length === 0) {
@@ -354,6 +351,11 @@ export async function updateData2Dataset({
}
}
// update mongo updateTime
mongoData.updateTime = new Date();
await mongoData.save();
// update vector
const result = await Promise.all(
patchResult.map(async (item) => {
if (item.type === 'create') {
@@ -361,38 +363,42 @@ export async function updateData2Dataset({
query: item.index.text,
model,
teamId: mongoData.teamId,
tmbId: mongoData.tmbId,
datasetId: mongoData.datasetId,
collectionId: mongoData.collectionId,
dataId
collectionId: mongoData.collectionId
});
item.index.dataId = result.insertId;
return result;
}
if (item.type === 'update' && item.index.dataId) {
return updateDatasetDataVector({
const result = await updateDatasetDataVector({
teamId: mongoData.teamId,
datasetId: mongoData.datasetId,
collectionId: mongoData.collectionId,
id: item.index.dataId,
query: item.index.text,
model
});
item.index.dataId = result.insertId;
return result;
}
if (item.type === 'delete' && item.index.dataId) {
await deleteDatasetDataVector({
teamId: mongoData.teamId,
id: item.index.dataId
});
return {
tokens: 0
charsLength: 0
};
}
return {
tokens: 0
charsLength: 0
};
})
);
const tokens = result.reduce((acc, cur) => acc + cur.tokens, 0);
const charsLength = result.reduce((acc, cur) => acc + cur.charsLength, 0);
// update mongo
// update mongo other data
mongoData.q = q || mongoData.q;
mongoData.a = a ?? mongoData.a;
mongoData.fullTextToken = jiebaSplit({ text: mongoData.q + mongoData.a });
@@ -401,11 +407,12 @@ export async function updateData2Dataset({
await mongoData.save();
return {
tokens
charsLength
};
}
export async function searchDatasetData(props: {
teamId: string;
model: string;
similarity?: number; // min distance
limit: number; // max Token limit
@@ -416,6 +423,7 @@ export async function searchDatasetData(props: {
queries: string[];
}) {
let {
teamId,
rawQuery,
queries,
model,
@@ -460,7 +468,7 @@ export async function searchDatasetData(props: {
};
};
const embeddingRecall = async ({ query, limit }: { query: string; limit: number }) => {
const { vectors, tokens } = await getVectorsByText({
const { vectors, charsLength } = await getVectorsByText({
model,
input: query
});
@@ -472,41 +480,45 @@ export async function searchDatasetData(props: {
});
// get q and a
const [collections, dataList] = await Promise.all([
MongoDatasetCollection.find(
{
_id: { $in: results.map((item) => item.collectionId) }
},
'name fileId rawLink'
).lean(),
MongoDatasetData.find(
{
_id: { $in: results.map((item) => item.dataId?.trim()) }
},
'datasetId collectionId q a chunkIndex indexes'
).lean()
]);
const dataList = (await MongoDatasetData.find(
{
teamId,
datasetId: { $in: datasetIds },
'indexes.dataId': { $in: results.map((item) => item.id?.trim()) }
},
'datasetId collectionId q a chunkIndex indexes'
)
.populate('collectionId', 'name fileId rawLink')
.lean()) as DatasetDataWithCollectionType[];
const formatResult = results
.map((item, index) => {
const collection = collections.find(
(collection) => String(collection._id) === item.collectionId
);
const data = dataList.find((data) => String(data._id) === item.dataId);
// add score to data(It's already sorted. The first one is the one with the most points)
const concatResults = dataList.map((data) => {
const dataIdList = data.indexes.map((item) => item.dataId);
// if collection or data UnExist, the relational mongo data already deleted
if (!collection || !data) return null;
const maxScoreResult = results.find((item) => {
return dataIdList.includes(item.id);
});
return {
...data,
score: maxScoreResult?.score || 0
};
});
concatResults.sort((a, b) => b.score - a.score);
const formatResult = concatResults
.map((data, index) => {
const result: SearchDataResponseItemType = {
id: String(data._id),
q: data.q,
a: data.a,
chunkIndex: data.chunkIndex,
datasetId: String(data.datasetId),
collectionId: String(data.collectionId),
sourceName: collection.name || '',
sourceId: collection?.fileId || collection?.rawLink,
score: [{ type: SearchScoreTypeEnum.embedding, value: item.score, index }]
collectionId: String(data.collectionId._id),
sourceName: data.collectionId.name || '',
sourceId: data.collectionId?.fileId || data.collectionId?.rawLink,
score: [{ type: SearchScoreTypeEnum.embedding, value: data.score, index }]
};
return result;
@@ -515,7 +527,7 @@ export async function searchDatasetData(props: {
return {
embeddingRecallResults: formatResult,
tokens
charsLength
};
};
const fullTextRecall = async ({
@@ -540,6 +552,7 @@ export async function searchDatasetData(props: {
datasetIds.map((id) =>
MongoDatasetData.find(
{
teamId,
datasetId: id,
$text: { $search: jiebaSplit({ text: query }) }
},
@@ -688,26 +701,27 @@ export async function searchDatasetData(props: {
// multi query recall
const embeddingRecallResList: SearchDataResponseItemType[][] = [];
const fullTextRecallResList: SearchDataResponseItemType[][] = [];
let embTokens = 0;
let totalCharsLength = 0;
for await (const query of queries) {
const [{ tokens, embeddingRecallResults }, { fullTextRecallResults }] = await Promise.all([
embeddingRecall({
query,
limit: embeddingLimit
}),
fullTextRecall({
query,
limit: fullTextLimit
})
]);
embTokens += tokens;
const [{ charsLength, embeddingRecallResults }, { fullTextRecallResults }] =
await Promise.all([
embeddingRecall({
query,
limit: embeddingLimit
}),
fullTextRecall({
query,
limit: fullTextLimit
})
]);
totalCharsLength += charsLength;
embeddingRecallResList.push(embeddingRecallResults);
fullTextRecallResList.push(fullTextRecallResults);
}
return {
tokens: embTokens,
charsLength: totalCharsLength,
embeddingRecallResults: embeddingRecallResList[0],
fullTextRecallResults: fullTextRecallResList[0]
};
@@ -778,7 +792,7 @@ export async function searchDatasetData(props: {
const { embeddingLimit, fullTextLimit } = countRecallLimit();
// recall
const { embeddingRecallResults, fullTextRecallResults, tokens } = await multiQueryRecall({
const { embeddingRecallResults, fullTextRecallResults, charsLength } = await multiQueryRecall({
embeddingLimit,
fullTextLimit
});
@@ -851,7 +865,7 @@ export async function searchDatasetData(props: {
return {
searchRes: filterResultsByMaxTokens(scoreFilter, maxTokens),
tokens,
charsLength,
searchMode,
limit: maxTokens,
similarity,

View File

@@ -4,18 +4,21 @@ import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
* Same value judgment
*/
export async function hasSameValue({
teamId,
collectionId,
q,
a = ''
}: {
teamId: string;
collectionId: string;
q: string;
a?: string;
}) {
const count = await MongoDatasetData.countDocuments({
teamId,
collectionId,
q,
a,
collectionId
a
});
if (count > 0) {

View File

@@ -1,6 +1,6 @@
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
import { pushQABill } from '@/service/support/wallet/bill/push';
import { DatasetDataIndexTypeEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
import { DatasetDataIndexTypeEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { sendOneInform } from '../support/user/inform/api';
import { getAIApi } from '@fastgpt/service/core/ai/config';
import type { ChatMessageItemType } from '@fastgpt/global/core/ai/type.d';
@@ -13,7 +13,7 @@ import { authTeamBalance } from '../support/permission/auth/bill';
import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api.d';
import { UserErrEnum } from '@fastgpt/global/common/error/code/user';
import { lockTrainingDataByTeamId } from '@fastgpt/service/core/dataset/training/controller';
import { pushDataToDatasetCollection } from '@/service/core/dataset/data/controller';
import { pushDataToTrainingQueue } from '@/service/core/dataset/data/controller';
const reduceQueue = () => {
global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0;
@@ -135,7 +135,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
const qaArr = formatSplitText(answer, text); // 格式化后的QA对
// get vector and insert
const { insertLen } = await pushDataToDatasetCollection({
const { insertLen } = await pushDataToTrainingQueue({
teamId: data.teamId,
tmbId: data.tmbId,
collectionId: data.collectionId,
@@ -161,8 +161,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
pushQABill({
teamId: data.teamId,
tmbId: data.tmbId,
inputTokens: chatResponse.usage?.prompt_tokens || 0,
outputTokens: chatResponse.usage?.completion_tokens || 0,
charsLength: `${prompt}${answer}`.length,
billId: data.billId,
model
});
@@ -238,7 +237,7 @@ function formatSplitText(text: string, rawText: string) {
// empty result. direct split chunk
if (result.length === 0) {
const { chunks } = splitText2Chunks({ text: rawText, chunkLen: 512, countTokens: false });
const { chunks } = splitText2Chunks({ text: rawText, chunkLen: 512 });
chunks.forEach((chunk) => {
result.push({
q: chunk,

View File

@@ -1,6 +1,6 @@
import { insertData2Dataset } from '@/service/core/dataset/data/controller';
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { sendOneInform } from '../support/user/inform/api';
import { addLog } from '@fastgpt/service/common/system/log';
import { getErrText } from '@fastgpt/global/common/error/utils';
@@ -125,7 +125,7 @@ export async function generateVector(): Promise<any> {
}
// insert data to pg
const { tokens } = await insertData2Dataset({
const { charsLength } = await insertData2Dataset({
teamId: data.teamId,
tmbId: data.tmbId,
datasetId: data.datasetId,
@@ -141,7 +141,7 @@ export async function generateVector(): Promise<any> {
pushGenerateVectorBill({
teamId: data.teamId,
tmbId: data.tmbId,
tokens,
charsLength,
model: data.model,
billId: data.billId
});

View File

@@ -6,7 +6,7 @@ import type { ModuleDispatchProps } from '@fastgpt/global/core/module/type.d';
import { ModelTypeEnum } from '@/service/core/ai/model';
import { searchDatasetData } from '@/service/core/dataset/data/controller';
import { ModuleInputKeyEnum, ModuleOutputKeyEnum } from '@fastgpt/global/core/module/constants';
import { DatasetSearchModeEnum } from '@fastgpt/global/core/dataset/constant';
import { DatasetSearchModeEnum } from '@fastgpt/global/core/dataset/constants';
type DatasetSearchProps = ModuleDispatchProps<{
[ModuleInputKeyEnum.datasetSelectList]: SelectedDatasetType;
@@ -27,6 +27,7 @@ export async function dispatchDatasetSearch(
props: DatasetSearchProps
): Promise<DatasetSearchResponse> {
const {
teamId,
inputs: { datasets = [], similarity, limit = 1500, usingReRank, searchMode, userChatInput }
} = props as DatasetSearchProps;
@@ -39,7 +40,7 @@ export async function dispatchDatasetSearch(
}
if (!userChatInput) {
return Promise.reject('core.chat.error.User question empty');
return Promise.reject('core.chat.error.User input empty');
}
// get vector
@@ -54,10 +55,11 @@ export async function dispatchDatasetSearch(
// start search
const {
searchRes,
tokens,
charsLength,
usingSimilarityFilter,
usingReRank: searchUsingReRank
} = await searchDatasetData({
teamId,
rawQuery: `${userChatInput}`,
queries: concatQueries,
model: vectorModel.model,
@@ -70,7 +72,7 @@ export async function dispatchDatasetSearch(
const { total, modelName } = formatModelPrice2Store({
model: vectorModel.model,
inputLen: tokens,
inputLen: charsLength,
type: ModelTypeEnum.vector
});
@@ -82,7 +84,7 @@ export async function dispatchDatasetSearch(
price: total,
query: concatQueries.join('\n'),
model: modelName,
inputTokens: tokens,
charsLength,
similarity: usingSimilarityFilter ? similarity : undefined,
limit,
searchMode,

View File

@@ -42,8 +42,7 @@ const callbackMap: Record<`${FlowNodeTypeEnum}`, Function> = {
[FlowNodeTypeEnum.cfr]: dispatchCFR,
// none
[FlowNodeTypeEnum.userGuide]: () => Promise.resolve(),
[FlowNodeTypeEnum.variable]: () => Promise.resolve()
[FlowNodeTypeEnum.userGuide]: () => Promise.resolve()
};
/* running */

View File

@@ -5,6 +5,7 @@ import { authOutLink } from './outLink';
import { ChatErrEnum } from '@fastgpt/global/common/error/code/chat';
import { authUserRole } from '@fastgpt/service/support/permission/auth/user';
import { TeamMemberRoleEnum } from '@fastgpt/global/support/user/team/constant';
import { AuthResponseType } from '@fastgpt/global/support/permission/type';
/*
outLink: Must be the owner

View File

@@ -14,7 +14,7 @@ export async function authDatasetData({
const datasetData = await MongoDatasetData.findById(dataId);
if (!datasetData) {
return Promise.reject('Data not found');
return Promise.reject('core.dataset.error.Data not found');
}
const result = await authDatasetCollection({

View File

@@ -36,7 +36,8 @@ export const pushChatBill = ({
amount: item.price || 0,
model: item.model,
inputTokens: item.inputTokens,
outputTokens: item.outputTokens
outputTokens: item.outputTokens,
charsLength: item.charsLength
}))
});
addLog.info(`finish completions`, {
@@ -52,22 +53,19 @@ export const pushQABill = async ({
teamId,
tmbId,
model,
inputTokens,
outputTokens,
charsLength,
billId
}: {
teamId: string;
tmbId: string;
model: string;
inputTokens: number;
outputTokens: number;
charsLength: number;
billId: string;
}) => {
// 计算价格
const { total } = formatModelPrice2Store({
model,
inputLen: inputTokens,
outputLen: outputTokens,
inputLen: charsLength,
type: ModelTypeEnum.qa
});
@@ -76,8 +74,7 @@ export const pushQABill = async ({
teamId,
tmbId,
total,
inputTokens,
outputTokens,
charsLength,
listIndex: 1
});
@@ -88,25 +85,23 @@ export const pushGenerateVectorBill = ({
billId,
teamId,
tmbId,
tokens,
charsLength,
model,
source = BillSourceEnum.fastgpt
}: {
billId?: string;
teamId: string;
tmbId: string;
tokens: number;
charsLength: number;
model: string;
source?: `${BillSourceEnum}`;
}) => {
let { total, modelName } = formatModelPrice2Store({
model,
inputLen: tokens,
inputLen: charsLength,
type: ModelTypeEnum.vector
});
total = total < 1 ? 1 : total;
// 插入 Bill 记录
if (billId) {
concatBill({
@@ -114,7 +109,7 @@ export const pushGenerateVectorBill = ({
tmbId,
total,
billId,
inputTokens: tokens,
charsLength,
listIndex: 0
});
} else {
@@ -129,7 +124,7 @@ export const pushGenerateVectorBill = ({
moduleName: 'wallet.moduleName.index',
amount: total,
model: modelName,
inputTokens: tokens
charsLength
}
]
});
@@ -177,21 +172,21 @@ export const pushQuestionGuideBill = ({
export function pushAudioSpeechBill({
appName = 'wallet.bill.Audio Speech',
model,
textLen,
charsLength,
teamId,
tmbId,
source = BillSourceEnum.fastgpt
}: {
appName?: string;
model: string;
textLen: number;
charsLength: number;
teamId: string;
tmbId: string;
source: `${BillSourceEnum}`;
}) {
const { total, modelName } = formatModelPrice2Store({
model,
inputLen: textLen,
inputLen: charsLength,
type: ModelTypeEnum.audioSpeech
});
@@ -206,7 +201,7 @@ export function pushAudioSpeechBill({
moduleName: appName,
amount: total,
model: modelName,
textLen
charsLength
}
]
});
@@ -265,11 +260,11 @@ export function pushReRankBill({
const reRankModel = global.reRankModels[0];
if (!reRankModel) return { total: 0 };
const textLen = inputs.reduce((sum, item) => sum + item.text.length, 0);
const charsLength = inputs.reduce((sum, item) => sum + item.text.length, 0);
const { total, modelName } = formatModelPrice2Store({
model: reRankModel.model,
inputLen: textLen,
inputLen: charsLength,
type: ModelTypeEnum.rerank
});
const name = 'wallet.bill.ReRank';
@@ -285,7 +280,7 @@ export function pushReRankBill({
moduleName: name,
amount: total,
model: modelName,
textLen
charsLength
}
]
});

View File

@@ -69,7 +69,7 @@ export async function saveChat({
if (chat) {
promise.push(
MongoChat.updateOne(
{ chatId },
{ appId, chatId },
{
title,
updateTime: new Date(),