mirror of
https://github.com/labring/FastGPT.git
synced 2025-08-01 03:48:24 +00:00
4.6.7 first pr (#726)
This commit is contained in:
18
projects/app/src/service/common/system/cron.ts
Normal file
18
projects/app/src/service/common/system/cron.ts
Normal file
@@ -0,0 +1,18 @@
|
||||
import { initSystemConfig } from '@/pages/api/common/system/getInitData';
|
||||
import { generateQA } from '@/service/events/generateQA';
|
||||
import { generateVector } from '@/service/events/generateVector';
|
||||
import { setCron } from '@fastgpt/service/common/system/cron';
|
||||
|
||||
export const setUpdateSystemConfigCron = () => {
|
||||
setCron('*/5 * * * *', () => {
|
||||
initSystemConfig();
|
||||
console.log('refresh system config');
|
||||
});
|
||||
};
|
||||
|
||||
export const setTrainingQueueCron = () => {
|
||||
setCron('*/3 * * * *', () => {
|
||||
generateVector();
|
||||
generateQA();
|
||||
});
|
||||
};
|
@@ -27,7 +27,7 @@ export function reRankRecall({ query, inputs }: PostReRankProps) {
|
||||
return data;
|
||||
})
|
||||
.catch((err) => {
|
||||
console.log(err);
|
||||
console.log('rerank error:', err);
|
||||
|
||||
return [];
|
||||
});
|
||||
|
@@ -14,7 +14,8 @@ import {
|
||||
DatasetDataIndexTypeEnum,
|
||||
DatasetSearchModeEnum,
|
||||
DatasetSearchModeMap,
|
||||
SearchScoreTypeEnum
|
||||
SearchScoreTypeEnum,
|
||||
TrainingModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constant';
|
||||
import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils';
|
||||
import { jiebaSplit } from '@/service/common/string/jieba';
|
||||
@@ -27,7 +28,173 @@ import {
|
||||
} from '@fastgpt/global/core/dataset/type';
|
||||
import { reRankRecall } from '../../ai/rerank';
|
||||
import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
import { hashStr, simpleText } from '@fastgpt/global/common/string/tools';
|
||||
import type { PushDatasetDataProps } from '@/global/core/dataset/api.d';
|
||||
import type { PushDataResponse } from '@/global/core/api/datasetRes';
|
||||
import { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api';
|
||||
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
|
||||
import { startQueue } from '@/service/utils/tools';
|
||||
import { getCollectionWithDataset } from '@fastgpt/service/core/dataset/controller';
|
||||
import { getQAModel, getVectorModel } from '../../ai/model';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
|
||||
export async function pushDataToDatasetCollection({
|
||||
teamId,
|
||||
tmbId,
|
||||
collectionId,
|
||||
data,
|
||||
prompt,
|
||||
billId,
|
||||
trainingMode
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
} & PushDatasetDataProps): Promise<PushDataResponse> {
|
||||
const checkModelValid = async ({ collectionId }: { collectionId: string }) => {
|
||||
const {
|
||||
datasetId: { _id: datasetId, vectorModel, agentModel }
|
||||
} = await getCollectionWithDataset(collectionId);
|
||||
|
||||
if (trainingMode === TrainingModeEnum.chunk) {
|
||||
if (!collectionId) return Promise.reject(`CollectionId is empty`);
|
||||
const vectorModelData = getVectorModel(vectorModel);
|
||||
if (!vectorModelData) {
|
||||
return Promise.reject(`Model ${vectorModel} is inValid`);
|
||||
}
|
||||
|
||||
return {
|
||||
datasetId,
|
||||
maxToken: vectorModelData.maxToken * 1.5,
|
||||
model: vectorModelData.model,
|
||||
weight: vectorModelData.weight
|
||||
};
|
||||
}
|
||||
|
||||
if (trainingMode === TrainingModeEnum.qa) {
|
||||
const qaModelData = getQAModel(agentModel);
|
||||
if (!qaModelData) {
|
||||
return Promise.reject(`Model ${agentModel} is inValid`);
|
||||
}
|
||||
return {
|
||||
datasetId,
|
||||
maxToken: qaModelData.maxContext * 0.8,
|
||||
model: qaModelData.model,
|
||||
weight: 0
|
||||
};
|
||||
}
|
||||
return Promise.reject(`Mode ${trainingMode} is inValid`);
|
||||
};
|
||||
|
||||
const { datasetId, model, maxToken, weight } = await checkModelValid({
|
||||
collectionId
|
||||
});
|
||||
|
||||
// format q and a, remove empty char
|
||||
data.forEach((item) => {
|
||||
item.q = simpleText(item.q);
|
||||
item.a = simpleText(item.a);
|
||||
|
||||
item.indexes = item.indexes
|
||||
?.map((index) => {
|
||||
return {
|
||||
...index,
|
||||
text: simpleText(index.text)
|
||||
};
|
||||
})
|
||||
.filter(Boolean);
|
||||
});
|
||||
|
||||
// filter repeat or equal content
|
||||
const set = new Set();
|
||||
const filterResult: Record<string, PushDatasetDataChunkProps[]> = {
|
||||
success: [],
|
||||
overToken: [],
|
||||
repeat: [],
|
||||
error: []
|
||||
};
|
||||
|
||||
data.forEach((item) => {
|
||||
if (!item.q) {
|
||||
filterResult.error.push(item);
|
||||
return;
|
||||
}
|
||||
|
||||
const text = item.q + item.a;
|
||||
|
||||
// count q token
|
||||
const token = countPromptTokens(item.q);
|
||||
|
||||
if (token > maxToken) {
|
||||
filterResult.overToken.push(item);
|
||||
return;
|
||||
}
|
||||
|
||||
if (set.has(text)) {
|
||||
console.log('repeat', item);
|
||||
filterResult.repeat.push(item);
|
||||
} else {
|
||||
filterResult.success.push(item);
|
||||
set.add(text);
|
||||
}
|
||||
});
|
||||
|
||||
// 插入记录
|
||||
const insertData = async (dataList: PushDatasetDataChunkProps[], retry = 3): Promise<number> => {
|
||||
try {
|
||||
const results = await MongoDatasetTraining.insertMany(
|
||||
dataList.map((item, i) => ({
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId,
|
||||
collectionId,
|
||||
billId,
|
||||
mode: trainingMode,
|
||||
prompt,
|
||||
model,
|
||||
q: item.q,
|
||||
a: item.a,
|
||||
chunkIndex: item.chunkIndex ?? i,
|
||||
weight: weight ?? 0,
|
||||
indexes: item.indexes
|
||||
}))
|
||||
);
|
||||
await delay(500);
|
||||
return results.length;
|
||||
} catch (error) {
|
||||
if (retry > 0) {
|
||||
await delay(1000);
|
||||
return insertData(dataList, retry - 1);
|
||||
}
|
||||
return Promise.reject(error);
|
||||
}
|
||||
};
|
||||
|
||||
let insertLen = 0;
|
||||
const chunkSize = 50;
|
||||
const chunkList = filterResult.success.reduce(
|
||||
(acc, cur) => {
|
||||
const lastChunk = acc[acc.length - 1];
|
||||
if (lastChunk.length < chunkSize) {
|
||||
lastChunk.push(cur);
|
||||
} else {
|
||||
acc.push([cur]);
|
||||
}
|
||||
return acc;
|
||||
},
|
||||
[[]] as PushDatasetDataChunkProps[][]
|
||||
);
|
||||
for await (const chunks of chunkList) {
|
||||
insertLen += await insertData(chunks);
|
||||
}
|
||||
|
||||
startQueue();
|
||||
delete filterResult.success;
|
||||
|
||||
return {
|
||||
insertLen,
|
||||
...filterResult
|
||||
};
|
||||
}
|
||||
|
||||
/* insert data.
|
||||
* 1. create data id
|
||||
@@ -439,7 +606,9 @@ export async function searchDatasetData(props: {
|
||||
}))
|
||||
});
|
||||
|
||||
if (!Array.isArray(results)) return [];
|
||||
if (!Array.isArray(results)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// add new score to data
|
||||
const mergeResult = results
|
||||
@@ -457,7 +626,6 @@ export async function searchDatasetData(props: {
|
||||
|
||||
return mergeResult;
|
||||
} catch (error) {
|
||||
usingReRank = false;
|
||||
return [];
|
||||
}
|
||||
};
|
||||
|
@@ -8,20 +8,15 @@ import { addLog } from '@fastgpt/service/common/system/log';
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { replaceVariable } from '@fastgpt/global/common/string/tools';
|
||||
import { Prompt_AgentQA } from '@/global/core/prompt/agent';
|
||||
import { pushDataToDatasetCollection } from '@/pages/api/core/dataset/data/pushData';
|
||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||
import { authTeamBalance } from '../support/permission/auth/bill';
|
||||
import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api.d';
|
||||
import { UserErrEnum } from '@fastgpt/global/common/error/code/user';
|
||||
import { lockTrainingDataByTeamId } from '@fastgpt/service/core/dataset/training/controller';
|
||||
import { pushDataToDatasetCollection } from '@/service/core/dataset/data/controller';
|
||||
|
||||
const reduceQueue = (retry = false) => {
|
||||
const reduceQueue = () => {
|
||||
global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0;
|
||||
if (global.qaQueueLen === 0 && retry) {
|
||||
setTimeout(() => {
|
||||
generateQA();
|
||||
}, 60000);
|
||||
}
|
||||
|
||||
return global.vectorQueueLen === 0;
|
||||
};
|
||||
@@ -144,11 +139,11 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
|
||||
teamId: data.teamId,
|
||||
tmbId: data.tmbId,
|
||||
collectionId: data.collectionId,
|
||||
trainingMode: TrainingModeEnum.chunk,
|
||||
data: qaArr.map((item) => ({
|
||||
...item,
|
||||
chunkIndex: data.chunkIndex
|
||||
})),
|
||||
mode: TrainingModeEnum.chunk,
|
||||
billId: data.billId
|
||||
});
|
||||
|
||||
@@ -178,7 +173,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
|
||||
reduceQueue();
|
||||
generateQA();
|
||||
} catch (err: any) {
|
||||
reduceQueue(true);
|
||||
reduceQueue();
|
||||
// log
|
||||
if (err?.response) {
|
||||
addLog.info('openai error: 生成QA错误', {
|
||||
|
@@ -9,15 +9,9 @@ import { pushGenerateVectorBill } from '@/service/support/wallet/bill/push';
|
||||
import { UserErrEnum } from '@fastgpt/global/common/error/code/user';
|
||||
import { lockTrainingDataByTeamId } from '@fastgpt/service/core/dataset/training/controller';
|
||||
|
||||
const reduceQueue = (retry = false) => {
|
||||
const reduceQueue = () => {
|
||||
global.vectorQueueLen = global.vectorQueueLen > 0 ? global.vectorQueueLen - 1 : 0;
|
||||
|
||||
if (global.vectorQueueLen === 0 && retry) {
|
||||
setTimeout(() => {
|
||||
generateVector();
|
||||
}, 60000);
|
||||
}
|
||||
|
||||
return global.vectorQueueLen === 0;
|
||||
};
|
||||
|
||||
@@ -159,7 +153,7 @@ export async function generateVector(): Promise<any> {
|
||||
|
||||
console.log(`embedding finished, time: ${Date.now() - start}ms`);
|
||||
} catch (err: any) {
|
||||
reduceQueue(true);
|
||||
reduceQueue();
|
||||
// log
|
||||
if (err?.response) {
|
||||
addLog.info('openai error: 生成向量错误', {
|
||||
|
@@ -214,7 +214,7 @@ export const dispatchChatCompletion = async (props: ChatProps): Promise<ChatResp
|
||||
model: modelName,
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
query: userChatInput,
|
||||
query: `${userChatInput}`,
|
||||
maxToken: max_tokens,
|
||||
quoteList: filterQuoteQA,
|
||||
historyPreview: getHistoryPreview(completeMessages),
|
||||
@@ -407,7 +407,7 @@ async function streamResponse({
|
||||
}
|
||||
|
||||
if (!answer) {
|
||||
return Promise.reject('Chat API is error or undefined');
|
||||
return Promise.reject('core.chat API is error or undefined');
|
||||
}
|
||||
|
||||
return { answer };
|
||||
|
@@ -58,7 +58,7 @@ export async function dispatchDatasetSearch(
|
||||
usingSimilarityFilter,
|
||||
usingReRank: searchUsingReRank
|
||||
} = await searchDatasetData({
|
||||
rawQuery: userChatInput,
|
||||
rawQuery: `${userChatInput}`,
|
||||
queries: concatQueries,
|
||||
model: vectorModel.model,
|
||||
similarity,
|
||||
|
@@ -61,7 +61,7 @@ A: ${systemPrompt}
|
||||
{
|
||||
role: 'user',
|
||||
content: replaceVariable(defaultPrompt, {
|
||||
query: userChatInput,
|
||||
query: `${userChatInput}`,
|
||||
histories: concatFewShot
|
||||
})
|
||||
}
|
||||
|
@@ -6,6 +6,8 @@ import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
import { createDefaultTeam } from '@fastgpt/service/support/user/team/controller';
|
||||
import { exit } from 'process';
|
||||
import { initVectorStore } from '@fastgpt/service/common/vectorStore/controller';
|
||||
import { getInitConfig } from '@/pages/api/common/system/getInitData';
|
||||
import { setUpdateSystemConfigCron, setTrainingQueueCron } from './common/system/cron';
|
||||
|
||||
/**
|
||||
* connect MongoDB and init data
|
||||
@@ -13,11 +15,18 @@ import { initVectorStore } from '@fastgpt/service/common/vectorStore/controller'
|
||||
export function connectToDatabase(): Promise<void> {
|
||||
return connectMongo({
|
||||
beforeHook: () => {},
|
||||
afterHook: () => {
|
||||
afterHook: async () => {
|
||||
initVectorStore();
|
||||
// start queue
|
||||
startQueue();
|
||||
return initRootUser();
|
||||
// init system config
|
||||
getInitConfig();
|
||||
|
||||
// cron
|
||||
setUpdateSystemConfigCron();
|
||||
setTrainingQueueCron();
|
||||
|
||||
initRootUser();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@@ -60,7 +60,6 @@ export async function saveChat({
|
||||
}))
|
||||
)
|
||||
];
|
||||
console.log(metadataUpdate);
|
||||
|
||||
const title =
|
||||
chatContentReplaceBlock(content[0].value).slice(0, 20) ||
|
||||
|
@@ -2,20 +2,9 @@ import { generateQA } from '../events/generateQA';
|
||||
import { generateVector } from '../events/generateVector';
|
||||
|
||||
/* start task */
|
||||
export const startQueue = (limit?: number) => {
|
||||
export const startQueue = () => {
|
||||
if (!global.systemEnv) return;
|
||||
|
||||
if (limit) {
|
||||
for (let i = 0; i < limit; i++) {
|
||||
generateVector();
|
||||
generateQA();
|
||||
}
|
||||
return;
|
||||
}
|
||||
for (let i = 0; i < global.systemEnv.qaMaxProcess; i++) {
|
||||
generateQA();
|
||||
}
|
||||
for (let i = 0; i < global.systemEnv.vectorMaxProcess; i++) {
|
||||
generateVector();
|
||||
}
|
||||
generateQA();
|
||||
generateVector();
|
||||
};
|
||||
|
Reference in New Issue
Block a user