mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 21:13:50 +00:00

* Aiproxy (#3649) * model config * feat: model config ui * perf: rename variable * feat: custom request url * perf: model buffer * perf: init model * feat: json model config * auto login * fix: ts * update packages * package * fix: dockerfile * feat: usage filter & export & dashbord (#3538) * feat: usage filter & export & dashbord * adjust ui * fix tmb scroll * fix code & selecte all * merge * perf: usages list;perf: move components (#3654) * perf: usages list * team sub plan load * perf: usage dashboard code * perf: dashboard ui * perf: move components * add default model config (#3653) * 4.8.20 test (#3656) * provider * perf: model config * model perf (#3657) * fix: model * dataset quote * perf: model config * model tag * doubao model config * perf: config model * feat: model test * fix: POST 500 error on dingtalk bot (#3655) * feat: default model (#3662) * move model config * feat: default model * fix: false triggerd org selection (#3661) * export usage csv i18n (#3660) * export usage csv i18n * fix build * feat: markdown extension (#3663) * feat: markdown extension * media cros * rerank test * default price * perf: default model * fix: cannot custom provider * fix: default model select * update bg * perf: default model selector * fix: usage export * i18n * fix: rerank * update init extension * perf: ip limit check * doubao model order * web default modle * perf: tts selector * perf: tts error * qrcode package * reload buffer (#3665) * reload buffer * reload buffer * tts selector * fix: err tip (#3666) * fix: err tip * perf: training queue * doc * fix interactive edge (#3659) * fix interactive edge * fix * comment * add gemini model * fix: chat model select * perf: supplement assistant empty response (#3669) * perf: supplement assistant empty response * check array * perf: max_token count;feat: support resoner output;fix: member scroll (#3681) * perf: supplement assistant empty response * check array * perf: max_token count * feat: support resoner output * member scroll * update provider order * i18n * fix: stream response (#3682) * perf: supplement assistant empty response * check array * fix: stream response * fix: model config cannot set to null * fix: reasoning response (#3684) * perf: supplement assistant empty response * check array * fix: reasoning response * fix: reasoning response * doc (#3685) * perf: supplement assistant empty response * check array * doc * lock * animation * update doc * update compose * doc * doc --------- Co-authored-by: heheer <heheer@sealos.io> Co-authored-by: a.e. <49438478+I-Info@users.noreply.github.com>
206 lines
5.1 KiB
TypeScript
206 lines
5.1 KiB
TypeScript
import { MongoDatasetTraining } from './schema';
|
|
import type {
|
|
PushDatasetDataChunkProps,
|
|
PushDatasetDataProps,
|
|
PushDatasetDataResponse
|
|
} from '@fastgpt/global/core/dataset/api.d';
|
|
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
|
import { simpleText } from '@fastgpt/global/common/string/tools';
|
|
import { ClientSession } from '../../../common/mongo';
|
|
import { getLLMModel, getEmbeddingModel } from '../../ai/model';
|
|
import { addLog } from '../../../common/system/log';
|
|
import { getCollectionWithDataset } from '../controller';
|
|
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
|
|
|
|
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
|
|
try {
|
|
await MongoDatasetTraining.updateMany(
|
|
{
|
|
teamId
|
|
},
|
|
{
|
|
lockTime: new Date('2999/5/5')
|
|
}
|
|
);
|
|
} catch (error) {}
|
|
};
|
|
|
|
export const pushDataListToTrainingQueueByCollectionId = async ({
|
|
collectionId,
|
|
...props
|
|
}: {
|
|
teamId: string;
|
|
tmbId: string;
|
|
session?: ClientSession;
|
|
} & PushDatasetDataProps) => {
|
|
const {
|
|
dataset: { _id: datasetId, agentModel, vectorModel }
|
|
} = await getCollectionWithDataset(collectionId);
|
|
return pushDataListToTrainingQueue({
|
|
...props,
|
|
datasetId,
|
|
collectionId,
|
|
agentModel,
|
|
vectorModel
|
|
});
|
|
};
|
|
|
|
export async function pushDataListToTrainingQueue({
|
|
teamId,
|
|
tmbId,
|
|
datasetId,
|
|
collectionId,
|
|
agentModel,
|
|
vectorModel,
|
|
data,
|
|
prompt,
|
|
billId,
|
|
trainingMode = TrainingModeEnum.chunk,
|
|
session
|
|
}: {
|
|
teamId: string;
|
|
tmbId: string;
|
|
datasetId: string;
|
|
agentModel: string;
|
|
vectorModel: string;
|
|
session?: ClientSession;
|
|
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
|
|
const { model, maxToken, weight } = await (async () => {
|
|
const agentModelData = getLLMModel(agentModel);
|
|
if (!agentModelData) {
|
|
return Promise.reject(`File model ${agentModel} is inValid`);
|
|
}
|
|
const vectorModelData = getEmbeddingModel(vectorModel);
|
|
if (!vectorModelData) {
|
|
return Promise.reject(`Vector model ${vectorModel} is inValid`);
|
|
}
|
|
|
|
if (trainingMode === TrainingModeEnum.chunk) {
|
|
return {
|
|
maxToken: vectorModelData.maxToken * 1.5,
|
|
model: vectorModelData.model,
|
|
weight: vectorModelData.weight
|
|
};
|
|
}
|
|
|
|
if (trainingMode === TrainingModeEnum.qa || trainingMode === TrainingModeEnum.auto) {
|
|
return {
|
|
maxToken: agentModelData.maxContext * 0.8,
|
|
model: agentModelData.model,
|
|
weight: 0
|
|
};
|
|
}
|
|
|
|
return Promise.reject(`Training mode "${trainingMode}" is inValid`);
|
|
})();
|
|
|
|
// filter repeat or equal content
|
|
const set = new Set();
|
|
const filterResult: Record<string, PushDatasetDataChunkProps[]> = {
|
|
success: [],
|
|
overToken: [],
|
|
repeat: [],
|
|
error: []
|
|
};
|
|
|
|
// format q and a, remove empty char
|
|
data.forEach((item) => {
|
|
item.q = simpleText(item.q);
|
|
item.a = simpleText(item.a);
|
|
|
|
item.indexes = item.indexes
|
|
?.map((index) => {
|
|
return {
|
|
...index,
|
|
text: simpleText(index.text)
|
|
};
|
|
})
|
|
.filter(Boolean);
|
|
|
|
// filter repeat content
|
|
if (!item.q) {
|
|
filterResult.error.push(item);
|
|
return;
|
|
}
|
|
|
|
const text = item.q + item.a;
|
|
|
|
if (text.length > maxToken) {
|
|
filterResult.overToken.push(item);
|
|
return;
|
|
}
|
|
|
|
if (set.has(text)) {
|
|
console.log('repeat', item);
|
|
filterResult.repeat.push(item);
|
|
} else {
|
|
filterResult.success.push(item);
|
|
set.add(text);
|
|
}
|
|
});
|
|
|
|
// insert data to db
|
|
const insertLen = filterResult.success.length;
|
|
const failedDocuments: PushDatasetDataChunkProps[] = [];
|
|
|
|
// 使用 insertMany 批量插入
|
|
const batchSize = 200;
|
|
const insertData = async (startIndex: number, session: ClientSession) => {
|
|
const list = filterResult.success.slice(startIndex, startIndex + batchSize);
|
|
|
|
if (list.length === 0) return;
|
|
|
|
try {
|
|
await MongoDatasetTraining.insertMany(
|
|
list.map((item) => ({
|
|
teamId,
|
|
tmbId,
|
|
datasetId,
|
|
collectionId,
|
|
billId,
|
|
mode: trainingMode,
|
|
prompt,
|
|
model,
|
|
q: item.q,
|
|
a: item.a,
|
|
chunkIndex: item.chunkIndex ?? 0,
|
|
weight: weight ?? 0,
|
|
indexes: item.indexes,
|
|
retryCount: 5
|
|
})),
|
|
{
|
|
session,
|
|
ordered: true
|
|
}
|
|
);
|
|
} catch (error: any) {
|
|
addLog.error(`Insert error`, error);
|
|
// 如果有错误,将失败的文档添加到失败列表中
|
|
error.writeErrors?.forEach((writeError: any) => {
|
|
failedDocuments.push(data[writeError.index]);
|
|
});
|
|
console.log('failed', failedDocuments);
|
|
}
|
|
|
|
// 对于失败的文档,尝试单独插入
|
|
await MongoDatasetTraining.create(failedDocuments, { session });
|
|
|
|
return insertData(startIndex + batchSize, session);
|
|
};
|
|
|
|
if (session) {
|
|
await insertData(0, session);
|
|
} else {
|
|
await mongoSessionRun(async (session) => {
|
|
await insertData(0, session);
|
|
});
|
|
}
|
|
|
|
delete filterResult.success;
|
|
|
|
return {
|
|
insertLen,
|
|
...filterResult
|
|
};
|
|
}
|