mirror of
https://github.com/labring/FastGPT.git
synced 2026-03-13 01:13:56 +08:00
* feat: migrate chat files to s3 (#5802) * feat: migrate chat files to s3 * feat: add delete jobs for deleting s3 files * chore: improvements * fix: lockfile * fix: imports * feat: add ttl for those uploaded files but not send yet * feat: init bullmq worker * fix: s3 key * perf: s3 internal url * remove env * fix: re-sign a new url * fix: re-sign a new url * perf: s3 code --------- Co-authored-by: archer <545436317@qq.com> * update pacakge * feat: add more file type for uploading (#5807) * fix: re-sign a new url * wip: file selector * feat: add more file type for uploading * feat: migrate chat files to s3 (#5802) * feat: migrate chat files to s3 * feat: add delete jobs for deleting s3 files * chore: improvements * fix: lockfile * fix: imports * feat: add ttl for those uploaded files but not send yet * feat: init bullmq worker * fix: s3 key * perf: s3 internal url * remove env * fix: re-sign a new url * fix: re-sign a new url * perf: s3 code --------- Co-authored-by: archer <545436317@qq.com> * fix: limit minmax available file upload number * perf: file select modal code * fix: fileselect refresh * fix: ts --------- Co-authored-by: archer <545436317@qq.com> * bugfix: chat page (#5809) * fix: upload avatar * fix: chat page username display issue and setting button visibility * doc * Markdown match base64 performance * feat: improve global variables(time, file, dataset) (#5804) * feat: improve global variables(time, file, dataset) * feat: optimize code * perf: time variables code * fix: model, file * fix: hide file upload * fix: ts * hide dataset select --------- Co-authored-by: archer <545436317@qq.com> * perf: insert training queue * perf: s3 upload error i18n * fix: share page s3 * fix: timeselector ui error * var update node * Timepicker ui * feat: plugin support password * fix: password disabled UX * fix: button size * fix: no model cache for chat page (#5820) * rename function * fix: workflow bug * fix: interactive loop * fix test * perf: common textare no richtext * move system plugin config (#5803) (#5813) * move system plugin config (#5803) * move system plugin config * extract tag bar * filter * tool detail temp * marketplace * params * fix * type * search * tags render * status * ui * code * connect to backend (#5815) * feat: marketplace apis & type definitions (#5817) * chore: marketplace init * chore: marketplace list api type * chore: detail api * marketplace & import * feat: marketplace ui (#5826) * temp * marketplace * import * feat: detail return readme * chore: cache data expire 10 mins * chore: update docs * feat: marketplace ui --------- Co-authored-by: heheer <zhiyu44@qq.com> * feat: marketplace (#5830) * temp * marketplace * chore: tool list tag filter * chore: adjust --------- Co-authored-by: heheer <zhiyu44@qq.com> * tool detail drawer * remove tag filter * fix * fix * fix build * update pnpm-lock * fix type * perf code * marketplace router * fix build * navbar icon * fix ui * fix init * docs: marketplace/plugin (#5832) * temp * marketplace * docs(plugin): system tool docs --------- Co-authored-by: heheer <zhiyu44@qq.com> * default url * feat: i18n/ docker build (#5833) * chore: docker build * feat: i18n selector * fix * fix * fix: i18n parse * fix: i18n parse --------- Co-authored-by: heheer <heheer@sealos.io> Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com> Co-authored-by: heheer <zhiyu44@qq.com> * marketplace url * update action * market place code * market place code * title * fix: nextconfig * fix: copilot review * Remove bypassable regex-based XSS sanitization from marketplace search (#5835) * Initial plan * Remove problematic regex-based XSS sanitization from search inputs Co-authored-by: c121914yu <50446880+c121914yu@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: c121914yu <50446880+c121914yu@users.noreply.github.com> * feat: tool tag openapi * api check * fix: tsc * fix: ts * fix: lock * sdk version * ts * sdk version * remove invalid tip * perf: export data add timezone * perf: admin plugin api move * perf: tool code * move tag code * perf: marketplace and team plugin code * remove workflow invalid request * rename global tool code * rename global tool code * rename api * fix some bugs (#5841) * fix some bugs * fix * perf: Tag filter * fix: ts * fix: ts --------- Co-authored-by: archer <545436317@qq.com> * perf: Concat function * fix: workflow snapshot push * fix: ts type * fix: login to config/* * fix: ts * fix: model avatar (#5848) * fix: model avatar * fix: ts * fix: avatar migration to s3 * update lock * fix: avatar redirect --------- Co-authored-by: archer <545436317@qq.com> * fix tool detail (#5847) * fix tool detail * init script * fix build * perf: plugin detail modal * change tooltags to tags * fix icon --------- Co-authored-by: archer <545436317@qq.com> * fix tag filter scroll (#5852) * fix create app plugin & import info (#5853) * tag size * rename toolkit * download url * import plugin status (#5854) * init doc * fix: init shell --------- Co-authored-by: 伍闲犬 <whoeverimf5@gmail.com> Co-authored-by: Zeng Qingwen <143274079+fishwww-ww@users.noreply.github.com> Co-authored-by: heheer <heheer@sealos.io> Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com> Co-authored-by: heheer <zhiyu44@qq.com> Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com>
216 lines
5.9 KiB
TypeScript
216 lines
5.9 KiB
TypeScript
import { MongoDatasetTraining } from './schema';
|
|
import type { PushDatasetDataResponse } from '@fastgpt/global/core/dataset/api.d';
|
|
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
|
import { type ClientSession } from '../../../common/mongo';
|
|
import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
|
|
import { addLog } from '../../../common/system/log';
|
|
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
|
|
import { type PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type';
|
|
import { i18nT } from '../../../../web/i18n/utils';
|
|
import { getLLMMaxChunkSize } from '../../../../global/core/dataset/training/utils';
|
|
import { retryFn } from '@fastgpt/global/common/system/utils';
|
|
|
|
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
|
|
try {
|
|
await MongoDatasetTraining.updateMany(
|
|
{
|
|
teamId
|
|
},
|
|
{
|
|
lockTime: new Date('2999/5/5')
|
|
}
|
|
);
|
|
} catch (error) {}
|
|
};
|
|
|
|
export async function pushDataListToTrainingQueue({
|
|
teamId,
|
|
tmbId,
|
|
datasetId,
|
|
collectionId,
|
|
agentModel,
|
|
vectorModel,
|
|
vlmModel,
|
|
data,
|
|
billId,
|
|
mode = TrainingModeEnum.chunk,
|
|
indexSize,
|
|
session
|
|
}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
|
|
const vectorModelData = getEmbeddingModel(vectorModel);
|
|
if (!vectorModelData) {
|
|
return Promise.reject(i18nT('common:error_embedding_not_config'));
|
|
}
|
|
const agentModelData = getLLMModel(agentModel);
|
|
if (!agentModelData) {
|
|
return Promise.reject(i18nT('common:error_llm_not_config'));
|
|
}
|
|
|
|
const { model, maxToken, weight } = await (async () => {
|
|
if (mode === TrainingModeEnum.chunk) {
|
|
return {
|
|
maxToken: Infinity,
|
|
model: vectorModelData.model,
|
|
weight: vectorModelData.weight
|
|
};
|
|
}
|
|
if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) {
|
|
return {
|
|
maxToken: getLLMMaxChunkSize(agentModelData),
|
|
model: agentModelData.model,
|
|
weight: 0
|
|
};
|
|
}
|
|
if (mode === TrainingModeEnum.image || mode === TrainingModeEnum.imageParse) {
|
|
const vllmModelData = getVlmModel(vlmModel);
|
|
if (!vllmModelData) {
|
|
return Promise.reject(i18nT('common:error_vlm_not_config'));
|
|
}
|
|
return {
|
|
maxToken: getLLMMaxChunkSize(vllmModelData),
|
|
model: vllmModelData.model,
|
|
weight: 0
|
|
};
|
|
}
|
|
|
|
return Promise.reject(`Training mode "${mode}" is inValid`);
|
|
})();
|
|
|
|
// format q and a, remove empty char
|
|
data = data.filter((item) => {
|
|
const q = item.q || '';
|
|
const a = item.a || '';
|
|
|
|
// filter repeat content
|
|
if (!item.imageId && !q) {
|
|
return;
|
|
}
|
|
|
|
const text = q + a;
|
|
|
|
// Oversize llm tokens
|
|
if (text.length > maxToken) {
|
|
return;
|
|
}
|
|
|
|
return true;
|
|
});
|
|
|
|
// insert data to db
|
|
const batchSize = 500; // Batch insert size
|
|
const maxBatchesPerTransaction = 20; // Every session can insert at most 20 batches
|
|
|
|
const insertDataIterative = async (
|
|
dataToInsert: typeof data,
|
|
session: ClientSession
|
|
): Promise<number> => {
|
|
let insertedCount = 0;
|
|
|
|
for (let i = 0; i < dataToInsert.length; i += batchSize) {
|
|
const batch = dataToInsert.slice(i, i + batchSize);
|
|
|
|
if (batch.length === 0) continue;
|
|
|
|
const result = await MongoDatasetTraining.insertMany(
|
|
batch.map((item) => ({
|
|
teamId,
|
|
tmbId,
|
|
datasetId,
|
|
collectionId,
|
|
billId,
|
|
mode,
|
|
...(item.q && { q: item.q }),
|
|
...(item.a && { a: item.a }),
|
|
...(item.imageId && { imageId: item.imageId }),
|
|
chunkIndex: item.chunkIndex ?? 0,
|
|
indexSize,
|
|
weight: weight ?? 0,
|
|
indexes: item.indexes,
|
|
retryCount: 5
|
|
})),
|
|
{
|
|
session,
|
|
ordered: true, // 改为 true: 任何失败立即停止,事务回滚
|
|
rawResult: true,
|
|
includeResultMetadata: false
|
|
}
|
|
);
|
|
|
|
// ordered: true 模式下,成功必定等于批次大小
|
|
insertedCount += result.insertedCount;
|
|
|
|
addLog.debug(`Training data insert progress: ${insertedCount}/${dataToInsert.length}`);
|
|
}
|
|
|
|
return insertedCount;
|
|
};
|
|
|
|
// 大数据量分段事务处理 (避免事务超时)
|
|
const chunkSize = maxBatchesPerTransaction * batchSize; // 10,000 条
|
|
let start = Date.now();
|
|
|
|
if (data.length > chunkSize) {
|
|
addLog.info(`Large dataset detected (${data.length} items), using chunked transactions`);
|
|
|
|
let totalInserted = 0;
|
|
|
|
for (let i = 0; i < data.length; i += chunkSize) {
|
|
const chunk = data.slice(i, i + chunkSize);
|
|
|
|
await retryFn(async () => {
|
|
const inserted = await mongoSessionRun(async (chunkSession) => {
|
|
return insertDataIterative(chunk, chunkSession);
|
|
});
|
|
totalInserted += inserted;
|
|
});
|
|
}
|
|
|
|
addLog.info(`Chunked transactions completed in ${Date.now() - start}ms`);
|
|
|
|
return { insertLen: totalInserted };
|
|
}
|
|
|
|
// 小数据量单事务处理
|
|
if (session) {
|
|
const insertedCount = await insertDataIterative(data, session);
|
|
addLog.info(`Single transaction completed in ${Date.now() - start}ms`);
|
|
return { insertLen: insertedCount };
|
|
} else {
|
|
const insertedCount = await mongoSessionRun(async (session) => {
|
|
return insertDataIterative(data, session);
|
|
});
|
|
addLog.info(`Single transaction completed in ${Date.now() - start}ms`);
|
|
return { insertLen: insertedCount };
|
|
}
|
|
}
|
|
|
|
export const pushDatasetToParseQueue = async ({
|
|
teamId,
|
|
tmbId,
|
|
datasetId,
|
|
collectionId,
|
|
billId,
|
|
session
|
|
}: {
|
|
teamId: string;
|
|
tmbId: string;
|
|
datasetId: string;
|
|
collectionId: string;
|
|
billId: string;
|
|
session: ClientSession;
|
|
}) => {
|
|
await MongoDatasetTraining.create(
|
|
[
|
|
{
|
|
teamId,
|
|
tmbId,
|
|
datasetId,
|
|
collectionId,
|
|
billId,
|
|
mode: TrainingModeEnum.parse
|
|
}
|
|
],
|
|
{ session, ordered: true }
|
|
);
|
|
};
|