Files
FastGPT/packages/service/core/dataset/training/controller.ts
Archer a499d05a02 V4.14.0 features (#5850)
* feat: migrate chat files to s3 (#5802)

* feat: migrate chat files to s3

* feat: add delete jobs for deleting s3 files

* chore: improvements

* fix: lockfile

* fix: imports

* feat: add ttl for those uploaded files but not send yet

* feat: init bullmq worker

* fix: s3 key

* perf: s3 internal url

* remove env

* fix: re-sign a new url

* fix: re-sign a new url

* perf: s3 code

---------

Co-authored-by: archer <545436317@qq.com>

* update pacakge

* feat: add more file type for uploading (#5807)

* fix: re-sign a new url

* wip: file selector

* feat: add more file type for uploading

* feat: migrate chat files to s3 (#5802)

* feat: migrate chat files to s3

* feat: add delete jobs for deleting s3 files

* chore: improvements

* fix: lockfile

* fix: imports

* feat: add ttl for those uploaded files but not send yet

* feat: init bullmq worker

* fix: s3 key

* perf: s3 internal url

* remove env

* fix: re-sign a new url

* fix: re-sign a new url

* perf: s3 code

---------

Co-authored-by: archer <545436317@qq.com>

* fix: limit minmax available file upload number

* perf: file select modal code

* fix: fileselect refresh

* fix: ts

---------

Co-authored-by: archer <545436317@qq.com>

* bugfix: chat page (#5809)

* fix: upload avatar

* fix: chat page username display issue and setting button visibility

* doc

* Markdown match base64 performance

* feat: improve global variables(time, file, dataset) (#5804)

* feat: improve global variables(time, file, dataset)

* feat: optimize code

* perf: time variables code

* fix: model, file

* fix: hide file upload

* fix: ts

* hide dataset select

---------

Co-authored-by: archer <545436317@qq.com>

* perf: insert training queue

* perf: s3 upload error i18n

* fix: share page s3

* fix: timeselector ui error

* var update node

* Timepicker ui

* feat: plugin support password

* fix: password disabled UX

* fix: button size

* fix: no model cache for chat page (#5820)

* rename function

* fix: workflow bug

* fix: interactive loop

* fix test

* perf: common textare no richtext

* move system plugin config (#5803) (#5813)

* move system plugin config (#5803)

* move system plugin config

* extract tag bar

* filter

* tool detail temp

* marketplace

* params

* fix

* type

* search

* tags render

* status

* ui

* code

* connect to backend (#5815)

* feat: marketplace apis & type definitions (#5817)

* chore: marketplace init

* chore: marketplace list api type

* chore: detail api

* marketplace & import

* feat: marketplace ui (#5826)

* temp

* marketplace

* import

* feat: detail return readme

* chore: cache data expire 10 mins

* chore: update docs

* feat: marketplace ui

---------

Co-authored-by: heheer <zhiyu44@qq.com>

* feat: marketplace (#5830)

* temp

* marketplace

* chore: tool list tag filter

* chore: adjust

---------

Co-authored-by: heheer <zhiyu44@qq.com>

* tool detail drawer

* remove tag filter

* fix

* fix

* fix build

* update pnpm-lock

* fix type

* perf code

* marketplace router

* fix build

* navbar icon

* fix ui

* fix init

* docs: marketplace/plugin (#5832)

* temp

* marketplace

* docs(plugin): system tool docs

---------

Co-authored-by: heheer <zhiyu44@qq.com>

* default url

* feat: i18n/ docker build (#5833)

* chore: docker build

* feat: i18n selector

* fix

* fix

* fix: i18n parse

* fix: i18n parse

---------

Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
Co-authored-by: heheer <zhiyu44@qq.com>

* marketplace url

* update action

* market place code

* market place code

* title

* fix: nextconfig

* fix: copilot review

* Remove bypassable regex-based XSS sanitization from marketplace search (#5835)

* Initial plan

* Remove problematic regex-based XSS sanitization from search inputs

Co-authored-by: c121914yu <50446880+c121914yu@users.noreply.github.com>

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: c121914yu <50446880+c121914yu@users.noreply.github.com>

* feat: tool tag openapi

* api check

* fix: tsc

* fix: ts

* fix: lock

* sdk version

* ts

* sdk version

* remove invalid tip

* perf: export data add timezone

* perf: admin plugin api move

* perf: tool code

* move tag code

* perf: marketplace and team plugin code

* remove workflow invalid request

* rename global tool code

* rename global tool code

* rename api

* fix some bugs (#5841)

* fix some bugs

* fix

* perf: Tag filter

* fix: ts

* fix: ts

---------

Co-authored-by: archer <545436317@qq.com>

* perf: Concat function

* fix: workflow snapshot push

* fix: ts type

* fix: login to config/*

* fix: ts

* fix: model avatar (#5848)

* fix: model avatar

* fix: ts

* fix: avatar migration to s3

* update lock

* fix: avatar redirect

---------

Co-authored-by: archer <545436317@qq.com>

* fix tool detail (#5847)

* fix tool detail

* init script

* fix build

* perf: plugin detail modal

* change tooltags to tags

* fix icon

---------

Co-authored-by: archer <545436317@qq.com>

* fix tag filter scroll (#5852)

* fix create app plugin & import info (#5853)

* tag size

* rename toolkit

* download url

* import plugin status (#5854)

* init doc

* fix: init shell

---------

Co-authored-by: 伍闲犬 <whoeverimf5@gmail.com>
Co-authored-by: Zeng Qingwen <143274079+fishwww-ww@users.noreply.github.com>
Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
Co-authored-by: heheer <zhiyu44@qq.com>
Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com>
2025-11-04 16:58:12 +08:00

216 lines
5.9 KiB
TypeScript

import { MongoDatasetTraining } from './schema';
import type { PushDatasetDataResponse } from '@fastgpt/global/core/dataset/api.d';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { type ClientSession } from '../../../common/mongo';
import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
import { addLog } from '../../../common/system/log';
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
import { type PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type';
import { i18nT } from '../../../../web/i18n/utils';
import { getLLMMaxChunkSize } from '../../../../global/core/dataset/training/utils';
import { retryFn } from '@fastgpt/global/common/system/utils';
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
try {
await MongoDatasetTraining.updateMany(
{
teamId
},
{
lockTime: new Date('2999/5/5')
}
);
} catch (error) {}
};
export async function pushDataListToTrainingQueue({
teamId,
tmbId,
datasetId,
collectionId,
agentModel,
vectorModel,
vlmModel,
data,
billId,
mode = TrainingModeEnum.chunk,
indexSize,
session
}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
const vectorModelData = getEmbeddingModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(i18nT('common:error_embedding_not_config'));
}
const agentModelData = getLLMModel(agentModel);
if (!agentModelData) {
return Promise.reject(i18nT('common:error_llm_not_config'));
}
const { model, maxToken, weight } = await (async () => {
if (mode === TrainingModeEnum.chunk) {
return {
maxToken: Infinity,
model: vectorModelData.model,
weight: vectorModelData.weight
};
}
if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) {
return {
maxToken: getLLMMaxChunkSize(agentModelData),
model: agentModelData.model,
weight: 0
};
}
if (mode === TrainingModeEnum.image || mode === TrainingModeEnum.imageParse) {
const vllmModelData = getVlmModel(vlmModel);
if (!vllmModelData) {
return Promise.reject(i18nT('common:error_vlm_not_config'));
}
return {
maxToken: getLLMMaxChunkSize(vllmModelData),
model: vllmModelData.model,
weight: 0
};
}
return Promise.reject(`Training mode "${mode}" is inValid`);
})();
// format q and a, remove empty char
data = data.filter((item) => {
const q = item.q || '';
const a = item.a || '';
// filter repeat content
if (!item.imageId && !q) {
return;
}
const text = q + a;
// Oversize llm tokens
if (text.length > maxToken) {
return;
}
return true;
});
// insert data to db
const batchSize = 500; // Batch insert size
const maxBatchesPerTransaction = 20; // Every session can insert at most 20 batches
const insertDataIterative = async (
dataToInsert: typeof data,
session: ClientSession
): Promise<number> => {
let insertedCount = 0;
for (let i = 0; i < dataToInsert.length; i += batchSize) {
const batch = dataToInsert.slice(i, i + batchSize);
if (batch.length === 0) continue;
const result = await MongoDatasetTraining.insertMany(
batch.map((item) => ({
teamId,
tmbId,
datasetId,
collectionId,
billId,
mode,
...(item.q && { q: item.q }),
...(item.a && { a: item.a }),
...(item.imageId && { imageId: item.imageId }),
chunkIndex: item.chunkIndex ?? 0,
indexSize,
weight: weight ?? 0,
indexes: item.indexes,
retryCount: 5
})),
{
session,
ordered: true, // 改为 true: 任何失败立即停止,事务回滚
rawResult: true,
includeResultMetadata: false
}
);
// ordered: true 模式下,成功必定等于批次大小
insertedCount += result.insertedCount;
addLog.debug(`Training data insert progress: ${insertedCount}/${dataToInsert.length}`);
}
return insertedCount;
};
// 大数据量分段事务处理 (避免事务超时)
const chunkSize = maxBatchesPerTransaction * batchSize; // 10,000 条
let start = Date.now();
if (data.length > chunkSize) {
addLog.info(`Large dataset detected (${data.length} items), using chunked transactions`);
let totalInserted = 0;
for (let i = 0; i < data.length; i += chunkSize) {
const chunk = data.slice(i, i + chunkSize);
await retryFn(async () => {
const inserted = await mongoSessionRun(async (chunkSession) => {
return insertDataIterative(chunk, chunkSession);
});
totalInserted += inserted;
});
}
addLog.info(`Chunked transactions completed in ${Date.now() - start}ms`);
return { insertLen: totalInserted };
}
// 小数据量单事务处理
if (session) {
const insertedCount = await insertDataIterative(data, session);
addLog.info(`Single transaction completed in ${Date.now() - start}ms`);
return { insertLen: insertedCount };
} else {
const insertedCount = await mongoSessionRun(async (session) => {
return insertDataIterative(data, session);
});
addLog.info(`Single transaction completed in ${Date.now() - start}ms`);
return { insertLen: insertedCount };
}
}
export const pushDatasetToParseQueue = async ({
teamId,
tmbId,
datasetId,
collectionId,
billId,
session
}: {
teamId: string;
tmbId: string;
datasetId: string;
collectionId: string;
billId: string;
session: ClientSession;
}) => {
await MongoDatasetTraining.create(
[
{
teamId,
tmbId,
datasetId,
collectionId,
billId,
mode: TrainingModeEnum.parse
}
],
{ session, ordered: true }
);
};