Files
FastGPT/packages/service/core/dataset/training/controller.ts
Archer db2c0a0bdb V4.8.20 feature (#3686)
* Aiproxy (#3649)

* model config

* feat: model config ui

* perf: rename variable

* feat: custom request url

* perf: model buffer

* perf: init model

* feat: json model config

* auto login

* fix: ts

* update packages

* package

* fix: dockerfile

* feat: usage filter & export & dashbord (#3538)

* feat: usage filter & export & dashbord

* adjust ui

* fix tmb scroll

* fix code & selecte all

* merge

* perf: usages list;perf: move components (#3654)

* perf: usages list

* team sub plan load

* perf: usage dashboard code

* perf: dashboard ui

* perf: move components

* add default model config (#3653)

* 4.8.20 test (#3656)

* provider

* perf: model config

* model perf (#3657)

* fix: model

* dataset quote

* perf: model config

* model tag

* doubao model config

* perf: config model

* feat: model test

* fix: POST 500 error on dingtalk bot (#3655)

* feat: default model (#3662)

* move model config

* feat: default model

* fix: false triggerd org selection (#3661)

* export usage csv i18n (#3660)

* export usage csv i18n

* fix build

* feat: markdown extension (#3663)

* feat: markdown extension

* media cros

* rerank test

* default price

* perf: default model

* fix: cannot custom provider

* fix: default model select

* update bg

* perf: default model selector

* fix: usage export

* i18n

* fix: rerank

* update init extension

* perf: ip limit check

* doubao model order

* web default modle

* perf: tts selector

* perf: tts error

* qrcode package

* reload buffer (#3665)

* reload buffer

* reload buffer

* tts selector

* fix: err tip (#3666)

* fix: err tip

* perf: training queue

* doc

* fix interactive edge (#3659)

* fix interactive edge

* fix

* comment

* add gemini model

* fix: chat model select

* perf: supplement assistant empty response (#3669)

* perf: supplement assistant empty response

* check array

* perf: max_token count;feat: support resoner output;fix: member scroll (#3681)

* perf: supplement assistant empty response

* check array

* perf: max_token count

* feat: support resoner output

* member scroll

* update provider order

* i18n

* fix: stream response (#3682)

* perf: supplement assistant empty response

* check array

* fix: stream response

* fix: model config cannot set to null

* fix: reasoning response (#3684)

* perf: supplement assistant empty response

* check array

* fix: reasoning response

* fix: reasoning response

* doc (#3685)

* perf: supplement assistant empty response

* check array

* doc

* lock

* animation

* update doc

* update compose

* doc

* doc

---------

Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: a.e. <49438478+I-Info@users.noreply.github.com>
2025-02-05 00:10:47 +08:00

206 lines
5.1 KiB
TypeScript

import { MongoDatasetTraining } from './schema';
import type {
PushDatasetDataChunkProps,
PushDatasetDataProps,
PushDatasetDataResponse
} from '@fastgpt/global/core/dataset/api.d';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { simpleText } from '@fastgpt/global/common/string/tools';
import { ClientSession } from '../../../common/mongo';
import { getLLMModel, getEmbeddingModel } from '../../ai/model';
import { addLog } from '../../../common/system/log';
import { getCollectionWithDataset } from '../controller';
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
try {
await MongoDatasetTraining.updateMany(
{
teamId
},
{
lockTime: new Date('2999/5/5')
}
);
} catch (error) {}
};
export const pushDataListToTrainingQueueByCollectionId = async ({
collectionId,
...props
}: {
teamId: string;
tmbId: string;
session?: ClientSession;
} & PushDatasetDataProps) => {
const {
dataset: { _id: datasetId, agentModel, vectorModel }
} = await getCollectionWithDataset(collectionId);
return pushDataListToTrainingQueue({
...props,
datasetId,
collectionId,
agentModel,
vectorModel
});
};
export async function pushDataListToTrainingQueue({
teamId,
tmbId,
datasetId,
collectionId,
agentModel,
vectorModel,
data,
prompt,
billId,
trainingMode = TrainingModeEnum.chunk,
session
}: {
teamId: string;
tmbId: string;
datasetId: string;
agentModel: string;
vectorModel: string;
session?: ClientSession;
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
const { model, maxToken, weight } = await (async () => {
const agentModelData = getLLMModel(agentModel);
if (!agentModelData) {
return Promise.reject(`File model ${agentModel} is inValid`);
}
const vectorModelData = getEmbeddingModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(`Vector model ${vectorModel} is inValid`);
}
if (trainingMode === TrainingModeEnum.chunk) {
return {
maxToken: vectorModelData.maxToken * 1.5,
model: vectorModelData.model,
weight: vectorModelData.weight
};
}
if (trainingMode === TrainingModeEnum.qa || trainingMode === TrainingModeEnum.auto) {
return {
maxToken: agentModelData.maxContext * 0.8,
model: agentModelData.model,
weight: 0
};
}
return Promise.reject(`Training mode "${trainingMode}" is inValid`);
})();
// filter repeat or equal content
const set = new Set();
const filterResult: Record<string, PushDatasetDataChunkProps[]> = {
success: [],
overToken: [],
repeat: [],
error: []
};
// format q and a, remove empty char
data.forEach((item) => {
item.q = simpleText(item.q);
item.a = simpleText(item.a);
item.indexes = item.indexes
?.map((index) => {
return {
...index,
text: simpleText(index.text)
};
})
.filter(Boolean);
// filter repeat content
if (!item.q) {
filterResult.error.push(item);
return;
}
const text = item.q + item.a;
if (text.length > maxToken) {
filterResult.overToken.push(item);
return;
}
if (set.has(text)) {
console.log('repeat', item);
filterResult.repeat.push(item);
} else {
filterResult.success.push(item);
set.add(text);
}
});
// insert data to db
const insertLen = filterResult.success.length;
const failedDocuments: PushDatasetDataChunkProps[] = [];
// 使用 insertMany 批量插入
const batchSize = 200;
const insertData = async (startIndex: number, session: ClientSession) => {
const list = filterResult.success.slice(startIndex, startIndex + batchSize);
if (list.length === 0) return;
try {
await MongoDatasetTraining.insertMany(
list.map((item) => ({
teamId,
tmbId,
datasetId,
collectionId,
billId,
mode: trainingMode,
prompt,
model,
q: item.q,
a: item.a,
chunkIndex: item.chunkIndex ?? 0,
weight: weight ?? 0,
indexes: item.indexes,
retryCount: 5
})),
{
session,
ordered: true
}
);
} catch (error: any) {
addLog.error(`Insert error`, error);
// 如果有错误,将失败的文档添加到失败列表中
error.writeErrors?.forEach((writeError: any) => {
failedDocuments.push(data[writeError.index]);
});
console.log('failed', failedDocuments);
}
// 对于失败的文档,尝试单独插入
await MongoDatasetTraining.create(failedDocuments, { session });
return insertData(startIndex + batchSize, session);
};
if (session) {
await insertData(0, session);
} else {
await mongoSessionRun(async (session) => {
await insertData(0, session);
});
}
delete filterResult.success;
return {
insertLen,
...filterResult
};
}