Files
FastGPT/packages/service/core/dataset/training/controller.ts
Archer 064c64e74c V4.6.9-first commit (#899)
* perf: insert mongo dataset data session

* perf: dataset data index

* remove delay

* rename bill schema

* rename bill record

* perf: bill table

* perf: prompt

* perf: sub plan

* change the usage count

* feat: usage bill

* publish usages

* doc

* 新增团队聊天功能 (#20)

* perf: doc

* feat 添加标签部分

feat 信息团队标签配置

feat 新增团队同步管理

feat team分享页面

feat 完成team分享页面

feat 实现模糊搜索

style 格式化

fix 修复迷糊匹配

style 样式修改

fix 团队标签功能修复

* fix 修复鉴权功能

* merge 合并代码

* fix 修复引用错误

* fix 修复pr问题

* fix 修复ts格式问题

---------

Co-authored-by: archer <545436317@qq.com>
Co-authored-by: liuxingwan <liuxingwan.lxw@alibaba-inc.com>

* update extra plan

* fix: ts

* format

* perf: bill field

* feat: standard plan

* fix: ts

* feat 个人账号页面修改 (#22)

* feat 添加标签部分

feat 信息团队标签配置

feat 新增团队同步管理

feat team分享页面

feat 完成team分享页面

feat 实现模糊搜索

style 格式化

fix 修复迷糊匹配

style 样式修改

fix 团队标签功能修复

* fix 修复鉴权功能

* merge 合并代码

* fix 修复引用错误

* fix 修复pr问题

* fix 修复ts格式问题

* feat 修改个人账号页

---------

Co-authored-by: liuxingwan <liuxingwan.lxw@alibaba-inc.com>

* sub plan page (#23)

* fix chunk index; error page text

* feat: dataset process Integral prediction

* feat: stand plan field

* feat: sub plan limit

* perf: index

* query extension

* perf: share link push app name

* perf: plan point unit

* perf: get sub plan

* perf: account page

* feat 新增套餐详情弹窗代码 (#24)

* merge 合并代码

* fix 新增套餐详情弹框

* fix 修复pr问题

* feat: change http node input to prompt editor (#21)

* feat: change http node input to prompt editor

* fix

* split PromptEditor to HttpInput

* Team plans (#25)

* perf: pay check

* perf: team plan test

* plan limit check

* replace sensitive text

* perf: fix some null

* collection null check

* perf: plans modal

* perf: http module

* pacakge (#26)

* individuation page and pay modal amount (#27)

* feat: individuation page

* team chat config

* pay modal

* plan count and replace invalid chars (#29)

* fix: user oneapi

* fix: training queue

* fix: qa queue

* perf: remove space chars

* replace invalid chars

* change httpinput dropdown menu (#28)

* perf: http

* reseet free plan

* perf: plan code to packages

* remove llm config to package

* perf: code

* perf: faq

* fix: get team plan

---------

Co-authored-by: yst <77910600+yu-and-liu@users.noreply.github.com>
Co-authored-by: liuxingwan <liuxingwan.lxw@alibaba-inc.com>
Co-authored-by: heheer <71265218+newfish-cmyk@users.noreply.github.com>
2024-02-28 13:19:15 +08:00

184 lines
4.6 KiB
TypeScript

import { delay } from '@fastgpt/global/common/system/utils';
import { MongoDatasetTraining } from './schema';
import type {
PushDatasetDataChunkProps,
PushDatasetDataProps,
PushDatasetDataResponse
} from '@fastgpt/global/core/dataset/api.d';
import { getCollectionWithDataset } from '../controller';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { simpleText } from '@fastgpt/global/common/string/tools';
import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
import type { VectorModelItemType, LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
try {
await MongoDatasetTraining.updateMany(
{
teamId
},
{
lockTime: new Date('2999/5/5')
}
);
} catch (error) {}
};
export async function pushDataListToTrainingQueue({
teamId,
tmbId,
collectionId,
data,
prompt,
billId,
trainingMode = TrainingModeEnum.chunk,
vectorModelList = [],
datasetModelList = []
}: {
teamId: string;
tmbId: string;
vectorModelList: VectorModelItemType[];
datasetModelList: LLMModelItemType[];
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
const {
datasetId: { _id: datasetId, vectorModel, agentModel }
} = await getCollectionWithDataset(collectionId);
const checkModelValid = async () => {
if (trainingMode === TrainingModeEnum.chunk) {
const vectorModelData = vectorModelList?.find((item) => item.model === vectorModel);
if (!vectorModelData) {
return Promise.reject(`File model ${vectorModel} is inValid`);
}
return {
maxToken: vectorModelData.maxToken * 1.3,
model: vectorModelData.model,
weight: vectorModelData.weight
};
}
if (trainingMode === TrainingModeEnum.qa) {
const qaModelData = datasetModelList?.find((item) => item.model === agentModel);
if (!qaModelData) {
return Promise.reject(`Vector model ${agentModel} is inValid`);
}
return {
maxToken: qaModelData.maxContext * 0.8,
model: qaModelData.model,
weight: 0
};
}
return Promise.reject(`Training mode "${trainingMode}" is inValid`);
};
const { model, maxToken, weight } = await checkModelValid();
// format q and a, remove empty char
data.forEach((item) => {
item.q = simpleText(item.q);
item.a = simpleText(item.a);
item.indexes = item.indexes
?.map((index) => {
return {
...index,
text: simpleText(index.text)
};
})
.filter(Boolean);
});
// filter repeat or equal content
const set = new Set();
const filterResult: Record<string, PushDatasetDataChunkProps[]> = {
success: [],
overToken: [],
repeat: [],
error: []
};
// filter repeat content
data.forEach((item) => {
if (!item.q) {
filterResult.error.push(item);
return;
}
const text = item.q + item.a;
// count q token
const token = countPromptTokens(item.q);
if (token > maxToken) {
filterResult.overToken.push(item);
return;
}
if (set.has(text)) {
console.log('repeat', item);
filterResult.repeat.push(item);
} else {
filterResult.success.push(item);
set.add(text);
}
});
// insert data to db
const insertData = async (dataList: PushDatasetDataChunkProps[], retry = 3): Promise<number> => {
try {
const results = await MongoDatasetTraining.insertMany(
dataList.map((item, i) => ({
teamId,
tmbId,
datasetId,
collectionId,
billId,
mode: trainingMode,
prompt,
model,
q: item.q,
a: item.a,
chunkIndex: item.chunkIndex ?? 0,
weight: weight ?? 0,
indexes: item.indexes
}))
);
await delay(500);
return results.length;
} catch (error) {
if (retry > 0) {
await delay(500);
return insertData(dataList, retry - 1);
}
return Promise.reject(error);
}
};
let insertLen = 0;
const chunkSize = 50;
const chunkList = filterResult.success.reduce(
(acc, cur) => {
const lastChunk = acc[acc.length - 1];
if (lastChunk.length < chunkSize) {
lastChunk.push(cur);
} else {
acc.push([cur]);
}
return acc;
},
[[]] as PushDatasetDataChunkProps[][]
);
for await (const chunks of chunkList) {
insertLen += await insertData(chunks);
}
delete filterResult.success;
return {
insertLen,
...filterResult
};
}