mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-28 17:29:44 +00:00
V4.9.11 feature (#4969)
* Feat: Images dataset collection (#4941) * New pic (#4858) * 更新数据集相关类型,添加图像文件ID和预览URL支持;优化数据集导入功能,新增图像数据集处理组件;修复部分国际化文本;更新文件上传逻辑以支持新功能。 * 与原先代码的差别 * 新增 V4.9.10 更新说明,支持 PG 设置`systemEnv.hnswMaxScanTuples`参数,优化 LLM stream 调用超时,修复全文检索多知识库排序问题。同时更新数据集索引,移除 datasetId 字段以简化查询。 * 更换成fileId_image逻辑,并增加训练队列匹配的逻辑 * 新增图片集合判断逻辑,优化预览URL生成流程,确保仅在数据集为图片集合时生成预览URL,并添加相关日志输出以便调试。 * Refactor Docker Compose configuration to comment out exposed ports for production environments, update image versions for pgvector, fastgpt, and mcp_server, and enhance Redis service with a health check. Additionally, standardize dataset collection labels in constants and improve internationalization strings across multiple languages. * Enhance TrainingStates component by adding internationalization support for the imageParse training mode and update defaultCounts to include imageParse mode in trainingDetail API. * Enhance dataset import context by adding additional steps for image dataset import process and improve internationalization strings for modal buttons in the useEditTitle hook. * Update DatasetImportContext to conditionally render MyStep component based on data source type, improving the import process for non-image datasets. * Refactor image dataset handling by improving internationalization strings, enhancing error messages, and streamlining the preview URL generation process. * 图片上传到新建的 dataset_collection_images 表,逻辑跟随更改 * 修改了除了controller的其他部分问题 * 把图片数据集的逻辑整合到controller里面 * 补充i18n * 补充i18n * resolve评论:主要是上传逻辑的更改和组件复用 * 图片名称的图标显示 * 修改编译报错的命名问题 * 删除不需要的collectionid部分 * 多余文件的处理和改动一个删除按钮 * 除了loading和统一的imageId,其他都resolve掉的 * 处理图标报错 * 复用了MyPhotoView并采用全部替换的方式将imageFileId变成imageId * 去除不必要文件修改 * 报错和字段修改 * 增加上传成功后删除临时文件的逻辑以及回退一些修改 * 删除path字段,将图片保存到gridfs内,并修改增删等操作的代码 * 修正编译错误 --------- Co-authored-by: archer <545436317@qq.com> * perf: image dataset * feat: insert image * perf: image icon * fix: training state --------- Co-authored-by: Zhuangzai fa <143257420+ctrlz526@users.noreply.github.com> * fix: ts (#4948) * Thirddatasetmd (#4942) * add thirddataset.md * fix thirddataset.md * fix * delete wrong png --------- Co-authored-by: dreamer6680 <146868355@qq.com> * perf: api dataset code * perf: log * add secondary.tsx (#4946) * add secondary.tsx * fix --------- Co-authored-by: dreamer6680 <146868355@qq.com> * perf: multiple menu * perf: i18n * feat: parse queue (#4960) * feat: parse queue * feat: sync parse queue * fix thirddataset.md (#4962) * fix thirddataset-4.png (#4963) * feat: Dataset template import (#4934) * 模版导入部分除了文档还没写 * 修复模版导入的 build 错误 * Document production * compress pictures * Change some constants to variables --------- Co-authored-by: Archer <545436317@qq.com> * perf: template import * doc * llm pargraph * bocha tool * fix: del collection --------- Co-authored-by: Zhuangzai fa <143257420+ctrlz526@users.noreply.github.com> Co-authored-by: dreamer6680 <1468683855@qq.com> Co-authored-by: dreamer6680 <146868355@qq.com>
This commit is contained in:
@@ -12,10 +12,7 @@ import { getCollectionWithDataset } from '../controller';
|
||||
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
|
||||
import { type PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type';
|
||||
import { i18nT } from '../../../../web/i18n/utils';
|
||||
import {
|
||||
getLLMDefaultChunkSize,
|
||||
getLLMMaxChunkSize
|
||||
} from '../../../../global/core/dataset/training/utils';
|
||||
import { getLLMMaxChunkSize } from '../../../../global/core/dataset/training/utils';
|
||||
|
||||
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
|
||||
try {
|
||||
@@ -62,10 +59,10 @@ export async function pushDataListToTrainingQueue({
|
||||
indexSize,
|
||||
session
|
||||
}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
|
||||
const getImageChunkMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => {
|
||||
const formatTrainingMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => {
|
||||
if (mode !== TrainingModeEnum.image) return mode;
|
||||
// 检查内容中,是否包含  的图片格式
|
||||
const text = data.q + data.a || '';
|
||||
const text = (data.q || '') + (data.a || '');
|
||||
const regex = /!\[\]\((.*?)\)/g;
|
||||
const match = text.match(regex);
|
||||
if (match) {
|
||||
@@ -82,9 +79,6 @@ export async function pushDataListToTrainingQueue({
|
||||
if (!agentModelData) {
|
||||
return Promise.reject(i18nT('common:error_llm_not_config'));
|
||||
}
|
||||
if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) {
|
||||
prompt = undefined;
|
||||
}
|
||||
|
||||
const { model, maxToken, weight } = await (async () => {
|
||||
if (mode === TrainingModeEnum.chunk) {
|
||||
@@ -101,7 +95,7 @@ export async function pushDataListToTrainingQueue({
|
||||
weight: 0
|
||||
};
|
||||
}
|
||||
if (mode === TrainingModeEnum.image) {
|
||||
if (mode === TrainingModeEnum.image || mode === TrainingModeEnum.imageParse) {
|
||||
const vllmModelData = getVlmModel(vlmModel);
|
||||
if (!vllmModelData) {
|
||||
return Promise.reject(i18nT('common:error_vlm_not_config'));
|
||||
@@ -116,17 +110,8 @@ export async function pushDataListToTrainingQueue({
|
||||
return Promise.reject(`Training mode "${mode}" is inValid`);
|
||||
})();
|
||||
|
||||
// filter repeat or equal content
|
||||
const set = new Set();
|
||||
const filterResult: Record<string, PushDatasetDataChunkProps[]> = {
|
||||
success: [],
|
||||
overToken: [],
|
||||
repeat: [],
|
||||
error: []
|
||||
};
|
||||
|
||||
// format q and a, remove empty char
|
||||
data.forEach((item) => {
|
||||
data = data.filter((item) => {
|
||||
item.q = simpleText(item.q);
|
||||
item.a = simpleText(item.a);
|
||||
|
||||
@@ -140,8 +125,7 @@ export async function pushDataListToTrainingQueue({
|
||||
.filter(Boolean);
|
||||
|
||||
// filter repeat content
|
||||
if (!item.q) {
|
||||
filterResult.error.push(item);
|
||||
if (!item.imageId && !item.q) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -149,42 +133,36 @@ export async function pushDataListToTrainingQueue({
|
||||
|
||||
// Oversize llm tokens
|
||||
if (text.length > maxToken) {
|
||||
filterResult.overToken.push(item);
|
||||
return;
|
||||
}
|
||||
|
||||
if (set.has(text)) {
|
||||
filterResult.repeat.push(item);
|
||||
} else {
|
||||
filterResult.success.push(item);
|
||||
set.add(text);
|
||||
}
|
||||
return true;
|
||||
});
|
||||
|
||||
// insert data to db
|
||||
const insertLen = filterResult.success.length;
|
||||
const failedDocuments: PushDatasetDataChunkProps[] = [];
|
||||
const insertLen = data.length;
|
||||
|
||||
// 使用 insertMany 批量插入
|
||||
const batchSize = 200;
|
||||
const batchSize = 500;
|
||||
const insertData = async (startIndex: number, session: ClientSession) => {
|
||||
const list = filterResult.success.slice(startIndex, startIndex + batchSize);
|
||||
const list = data.slice(startIndex, startIndex + batchSize);
|
||||
|
||||
if (list.length === 0) return;
|
||||
|
||||
try {
|
||||
await MongoDatasetTraining.insertMany(
|
||||
const result = await MongoDatasetTraining.insertMany(
|
||||
list.map((item) => ({
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId,
|
||||
collectionId,
|
||||
datasetId: datasetId,
|
||||
collectionId: collectionId,
|
||||
billId,
|
||||
mode: getImageChunkMode(item, mode),
|
||||
mode: formatTrainingMode(item, mode),
|
||||
prompt,
|
||||
model,
|
||||
q: item.q,
|
||||
a: item.a,
|
||||
...(item.q && { q: item.q }),
|
||||
...(item.a && { a: item.a }),
|
||||
...(item.imageId && { imageId: item.imageId }),
|
||||
chunkIndex: item.chunkIndex ?? 0,
|
||||
indexSize,
|
||||
weight: weight ?? 0,
|
||||
@@ -193,21 +171,20 @@ export async function pushDataListToTrainingQueue({
|
||||
})),
|
||||
{
|
||||
session,
|
||||
ordered: true
|
||||
ordered: false,
|
||||
rawResult: true,
|
||||
includeResultMetadata: false // 进一步减少返回数据
|
||||
}
|
||||
);
|
||||
|
||||
if (result.insertedCount !== list.length) {
|
||||
return Promise.reject(`Insert data error, ${JSON.stringify(result)}`);
|
||||
}
|
||||
} catch (error: any) {
|
||||
addLog.error(`Insert error`, error);
|
||||
// 如果有错误,将失败的文档添加到失败列表中
|
||||
error.writeErrors?.forEach((writeError: any) => {
|
||||
failedDocuments.push(data[writeError.index]);
|
||||
});
|
||||
console.log('failed', failedDocuments);
|
||||
return Promise.reject(error);
|
||||
}
|
||||
|
||||
// 对于失败的文档,尝试单独插入
|
||||
await MongoDatasetTraining.create(failedDocuments, { session });
|
||||
|
||||
return insertData(startIndex + batchSize, session);
|
||||
};
|
||||
|
||||
@@ -219,10 +196,37 @@ export async function pushDataListToTrainingQueue({
|
||||
});
|
||||
}
|
||||
|
||||
delete filterResult.success;
|
||||
|
||||
return {
|
||||
insertLen,
|
||||
...filterResult
|
||||
insertLen
|
||||
};
|
||||
}
|
||||
|
||||
export const pushDatasetToParseQueue = async ({
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId,
|
||||
collectionId,
|
||||
billId,
|
||||
session
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
datasetId: string;
|
||||
collectionId: string;
|
||||
billId: string;
|
||||
session: ClientSession;
|
||||
}) => {
|
||||
await MongoDatasetTraining.create(
|
||||
[
|
||||
{
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId,
|
||||
collectionId,
|
||||
billId,
|
||||
mode: TrainingModeEnum.parse
|
||||
}
|
||||
],
|
||||
{ session, ordered: true }
|
||||
);
|
||||
};
|
||||
|
Reference in New Issue
Block a user