From 27332743c717d9d91b4ca09f897e2574c7367144 Mon Sep 17 00:00:00 2001 From: Archer <545436317@qq.com> Date: Wed, 2 Apr 2025 10:53:15 +0800 Subject: [PATCH] Training status (#4424) * dataset data training state (#4311) * dataset data training state * fix * fix ts * fix * fix api format * fix * fix * perf: count training * format * fix: dataset training state (#4417) * fix * add test * fix * fix * fix test * fix test * perf: training count * count * loading status --------- Co-authored-by: heheer --- packages/global/core/dataset/type.d.ts | 2 + .../service/core/dataset/training/schema.ts | 4 +- .../web/components/common/Icon/constants.ts | 2 + .../common/Icon/icons/common/check.svg | 4 +- .../common/Icon/icons/common/maximize.svg | 3 + .../common/Icon/icons/common/running.svg | 3 + packages/web/hooks/useScrollPagination.tsx | 8 +- packages/web/i18n/en/common.json | 2 + packages/web/i18n/en/dataset.json | 25 + packages/web/i18n/zh-CN/common.json | 2 + packages/web/i18n/zh-CN/dataset.json | 26 + packages/web/i18n/zh-Hant/common.json | 2 + packages/web/i18n/zh-Hant/dataset.json | 25 + .../app/src/global/core/dataset/type.d.ts | 1 + .../detail/CollectionCard/TrainingStates.tsx | 502 ++++++++++++++++++ .../dataset/detail/CollectionCard/index.tsx | 39 +- .../dataset/detail/DataCard.tsx | 31 +- .../api/core/dataset/collection/detail.ts | 20 +- .../api/core/dataset/collection/listV2.ts | 7 +- .../core/dataset/collection/trainingDetail.ts | 170 ++++++ .../dataset/training/deleteTrainingData.ts | 39 ++ .../dataset/training/getTrainingDataDetail.ts | 52 ++ .../core/dataset/training/getTrainingError.ts | 51 ++ .../dataset/training/updateTrainingData.ts | 59 ++ .../app/src/pages/api/v1/chat/completions.ts | 1 - projects/app/src/service/events/generateQA.ts | 14 +- .../app/src/service/events/generateVector.ts | 13 +- projects/app/src/web/core/dataset/api.ts | 24 + .../app/src/web/core/dataset/constants.ts | 35 +- .../training/deleteTrainingData.test.ts | 58 ++ .../training/getTrainingDataDetail.test.ts | 59 ++ .../dataset/training/getTrainingError.test.ts | 56 ++ .../training/updateTrainingData.test.ts | 63 +++ 33 files changed, 1383 insertions(+), 19 deletions(-) create mode 100644 packages/web/components/common/Icon/icons/common/maximize.svg create mode 100644 packages/web/components/common/Icon/icons/common/running.svg create mode 100644 projects/app/src/pageComponents/dataset/detail/CollectionCard/TrainingStates.tsx create mode 100644 projects/app/src/pages/api/core/dataset/collection/trainingDetail.ts create mode 100644 projects/app/src/pages/api/core/dataset/training/deleteTrainingData.ts create mode 100644 projects/app/src/pages/api/core/dataset/training/getTrainingDataDetail.ts create mode 100644 projects/app/src/pages/api/core/dataset/training/getTrainingError.ts create mode 100644 projects/app/src/pages/api/core/dataset/training/updateTrainingData.ts create mode 100644 test/cases/api/core/dataset/training/deleteTrainingData.test.ts create mode 100644 test/cases/api/core/dataset/training/getTrainingDataDetail.test.ts create mode 100644 test/cases/api/core/dataset/training/getTrainingError.test.ts create mode 100644 test/cases/api/core/dataset/training/updateTrainingData.test.ts diff --git a/packages/global/core/dataset/type.d.ts b/packages/global/core/dataset/type.d.ts index a92785b94..67bde78fb 100644 --- a/packages/global/core/dataset/type.d.ts +++ b/packages/global/core/dataset/type.d.ts @@ -163,6 +163,7 @@ export type DatasetTrainingSchemaType = { weight: number; indexes: Omit[]; retryCount: number; + errorMsg?: string; }; export type CollectionWithDatasetType = DatasetCollectionSchemaType & { @@ -216,6 +217,7 @@ export type DatasetCollectionItemType = CollectionWithDatasetType & { file?: DatasetFileSchema; permission: DatasetPermission; indexAmount: number; + errorCount?: number; }; /* ================= data ===================== */ diff --git a/packages/service/core/dataset/training/schema.ts b/packages/service/core/dataset/training/schema.ts index d11d2e109..ad4a26fa0 100644 --- a/packages/service/core/dataset/training/schema.ts +++ b/packages/service/core/dataset/training/schema.ts @@ -98,7 +98,9 @@ const TrainingDataSchema = new Schema({ } ], default: [] - } + }, + + errorMsg: String }); TrainingDataSchema.virtual('dataset', { diff --git a/packages/web/components/common/Icon/constants.ts b/packages/web/components/common/Icon/constants.ts index 65817717f..bb3371b3d 100644 --- a/packages/web/components/common/Icon/constants.ts +++ b/packages/web/components/common/Icon/constants.ts @@ -67,6 +67,7 @@ export const iconPaths = { 'common/list': () => import('./icons/common/list.svg'), 'common/loading': () => import('./icons/common/loading.svg'), 'common/logLight': () => import('./icons/common/logLight.svg'), + 'common/maximize': () => import('./icons/common/maximize.svg'), 'common/microsoft': () => import('./icons/common/microsoft.svg'), 'common/model': () => import('./icons/common/model.svg'), 'common/monitor': () => import('./icons/common/monitor.svg'), @@ -85,6 +86,7 @@ export const iconPaths = { 'common/rightArrowFill': () => import('./icons/common/rightArrowFill.svg'), 'common/rightArrowLight': () => import('./icons/common/rightArrowLight.svg'), 'common/routePushLight': () => import('./icons/common/routePushLight.svg'), + 'common/running': () => import('./icons/common/running.svg'), 'common/saveFill': () => import('./icons/common/saveFill.svg'), 'common/searchLight': () => import('./icons/common/searchLight.svg'), 'common/select': () => import('./icons/common/select.svg'), diff --git a/packages/web/components/common/Icon/icons/common/check.svg b/packages/web/components/common/Icon/icons/common/check.svg index f9d5d37f1..1662108f8 100644 --- a/packages/web/components/common/Icon/icons/common/check.svg +++ b/packages/web/components/common/Icon/icons/common/check.svg @@ -1,3 +1,3 @@ - - + + \ No newline at end of file diff --git a/packages/web/components/common/Icon/icons/common/maximize.svg b/packages/web/components/common/Icon/icons/common/maximize.svg new file mode 100644 index 000000000..d5545f478 --- /dev/null +++ b/packages/web/components/common/Icon/icons/common/maximize.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/packages/web/components/common/Icon/icons/common/running.svg b/packages/web/components/common/Icon/icons/common/running.svg new file mode 100644 index 000000000..2ad538b38 --- /dev/null +++ b/packages/web/components/common/Icon/icons/common/running.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/packages/web/hooks/useScrollPagination.tsx b/packages/web/hooks/useScrollPagination.tsx index 8188f4a4d..9fae0ce95 100644 --- a/packages/web/hooks/useScrollPagination.tsx +++ b/packages/web/hooks/useScrollPagination.tsx @@ -308,7 +308,13 @@ export function useScrollPagination< ); return ( - + {scrollLoadType === 'top' && total > 0 && isLoading && ( {t('common:common.is_requesting')} diff --git a/packages/web/i18n/en/common.json b/packages/web/i18n/en/common.json index 5581d22e2..f729cca3e 100644 --- a/packages/web/i18n/en/common.json +++ b/packages/web/i18n/en/common.json @@ -1,5 +1,6 @@ { "App": "Application", + "Click_to_expand": "Click to expand", "Download": "Download", "Export": "Export", "FAQ.ai_point_a": "Each time you use the AI model, a certain amount of AI points will be deducted. For detailed calculation standards, please refer to the 'AI Points Calculation Standards' above.\nToken calculation uses the same formula as GPT-3.5, where 1 Token ≈ 0.7 Chinese characters ≈ 0.9 English words. Consecutive characters may be considered as 1 Token.", @@ -538,6 +539,7 @@ "core.dataset.collection.metadata.source name": "Source Name", "core.dataset.collection.metadata.source size": "Source Size", "core.dataset.collection.status.active": "Ready", + "core.dataset.collection.status.error": "Error", "core.dataset.collection.sync.result.sameRaw": "Content Unchanged, No Update Needed", "core.dataset.collection.sync.result.success": "Sync Started", "core.dataset.data.Data Content": "Related Data Content", diff --git a/packages/web/i18n/en/dataset.json b/packages/web/i18n/en/dataset.json index 529c08b86..3eae9589a 100644 --- a/packages/web/i18n/en/dataset.json +++ b/packages/web/i18n/en/dataset.json @@ -28,9 +28,24 @@ "custom_data_process_params_desc": "Customize data processing rules", "custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.", "data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes", + "data_error_amount": "{{errorAmount}} Group training exception", "data_index_num": "Index {{index}}", "data_process_params": "Params", "data_process_setting": "Processing config", + "dataset.Chunk_Number": "Block number", + "dataset.Completed": "Finish", + "dataset.Delete_Chunk": "delete", + "dataset.Edit_Chunk": "edit", + "dataset.Error_Message": "Report an error message", + "dataset.No_Error": "No exception information yet", + "dataset.Operation": "operate", + "dataset.ReTrain": "Retrain", + "dataset.Training Process": "Training status", + "dataset.Training_Count": "{{count}} Group training", + "dataset.Training_Errors": "Errors", + "dataset.Training_QA": "{{count}} Group Q&A pair training", + "dataset.Training_Status": "Training status", + "dataset.Training_Waiting": "Need to wait for {{count}} group data", "dataset.Unsupported operation": "dataset.Unsupported operation", "dataset.no_collections": "No datasets available", "dataset.no_tags": "No tags available", @@ -82,6 +97,13 @@ "preview_chunk_empty": "Unable to read the contents of the file", "preview_chunk_intro": "A total of {{total}} blocks, up to 10", "preview_chunk_not_selected": "Click on the file on the left to preview", + "process.Auto_Index": "Automatic index generation", + "process.Get QA": "Q&A extraction", + "process.Image_Index": "Image index generation", + "process.Is_Ready": "Ready", + "process.Parsing": "Parsing", + "process.Vectorizing": "Index vectorization", + "process.Waiting": "Queue", "rebuild_embedding_start_tip": "Index model switching task has started", "rebuilding_index_count": "Number of indexes being rebuilt: {{count}}", "request_headers": "Request headers, will automatically append 'Bearer '", @@ -114,7 +136,10 @@ "tag.total_tags": "Total {{total}} tags", "the_knowledge_base_has_indexes_that_are_being_trained_or_being_rebuilt": "The Dataset has indexes that are being trained or rebuilt", "total_num_files": "Total {{total}} files", + "training.Error": "{{count}} Group exception", + "training.Normal": "Normal", "training_mode": "Chunk mode", + "training_ready": "{{count}} Group", "vector_model_max_tokens_tip": "Each chunk of data has a maximum length of 3000 tokens", "vllm_model": "Image understanding model", "website_dataset": "Website Sync", diff --git a/packages/web/i18n/zh-CN/common.json b/packages/web/i18n/zh-CN/common.json index c7068843b..245f06b7c 100644 --- a/packages/web/i18n/zh-CN/common.json +++ b/packages/web/i18n/zh-CN/common.json @@ -1,5 +1,6 @@ { "App": "应用", + "Click_to_expand": "点击查看详情", "Download": "下载", "Export": "导出", "FAQ.ai_point_a": "每次调用AI模型时,都会消耗一定的AI积分。具体的计算标准可参考上方的“AI 积分计算标准”。\nToken计算采用GPT3.5相同公式,1Token≈0.7中文字符≈0.9英文单词,连续出现的字符可能被认为是1个Tokens。", @@ -541,6 +542,7 @@ "core.dataset.collection.metadata.source name": "来源名", "core.dataset.collection.metadata.source size": "来源大小", "core.dataset.collection.status.active": "已就绪", + "core.dataset.collection.status.error": "训练异常", "core.dataset.collection.sync.result.sameRaw": "内容未变动,无需更新", "core.dataset.collection.sync.result.success": "开始同步", "core.dataset.data.Data Content": "相关数据内容", diff --git a/packages/web/i18n/zh-CN/dataset.json b/packages/web/i18n/zh-CN/dataset.json index 7e11fa021..53a07dfda 100644 --- a/packages/web/i18n/zh-CN/dataset.json +++ b/packages/web/i18n/zh-CN/dataset.json @@ -28,9 +28,24 @@ "custom_data_process_params_desc": "自定义设置数据处理规则", "custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号,例如: * () [] {} 等。", "data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引", + "data_error_amount": "{{errorAmount}} 组训练异常", "data_index_num": "索引 {{index}}", "data_process_params": "处理参数", "data_process_setting": "数据处理配置", + "dataset.Chunk_Number": "分块号", + "dataset.Completed": "完成", + "dataset.Delete_Chunk": "删除", + "dataset.Edit_Chunk": "编辑", + "dataset.Error_Message": "报错信息", + "dataset.No_Error": "暂无异常信息", + "dataset.Operation": "操作", + "dataset.ReTrain": "重试", + "dataset.Training Process": "训练状态", + "dataset.Training_Count": "{{count}} 组训练中", + "dataset.Training_Errors": "异常 ({{count}})", + "dataset.Training_QA": "{{count}} 组问答对训练中", + "dataset.Training_Status": "训练状态", + "dataset.Training_Waiting": "需等待 {{count}} 组数据", "dataset.Unsupported operation": "操作不支持", "dataset.no_collections": "暂无数据集", "dataset.no_tags": "暂无标签", @@ -82,6 +97,14 @@ "preview_chunk_empty": "无法读取该文件内容", "preview_chunk_intro": "共 {{total}} 个分块,最多展示 10 个", "preview_chunk_not_selected": "点击左侧文件后进行预览", + "process.Auto_Index": "自动索引生成", + "process.Get QA": "问答对提取", + "process.Image_Index": "图片索引生成", + "process.Is_Ready": "已就绪", + "process.Is_Ready_Count": "{{count}} 组已就绪", + "process.Parsing": "内容解析中", + "process.Vectorizing": "索引向量化", + "process.Waiting": "排队中", "rebuild_embedding_start_tip": "切换索引模型任务已开始", "rebuilding_index_count": "重建中索引数量:{{count}}", "request_headers": "请求头参数,会自动补充 Bearer", @@ -114,7 +137,10 @@ "tag.total_tags": "共{{total}}个标签", "the_knowledge_base_has_indexes_that_are_being_trained_or_being_rebuilt": "知识库有训练中或正在重建的索引", "total_num_files": "共 {{total}} 个文件", + "training.Error": "{{count}} 组异常", + "training.Normal": "正常", "training_mode": "处理方式", + "training_ready": "{{count}} 组", "vector_model_max_tokens_tip": "每个分块数据,最大长度为 3000 tokens", "vllm_model": "图片理解模型", "website_dataset": "Web 站点同步", diff --git a/packages/web/i18n/zh-Hant/common.json b/packages/web/i18n/zh-Hant/common.json index 2d3336cc7..955858e7d 100644 --- a/packages/web/i18n/zh-Hant/common.json +++ b/packages/web/i18n/zh-Hant/common.json @@ -1,5 +1,6 @@ { "App": "應用程式", + "Click_to_expand": "點擊查看詳情", "Download": "下載", "Export": "匯出", "FAQ.ai_point_a": "每次呼叫 AI 模型時,都會消耗一定數量的 AI 點數。詳細的計算標準請參考上方的「AI 點數計算標準」。\nToken 計算採用與 GPT3.5 相同的公式,1 Token ≈ 0.7 個中文字 ≈ 0.9 個英文單字,連續出現的字元可能會被視為 1 個 Token。", @@ -537,6 +538,7 @@ "core.dataset.collection.metadata.source name": "來源名稱", "core.dataset.collection.metadata.source size": "來源大小", "core.dataset.collection.status.active": "已就緒", + "core.dataset.collection.status.error": "訓練異常", "core.dataset.collection.sync.result.sameRaw": "內容未變更,無需更新", "core.dataset.collection.sync.result.success": "開始同步", "core.dataset.data.Data Content": "相關資料內容", diff --git a/packages/web/i18n/zh-Hant/dataset.json b/packages/web/i18n/zh-Hant/dataset.json index 76e510142..7bc6c16c2 100644 --- a/packages/web/i18n/zh-Hant/dataset.json +++ b/packages/web/i18n/zh-Hant/dataset.json @@ -28,9 +28,24 @@ "custom_data_process_params_desc": "自訂資料處理規則", "custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的數據,使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.” 表示中英文句號。\n\n盡量避免使用正則相關特殊符號,例如: * () [] {} 等。", "data_amount": "{{dataAmount}} 組數據, {{indexAmount}} 組索引", + "data_error_amount": "{{errorAmount}} 組訓練異常", "data_index_num": "索引 {{index}}", "data_process_params": "處理參數", "data_process_setting": "資料處理設定", + "dataset.Chunk_Number": "分塊號", + "dataset.Completed": "完成", + "dataset.Delete_Chunk": "刪除", + "dataset.Edit_Chunk": "編輯", + "dataset.Error_Message": "報錯信息", + "dataset.No_Error": "暫無異常信息", + "dataset.Operation": "操作", + "dataset.ReTrain": "重試", + "dataset.Training Process": "訓練狀態", + "dataset.Training_Count": "{{count}} 組訓練中", + "dataset.Training_Errors": "異常", + "dataset.Training_QA": "{{count}} 組問答對訓練中", + "dataset.Training_Status": "訓練狀態", + "dataset.Training_Waiting": "需等待 {{count}} 組數據", "dataset.Unsupported operation": "操作不支持", "dataset.no_collections": "尚無資料集", "dataset.no_tags": "尚無標籤", @@ -82,6 +97,13 @@ "preview_chunk_empty": "無法讀取該文件內容", "preview_chunk_intro": "共 {{total}} 個分塊,最多展示 10 個", "preview_chunk_not_selected": "點擊左側文件後進行預覽", + "process.Auto_Index": "自動索引生成", + "process.Get QA": "問答對提取", + "process.Image_Index": "圖片索引生成", + "process.Is_Ready": "已就緒", + "process.Parsing": "內容解析中", + "process.Vectorizing": "索引向量化", + "process.Waiting": "排隊中", "rebuild_embedding_start_tip": "切換索引模型任務已開始", "rebuilding_index_count": "重建中索引數量:{{count}}", "request_headers": "請求頭", @@ -114,7 +136,10 @@ "tag.total_tags": "共 {{total}} 個標籤", "the_knowledge_base_has_indexes_that_are_being_trained_or_being_rebuilt": "資料集有索引正在訓練或重建中", "total_num_files": "共 {{total}} 個文件", + "training.Error": "{{count}} 組異常", + "training.Normal": "正常", "training_mode": "分段模式", + "training_ready": "{{count}} 組", "vector_model_max_tokens_tip": "每個分塊數據,最大長度為 3000 tokens", "vllm_model": "圖片理解模型", "website_dataset": "網站同步", diff --git a/projects/app/src/global/core/dataset/type.d.ts b/projects/app/src/global/core/dataset/type.d.ts index 3eca9728b..0f0e5c305 100644 --- a/projects/app/src/global/core/dataset/type.d.ts +++ b/projects/app/src/global/core/dataset/type.d.ts @@ -29,6 +29,7 @@ export type DatasetCollectionsListItemType = { dataAmount: number; trainingAmount: number; + hasError?: boolean; }; /* ================= data ===================== */ diff --git a/projects/app/src/pageComponents/dataset/detail/CollectionCard/TrainingStates.tsx b/projects/app/src/pageComponents/dataset/detail/CollectionCard/TrainingStates.tsx new file mode 100644 index 000000000..c7636d71b --- /dev/null +++ b/projects/app/src/pageComponents/dataset/detail/CollectionCard/TrainingStates.tsx @@ -0,0 +1,502 @@ +import { + Box, + Button, + Flex, + ModalBody, + Table, + TableContainer, + Tbody, + Td, + Th, + Thead, + Tr +} from '@chakra-ui/react'; +import MyModal from '@fastgpt/web/components/common/MyModal'; +import { useTranslation } from 'next-i18next'; +import MyTag from '@fastgpt/web/components/common/Tag/index'; +import FillRowTabs from '@fastgpt/web/components/common/Tabs/FillRowTabs'; +import { useMemo, useState } from 'react'; +import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; +import { + deleteTrainingData, + getDatasetCollectionTrainingDetail, + getTrainingDataDetail, + getTrainingError, + updateTrainingData +} from '@/web/core/dataset/api'; +import { DatasetCollectionDataProcessModeEnum } from '@fastgpt/global/core/dataset/constants'; +import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; +import MyIcon from '@fastgpt/web/components/common/Icon'; +import MyTooltip from '@fastgpt/web/components/common/MyTooltip'; +import { getTrainingDataDetailResponse } from '@/pages/api/core/dataset/training/getTrainingDataDetail'; +import MyTextarea from '@/components/common/Textarea/MyTextarea'; +import { TrainingProcess } from '@/web/core/dataset/constants'; +import { useForm } from 'react-hook-form'; +import type { getTrainingDetailResponse } from '@/pages/api/core/dataset/collection/trainingDetail'; +import { useScrollPagination } from '@fastgpt/web/hooks/useScrollPagination'; +import EmptyTip from '@fastgpt/web/components/common/EmptyTip'; + +enum TrainingStatus { + NotStart = 'NotStart', + Queued = 'Queued', // wait count>0 + Running = 'Running', // wait count=0; training count>0. + Ready = 'Ready', + Error = 'Error' +} + +const ProgressView = ({ trainingDetail }: { trainingDetail: getTrainingDetailResponse }) => { + const { t } = useTranslation(); + + const isQA = trainingDetail?.trainingType === DatasetCollectionDataProcessModeEnum.qa; + + /* + 状态计算 + 1. 暂时没有内容解析的状态 + 2. 完全没有训练数据时候,已就绪 + 3. 有训练数据,中间过程全部是进行中 + */ + const statesArray = useMemo(() => { + const isReady = + Object.values(trainingDetail.queuedCounts).every((count) => count === 0) && + Object.values(trainingDetail.trainingCounts).every((count) => count === 0) && + Object.values(trainingDetail.errorCounts).every((count) => count === 0); + + const getTrainingStatus = ({ errorCount }: { errorCount: number }) => { + if (isReady) return TrainingStatus.Ready; + if (errorCount > 0) { + return TrainingStatus.Error; + } + return TrainingStatus.Running; + }; + + // 只显示排队和处理中的数量 + const getStatusText = (mode: TrainingModeEnum) => { + if (isReady) return; + + if (trainingDetail.queuedCounts[mode] > 0) { + return t('dataset:dataset.Training_Waiting', { + count: trainingDetail.queuedCounts[mode] + }); + } + if (trainingDetail.trainingCounts[mode] > 0) { + return t('dataset:dataset.Training_Count', { + count: trainingDetail.trainingCounts[mode] + }); + } + return; + }; + + const states: { + label: string; + statusText?: string; + status: TrainingStatus; + errorCount: number; + }[] = [ + // { + // label: TrainingProcess.waiting.label, + // status: TrainingStatus.Queued, + // statusText: t('dataset:dataset.Completed') + // }, + { + label: t(TrainingProcess.parsing.label), + status: TrainingStatus.Ready, + errorCount: 0 + }, + ...(isQA + ? [ + { + errorCount: trainingDetail.errorCounts.qa, + label: t(TrainingProcess.getQA.label), + statusText: getStatusText(TrainingModeEnum.qa), + status: getTrainingStatus({ + errorCount: trainingDetail.errorCounts.qa + }) + } + ] + : []), + ...(trainingDetail?.advancedTraining.imageIndex && !isQA + ? [ + { + errorCount: trainingDetail.errorCounts.image, + label: t(TrainingProcess.imageIndex.label), + statusText: getStatusText(TrainingModeEnum.image), + status: getTrainingStatus({ + errorCount: trainingDetail.errorCounts.image + }) + } + ] + : []), + ...(trainingDetail?.advancedTraining.autoIndexes && !isQA + ? [ + { + errorCount: trainingDetail.errorCounts.auto, + label: t(TrainingProcess.autoIndex.label), + statusText: getStatusText(TrainingModeEnum.auto), + status: getTrainingStatus({ + errorCount: trainingDetail.errorCounts.auto + }) + } + ] + : []), + { + errorCount: trainingDetail.errorCounts.chunk, + label: t(TrainingProcess.vectorizing.label), + statusText: getStatusText(TrainingModeEnum.chunk), + status: getTrainingStatus({ + errorCount: trainingDetail.errorCounts.chunk + }) + }, + { + errorCount: 0, + label: t('dataset:process.Is_Ready'), + status: isReady ? TrainingStatus.Ready : TrainingStatus.NotStart, + statusText: isReady + ? undefined + : t('dataset:training_ready', { + count: trainingDetail.trainedCount + }) + } + ]; + + return states; + }, [trainingDetail, t, isQA]); + + return ( + + {statesArray.map((item, index) => ( + + {/* Status round */} + + {item.status === TrainingStatus.Ready && ( + + )} + + {/* Card */} + + + {t(item.label as any)} + + {item.status === TrainingStatus.Error && ( + + {t('dataset:training.Error', { count: item.errorCount })} + + )} + + {!!item.statusText && ( + + {item.statusText} + + )} + + + ))} + + ); +}; + +const ErrorView = ({ datasetId, collectionId }: { datasetId: string; collectionId: string }) => { + const { t } = useTranslation(); + const TrainingText = { + [TrainingModeEnum.chunk]: t('dataset:process.Vectorizing'), + [TrainingModeEnum.qa]: t('dataset:process.Get QA'), + [TrainingModeEnum.image]: t('dataset:process.Image_Index'), + [TrainingModeEnum.auto]: t('dataset:process.Auto_Index') + }; + + const [editChunk, setEditChunk] = useState(); + + const { + data: errorList, + ScrollData, + isLoading, + refreshList + } = useScrollPagination(getTrainingError, { + pageSize: 15, + params: { + collectionId + }, + EmptyTip: + }); + + const { runAsync: getData, loading: getDataLoading } = useRequest2( + (data: { datasetId: string; collectionId: string; dataId: string }) => { + return getTrainingDataDetail(data); + }, + { + manual: true, + onSuccess: (data) => { + setEditChunk(data); + } + } + ); + const { runAsync: deleteData, loading: deleteLoading } = useRequest2( + (data: { datasetId: string; collectionId: string; dataId: string }) => { + return deleteTrainingData(data); + }, + { + manual: true, + onSuccess: () => { + refreshList(); + } + } + ); + const { runAsync: updateData, loading: updateLoading } = useRequest2( + (data: { datasetId: string; collectionId: string; dataId: string; q?: string; a?: string }) => { + return updateTrainingData(data); + }, + { + manual: true, + onSuccess: () => { + refreshList(); + setEditChunk(undefined); + } + } + ); + + if (editChunk) { + return ( + setEditChunk(undefined)} + onSave={(data) => { + updateData({ + datasetId, + collectionId, + dataId: editChunk._id, + ...data + }); + }} + /> + ); + } + + return ( + + + + + + + + + + + + + {errorList.map((item, index) => ( + + + + + + + ))} + +
{t('dataset:dataset.Chunk_Number')}{t('dataset:dataset.Training_Status')}{t('dataset:dataset.Error_Message')}{t('dataset:dataset.Operation')}
{item.chunkIndex + 1}{TrainingText[item.mode]} + {item.errorMsg} + + + + + + + + +
+ + + ); +}; + +const EditView = ({ + editChunk, + onCancel, + onSave +}: { + editChunk: getTrainingDataDetailResponse; + onCancel: () => void; + onSave: (data: { q: string; a?: string }) => void; +}) => { + const { t } = useTranslation(); + const { register, handleSubmit } = useForm({ + defaultValues: { + q: editChunk?.q || '', + a: editChunk?.a || '' + } + }); + + return ( + + {editChunk?.a && q} + + {editChunk?.a && ( + <> + a + + + )} + + + + + + ); +}; + +const TrainingStates = ({ + datasetId, + collectionId, + defaultTab = 'states', + onClose +}: { + datasetId: string; + collectionId: string; + defaultTab?: 'states' | 'errors'; + onClose: () => void; +}) => { + const { t } = useTranslation(); + const [tab, setTab] = useState(defaultTab); + + const { data: trainingDetail, loading } = useRequest2( + () => getDatasetCollectionTrainingDetail(collectionId), + { + pollingInterval: 5000, + pollingWhenHidden: false, + manual: false + } + ); + + const errorCounts = (Object.values(trainingDetail?.errorCounts || {}) as number[]).reduce( + (acc, count) => acc + count, + 0 + ); + + return ( + + + setTab(e as 'states' | 'errors')} + list={[ + { label: t('dataset:dataset.Training Process'), value: 'states' }, + { + label: t('dataset:dataset.Training_Errors', { + count: errorCounts + }), + value: 'errors' + } + ]} + /> + {tab === 'states' && trainingDetail && } + {tab === 'errors' && } + + + ); +}; + +export default TrainingStates; diff --git a/projects/app/src/pageComponents/dataset/detail/CollectionCard/index.tsx b/projects/app/src/pageComponents/dataset/detail/CollectionCard/index.tsx index cdfc9748c..6555f8dc9 100644 --- a/projects/app/src/pageComponents/dataset/detail/CollectionCard/index.tsx +++ b/projects/app/src/pageComponents/dataset/detail/CollectionCard/index.tsx @@ -51,6 +51,7 @@ import { import { useFolderDrag } from '@/components/common/folder/useFolderDrag'; import TagsPopOver from './TagsPopOver'; import { useSystemStore } from '@/web/common/system/useSystemStore'; +import TrainingStates from './TrainingStates'; const Header = dynamic(() => import('./Header')); const EmptyCollectionTip = dynamic(() => import('./EmptyCollectionTip')); @@ -73,16 +74,25 @@ const CollectionCard = () => { }); const [moveCollectionData, setMoveCollectionData] = useState<{ collectionId: string }>(); + const [trainingStatesCollection, setTrainingStatesCollection] = useState<{ + collectionId: string; + }>(); const { collections, Pagination, total, getData, isGetting, pageNum, pageSize } = useContextSelector(CollectionPageContext, (v) => v); - // Ad file status icon + // Add file status icon const formatCollections = useMemo( () => collections.map((collection) => { const icon = getCollectionIcon(collection.type, collection.name); const status = (() => { + if (collection.hasError) { + return { + statusText: t('common:core.dataset.collection.status.error'), + colorSchema: 'red' + }; + } if (collection.trainingAmount > 0) { return { statusText: t('common:dataset.collections.Collection Embedding', { @@ -269,9 +279,22 @@ const CollectionCard = () => { {formatTime2YMDHM(collection.updateTime)} - - {t(collection.statusText as any)} - + + { + e.stopPropagation(); + setTrainingStatesCollection({ collectionId: collection._id }); + }} + > + + {t(collection.statusText as any)} + + + + e.stopPropagation()}> { + {!!trainingStatesCollection && ( + setTrainingStatesCollection(undefined)} + /> + )} + {!!moveCollectionData && ( { const theme = useTheme(); @@ -44,6 +45,7 @@ const DataCard = () => { const { t } = useTranslation(); const [searchText, setSearchText] = useState(''); + const [errorModalId, setErrorModalId] = useState(''); const { toast } = useToast(); const scrollParams = useMemo( @@ -174,7 +176,7 @@ const DataCard = () => { - + {t('dataset:data_amount', { @@ -182,6 +184,25 @@ const DataCard = () => { indexAmount: collection?.indexAmount ?? '-' })} + {!!collection?.errorCount && ( + { + setErrorModalId(collection._id); + }} + > + + {t('dataset:data_error_amount', { + errorAmount: collection?.errorCount + })} + + + + )} { }} /> )} + {errorModalId && ( + setErrorModalId('')} + /> + )} ); diff --git a/projects/app/src/pages/api/core/dataset/collection/detail.ts b/projects/app/src/pages/api/core/dataset/collection/detail.ts index ff988ba0c..5da4feca7 100644 --- a/projects/app/src/pages/api/core/dataset/collection/detail.ts +++ b/projects/app/src/pages/api/core/dataset/collection/detail.ts @@ -12,6 +12,9 @@ import { DatasetCollectionItemType } from '@fastgpt/global/core/dataset/type'; import { CommonErrEnum } from '@fastgpt/global/common/error/code/common'; import { collectionTagsToTagLabel } from '@fastgpt/service/core/dataset/collection/utils'; import { getVectorCountByCollectionId } from '@fastgpt/service/common/vectorStore/controller'; +import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema'; +import { Types } from 'mongoose'; +import { readFromSecondary } from '@fastgpt/service/common/mongo/utils'; async function handler(req: NextApiRequest): Promise { const { id } = req.query as { id: string }; @@ -30,11 +33,21 @@ async function handler(req: NextApiRequest): Promise }); // get file - const [file, indexAmount] = await Promise.all([ + const [file, indexAmount, errorCount] = await Promise.all([ collection?.fileId ? await getFileById({ bucketName: BucketNameEnum.dataset, fileId: collection.fileId }) : undefined, - getVectorCountByCollectionId(collection.teamId, collection.datasetId, collection._id) + getVectorCountByCollectionId(collection.teamId, collection.datasetId, collection._id), + MongoDatasetTraining.countDocuments( + { + teamId: collection.teamId, + datasetId: collection.datasetId, + collectionId: id, + errorMsg: { $exists: true }, + retryCount: { $lte: 0 } + }, + readFromSecondary + ) ]); return { @@ -46,7 +59,8 @@ async function handler(req: NextApiRequest): Promise tags: collection.tags }), permission, - file + file, + errorCount }; } diff --git a/projects/app/src/pages/api/core/dataset/collection/listV2.ts b/projects/app/src/pages/api/core/dataset/collection/listV2.ts index b92e02d56..d3c3abf80 100644 --- a/projects/app/src/pages/api/core/dataset/collection/listV2.ts +++ b/projects/app/src/pages/api/core/dataset/collection/listV2.ts @@ -93,6 +93,7 @@ async function handler( dataAmount: 0, indexAmount: 0, trainingAmount: 0, + hasError: false, permission })) ), @@ -113,7 +114,7 @@ async function handler( // Compute data amount const [trainingAmount, dataAmount]: [ - { _id: string; count: number }[], + { _id: string; count: number; hasError: boolean }[], { _id: string; count: number }[] ] = await Promise.all([ MongoDatasetTraining.aggregate( @@ -128,7 +129,8 @@ async function handler( { $group: { _id: '$collectionId', - count: { $sum: 1 } + count: { $sum: 1 }, + hasError: { $max: { $cond: [{ $ifNull: ['$errorMsg', false] }, true, false] } } } } ], @@ -168,6 +170,7 @@ async function handler( trainingAmount: trainingAmount.find((amount) => String(amount._id) === String(item._id))?.count || 0, dataAmount: dataAmount.find((amount) => String(amount._id) === String(item._id))?.count || 0, + hasError: trainingAmount.find((amount) => String(amount._id) === String(item._id))?.hasError, permission })) ); diff --git a/projects/app/src/pages/api/core/dataset/collection/trainingDetail.ts b/projects/app/src/pages/api/core/dataset/collection/trainingDetail.ts new file mode 100644 index 000000000..89a0b20aa --- /dev/null +++ b/projects/app/src/pages/api/core/dataset/collection/trainingDetail.ts @@ -0,0 +1,170 @@ +import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema'; +import { + DatasetCollectionDataProcessModeEnum, + TrainingModeEnum +} from '@fastgpt/global/core/dataset/constants'; +import { readFromSecondary } from '@fastgpt/service/common/mongo/utils'; +import { NextAPI } from '@/service/middleware/entry'; +import { ReadPermissionVal } from '@fastgpt/global/support/permission/constant'; +import { authDatasetCollection } from '@fastgpt/service/support/permission/dataset/auth'; +import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema'; +import { ApiRequestProps } from '@fastgpt/service/type/next'; + +type getTrainingDetailParams = { + collectionId: string; +}; + +export type getTrainingDetailResponse = { + trainingType: DatasetCollectionDataProcessModeEnum; + advancedTraining: { + customPdfParse: boolean; + imageIndex: boolean; + autoIndexes: boolean; + }; + queuedCounts: Record; + trainingCounts: Record; + errorCounts: Record; + trainedCount: number; +}; + +const defaultCounts: Record = { + qa: 0, + chunk: 0, + image: 0, + auto: 0 +}; + +async function handler( + req: ApiRequestProps<{}, getTrainingDetailParams> +): Promise { + const { collectionId } = req.query; + + const { collection } = await authDatasetCollection({ + req, + authToken: true, + collectionId: collectionId as string, + per: ReadPermissionVal + }); + + const match = { + teamId: collection.teamId, + datasetId: collection.datasetId, + collectionId: collection._id + }; + + // Computed global queue + const minId = ( + await MongoDatasetTraining.findOne( + { + teamId: collection.teamId, + datasetId: collection.datasetId, + collectionId: collection._id + }, + { sort: { _id: 1 }, select: '_id' }, + readFromSecondary + ).lean() + )?._id; + + const [ququedCountData, trainingCountData, errorCountData, trainedCount] = (await Promise.all([ + minId + ? MongoDatasetTraining.aggregate( + [ + { + $match: { + _id: { $lt: minId }, + retryCount: { $gt: 0 }, + lockTime: { $lt: new Date('2050/1/1') } + } + }, + { + $group: { + _id: '$mode', + count: { $sum: 1 } + } + } + ], + readFromSecondary + ) + : Promise.resolve([]), + MongoDatasetTraining.aggregate( + [ + { + $match: { + ...match, + retryCount: { $gt: 0 }, + lockTime: { $lt: new Date('2050/1/1') } + } + }, + { + $group: { + _id: '$mode', + count: { $sum: 1 } + } + } + ], + readFromSecondary + ), + MongoDatasetTraining.aggregate( + [ + { + $match: { + ...match, + retryCount: { $lte: 0 }, + errorMsg: { $exists: true } + } + }, + { + $group: { + _id: '$mode', + count: { $sum: 1 } + } + } + ], + readFromSecondary + ), + MongoDatasetData.countDocuments(match, readFromSecondary) + ])) as [ + { _id: TrainingModeEnum; count: number }[], + { _id: TrainingModeEnum; count: number }[], + { _id: TrainingModeEnum; count: number }[], + number + ]; + + const queuedCounts = ququedCountData.reduce( + (acc, item) => { + acc[item._id] = item.count; + return acc; + }, + { ...defaultCounts } + ); + const trainingCounts = trainingCountData.reduce( + (acc, item) => { + acc[item._id] = item.count; + return acc; + }, + { ...defaultCounts } + ); + const errorCounts = errorCountData.reduce( + (acc, item) => { + acc[item._id] = item.count; + return acc; + }, + { ...defaultCounts } + ); + + return { + trainingType: collection.trainingType, + advancedTraining: { + customPdfParse: !!collection.customPdfParse, + imageIndex: !!collection.imageIndex, + autoIndexes: !!collection.autoIndexes + }, + + queuedCounts, + trainingCounts, + errorCounts, + trainedCount + }; +} + +export default NextAPI(handler); diff --git a/projects/app/src/pages/api/core/dataset/training/deleteTrainingData.ts b/projects/app/src/pages/api/core/dataset/training/deleteTrainingData.ts new file mode 100644 index 000000000..d92ddd123 --- /dev/null +++ b/projects/app/src/pages/api/core/dataset/training/deleteTrainingData.ts @@ -0,0 +1,39 @@ +import { ManagePermissionVal } from '@fastgpt/global/support/permission/constant'; +import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema'; +import { authDatasetCollection } from '@fastgpt/service/support/permission/dataset/auth'; +import { NextAPI } from '@/service/middleware/entry'; +import { ApiRequestProps } from '@fastgpt/service/type/next'; + +export type deleteTrainingDataBody = { + datasetId: string; + collectionId: string; + dataId: string; +}; + +export type deleteTrainingDataQuery = {}; + +export type deleteTrainingDataResponse = {}; + +async function handler( + req: ApiRequestProps +): Promise { + const { datasetId, collectionId, dataId } = req.body; + + const { teamId } = await authDatasetCollection({ + req, + authToken: true, + authApiKey: true, + collectionId, + per: ManagePermissionVal + }); + + await MongoDatasetTraining.deleteOne({ + teamId, + datasetId, + _id: dataId + }); + + return {}; +} + +export default NextAPI(handler); diff --git a/projects/app/src/pages/api/core/dataset/training/getTrainingDataDetail.ts b/projects/app/src/pages/api/core/dataset/training/getTrainingDataDetail.ts new file mode 100644 index 000000000..b1b585178 --- /dev/null +++ b/projects/app/src/pages/api/core/dataset/training/getTrainingDataDetail.ts @@ -0,0 +1,52 @@ +import { ReadPermissionVal } from '@fastgpt/global/support/permission/constant'; +import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema'; +import { authDatasetCollection } from '@fastgpt/service/support/permission/dataset/auth'; +import { NextAPI } from '@/service/middleware/entry'; +import { ApiRequestProps } from '@fastgpt/service/type/next'; + +export type getTrainingDataDetailQuery = {}; + +export type getTrainingDataDetailBody = { + datasetId: string; + collectionId: string; + dataId: string; +}; + +export type getTrainingDataDetailResponse = + | { + _id: string; + datasetId: string; + mode: string; + q: string; + a: string; + } + | undefined; + +async function handler( + req: ApiRequestProps +): Promise { + const { datasetId, collectionId, dataId } = req.body; + + const { teamId } = await authDatasetCollection({ + req, + authToken: true, + collectionId, + per: ReadPermissionVal + }); + + const data = await MongoDatasetTraining.findOne({ teamId, datasetId, _id: dataId }).lean(); + + if (!data) { + return undefined; + } + + return { + _id: data._id, + datasetId: data.datasetId, + mode: data.mode, + q: data.q, + a: data.a + }; +} + +export default NextAPI(handler); diff --git a/projects/app/src/pages/api/core/dataset/training/getTrainingError.ts b/projects/app/src/pages/api/core/dataset/training/getTrainingError.ts new file mode 100644 index 000000000..10d928ffe --- /dev/null +++ b/projects/app/src/pages/api/core/dataset/training/getTrainingError.ts @@ -0,0 +1,51 @@ +import { NextAPI } from '@/service/middleware/entry'; +import { DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type'; +import { ReadPermissionVal } from '@fastgpt/global/support/permission/constant'; +import { parsePaginationRequest } from '@fastgpt/service/common/api/pagination'; +import { readFromSecondary } from '@fastgpt/service/common/mongo/utils'; +import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema'; +import { authDatasetCollection } from '@fastgpt/service/support/permission/dataset/auth'; +import { ApiRequestProps } from '@fastgpt/service/type/next'; +import { PaginationProps, PaginationResponse } from '@fastgpt/web/common/fetch/type'; + +export type getTrainingErrorBody = PaginationProps<{ + collectionId: string; +}>; + +export type getTrainingErrorResponse = PaginationResponse; + +async function handler(req: ApiRequestProps) { + const { collectionId } = req.body; + const { offset, pageSize } = parsePaginationRequest(req); + + const { collection } = await authDatasetCollection({ + req, + authToken: true, + collectionId, + per: ReadPermissionVal + }); + + const match = { + teamId: collection.teamId, + datasetId: collection.datasetId, + collectionId: collection._id, + errorMsg: { $exists: true } + }; + + const [errorList, total] = await Promise.all([ + MongoDatasetTraining.find(match, undefined, { + ...readFromSecondary + }) + .skip(offset) + .limit(pageSize) + .lean(), + MongoDatasetTraining.countDocuments(match, { ...readFromSecondary }) + ]); + + return { + list: errorList, + total + }; +} + +export default NextAPI(handler); diff --git a/projects/app/src/pages/api/core/dataset/training/updateTrainingData.ts b/projects/app/src/pages/api/core/dataset/training/updateTrainingData.ts new file mode 100644 index 000000000..75d8edb23 --- /dev/null +++ b/projects/app/src/pages/api/core/dataset/training/updateTrainingData.ts @@ -0,0 +1,59 @@ +import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; +import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema'; +import { authDatasetCollection } from '@fastgpt/service/support/permission/dataset/auth'; +import { NextAPI } from '@/service/middleware/entry'; +import { ApiRequestProps } from '@fastgpt/service/type/next'; +import { addMinutes } from 'date-fns'; + +export type updateTrainingDataBody = { + datasetId: string; + collectionId: string; + dataId: string; + q?: string; + a?: string; + chunkIndex?: number; +}; + +export type updateTrainingDataQuery = {}; + +export type updateTrainingDataResponse = {}; + +async function handler( + req: ApiRequestProps +): Promise { + const { datasetId, collectionId, dataId, q, a, chunkIndex } = req.body; + + const { teamId } = await authDatasetCollection({ + req, + authToken: true, + authApiKey: true, + collectionId, + per: WritePermissionVal + }); + + const data = await MongoDatasetTraining.findOne({ teamId, datasetId, _id: dataId }); + + if (!data) { + return Promise.reject('data not found'); + } + + await MongoDatasetTraining.updateOne( + { + teamId, + datasetId, + _id: dataId + }, + { + $unset: { errorMsg: '' }, + retryCount: 3, + ...(q !== undefined && { q }), + ...(a !== undefined && { a }), + ...(chunkIndex !== undefined && { chunkIndex }), + lockTime: addMinutes(new Date(), -10) + } + ); + + return {}; +} + +export default NextAPI(handler); diff --git a/projects/app/src/pages/api/v1/chat/completions.ts b/projects/app/src/pages/api/v1/chat/completions.ts index 14f79d53f..5657a2783 100644 --- a/projects/app/src/pages/api/v1/chat/completions.ts +++ b/projects/app/src/pages/api/v1/chat/completions.ts @@ -59,7 +59,6 @@ import { getWorkflowResponseWrite } from '@fastgpt/service/core/workflow/dispatc import { WORKFLOW_MAX_RUN_TIMES } from '@fastgpt/service/core/workflow/constants'; import { getPluginInputsFromStoreNodes } from '@fastgpt/global/core/app/plugin/utils'; import { ExternalProviderType } from '@fastgpt/global/core/workflow/runtime/type'; -import { FlowNodeTypeEnum } from '@fastgpt/global/core/workflow/node/constant'; type FastGptWebChatProps = { chatId?: string; // undefined: get histories from messages, '': new chat, 'xxxxx': get histories from db diff --git a/projects/app/src/service/events/generateQA.ts b/projects/app/src/service/events/generateQA.ts index 2273d1edf..ebf8a829c 100644 --- a/projects/app/src/service/events/generateQA.ts +++ b/projects/app/src/service/events/generateQA.ts @@ -26,6 +26,7 @@ import { chunkAutoChunkSize, getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils'; +import { getErrText } from '@fastgpt/global/common/error/utils'; const reduceQueue = () => { global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0; @@ -50,7 +51,7 @@ export async function generateQA(): Promise { const data = await MongoDatasetTraining.findOneAndUpdate( { mode: TrainingModeEnum.qa, - retryCount: { $gte: 0 }, + retryCount: { $gt: 0 }, lockTime: { $lte: addMinutes(new Date(), -10) } }, { @@ -176,7 +177,16 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`; generateQA(); } catch (err: any) { addLog.error(`[QA Queue] Error`, err); - reduceQueue(); + await MongoDatasetTraining.updateOne( + { + teamId: data.teamId, + datasetId: data.datasetId, + _id: data._id + }, + { + errorMsg: getErrText(err, 'unknown error') + } + ); setTimeout(() => { generateQA(); diff --git a/projects/app/src/service/events/generateVector.ts b/projects/app/src/service/events/generateVector.ts index 0e7f8c2b4..ada65e5ba 100644 --- a/projects/app/src/service/events/generateVector.ts +++ b/projects/app/src/service/events/generateVector.ts @@ -14,6 +14,7 @@ import { getEmbeddingModel } from '@fastgpt/service/core/ai/model'; import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun'; import { DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type'; import { Document } from '@fastgpt/service/common/mongo'; +import { getErrText } from '@fastgpt/global/common/error/utils'; const reduceQueue = () => { global.vectorQueueLen = global.vectorQueueLen > 0 ? global.vectorQueueLen - 1 : 0; @@ -48,7 +49,7 @@ export async function generateVector(): Promise { const data = await MongoDatasetTraining.findOneAndUpdate( { mode: TrainingModeEnum.chunk, - retryCount: { $gte: 0 }, + retryCount: { $gt: 0 }, lockTime: { $lte: addMinutes(new Date(), -3) } }, { @@ -117,6 +118,16 @@ export async function generateVector(): Promise { return reduceQueueAndReturn(); } catch (err: any) { addLog.error(`[Vector Queue] Error`, err); + await MongoDatasetTraining.updateOne( + { + teamId: data.teamId, + datasetId: data.datasetId, + _id: data._id + }, + { + errorMsg: getErrText(err, 'unknown error') + } + ); return reduceQueueAndReturn(1000); } } diff --git a/projects/app/src/web/core/dataset/api.ts b/projects/app/src/web/core/dataset/api.ts index a782d97bd..04eb30855 100644 --- a/projects/app/src/web/core/dataset/api.ts +++ b/projects/app/src/web/core/dataset/api.ts @@ -63,6 +63,17 @@ import type { import type { GetQuoteDataResponse } from '@/pages/api/core/dataset/data/getQuoteData'; import type { GetQuotePermissionResponse } from '@/pages/api/core/dataset/data/getPermission'; import type { GetQueueLenResponse } from '@/pages/api/core/dataset/training/getQueueLen'; +import type { updateTrainingDataBody } from '@/pages/api/core/dataset/training/updateTrainingData'; +import type { + getTrainingDataDetailBody, + getTrainingDataDetailResponse +} from '@/pages/api/core/dataset/training/getTrainingDataDetail'; +import type { deleteTrainingDataBody } from '@/pages/api/core/dataset/training/deleteTrainingData'; +import type { getTrainingDetailResponse } from '@/pages/api/core/dataset/collection/trainingDetail'; +import type { + getTrainingErrorBody, + getTrainingErrorResponse +} from '@/pages/api/core/dataset/training/getTrainingError'; /* ======================== dataset ======================= */ export const getDatasets = (data: GetDatasetListBody) => @@ -113,6 +124,10 @@ export const getDatasetCollectionPathById = (parentId: string) => GET(`/core/dataset/collection/paths`, { parentId }); export const getDatasetCollectionById = (id: string) => GET(`/core/dataset/collection/detail`, { id }); +export const getDatasetCollectionTrainingDetail = (collectionId: string) => + GET(`/core/dataset/collection/trainingDetail`, { + collectionId + }); export const postDatasetCollection = (data: CreateDatasetCollectionParams) => POST(`/core/dataset/collection/create`, data); export const postCreateDatasetFileCollection = (data: FileIdCreateDatasetCollectionParams) => @@ -224,6 +239,15 @@ export const getPreviewChunks = (data: PostPreviewFilesChunksProps) => timeout: 600000 }); +export const deleteTrainingData = (data: deleteTrainingDataBody) => + POST(`/core/dataset/training/deleteTrainingData`, data); +export const updateTrainingData = (data: updateTrainingDataBody) => + PUT(`/core/dataset/training/updateTrainingData`, data); +export const getTrainingDataDetail = (data: getTrainingDataDetailBody) => + POST(`/core/dataset/training/getTrainingDataDetail`, data); +export const getTrainingError = (data: getTrainingErrorBody) => + POST(`/core/dataset/training/getTrainingError`, data); + /* ================== read source ======================== */ export const getCollectionSource = (data: readCollectionSourceBody) => POST('/core/dataset/collection/read', data); diff --git a/projects/app/src/web/core/dataset/constants.ts b/projects/app/src/web/core/dataset/constants.ts index 3976d93ee..a8b6af022 100644 --- a/projects/app/src/web/core/dataset/constants.ts +++ b/projects/app/src/web/core/dataset/constants.ts @@ -2,13 +2,15 @@ import { defaultQAModels, defaultVectorModels } from '@fastgpt/global/core/ai/mo import { DatasetCollectionDataProcessModeEnum, DatasetCollectionTypeEnum, - DatasetTypeEnum + DatasetTypeEnum, + TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; import type { DatasetCollectionItemType, DatasetItemType } from '@fastgpt/global/core/dataset/type.d'; import { DatasetPermission } from '@fastgpt/global/support/permission/dataset/controller'; +import { i18nT } from '@fastgpt/web/i18n/utils'; export const defaultDatasetDetail: DatasetItemType = { _id: '', @@ -74,3 +76,34 @@ export const datasetTypeCourseMap: Record<`${DatasetTypeEnum}`, string> = { [DatasetTypeEnum.yuque]: '/docs/guide/knowledge_base/yuque_dataset/', [DatasetTypeEnum.externalFile]: '' }; + +export const TrainingProcess = { + waiting: { + label: i18nT('dataset:process.Waiting'), + value: 'waiting' + }, + parsing: { + label: i18nT('dataset:process.Parsing'), + value: 'parsing' + }, + getQA: { + label: i18nT('dataset:process.Get QA'), + value: 'getQA' + }, + imageIndex: { + label: i18nT('dataset:process.Image_Index'), + value: 'imageIndex' + }, + autoIndex: { + label: i18nT('dataset:process.Auto_Index'), + value: 'autoIndex' + }, + vectorizing: { + label: i18nT('dataset:process.Vectorizing'), + value: 'vectorizing' + }, + isReady: { + label: i18nT('dataset:process.Is_Ready'), + value: 'isReady' + } +}; diff --git a/test/cases/api/core/dataset/training/deleteTrainingData.test.ts b/test/cases/api/core/dataset/training/deleteTrainingData.test.ts new file mode 100644 index 000000000..552a24bcc --- /dev/null +++ b/test/cases/api/core/dataset/training/deleteTrainingData.test.ts @@ -0,0 +1,58 @@ +import handler, { + type deleteTrainingDataBody, + type deleteTrainingDataResponse +} from '@/pages/api/core/dataset/training/deleteTrainingData'; +import { + DatasetCollectionTypeEnum, + TrainingModeEnum +} from '@fastgpt/global/core/dataset/constants'; +import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema'; +import { MongoDataset } from '@fastgpt/service/core/dataset/schema'; +import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema'; +import { getRootUser } from '@test/datas/users'; +import { Call } from '@test/utils/request'; +import { describe, expect, it } from 'vitest'; + +describe('delete training data test', () => { + it('should delete training data', async () => { + const root = await getRootUser(); + const dataset = await MongoDataset.create({ + name: 'test', + teamId: root.teamId, + tmbId: root.tmbId + }); + const collection = await MongoDatasetCollection.create({ + name: 'test', + type: DatasetCollectionTypeEnum.file, + teamId: root.teamId, + tmbId: root.tmbId, + datasetId: dataset._id + }); + const trainingData = await MongoDatasetTraining.create({ + teamId: root.teamId, + tmbId: root.tmbId, + datasetId: dataset._id, + collectionId: collection._id, + mode: TrainingModeEnum.chunk, + model: 'test' + }); + + const res = await Call(handler, { + auth: root, + body: { + datasetId: dataset._id, + collectionId: collection._id, + dataId: trainingData._id + } + }); + + const deletedTrainingData = await MongoDatasetTraining.findOne({ + teamId: root.teamId, + datasetId: dataset._id, + _id: trainingData._id + }); + + expect(res.code).toBe(200); + expect(deletedTrainingData).toBeNull(); + }); +}); diff --git a/test/cases/api/core/dataset/training/getTrainingDataDetail.test.ts b/test/cases/api/core/dataset/training/getTrainingDataDetail.test.ts new file mode 100644 index 000000000..7c157a3e4 --- /dev/null +++ b/test/cases/api/core/dataset/training/getTrainingDataDetail.test.ts @@ -0,0 +1,59 @@ +import handler, { + type getTrainingDataDetailBody, + type getTrainingDataDetailResponse +} from '@/pages/api/core/dataset/training/getTrainingDataDetail'; +import { + DatasetCollectionTypeEnum, + TrainingModeEnum +} from '@fastgpt/global/core/dataset/constants'; +import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema'; +import { MongoDataset } from '@fastgpt/service/core/dataset/schema'; +import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema'; +import { getRootUser } from '@test/datas/users'; +import { Call } from '@test/utils/request'; +import { describe, expect, it } from 'vitest'; + +describe('get training data detail test', () => { + it('should return training data detail', async () => { + const root = await getRootUser(); + const dataset = await MongoDataset.create({ + name: 'test', + teamId: root.teamId, + tmbId: root.tmbId + }); + const collection = await MongoDatasetCollection.create({ + name: 'test', + type: DatasetCollectionTypeEnum.file, + teamId: root.teamId, + tmbId: root.tmbId, + datasetId: dataset._id + }); + const trainingData = await MongoDatasetTraining.create({ + teamId: root.teamId, + tmbId: root.tmbId, + datasetId: dataset._id, + collectionId: collection._id, + model: 'test', + mode: TrainingModeEnum.chunk, + q: 'test', + a: 'test' + }); + + const res = await Call(handler, { + auth: root, + body: { + datasetId: dataset._id, + collectionId: collection._id, + dataId: trainingData._id + } + }); + + expect(res.code).toBe(200); + expect(res.data).toBeDefined(); + expect(res.data?._id).toStrictEqual(trainingData._id); + expect(res.data?.datasetId).toStrictEqual(dataset._id); + expect(res.data?.mode).toBe(TrainingModeEnum.chunk); + expect(res.data?.q).toBe('test'); + expect(res.data?.a).toBe('test'); + }); +}); diff --git a/test/cases/api/core/dataset/training/getTrainingError.test.ts b/test/cases/api/core/dataset/training/getTrainingError.test.ts new file mode 100644 index 000000000..e99f70bfc --- /dev/null +++ b/test/cases/api/core/dataset/training/getTrainingError.test.ts @@ -0,0 +1,56 @@ +import handler, { + type getTrainingErrorBody, + type getTrainingErrorResponse +} from '@/pages/api/core/dataset/training/getTrainingError'; +import { + DatasetCollectionTypeEnum, + TrainingModeEnum +} from '@fastgpt/global/core/dataset/constants'; +import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema'; +import { MongoDataset } from '@fastgpt/service/core/dataset/schema'; +import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema'; +import { getRootUser } from '@test/datas/users'; +import { Call } from '@test/utils/request'; +import { describe, expect, it } from 'vitest'; + +describe('training error list test', () => { + it('should return training error list', async () => { + const root = await getRootUser(); + const dataset = await MongoDataset.create({ + name: 'test', + teamId: root.teamId, + tmbId: root.tmbId + }); + const collection = await MongoDatasetCollection.create({ + name: 'test', + type: DatasetCollectionTypeEnum.file, + teamId: root.teamId, + tmbId: root.tmbId, + datasetId: dataset._id + }); + await MongoDatasetTraining.create( + [...Array(10).keys()].map((i) => ({ + teamId: root.teamId, + tmbId: root.tmbId, + datasetId: dataset._id, + collectionId: collection._id, + mode: TrainingModeEnum.chunk, + model: 'test', + errorMsg: 'test' + })) + ); + + const res = await Call(handler, { + auth: root, + body: { + collectionId: collection._id, + pageSize: 10, + offset: 0 + } + }); + + expect(res.code).toBe(200); + expect(res.data.total).toBe(10); + expect(res.data.list.length).toBe(10); + }); +}); diff --git a/test/cases/api/core/dataset/training/updateTrainingData.test.ts b/test/cases/api/core/dataset/training/updateTrainingData.test.ts new file mode 100644 index 000000000..e95a2222b --- /dev/null +++ b/test/cases/api/core/dataset/training/updateTrainingData.test.ts @@ -0,0 +1,63 @@ +import handler, { + type updateTrainingDataBody, + type updateTrainingDataResponse +} from '@/pages/api/core/dataset/training/updateTrainingData'; +import { + DatasetCollectionTypeEnum, + TrainingModeEnum +} from '@fastgpt/global/core/dataset/constants'; +import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema'; +import { MongoDataset } from '@fastgpt/service/core/dataset/schema'; +import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema'; +import { getRootUser } from '@test/datas/users'; +import { Call } from '@test/utils/request'; +import { describe, expect, it } from 'vitest'; + +describe('update training data test', () => { + it('should update training data', async () => { + const root = await getRootUser(); + const dataset = await MongoDataset.create({ + name: 'test', + teamId: root.teamId, + tmbId: root.tmbId + }); + const collection = await MongoDatasetCollection.create({ + name: 'test', + type: DatasetCollectionTypeEnum.file, + teamId: root.teamId, + tmbId: root.tmbId, + datasetId: dataset._id + }); + const trainingData = await MongoDatasetTraining.create({ + teamId: root.teamId, + tmbId: root.tmbId, + datasetId: dataset._id, + collectionId: collection._id, + mode: TrainingModeEnum.chunk, + model: 'test' + }); + + const res = await Call(handler, { + auth: root, + body: { + datasetId: dataset._id, + collectionId: collection._id, + dataId: trainingData._id, + q: 'test', + a: 'test', + chunkIndex: 1 + } + }); + + const updatedTrainingData = await MongoDatasetTraining.findOne({ + teamId: root.teamId, + datasetId: dataset._id, + _id: trainingData._id + }); + + expect(res.code).toBe(200); + expect(updatedTrainingData?.q).toBe('test'); + expect(updatedTrainingData?.a).toBe('test'); + expect(updatedTrainingData?.chunkIndex).toBe(1); + }); +});