diff --git a/docSite/content/zh-cn/docs/development/upgrading/4913.md b/docSite/content/zh-cn/docs/development/upgrading/4913.md index 11be6ba18..836e797f6 100644 --- a/docSite/content/zh-cn/docs/development/upgrading/4913.md +++ b/docSite/content/zh-cn/docs/development/upgrading/4913.md @@ -21,4 +21,5 @@ weight: 787 1. 对话日志,日期范围选择问题。 2. API 调用时,传入的 system 提示词可能会重复。 -3. AI 对话/工具调用,未选择文件链接时,也会从历史记录读取文件。 \ No newline at end of file +3. AI 对话/工具调用,未选择文件链接时,也会从历史记录读取文件。 +4. 手动更新知识库索引时,错误的删除旧索引,导致手动索引无效。 \ No newline at end of file diff --git a/packages/service/common/file/image/schema.ts b/packages/service/common/file/image/schema.ts index 970e91f3d..7d1db38aa 100644 --- a/packages/service/common/file/image/schema.ts +++ b/packages/service/common/file/image/schema.ts @@ -23,6 +23,12 @@ try { ImageSchema.index({ type: 1 }); // delete related img ImageSchema.index({ teamId: 1, 'metadata.relatedId': 1 }); + + // Cron clear invalid img + ImageSchema.index( + { createTime: 1 }, + { partialFilterExpression: { 'metadata.relatedId': { $exists: true } } } + ); } catch (error) { console.log(error); } diff --git a/packages/service/core/dataset/collection/schema.ts b/packages/service/core/dataset/collection/schema.ts index b789c0b15..674275520 100644 --- a/packages/service/core/dataset/collection/schema.ts +++ b/packages/service/core/dataset/collection/schema.ts @@ -132,6 +132,12 @@ try { } } ); + + // Clear invalid image + DatasetCollectionSchema.index({ + teamId: 1, + 'metadata.relatedImgId': 1 + }); } catch (error) { console.log(error); } diff --git a/packages/service/core/dataset/data/schema.ts b/packages/service/core/dataset/data/schema.ts index 5b8d07e94..11cc34446 100644 --- a/packages/service/core/dataset/data/schema.ts +++ b/packages/service/core/dataset/data/schema.ts @@ -103,12 +103,14 @@ try { }); // Recall vectors after data matching DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 }); - DatasetDataSchema.index({ updateTime: 1 }); // rebuild data DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 }); // 为查询 initJieba 字段不存在的数据添加索引 DatasetDataSchema.index({ initJieba: 1, updateTime: 1 }); + + // Cron clear invalid data + DatasetDataSchema.index({ updateTime: 1 }); } catch (error) { console.log(error); } diff --git a/packages/service/support/operationLog/schema.ts b/packages/service/support/operationLog/schema.ts index d41123005..894b5b570 100644 --- a/packages/service/support/operationLog/schema.ts +++ b/packages/service/support/operationLog/schema.ts @@ -6,7 +6,7 @@ import { TeamMemberCollectionName } from '@fastgpt/global/support/user/team/constant'; -export const OperationLogCollectionName = 'operationLog'; +export const OperationLogCollectionName = 'operationLogs'; const OperationLogSchema = new Schema({ tmbId: { @@ -34,6 +34,9 @@ const OperationLogSchema = new Schema({ } }); +OperationLogSchema.index({ teamId: 1, tmbId: 1, event: 1 }); +OperationLogSchema.index({ timestamp: 1 }, { expireAfterSeconds: 14 * 24 * 60 * 60 }); // Auto delete after 14 days + export const MongoOperationLog = getMongoLogModel( OperationLogCollectionName, OperationLogSchema diff --git a/projects/app/src/pages/api/admin/clearInvalidData.ts b/projects/app/src/pages/api/admin/clearInvalidData.ts index 763e281e7..f75b6edf7 100644 --- a/projects/app/src/pages/api/admin/clearInvalidData.ts +++ b/projects/app/src/pages/api/admin/clearInvalidData.ts @@ -2,16 +2,20 @@ import type { NextApiRequest, NextApiResponse } from 'next'; import { jsonRes } from '@fastgpt/service/common/response'; import { authCert } from '@fastgpt/service/support/permission/auth/common'; import { addHours } from 'date-fns'; -import { MongoImage } from '@fastgpt/service/common/file/image/schema'; -import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema'; import { checkInvalidDatasetFiles, checkInvalidDatasetData, checkInvalidVector } from '@/service/common/system/cronTask'; +import dayjs from 'dayjs'; +import { retryFn } from '@fastgpt/global/common/system/utils'; +import { NextAPI } from '@/service/middleware/entry'; +import { useIPFrequencyLimit } from '@fastgpt/service/common/middle/reqFrequencyLimit'; +import { MongoImage } from '@fastgpt/service/common/file/image/schema'; +import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema'; let deleteImageAmount = 0; -async function checkInvalidImg(start: Date, end: Date, limit = 50) { +async function checkInvalidImg(start: Date, end: Date) { const images = await MongoImage.find( { createTime: { @@ -52,8 +56,8 @@ async function checkInvalidImg(start: Date, end: Date, limit = 50) { console.log(`检测完成,共删除 ${deleteImageAmount} 个无效图片`); } -/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */ -export default async function handler(req: NextApiRequest, res: NextApiResponse) { +async function handler(req: NextApiRequest, res: NextApiResponse) { + deleteImageAmount = 0; try { await authCert({ req, authRoot: true }); const { start = -2, end = -360 * 24 } = req.body as { start: number; end: number }; @@ -61,13 +65,37 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) (async () => { try { console.log('执行脏数据清理任务'); - // 360天 ~ 2小时前 - const endTime = addHours(new Date(), start); - const startTime = addHours(new Date(), end); - await checkInvalidDatasetFiles(startTime, endTime); - await checkInvalidImg(startTime, endTime); - await checkInvalidDatasetData(startTime, endTime); - await checkInvalidVector(startTime, endTime); + + // Split time range into 6-hour chunks to avoid processing too much data at once + const totalHours = Math.abs(start - end); + const chunkHours = 6; + const chunks = Math.ceil(totalHours / chunkHours); + + console.log( + `Total time range: ${totalHours} hours, split into ${chunks} chunks of ${chunkHours} hours each` + ); + + for (let i = 0; i < chunks; i++) { + const chunkStart = start - i * chunkHours; + const chunkEnd = Math.max(start - (i + 1) * chunkHours, end); + + const chunkEndTime = addHours(new Date(), chunkStart); + const chunkStartTime = addHours(new Date(), chunkEnd); + + console.log( + `Processing chunk ${i + 1}/${chunks}: ${dayjs(chunkStartTime).format( + 'YYYY-MM-DD HH:mm' + )} to ${dayjs(chunkEndTime).format('YYYY-MM-DD HH:mm')}` + ); + + await retryFn(() => checkInvalidDatasetFiles(chunkStartTime, chunkEndTime)); + await retryFn(() => checkInvalidImg(chunkStartTime, chunkEndTime)); + await retryFn(() => checkInvalidDatasetData(chunkStartTime, chunkEndTime)); + await retryFn(() => checkInvalidVector(chunkStartTime, chunkEndTime)); + + console.log(`Chunk ${i + 1}/${chunks} completed`); + } + console.log('执行脏数据清理任务完毕'); } catch (error) { console.log('执行脏数据清理任务出错了'); @@ -86,3 +114,5 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) }); } } + +export default NextAPI(useIPFrequencyLimit({ id: 'admin-api', seconds: 60, limit: 1 }), handler); diff --git a/projects/app/src/service/common/system/cronTask.ts b/projects/app/src/service/common/system/cronTask.ts index cf92e140a..b1e9dea85 100644 --- a/projects/app/src/service/common/system/cronTask.ts +++ b/projects/app/src/service/common/system/cronTask.ts @@ -145,16 +145,20 @@ export async function checkInvalidDatasetData(start: Date, end: Date) { datasetId: item.datasetId, collectionId: item.collectionId }); - await MongoDatasetDataText.deleteMany({ - teamId: item.teamId, - datasetId: item.datasetId, - collectionId: item.collectionId - }); - await deleteDatasetDataVector({ - teamId: item.teamId, - datasetIds: [item.datasetId], - collectionIds: [item.collectionId] - }); + + await Promise.all([ + MongoDatasetDataText.deleteMany({ + teamId: item.teamId, + datasetId: item.datasetId, + collectionId: item.collectionId + }), + deleteDatasetDataVector({ + teamId: item.teamId, + datasetIds: [item.datasetId], + collectionIds: [item.collectionId] + }) + ]); + await MongoDatasetData.deleteMany({ teamId: item.teamId, datasetId: item.datasetId, diff --git a/projects/app/src/service/core/dataset/data/controller.ts b/projects/app/src/service/core/dataset/data/controller.ts index e5cccf07e..2315153cb 100644 --- a/projects/app/src/service/core/dataset/data/controller.ts +++ b/projects/app/src/service/core/dataset/data/controller.ts @@ -318,6 +318,11 @@ export async function updateData2Dataset({ } } + const deleteVectorIdList = patchResult + .filter((item) => item.type === 'delete' || item.type === 'update') + .map((item) => item.index.dataId) + .filter(Boolean) as string[]; + // 4. Update mongo updateTime(便于脏数据检查器识别) const updateTime = mongoData.updateTime; mongoData.updateTime = new Date(); @@ -377,14 +382,10 @@ export async function updateData2Dataset({ ); // Delete vector - const deleteIdList = patchResult - .filter((item) => item.type === 'delete' || item.type === 'update') - .map((item) => item.index.dataId) - .filter(Boolean) as string[]; - if (deleteIdList.length > 0) { + if (deleteVectorIdList.length > 0) { await deleteDatasetDataVector({ teamId: mongoData.teamId, - idList: deleteIdList + idList: deleteVectorIdList }); } });