feat: operation index (#5056)

* feat: operation index

* fix: delete update vector

* perf: Clear invalid data

* perf: index

* perf: cleare invalid data

* index
This commit is contained in:
Archer
2025-06-18 00:46:31 +08:00
committed by GitHub
parent 6060543222
commit 6b2ea696c5
8 changed files with 84 additions and 31 deletions

View File

@@ -21,4 +21,5 @@ weight: 787
1. 对话日志,日期范围选择问题。 1. 对话日志,日期范围选择问题。
2. API 调用时,传入的 system 提示词可能会重复。 2. API 调用时,传入的 system 提示词可能会重复。
3. AI 对话/工具调用,未选择文件链接时,也会从历史记录读取文件。 3. AI 对话/工具调用,未选择文件链接时,也会从历史记录读取文件。
4. 手动更新知识库索引时,错误的删除旧索引,导致手动索引无效。

View File

@@ -23,6 +23,12 @@ try {
ImageSchema.index({ type: 1 }); ImageSchema.index({ type: 1 });
// delete related img // delete related img
ImageSchema.index({ teamId: 1, 'metadata.relatedId': 1 }); ImageSchema.index({ teamId: 1, 'metadata.relatedId': 1 });
// Cron clear invalid img
ImageSchema.index(
{ createTime: 1 },
{ partialFilterExpression: { 'metadata.relatedId': { $exists: true } } }
);
} catch (error) { } catch (error) {
console.log(error); console.log(error);
} }

View File

@@ -132,6 +132,12 @@ try {
} }
} }
); );
// Clear invalid image
DatasetCollectionSchema.index({
teamId: 1,
'metadata.relatedImgId': 1
});
} catch (error) { } catch (error) {
console.log(error); console.log(error);
} }

View File

@@ -103,12 +103,14 @@ try {
}); });
// Recall vectors after data matching // Recall vectors after data matching
DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 }); DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 });
DatasetDataSchema.index({ updateTime: 1 });
// rebuild data // rebuild data
DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 }); DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 });
// 为查询 initJieba 字段不存在的数据添加索引 // 为查询 initJieba 字段不存在的数据添加索引
DatasetDataSchema.index({ initJieba: 1, updateTime: 1 }); DatasetDataSchema.index({ initJieba: 1, updateTime: 1 });
// Cron clear invalid data
DatasetDataSchema.index({ updateTime: 1 });
} catch (error) { } catch (error) {
console.log(error); console.log(error);
} }

View File

@@ -6,7 +6,7 @@ import {
TeamMemberCollectionName TeamMemberCollectionName
} from '@fastgpt/global/support/user/team/constant'; } from '@fastgpt/global/support/user/team/constant';
export const OperationLogCollectionName = 'operationLog'; export const OperationLogCollectionName = 'operationLogs';
const OperationLogSchema = new Schema({ const OperationLogSchema = new Schema({
tmbId: { tmbId: {
@@ -34,6 +34,9 @@ const OperationLogSchema = new Schema({
} }
}); });
OperationLogSchema.index({ teamId: 1, tmbId: 1, event: 1 });
OperationLogSchema.index({ timestamp: 1 }, { expireAfterSeconds: 14 * 24 * 60 * 60 }); // Auto delete after 14 days
export const MongoOperationLog = getMongoLogModel<OperationLogSchema>( export const MongoOperationLog = getMongoLogModel<OperationLogSchema>(
OperationLogCollectionName, OperationLogCollectionName,
OperationLogSchema OperationLogSchema

View File

@@ -2,16 +2,20 @@ import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response'; import { jsonRes } from '@fastgpt/service/common/response';
import { authCert } from '@fastgpt/service/support/permission/auth/common'; import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { addHours } from 'date-fns'; import { addHours } from 'date-fns';
import { MongoImage } from '@fastgpt/service/common/file/image/schema';
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
import { import {
checkInvalidDatasetFiles, checkInvalidDatasetFiles,
checkInvalidDatasetData, checkInvalidDatasetData,
checkInvalidVector checkInvalidVector
} from '@/service/common/system/cronTask'; } from '@/service/common/system/cronTask';
import dayjs from 'dayjs';
import { retryFn } from '@fastgpt/global/common/system/utils';
import { NextAPI } from '@/service/middleware/entry';
import { useIPFrequencyLimit } from '@fastgpt/service/common/middle/reqFrequencyLimit';
import { MongoImage } from '@fastgpt/service/common/file/image/schema';
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
let deleteImageAmount = 0; let deleteImageAmount = 0;
async function checkInvalidImg(start: Date, end: Date, limit = 50) { async function checkInvalidImg(start: Date, end: Date) {
const images = await MongoImage.find( const images = await MongoImage.find(
{ {
createTime: { createTime: {
@@ -52,8 +56,8 @@ async function checkInvalidImg(start: Date, end: Date, limit = 50) {
console.log(`检测完成,共删除 ${deleteImageAmount} 个无效图片`); console.log(`检测完成,共删除 ${deleteImageAmount} 个无效图片`);
} }
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */ async function handler(req: NextApiRequest, res: NextApiResponse) {
export default async function handler(req: NextApiRequest, res: NextApiResponse) { deleteImageAmount = 0;
try { try {
await authCert({ req, authRoot: true }); await authCert({ req, authRoot: true });
const { start = -2, end = -360 * 24 } = req.body as { start: number; end: number }; const { start = -2, end = -360 * 24 } = req.body as { start: number; end: number };
@@ -61,13 +65,37 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
(async () => { (async () => {
try { try {
console.log('执行脏数据清理任务'); console.log('执行脏数据清理任务');
// 360天 ~ 2小时前
const endTime = addHours(new Date(), start); // Split time range into 6-hour chunks to avoid processing too much data at once
const startTime = addHours(new Date(), end); const totalHours = Math.abs(start - end);
await checkInvalidDatasetFiles(startTime, endTime); const chunkHours = 6;
await checkInvalidImg(startTime, endTime); const chunks = Math.ceil(totalHours / chunkHours);
await checkInvalidDatasetData(startTime, endTime);
await checkInvalidVector(startTime, endTime); console.log(
`Total time range: ${totalHours} hours, split into ${chunks} chunks of ${chunkHours} hours each`
);
for (let i = 0; i < chunks; i++) {
const chunkStart = start - i * chunkHours;
const chunkEnd = Math.max(start - (i + 1) * chunkHours, end);
const chunkEndTime = addHours(new Date(), chunkStart);
const chunkStartTime = addHours(new Date(), chunkEnd);
console.log(
`Processing chunk ${i + 1}/${chunks}: ${dayjs(chunkStartTime).format(
'YYYY-MM-DD HH:mm'
)} to ${dayjs(chunkEndTime).format('YYYY-MM-DD HH:mm')}`
);
await retryFn(() => checkInvalidDatasetFiles(chunkStartTime, chunkEndTime));
await retryFn(() => checkInvalidImg(chunkStartTime, chunkEndTime));
await retryFn(() => checkInvalidDatasetData(chunkStartTime, chunkEndTime));
await retryFn(() => checkInvalidVector(chunkStartTime, chunkEndTime));
console.log(`Chunk ${i + 1}/${chunks} completed`);
}
console.log('执行脏数据清理任务完毕'); console.log('执行脏数据清理任务完毕');
} catch (error) { } catch (error) {
console.log('执行脏数据清理任务出错了'); console.log('执行脏数据清理任务出错了');
@@ -86,3 +114,5 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
}); });
} }
} }
export default NextAPI(useIPFrequencyLimit({ id: 'admin-api', seconds: 60, limit: 1 }), handler);

View File

@@ -145,16 +145,20 @@ export async function checkInvalidDatasetData(start: Date, end: Date) {
datasetId: item.datasetId, datasetId: item.datasetId,
collectionId: item.collectionId collectionId: item.collectionId
}); });
await MongoDatasetDataText.deleteMany({
teamId: item.teamId, await Promise.all([
datasetId: item.datasetId, MongoDatasetDataText.deleteMany({
collectionId: item.collectionId teamId: item.teamId,
}); datasetId: item.datasetId,
await deleteDatasetDataVector({ collectionId: item.collectionId
teamId: item.teamId, }),
datasetIds: [item.datasetId], deleteDatasetDataVector({
collectionIds: [item.collectionId] teamId: item.teamId,
}); datasetIds: [item.datasetId],
collectionIds: [item.collectionId]
})
]);
await MongoDatasetData.deleteMany({ await MongoDatasetData.deleteMany({
teamId: item.teamId, teamId: item.teamId,
datasetId: item.datasetId, datasetId: item.datasetId,

View File

@@ -318,6 +318,11 @@ export async function updateData2Dataset({
} }
} }
const deleteVectorIdList = patchResult
.filter((item) => item.type === 'delete' || item.type === 'update')
.map((item) => item.index.dataId)
.filter(Boolean) as string[];
// 4. Update mongo updateTime(便于脏数据检查器识别) // 4. Update mongo updateTime(便于脏数据检查器识别)
const updateTime = mongoData.updateTime; const updateTime = mongoData.updateTime;
mongoData.updateTime = new Date(); mongoData.updateTime = new Date();
@@ -377,14 +382,10 @@ export async function updateData2Dataset({
); );
// Delete vector // Delete vector
const deleteIdList = patchResult if (deleteVectorIdList.length > 0) {
.filter((item) => item.type === 'delete' || item.type === 'update')
.map((item) => item.index.dataId)
.filter(Boolean) as string[];
if (deleteIdList.length > 0) {
await deleteDatasetDataVector({ await deleteDatasetDataVector({
teamId: mongoData.teamId, teamId: mongoData.teamId,
idList: deleteIdList idList: deleteVectorIdList
}); });
} }
}); });