4.6.3-website dataset (#532)

This commit is contained in:
Archer
2023-12-03 20:45:57 +08:00
committed by GitHub
parent b916183848
commit a9ae270335
122 changed files with 3793 additions and 1360 deletions

View File

@@ -1,7 +1,7 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { delay } from '@/utils/tools';
import { delay } from '@fastgpt/global/common/system/utils';
import { PgClient } from '@fastgpt/service/common/pg';
import {
DatasetDataIndexTypeEnum,

View File

@@ -1,12 +1,9 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { delay } from '@/utils/tools';
import { delay } from '@fastgpt/global/common/system/utils';
import { PgClient } from '@fastgpt/service/common/pg';
import {
DatasetDataIndexTypeEnum,
PgDatasetTableName
} from '@fastgpt/global/core/dataset/constant';
import { PgDatasetTableName } from '@fastgpt/global/core/dataset/constant';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';

View File

@@ -8,7 +8,7 @@ import {
} from '@fastgpt/service/support/user/team/controller';
import { MongoUser } from '@fastgpt/service/support/user/schema';
import { UserModelSchema } from '@fastgpt/global/support/user/type';
import { delay } from '@/utils/tools';
import { delay } from '@fastgpt/global/common/system/utils';
import { MongoDataset } from '@fastgpt/service/core/dataset/schema';
import { PermissionTypeEnum } from '@fastgpt/global/support/permission/constant';
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';

View File

@@ -1,7 +1,7 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { delay } from '@/utils/tools';
import { delay } from '@fastgpt/global/common/system/utils';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { MongoApp } from '@fastgpt/service/core/app/schema';
import { FlowNodeInputTypeEnum, FlowNodeTypeEnum } from '@fastgpt/global/core/module/node/constant';

View File

@@ -1,7 +1,7 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { delay } from '@/utils/tools';
import { delay } from '@fastgpt/global/common/system/utils';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { jiebaSplit } from '@/service/core/dataset/utils';
@@ -17,10 +17,13 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
console.log(
'total',
await MongoDatasetData.countDocuments({ fullTextToken: { $exists: false } })
await MongoDatasetData.countDocuments({
fullTextToken: { $exists: false },
updateTime: { $lt: new Date() }
})
);
await initFullTextToken(limit);
await initFullTextToken(limit, new Date());
jsonRes(res, {
message: 'success'
@@ -34,9 +37,12 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
});
}
}
export async function initFullTextToken(limit = 50): Promise<any> {
export async function initFullTextToken(limit = 50, endDate: Date): Promise<any> {
try {
const dataList = await MongoDatasetData.find({ fullTextToken: { $exists: false } }, '_id q a')
const dataList = await MongoDatasetData.find(
{ fullTextToken: { $exists: false }, updateTime: { $lt: endDate } },
'_id q a'
)
.limit(limit)
.lean();
if (dataList.length === 0) return;
@@ -56,9 +62,9 @@ export async function initFullTextToken(limit = 50): Promise<any> {
success += result.filter((item) => item.status === 'fulfilled').length;
console.log(`success: ${success}`);
return initFullTextToken(limit);
return initFullTextToken(limit, endDate);
} catch (error) {
await delay(1000);
return initFullTextToken(limit);
return initFullTextToken(limit, endDate);
}
}

View File

@@ -0,0 +1,62 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { delay } from '@fastgpt/global/common/system/utils';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { jiebaSplit } from '@/service/core/dataset/utils';
let success = 0;
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
const { limit = 50 } = req.body as { limit: number };
await authCert({ req, authRoot: true });
await connectToDatabase();
success = 0;
console.log('total', await MongoDatasetData.countDocuments({ inited: { $exists: false } }));
await initFullTextToken(limit);
jsonRes(res, {
message: 'success'
});
} catch (error) {
console.log(error);
jsonRes(res, {
code: 500,
error
});
}
}
export async function initFullTextToken(limit = 50): Promise<any> {
try {
const dataList = await MongoDatasetData.find({ inited: { $exists: false } }, '_id q a')
.limit(limit)
.lean();
if (dataList.length === 0) return;
const result = await Promise.allSettled(
dataList.map((item) => {
const text = item.q + (item.a || '');
const tokens = jiebaSplit({ text });
return MongoDatasetData.findByIdAndUpdate(item._id, {
$set: {
inited: true,
fullTextToken: tokens
}
});
})
);
success += result.filter((item) => item.status === 'fulfilled').length;
console.log(`success: ${success}`);
return initFullTextToken(limit);
} catch (error) {
await delay(1000);
return initFullTextToken(limit);
}
}

View File

@@ -4,7 +4,8 @@ import { connectToDatabase } from '@/service/mongo';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
import { DatasetStatusEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
import { MongoDataset } from '@fastgpt/service/core/dataset/schema';
let success = 0;
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
@@ -15,32 +16,85 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
await connectToDatabase();
success = 0;
await MongoDatasetCollection.updateMany({}, [
await MongoDatasetCollection.updateMany({ createTime: { $exists: false } }, [
{
$set: {
createTime: '$updateTime'
}
}
]);
await MongoDatasetCollection.updateMany({ trainingType: { $exists: false } }, [
{
$set: {
createTime: '$updateTime',
trainingType: {
$cond: {
if: { $ifNull: ['$a', false] },
then: TrainingModeEnum.qa,
else: TrainingModeEnum.chunk
}
},
chunkSize: 0,
fileId: '$metadata.fileId',
}
}
}
]);
await MongoDatasetCollection.updateMany({ chunkSize: { $exists: false } }, [
{
$set: {
chunkSize: 0
}
}
]);
await MongoDatasetCollection.updateMany({ fileId: { $exists: false } }, [
{
$set: {
fileId: '$metadata.fileId'
}
}
]);
await MongoDatasetCollection.updateMany({ rawLink: { $exists: false } }, [
{
$set: {
rawLink: '$metadata.rawLink'
}
}
]);
await MongoDatasetData.updateMany(
{},
{ chunkIndex: { $exists: false } },
{
chunkIndex: 0
}
);
await MongoDatasetData.updateMany(
{ updateTime: { $exists: false } },
{
chunkIndex: 0,
updateTime: new Date()
}
);
await MongoDataset.updateMany(
{ status: { $exists: false } },
{
$set: {
status: DatasetStatusEnum.active
}
}
);
// dataset tags to intro
await MongoDataset.updateMany({ tags: { $exists: true } }, [
{
$set: {
intro: {
$reduce: {
input: '$tags',
initialValue: '',
in: { $concat: ['$$value', ' ', '$$this'] }
}
}
}
}
]);
jsonRes(res, {
message: 'success'
});

View File

@@ -0,0 +1,92 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { delFileById, getGFSCollection } from '@fastgpt/service/common/file/gridfs/controller';
import { addLog } from '@fastgpt/service/common/mongo/controller';
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
import { delay } from '@fastgpt/global/common/system/utils';
/*
check dataset.files data. If there is no match in dataset.collections, delete it
*/
let deleteFileAmount = 0;
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
const {
startDay = 10,
endDay = 3,
limit = 30
} = req.body as { startDay?: number; endDay?: number; limit?: number };
await authCert({ req, authRoot: true });
await connectToDatabase();
// start: now - maxDay, end: now - 3 day
const start = new Date(Date.now() - startDay * 24 * 60 * 60 * 1000);
const end = new Date(Date.now() - endDay * 24 * 60 * 60 * 1000);
deleteFileAmount = 0;
checkFiles(start, end, limit);
jsonRes(res, {
message: 'success'
});
} catch (error) {
addLog.error(`check valid dataset files error`, error);
jsonRes(res, {
code: 500,
error
});
}
}
export async function checkFiles(start: Date, end: Date, limit: number) {
const collection = getGFSCollection('dataset');
const where = {
uploadDate: { $gte: start, $lte: end }
};
// 1. get all _id
const ids = await collection
.find(where, {
projection: {
_id: 1
}
})
.toArray();
console.log('total files', ids.length);
for (let i = 0; i < limit; i++) {
check(i);
}
async function check(index: number): Promise<any> {
const id = ids[index];
if (!id) {
console.log(`检测完成,共删除 ${deleteFileAmount} 个无效文件`);
return;
}
try {
const { _id } = id;
// 2. find fileId in dataset.collections
const hasCollection = await MongoDatasetCollection.countDocuments({ fileId: _id });
// 3. if not found, delete file
if (hasCollection === 0) {
await delFileById({ bucketName: 'dataset', fileId: String(_id) });
console.log('delete file', _id);
deleteFileAmount++;
}
index % 100 === 0 && console.log(index);
return check(index + limit);
} catch (error) {
console.log(error);
await delay(2000);
return check(index);
}
}
}