mirror of
https://github.com/labring/FastGPT.git
synced 2025-08-05 22:55:27 +00:00
4.6.3-website dataset (#532)
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import { delay } from '@/utils/tools';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { PgClient } from '@fastgpt/service/common/pg';
|
||||
import {
|
||||
DatasetDataIndexTypeEnum,
|
||||
|
@@ -1,12 +1,9 @@
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import { delay } from '@/utils/tools';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { PgClient } from '@fastgpt/service/common/pg';
|
||||
import {
|
||||
DatasetDataIndexTypeEnum,
|
||||
PgDatasetTableName
|
||||
} from '@fastgpt/global/core/dataset/constant';
|
||||
import { PgDatasetTableName } from '@fastgpt/global/core/dataset/constant';
|
||||
|
||||
import { authCert } from '@fastgpt/service/support/permission/auth/common';
|
||||
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
|
||||
|
@@ -8,7 +8,7 @@ import {
|
||||
} from '@fastgpt/service/support/user/team/controller';
|
||||
import { MongoUser } from '@fastgpt/service/support/user/schema';
|
||||
import { UserModelSchema } from '@fastgpt/global/support/user/type';
|
||||
import { delay } from '@/utils/tools';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { MongoDataset } from '@fastgpt/service/core/dataset/schema';
|
||||
import { PermissionTypeEnum } from '@fastgpt/global/support/permission/constant';
|
||||
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
|
||||
|
@@ -1,7 +1,7 @@
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import { delay } from '@/utils/tools';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { authCert } from '@fastgpt/service/support/permission/auth/common';
|
||||
import { MongoApp } from '@fastgpt/service/core/app/schema';
|
||||
import { FlowNodeInputTypeEnum, FlowNodeTypeEnum } from '@fastgpt/global/core/module/node/constant';
|
||||
|
@@ -1,7 +1,7 @@
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import { delay } from '@/utils/tools';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { authCert } from '@fastgpt/service/support/permission/auth/common';
|
||||
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
|
||||
import { jiebaSplit } from '@/service/core/dataset/utils';
|
||||
@@ -17,10 +17,13 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
|
||||
|
||||
console.log(
|
||||
'total',
|
||||
await MongoDatasetData.countDocuments({ fullTextToken: { $exists: false } })
|
||||
await MongoDatasetData.countDocuments({
|
||||
fullTextToken: { $exists: false },
|
||||
updateTime: { $lt: new Date() }
|
||||
})
|
||||
);
|
||||
|
||||
await initFullTextToken(limit);
|
||||
await initFullTextToken(limit, new Date());
|
||||
|
||||
jsonRes(res, {
|
||||
message: 'success'
|
||||
@@ -34,9 +37,12 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
|
||||
});
|
||||
}
|
||||
}
|
||||
export async function initFullTextToken(limit = 50): Promise<any> {
|
||||
export async function initFullTextToken(limit = 50, endDate: Date): Promise<any> {
|
||||
try {
|
||||
const dataList = await MongoDatasetData.find({ fullTextToken: { $exists: false } }, '_id q a')
|
||||
const dataList = await MongoDatasetData.find(
|
||||
{ fullTextToken: { $exists: false }, updateTime: { $lt: endDate } },
|
||||
'_id q a'
|
||||
)
|
||||
.limit(limit)
|
||||
.lean();
|
||||
if (dataList.length === 0) return;
|
||||
@@ -56,9 +62,9 @@ export async function initFullTextToken(limit = 50): Promise<any> {
|
||||
|
||||
success += result.filter((item) => item.status === 'fulfilled').length;
|
||||
console.log(`success: ${success}`);
|
||||
return initFullTextToken(limit);
|
||||
return initFullTextToken(limit, endDate);
|
||||
} catch (error) {
|
||||
await delay(1000);
|
||||
return initFullTextToken(limit);
|
||||
return initFullTextToken(limit, endDate);
|
||||
}
|
||||
}
|
||||
|
62
projects/app/src/pages/api/admin/initv463-2.ts
Normal file
62
projects/app/src/pages/api/admin/initv463-2.ts
Normal file
@@ -0,0 +1,62 @@
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { authCert } from '@fastgpt/service/support/permission/auth/common';
|
||||
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
|
||||
import { jiebaSplit } from '@/service/core/dataset/utils';
|
||||
|
||||
let success = 0;
|
||||
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
|
||||
try {
|
||||
const { limit = 50 } = req.body as { limit: number };
|
||||
await authCert({ req, authRoot: true });
|
||||
await connectToDatabase();
|
||||
success = 0;
|
||||
|
||||
console.log('total', await MongoDatasetData.countDocuments({ inited: { $exists: false } }));
|
||||
|
||||
await initFullTextToken(limit);
|
||||
|
||||
jsonRes(res, {
|
||||
message: 'success'
|
||||
});
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
|
||||
jsonRes(res, {
|
||||
code: 500,
|
||||
error
|
||||
});
|
||||
}
|
||||
}
|
||||
export async function initFullTextToken(limit = 50): Promise<any> {
|
||||
try {
|
||||
const dataList = await MongoDatasetData.find({ inited: { $exists: false } }, '_id q a')
|
||||
.limit(limit)
|
||||
.lean();
|
||||
if (dataList.length === 0) return;
|
||||
|
||||
const result = await Promise.allSettled(
|
||||
dataList.map((item) => {
|
||||
const text = item.q + (item.a || '');
|
||||
const tokens = jiebaSplit({ text });
|
||||
|
||||
return MongoDatasetData.findByIdAndUpdate(item._id, {
|
||||
$set: {
|
||||
inited: true,
|
||||
fullTextToken: tokens
|
||||
}
|
||||
});
|
||||
})
|
||||
);
|
||||
|
||||
success += result.filter((item) => item.status === 'fulfilled').length;
|
||||
console.log(`success: ${success}`);
|
||||
return initFullTextToken(limit);
|
||||
} catch (error) {
|
||||
await delay(1000);
|
||||
return initFullTextToken(limit);
|
||||
}
|
||||
}
|
@@ -4,7 +4,8 @@ import { connectToDatabase } from '@/service/mongo';
|
||||
import { authCert } from '@fastgpt/service/support/permission/auth/common';
|
||||
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
|
||||
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
|
||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
import { DatasetStatusEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
import { MongoDataset } from '@fastgpt/service/core/dataset/schema';
|
||||
|
||||
let success = 0;
|
||||
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
|
||||
@@ -15,32 +16,85 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
|
||||
await connectToDatabase();
|
||||
success = 0;
|
||||
|
||||
await MongoDatasetCollection.updateMany({}, [
|
||||
await MongoDatasetCollection.updateMany({ createTime: { $exists: false } }, [
|
||||
{
|
||||
$set: {
|
||||
createTime: '$updateTime'
|
||||
}
|
||||
}
|
||||
]);
|
||||
await MongoDatasetCollection.updateMany({ trainingType: { $exists: false } }, [
|
||||
{
|
||||
$set: {
|
||||
createTime: '$updateTime',
|
||||
trainingType: {
|
||||
$cond: {
|
||||
if: { $ifNull: ['$a', false] },
|
||||
then: TrainingModeEnum.qa,
|
||||
else: TrainingModeEnum.chunk
|
||||
}
|
||||
},
|
||||
chunkSize: 0,
|
||||
fileId: '$metadata.fileId',
|
||||
}
|
||||
}
|
||||
}
|
||||
]);
|
||||
await MongoDatasetCollection.updateMany({ chunkSize: { $exists: false } }, [
|
||||
{
|
||||
$set: {
|
||||
chunkSize: 0
|
||||
}
|
||||
}
|
||||
]);
|
||||
await MongoDatasetCollection.updateMany({ fileId: { $exists: false } }, [
|
||||
{
|
||||
$set: {
|
||||
fileId: '$metadata.fileId'
|
||||
}
|
||||
}
|
||||
]);
|
||||
await MongoDatasetCollection.updateMany({ rawLink: { $exists: false } }, [
|
||||
{
|
||||
$set: {
|
||||
rawLink: '$metadata.rawLink'
|
||||
}
|
||||
}
|
||||
]);
|
||||
|
||||
await MongoDatasetData.updateMany(
|
||||
{},
|
||||
{ chunkIndex: { $exists: false } },
|
||||
{
|
||||
chunkIndex: 0
|
||||
}
|
||||
);
|
||||
await MongoDatasetData.updateMany(
|
||||
{ updateTime: { $exists: false } },
|
||||
{
|
||||
chunkIndex: 0,
|
||||
updateTime: new Date()
|
||||
}
|
||||
);
|
||||
|
||||
await MongoDataset.updateMany(
|
||||
{ status: { $exists: false } },
|
||||
{
|
||||
$set: {
|
||||
status: DatasetStatusEnum.active
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
// dataset tags to intro
|
||||
await MongoDataset.updateMany({ tags: { $exists: true } }, [
|
||||
{
|
||||
$set: {
|
||||
intro: {
|
||||
$reduce: {
|
||||
input: '$tags',
|
||||
initialValue: '',
|
||||
in: { $concat: ['$$value', ' ', '$$this'] }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]);
|
||||
|
||||
jsonRes(res, {
|
||||
message: 'success'
|
||||
});
|
||||
|
@@ -0,0 +1,92 @@
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import { authCert } from '@fastgpt/service/support/permission/auth/common';
|
||||
import { delFileById, getGFSCollection } from '@fastgpt/service/common/file/gridfs/controller';
|
||||
import { addLog } from '@fastgpt/service/common/mongo/controller';
|
||||
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
|
||||
/*
|
||||
check dataset.files data. If there is no match in dataset.collections, delete it
|
||||
*/
|
||||
let deleteFileAmount = 0;
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
|
||||
try {
|
||||
const {
|
||||
startDay = 10,
|
||||
endDay = 3,
|
||||
limit = 30
|
||||
} = req.body as { startDay?: number; endDay?: number; limit?: number };
|
||||
await authCert({ req, authRoot: true });
|
||||
await connectToDatabase();
|
||||
|
||||
// start: now - maxDay, end: now - 3 day
|
||||
const start = new Date(Date.now() - startDay * 24 * 60 * 60 * 1000);
|
||||
const end = new Date(Date.now() - endDay * 24 * 60 * 60 * 1000);
|
||||
deleteFileAmount = 0;
|
||||
|
||||
checkFiles(start, end, limit);
|
||||
|
||||
jsonRes(res, {
|
||||
message: 'success'
|
||||
});
|
||||
} catch (error) {
|
||||
addLog.error(`check valid dataset files error`, error);
|
||||
|
||||
jsonRes(res, {
|
||||
code: 500,
|
||||
error
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export async function checkFiles(start: Date, end: Date, limit: number) {
|
||||
const collection = getGFSCollection('dataset');
|
||||
const where = {
|
||||
uploadDate: { $gte: start, $lte: end }
|
||||
};
|
||||
|
||||
// 1. get all _id
|
||||
const ids = await collection
|
||||
.find(where, {
|
||||
projection: {
|
||||
_id: 1
|
||||
}
|
||||
})
|
||||
.toArray();
|
||||
console.log('total files', ids.length);
|
||||
|
||||
for (let i = 0; i < limit; i++) {
|
||||
check(i);
|
||||
}
|
||||
|
||||
async function check(index: number): Promise<any> {
|
||||
const id = ids[index];
|
||||
if (!id) {
|
||||
console.log(`检测完成,共删除 ${deleteFileAmount} 个无效文件`);
|
||||
|
||||
return;
|
||||
}
|
||||
try {
|
||||
const { _id } = id;
|
||||
|
||||
// 2. find fileId in dataset.collections
|
||||
const hasCollection = await MongoDatasetCollection.countDocuments({ fileId: _id });
|
||||
|
||||
// 3. if not found, delete file
|
||||
if (hasCollection === 0) {
|
||||
await delFileById({ bucketName: 'dataset', fileId: String(_id) });
|
||||
console.log('delete file', _id);
|
||||
deleteFileAmount++;
|
||||
}
|
||||
index % 100 === 0 && console.log(index);
|
||||
return check(index + limit);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
await delay(2000);
|
||||
return check(index);
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user