mirror of
https://github.com/labring/FastGPT.git
synced 2026-05-08 01:08:43 +08:00
1 (#3924)
This commit is contained in:
@@ -0,0 +1,77 @@
|
||||
|
||||
import NodeCache from 'node-cache';
|
||||
import { MongoClient } from 'mongodb';
|
||||
import crypto from 'crypto';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const cache = new NodeCache({ stdTTL: parseInt(process.env.STD_TTL || '3600') });
|
||||
const mongoClient = new MongoClient(process.env.MONGODB_URI || 'mongodb://localhost:27017');
|
||||
const dbName = 'pageCache';
|
||||
const collectionName = 'pages';
|
||||
|
||||
const connectToMongo = async () => {
|
||||
await mongoClient.connect();
|
||||
return mongoClient.db(dbName);
|
||||
};
|
||||
|
||||
const createTTLIndex = async () => {
|
||||
try {
|
||||
const db = await connectToMongo();
|
||||
await db.collection(collectionName).createIndex({ "updatedAt": 1 }, { expireAfterSeconds: parseInt(process.env.EXPIRE_AFTER_SECONDS || '9000') });
|
||||
console.log("TTL index created successfully");
|
||||
} catch (error) {
|
||||
console.error("Error creating TTL index:", error);
|
||||
}
|
||||
};
|
||||
|
||||
const getPageHash = (content: string) => {
|
||||
return crypto.createHash('md5').update(content).digest('hex');
|
||||
};
|
||||
|
||||
export const getCachedPage = async (url: string) => {
|
||||
const cachedPage = cache.get(url);
|
||||
if (cachedPage) return cachedPage;
|
||||
|
||||
try {
|
||||
const db = await connectToMongo();
|
||||
const page = await db.collection(collectionName).findOne({ url });
|
||||
if (page) cache.set(url, page);
|
||||
return page;
|
||||
} catch (error) {
|
||||
console.error('Error getting cached page:', error);
|
||||
throw error;
|
||||
}
|
||||
};
|
||||
|
||||
const savePageToCache = async (url: string, content: string) => {
|
||||
const hash = getPageHash(content);
|
||||
const page = { url, content, hash, updatedAt: new Date() };
|
||||
|
||||
cache.set(url, page); // 更新内存缓存
|
||||
|
||||
try {
|
||||
const db = await connectToMongo();
|
||||
await db.collection(collectionName).updateOne(
|
||||
{ url },
|
||||
{ $set: page },
|
||||
{ upsert: true }
|
||||
); // 更新持久化缓存
|
||||
} catch (error) {
|
||||
console.error('Error saving page to cache:', error);
|
||||
throw error;
|
||||
}
|
||||
};
|
||||
|
||||
export const updateCacheAsync = async (url: string, content: string) => {
|
||||
await savePageToCache(url, content);
|
||||
};
|
||||
|
||||
process.on('SIGINT', async () => {
|
||||
await mongoClient.close();
|
||||
process.exit(0);
|
||||
});
|
||||
|
||||
// 在应用启动时创建 TTL 索引
|
||||
createTTLIndex();
|
||||
@@ -0,0 +1,140 @@
|
||||
import { Cluster } from 'puppeteer-cluster';
|
||||
import * as cheerio from 'cheerio';
|
||||
import UserAgent from 'user-agents';
|
||||
import { setupPage } from './setupPage';
|
||||
import { getCachedPage, updateCacheAsync } from './cacheUpdater';
|
||||
import { handleSpecialWebsite } from '../specialHandlers';
|
||||
import fetch from 'node-fetch';
|
||||
|
||||
interface CachedPage {
|
||||
url: string;
|
||||
content: string;
|
||||
hash: string;
|
||||
updatedAt: Date;
|
||||
}
|
||||
|
||||
export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: string[], results: Map<string, any>, strategies: any[], detectWebsites: string[], pageCount: number) => {
|
||||
const tasks = [];
|
||||
|
||||
await clusterInstance.task(async ({ page, data: { searchUrl } }) => {
|
||||
try {
|
||||
const cachedPage = await getCachedPage(searchUrl) as CachedPage | null;
|
||||
if (cachedPage) {
|
||||
const result = results.get(searchUrl);
|
||||
if (result) {
|
||||
result.content = cachedPage.content;
|
||||
result.crawlStatus = 'Success';
|
||||
}
|
||||
return;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`从缓存获取页面 ${searchUrl} 时发生错误:`, error);
|
||||
results.set(searchUrl, { url: searchUrl, error: (error as Error).message, crawlStatus: 'Failed' });
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(searchUrl, {
|
||||
headers: {
|
||||
'User-Agent': new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }).toString(),
|
||||
'Referer': 'https://www.google.com/',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Connection': 'keep-alive',
|
||||
'Cache-Control': 'no-cache'
|
||||
}
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const content = await response.text();
|
||||
const $ = cheerio.load(content);
|
||||
const cleanedContent = $('body').html() || '';
|
||||
|
||||
const result = results.get(searchUrl);
|
||||
if (result) {
|
||||
result.content = cleanedContent;
|
||||
result.crawlStatus = 'Success';
|
||||
}
|
||||
|
||||
await updateCacheAsync(searchUrl, cleanedContent || '');
|
||||
return;
|
||||
} else {
|
||||
throw new Error(`HTTP error! status: ${response.status}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`快速抓取页面 ${searchUrl} 时发生错误:`, error);
|
||||
}
|
||||
|
||||
try {
|
||||
if (detectWebsites.some(website => searchUrl.includes(website))) {
|
||||
await setupPage(page);
|
||||
} else {
|
||||
const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
|
||||
await page.setUserAgent(userAgent.toString());
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`访问页面 ${searchUrl} 设置用户代理时发生错误:`, error);
|
||||
}
|
||||
|
||||
let pageLoaded = false;
|
||||
let pageLoadError: Error | null = null;
|
||||
for (const strategy of strategies) {
|
||||
try {
|
||||
await page.goto(searchUrl, { waitUntil: strategy.waitUntil, timeout: strategy.timeout });
|
||||
pageLoaded = true;
|
||||
break;
|
||||
} catch (error: any) {
|
||||
if (error.name === 'TimeoutError') {
|
||||
pageLoadError = error;
|
||||
continue;
|
||||
} else {
|
||||
pageLoadError = error;
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!pageLoaded) {
|
||||
const result = results.get(searchUrl);
|
||||
if (result) {
|
||||
result.error = pageLoadError;
|
||||
result.crawlStatus = 'Failed';
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
let cleanedContent = await handleSpecialWebsite(page, searchUrl);
|
||||
if (!cleanedContent) {
|
||||
const content = await page.content();
|
||||
const $ = cheerio.load(content);
|
||||
cleanedContent = $('body').html() || '';
|
||||
}
|
||||
|
||||
const result = results.get(searchUrl);
|
||||
if (result) {
|
||||
result.content = cleanedContent;
|
||||
result.crawlStatus = 'Success';
|
||||
}
|
||||
|
||||
await updateCacheAsync(searchUrl, cleanedContent || '');
|
||||
} catch (error) {
|
||||
results.set(searchUrl, { url: searchUrl, error: (error as Error).message, crawlStatus: 'Failed' });
|
||||
} finally {
|
||||
await page.close().catch(() => {});
|
||||
}
|
||||
});
|
||||
|
||||
for (const url of resultUrls) {
|
||||
if (tasks.length >= pageCount + 10) {
|
||||
break;
|
||||
}
|
||||
tasks.push(clusterInstance.queue({ searchUrl: url }));
|
||||
}
|
||||
|
||||
await Promise.all(tasks);
|
||||
|
||||
await clusterInstance.idle();
|
||||
await clusterInstance.close();
|
||||
|
||||
return Array.from(results.values()).sort((a, b) => b.score - a.score);
|
||||
};
|
||||
@@ -0,0 +1,88 @@
|
||||
import { Page } from 'puppeteer';
|
||||
import randomUseragent from 'random-useragent';
|
||||
|
||||
const getRandomUserAgent = () => {
|
||||
return randomUseragent.getRandom();
|
||||
};
|
||||
|
||||
const getRandomPlatform = () => {
|
||||
const platforms = ["Win32", "MacIntel", "Linux x86_64"];
|
||||
return platforms[Math.floor(Math.random() * platforms.length)];
|
||||
};
|
||||
|
||||
//代理池
|
||||
const validateproxy = [
|
||||
{ ip: "39.102.210.222", port: 8080 },
|
||||
{ ip: "8.130.71.75", port: 8080 },
|
||||
{ ip: "39.102.214.208", port: 9999 },
|
||||
{ ip: "39.104.59.56", port: 8080 },
|
||||
{ ip: "8.130.37.235", port: 3128 },
|
||||
{ ip: "8.138.131.110", port: 8080 },
|
||||
{ ip: "8.140.105.75", port: 8009 },
|
||||
{ ip: "114.80.38.120", port: 3081 },
|
||||
{ ip: "8.148.23.165", port: 8081 },
|
||||
{ ip: "119.96.72.199", port: 59394 },
|
||||
{ ip: "120.55.14.137", port: 80 },
|
||||
{ ip: "47.116.181.146", port: 5060 },
|
||||
{ ip: "39.102.214.199", port: 3128 },
|
||||
{ ip: "47.121.183.107", port: 8080 },
|
||||
{ ip: "39.104.16.201", port: 8080 },
|
||||
{ ip: "39.102.209.163", port: 10002 },
|
||||
{ ip: "101.201.76.157", port: 9090 },
|
||||
{ ip: "122.224.124.26", port: 12080 },
|
||||
{ ip: "180.105.244.199", port: 1080 },
|
||||
{ ip: "119.3.113.150", port: 9094 }
|
||||
];
|
||||
|
||||
const getRandomProxy = () => {
|
||||
return validateproxy[Math.floor(Math.random() * validateproxy.length)];
|
||||
};
|
||||
|
||||
const getRandomLanguages = () => {
|
||||
const languages = [
|
||||
["zh-CN", "zh", "en"],
|
||||
["en-US", "en", "fr"],
|
||||
["es-ES", "es", "en"]
|
||||
];
|
||||
return languages[Math.floor(Math.random() * languages.length)];
|
||||
};
|
||||
|
||||
export const setupPage = async (page: Page): Promise<void> => {
|
||||
const proxy = getRandomProxy();
|
||||
await page.authenticate({
|
||||
username: proxy.ip,
|
||||
password: proxy.port.toString()
|
||||
});
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
const newProto = (navigator as any).__proto__;
|
||||
delete newProto.webdriver;
|
||||
(navigator as any).__proto__ = newProto;
|
||||
(window as any).chrome = {};
|
||||
(window as any).chrome.app = {"InstallState":"testt", "RunningState":"estt", "getDetails":"stte", "getIsInstalled":"ttes"};
|
||||
(window as any).chrome.csi = function(){};
|
||||
(window as any).chrome.loadTimes = function(){};
|
||||
(window as any).chrome.runtime = function(){};
|
||||
Object.defineProperty(navigator, 'userAgent', {
|
||||
get: () => getRandomUserAgent(),
|
||||
});
|
||||
Object.defineProperty(navigator, 'platform', {
|
||||
get: () => getRandomPlatform(),
|
||||
});
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [{"description": "Shockwave Flash",
|
||||
"filename": "pepflashplayer.dll",
|
||||
"length": 1,
|
||||
"name": "Shockwave Flash"}]
|
||||
});
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => getRandomLanguages(),
|
||||
});
|
||||
const originalQuery = (window.navigator.permissions as any).query;
|
||||
(window.navigator.permissions as any).query = (parameters: any) => (
|
||||
parameters.name === 'notifications' ?
|
||||
Promise.resolve({ state: Notification.permission } as PermissionStatus) :
|
||||
originalQuery(parameters)
|
||||
);
|
||||
});
|
||||
};
|
||||
Reference in New Issue
Block a user