This commit is contained in:
gggaaallleee
2025-02-28 19:00:58 +08:00
committed by GitHub
parent cf0aaa1091
commit f7b2a57ca3
29 changed files with 7469 additions and 0 deletions

View File

@@ -0,0 +1,60 @@
import { Request, Response } from 'express';
import fetch from 'node-fetch';
import dotenv from 'dotenv';
dotenv.config();
const userAgents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
];
export const quickFetch = async (req: Request, res: Response): Promise<void> => {
const { url } = req.query;
if (!url) {
res.status(400).json({
status: 400,
error: {
code: "MISSING_PARAM",
message: "缺少必要参数: url"
}
});
return;
}
try {
const response = await fetch(url as string, {
headers: {
'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
'Referer': 'https://www.google.com/',
'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Connection': 'keep-alive',
'Cache-Control': 'no-cache'
}
});
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const data = await response.text();
res.status(200).json({
status: 200,
data: {
content: data
}
});
} catch (error) {
console.error('Error fetching the page:', error);
res.status(500).json({
status: 500,
error: {
code: "INTERNAL_SERVER_ERROR",
message: "发生错误"
}
});
}
};
export default { quickFetch };

View File

@@ -0,0 +1,142 @@
import { Request, Response } from 'express';
import puppeteer, { Page } from 'puppeteer';
import * as cheerio from 'cheerio';
import UserAgent from 'user-agents';
import { setupPage } from '../utils/setupPage'; // 导入 setupPage 模块
import dotenv from 'dotenv'; // 导入 dotenv 模块
import { URL } from 'url'; // 导入 URL 模块
import { handleSpecialWebsite } from '../specialHandlers'; // 导入 handleSpecialWebsite 模块
import fetch from 'node-fetch';
import { getCachedPage, updateCacheAsync } from '../utils/cacheUpdater'; // 导入缓存相关模块
dotenv.config(); // 加载环境变量
const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
export const readPage = async (req: Request, res: Response): Promise<void> => {
const { queryUrl } = req.query;
console.log("-------");
console.log(queryUrl);
console.log("-------");
if (!queryUrl) {
res.status(400).json({
status: 400,
error: {
code: "MISSING_PARAM",
message: "缺少必要参数: queryUrl"
}
});
return;
}
const urlDomain = new URL(queryUrl as string).hostname;
if (blacklistDomains.some((domain: string) => urlDomain.endsWith(domain))) {
res.status(403).json({
status: 403,
error: {
code: "BLACKLISTED_DOMAIN",
message: "该域名受到保护中"
}
});
return;
}
try {
const response = await fetch(queryUrl as string, {
headers: {
'User-Agent': new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }).toString(),
'Referer': 'https://www.google.com/',
'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Connection': 'keep-alive',
'Cache-Control': 'no-cache'
}
});
if (response.ok) {
const content = await response.text();
const $ = cheerio.load(content);
const cleanedContent = $('body').html();
res.status(200).json({
status: 200,
data: {
title: $('title').text(),
content: cleanedContent
}
});
await updateCacheAsync(queryUrl as string, cleanedContent || '');
console.log("Page read successfully");
return;
} else {
throw new Error(`HTTP error! status: ${response.status}`);
}
} catch (error) {
console.error('快速抓取页面时发生错误:', error);
}
try {
const browser = await puppeteer.launch({
ignoreDefaultArgs: ["--enable-automation"],
headless: true,
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径
pipe: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
// '--single-process'
]
});
const page = await browser.newPage();
// 检测是否需要特殊处理
if (typeof queryUrl === 'string' && detectWebsites.some(website => queryUrl.includes(website))) {
await setupPage(page);
} else {
const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
await page.setUserAgent(userAgent.toString());
}
const queryUrlSafe = new URL(queryUrl as string).toString();
await page.goto(queryUrlSafe, { waitUntil: 'load' });
await page.waitForSelector('body');
const title = await page.title();
let cleanedContent = await handleSpecialWebsite(page, queryUrl as string);
if (!cleanedContent) {
const content = await page.content();
const $ = cheerio.load(content);
cleanedContent = $('body').html();
}
await page.close();
await browser.close();
res.status(200).json({
status: 200,
data: {
title,
content: cleanedContent
}
});
await updateCacheAsync(queryUrl as string, cleanedContent || '');
console.log("Page read successfully");
} catch (error) {
console.error(error);
res.status(500).json({
status: 500,
error: {
code: "INTERNAL_SERVER_ERROR",
message: "读取页面时发生内部服务器错误"
}
});
}
};

View File

@@ -0,0 +1,114 @@
import { Request, Response } from 'express';
import { Cluster } from 'puppeteer-cluster';
import dotenv from 'dotenv';
import { performDeepSearch } from '../utils/deepSearch';
import { fetchSearchResults as fetchBaiduResults } from '../engines/baiduEngine';
import { fetchSearchResults as fetchSearchxngResults } from '../engines/searchxngEngine';
dotenv.config();
const strategies = JSON.parse(process.env.STRATEGIES || '[]');
const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
const maxConcurrency = parseInt(process.env.MAX_CONCURRENCY || '10', 10);
export const search = async (req: Request, res: Response): Promise<void> => {
const { query, pageCount = 10, needDetails = 'false', engine = 'baidu', categories = 'general' } = req.query;
const needDetailsBool = (needDetails === 'true');
if (!query) {
res.status(400).json({
status: 400,
error: {
code: "MISSING_PARAM",
message: "缺少必要参数: query"
}
});
return;
}
let fetchSearchResults;
let searchUrlBase;
try {
if (engine === 'baidu') {
fetchSearchResults = fetchBaiduResults;
searchUrlBase = process.env.ENGINE_BAIDUURL;
} else if (engine === 'searchxng') {
fetchSearchResults = fetchSearchxngResults;
searchUrlBase = process.env.ENGINE_SEARCHXNGURL;
} else {
res.status(400).json({
status: 400,
error: {
code: "INVALID_ENGINE",
message: "无效的搜索引擎"
}
});
return;
}
const { resultUrls, results } = await fetchSearchResults(query as string, Number(pageCount), searchUrlBase || '', categories as string);
//如果返回值为空,返回空数组
if (results.size === 0) {
console.log('No results found');
res.status(200).json({
status: 200,
data: {
results: []
}
});
return;
}
if (!needDetailsBool) {
console.log('Need details is false');
results.forEach((value: any) => {
if (value.crawlStatus === 'Pending') {
value.crawlStatus = 'Success';
}
});
res.status(200).json({
status: 200,
data: {
results: Array.from(results.values())
}
});
} else {
console.log('Need details is true');
const clusterInstance = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: maxConcurrency,
puppeteerOptions: {
ignoreDefaultArgs: ["--enable-automation"],
headless: "true",
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径
pipe: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
]
}
});
const sortedResults = await performDeepSearch(clusterInstance, resultUrls, results, strategies, detectWebsites, Number(pageCount));
res.status(200).json({
status: 200,
data: {
results: sortedResults.slice(0, Number(pageCount))
}
});
}
} catch (error) {
res.status(500).json({
status: 500,
error: {
code: "INTERNAL_SERVER_ERROR",
message: "发生错误"
}
});
}
};
export default { search };

View File

@@ -0,0 +1,204 @@
import { URL } from 'url';
import { JSDOM } from 'jsdom';
import puppeteer from 'puppeteer';
import { setupPage } from '../utils/setupPage';
import { Cluster } from 'puppeteer-cluster';
async function randomWait(min: number, max: number) {
// 随机等待时间
const delay = Math.floor(Math.random() * (max - min + 1)) + min;
return new Promise(resolve => setTimeout(resolve, delay));
}
export const fetchSearchResults = async (query: string, pageCount: number, searchUrlBase: string, categories: string) => {
console.log(`Fetching Baidu search results for query: ${query}`);
// 如果 searchUrlBase 为空,返回空数组
if (!searchUrlBase) {
return { resultUrls: [], results: new Map() };
}
const resultUrls: string[] = [];
const results = new Map<string, any>();
const pagesToFetch = Math.ceil(pageCount / 10);
const browser = await puppeteer.launch({
ignoreDefaultArgs: ["--enable-automation"],
headless: true,
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径
pipe: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
// '--single-process'
]
});
const page = await browser.newPage();
await setupPage(page);
for (let i = 0; i < pagesToFetch; i++) {
const searchUrl = new URL(`${searchUrlBase}?wd=${encodeURIComponent(query)}&pn=${i * 10}`);
console.log(`Fetching page ${i + 1} from Baidu: ${searchUrl.toString()}`);
let retryCount = 0;
let success = false;
while (retryCount < 5 && !success) {
try {
console.time(`Page Load Time for page ${i + 1}`);
await page.goto(searchUrl.toString(), { waitUntil: 'load' });
console.timeEnd(`Page Load Time for page ${i + 1}`);
let content = await page.content();
let dom = new JSDOM(content);
let document = dom.window.document;
console.log(document.title);
// 如果是百度安全验证页面,重新设置页面并重新访问
if (document.title.includes('百度安全验证')) {
console.log('Detected Baidu security verification, retrying...');
await setupPage(page);
retryCount++;
//随机等待时间
await randomWait(1000, 3000);
continue;
}
// 解析搜索结果
console.time(`Link Retrieval Time for page ${i + 1}`);
const resultContainers = document.querySelectorAll('.result.c-container');
for (const result of resultContainers) {
if (resultUrls.length > pageCount + 5) {
break;
}
const titleElement = result.querySelector('h3 a');
const title = titleElement ? titleElement.textContent : '';
const url = titleElement ? titleElement.getAttribute('href') : '';
const contentElement = result.querySelector('[class^="content"]');
const content = contentElement ? contentElement.textContent : '';
if (url) {
resultUrls.push(url);
results.set(url, {
title,
url,
snippet: content,
source: 'baidu',
crawlStatus: 'Pending',
score: 0
});
}
}
console.timeEnd(`Link Retrieval Time for page ${i + 1}`);
success = true;
} catch (error) {
console.error(`Error fetching page ${i + 1}:`, error);
retryCount++;
}
}
}
await browser.close();
console.log('fetch all fake urls');
// 快速检索真实 URL
const urlsToProcessWithPuppeteer = [];
for (const url of resultUrls) {
try {
const response = await fetch(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Referer': 'https://www.google.com/',
'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Connection': 'keep-alive',
'Cache-Control': 'no-cache'
}
});
if (response.ok) {
const realUrl = response.url;
console.log('realurl:', realUrl);
const result = results.get(url);
if (result) {
result.url = realUrl;
result.crawlStatus = 'Success';
}
} else {
throw new Error(`HTTP error! status: ${response.status}`);
}
} catch (error) {
console.error(`Error fetching original URL for ${url}:`, error);
urlsToProcessWithPuppeteer.push(url);
}
}
console.log('pass quickfetch');
// 并发处理真实 URL
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: 10,
puppeteerOptions: {
ignoreDefaultArgs: ["--enable-automation"],
headless: "true",
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径
pipe: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
]
}
});
let failedUrlCount = 0;
await cluster.task(async ({ page, data: url }) => {
let retryUrlCount = 0;
let urlSuccess = false;
while (retryUrlCount < 3 && !urlSuccess) {
console.log(`Fetching original URL for ${url}, attempt ${retryUrlCount + 1}`);
try {
await page.goto(url, { waitUntil: 'load' });
// 检查页面是否被分离
if (page.isClosed()) {
throw new Error('Page has been closed');
}
const realUrl = page.url(); // 获取真实 URL
const result = results.get(url);
if (result) {
result.url = realUrl;
result.crawlStatus = 'Success';
}
urlSuccess = true;
} catch (error) {
console.error(`Error fetching original URL, retrying...`, error);
retryUrlCount++;
await randomWait(1000, 3000);
}
}
if (!urlSuccess) {
failedUrlCount++;
}
});
for (const url of urlsToProcessWithPuppeteer) {
cluster.queue(url);
}
await cluster.idle();
await cluster.close();
console.log(`Number of URLs that failed to return a real URL: ${failedUrlCount}`);
// 过滤并返回前 pageCount 个结果
const filteredResults = Array.from(results.values()).slice(0, pageCount);
return { resultUrls: filteredResults.map(result => result.url), results: new Map(filteredResults.map(result => [result.url, result])) };
};

View File

@@ -0,0 +1,55 @@
import axios from 'axios';
import { URL } from 'url';
import dotenv from 'dotenv';
dotenv.config();
const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
export const fetchSearchResults = async (query: string, pageCount: number, searchUrlBase: string, categories: string) => {
const MAX_PAGES = (pageCount / 10 +1) * 2+1; // 最多搜索的页面数
//如果searchUrlBase为空返回空数组pagecount是需要搜索结果的数量
if (!searchUrlBase) {
return { resultUrls: [], results: new Map() };
}
const resultUrls: string[] = [];
const results = new Map<string, any>();
let fetchedResultsCount = 0;
let pageIndex = 0;
while (fetchedResultsCount < pageCount && pageIndex < MAX_PAGES) {
const searchUrl = new URL(`${searchUrlBase}?q=${encodeURIComponent(query)}&pageno=${pageIndex + 1}&format=json&categories=${categories}`);
console.log(`Fetching page ${pageIndex + 1} from SearchXNG: ${searchUrl.toString()}`);
const response = await axios.get(searchUrl.toString());
const jsonResults = response.data.results;
for (let index = 0; index < jsonResults.length; index++) {
const result = jsonResults[index];
const resultDomain = new URL(result.url).hostname;
if (blacklistDomains.some((domain: string) => resultDomain.endsWith(domain)) || resultDomain.includes('zhihu')) {
continue;
}
resultUrls.push(result.url);
results.set(result.url, {
title: result.title,
url: result.url,
snippet: result.content,
source: result.engine,
crawlStatus: 'Pending',
score: result.score
});
fetchedResultsCount++;
if (fetchedResultsCount >= pageCount) {
break;
}
}
pageIndex++;
if (jsonResults.length === 0) {
break; // 如果没有更多结果,退出循环
}
}
return { resultUrls, results };
};

View File

@@ -0,0 +1,18 @@
import express, { Application } from 'express';
import bodyParser from 'body-parser';
import searchRoutes from './routes/searchRoutes';
import readRoutes from './routes/readRoutes';
import quickfetchRoutes from './routes/quickfetchRoutes';
import dotenv from 'dotenv';
dotenv.config();
const app: Application = express();
app.use(bodyParser.json());
app.use('/api', searchRoutes);
app.use('/api', readRoutes);
app.use('/api', quickfetchRoutes);
const PORT = process.env.PORT || 3000;
app.listen(PORT, () => console.log(`Server running on port ${PORT}`));

View File

@@ -0,0 +1,21 @@
import { Request, Response, NextFunction } from 'express';
const authMiddleware = (req: Request, res: Response, next: NextFunction) => {
const bearerHeader = req.headers['authorization'];
if (bearerHeader) {
console.log("bearerHeader:" + bearerHeader);
const bearer = bearerHeader.split(' ');
const bearerToken = bearer[1];
if (bearerToken === process.env.ACCESS_TOKEN) {
next();
} else {
res.status(403).json({ message: 'Invalid token' });
}
} else {
res.status(401).json({ message: 'Bearer token not found' });
}
};
export default authMiddleware;

View File

@@ -0,0 +1,9 @@
import express from 'express';
import { quickFetch } from '../controllers/quickfetchController';
import authMiddleware from '../middleware/authMiddleware';
const readRoutes = express.Router();
readRoutes.get('/quickFetch', authMiddleware, quickFetch);
export default readRoutes;

View File

@@ -0,0 +1,9 @@
import express from 'express';
import { readPage } from '../controllers/readController';
import authMiddleware from '../middleware/authMiddleware';
const readRoutes = express.Router();
readRoutes.get('/read', authMiddleware, readPage);
export default readRoutes;

View File

@@ -0,0 +1,9 @@
import express from 'express';
import searchController from '../controllers/searchController';
import authMiddleware from '../middleware/authMiddleware';
const searchRoutes = express.Router();
searchRoutes.get('/search', authMiddleware, searchController.search);
export default searchRoutes;

View File

@@ -0,0 +1,21 @@
import { Page } from 'puppeteer';
export const handleSpecialWebsite = async (page: Page, url: string): Promise<string | null> => {
if (url.includes('blog.csdn.net')) {
await page.waitForSelector('article');
const content = await page.$eval('article', el => el.innerHTML);
return content;
}
if (url.includes('zhuanlan.zhihu.com')) {
console.log('是知乎,需要点击按掉!');
console.log(await page.content());
if((await page.content()).includes('{"error":{"message":"您当前请求存在异常,暂时限制本次访问。如有疑问,您可以通过手机摇一摇或登录后私信知乎小管家反馈。","code":40362}}')) return null;
await page.waitForSelector('button[aria-label="关闭"]');
await page.click('button[aria-label="关闭"]'); // 使用 aria-label 选择按钮
await page.waitForSelector('article');
const content = await page.$eval('article', el => el.innerHTML);
return content;
}
// 可以添加更多特殊网站的处理逻辑
return null;
};

View File

@@ -0,0 +1,77 @@
import NodeCache from 'node-cache';
import { MongoClient } from 'mongodb';
import crypto from 'crypto';
import dotenv from 'dotenv';
dotenv.config();
const cache = new NodeCache({ stdTTL: parseInt(process.env.STD_TTL || '3600') });
const mongoClient = new MongoClient(process.env.MONGODB_URI || 'mongodb://localhost:27017');
const dbName = 'pageCache';
const collectionName = 'pages';
const connectToMongo = async () => {
await mongoClient.connect();
return mongoClient.db(dbName);
};
const createTTLIndex = async () => {
try {
const db = await connectToMongo();
await db.collection(collectionName).createIndex({ "updatedAt": 1 }, { expireAfterSeconds: parseInt(process.env.EXPIRE_AFTER_SECONDS || '9000') });
console.log("TTL index created successfully");
} catch (error) {
console.error("Error creating TTL index:", error);
}
};
const getPageHash = (content: string) => {
return crypto.createHash('md5').update(content).digest('hex');
};
export const getCachedPage = async (url: string) => {
const cachedPage = cache.get(url);
if (cachedPage) return cachedPage;
try {
const db = await connectToMongo();
const page = await db.collection(collectionName).findOne({ url });
if (page) cache.set(url, page);
return page;
} catch (error) {
console.error('Error getting cached page:', error);
throw error;
}
};
const savePageToCache = async (url: string, content: string) => {
const hash = getPageHash(content);
const page = { url, content, hash, updatedAt: new Date() };
cache.set(url, page); // 更新内存缓存
try {
const db = await connectToMongo();
await db.collection(collectionName).updateOne(
{ url },
{ $set: page },
{ upsert: true }
); // 更新持久化缓存
} catch (error) {
console.error('Error saving page to cache:', error);
throw error;
}
};
export const updateCacheAsync = async (url: string, content: string) => {
await savePageToCache(url, content);
};
process.on('SIGINT', async () => {
await mongoClient.close();
process.exit(0);
});
// 在应用启动时创建 TTL 索引
createTTLIndex();

View File

@@ -0,0 +1,140 @@
import { Cluster } from 'puppeteer-cluster';
import * as cheerio from 'cheerio';
import UserAgent from 'user-agents';
import { setupPage } from './setupPage';
import { getCachedPage, updateCacheAsync } from './cacheUpdater';
import { handleSpecialWebsite } from '../specialHandlers';
import fetch from 'node-fetch';
interface CachedPage {
url: string;
content: string;
hash: string;
updatedAt: Date;
}
export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: string[], results: Map<string, any>, strategies: any[], detectWebsites: string[], pageCount: number) => {
const tasks = [];
await clusterInstance.task(async ({ page, data: { searchUrl } }) => {
try {
const cachedPage = await getCachedPage(searchUrl) as CachedPage | null;
if (cachedPage) {
const result = results.get(searchUrl);
if (result) {
result.content = cachedPage.content;
result.crawlStatus = 'Success';
}
return;
}
} catch (error) {
console.error(`从缓存获取页面 ${searchUrl} 时发生错误:`, error);
results.set(searchUrl, { url: searchUrl, error: (error as Error).message, crawlStatus: 'Failed' });
return;
}
try {
const response = await fetch(searchUrl, {
headers: {
'User-Agent': new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }).toString(),
'Referer': 'https://www.google.com/',
'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Connection': 'keep-alive',
'Cache-Control': 'no-cache'
}
});
if (response.ok) {
const content = await response.text();
const $ = cheerio.load(content);
const cleanedContent = $('body').html() || '';
const result = results.get(searchUrl);
if (result) {
result.content = cleanedContent;
result.crawlStatus = 'Success';
}
await updateCacheAsync(searchUrl, cleanedContent || '');
return;
} else {
throw new Error(`HTTP error! status: ${response.status}`);
}
} catch (error) {
console.error(`快速抓取页面 ${searchUrl} 时发生错误:`, error);
}
try {
if (detectWebsites.some(website => searchUrl.includes(website))) {
await setupPage(page);
} else {
const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
await page.setUserAgent(userAgent.toString());
}
} catch (error) {
console.error(`访问页面 ${searchUrl} 设置用户代理时发生错误:`, error);
}
let pageLoaded = false;
let pageLoadError: Error | null = null;
for (const strategy of strategies) {
try {
await page.goto(searchUrl, { waitUntil: strategy.waitUntil, timeout: strategy.timeout });
pageLoaded = true;
break;
} catch (error: any) {
if (error.name === 'TimeoutError') {
pageLoadError = error;
continue;
} else {
pageLoadError = error;
throw error;
}
}
}
if (!pageLoaded) {
const result = results.get(searchUrl);
if (result) {
result.error = pageLoadError;
result.crawlStatus = 'Failed';
}
return;
}
try {
let cleanedContent = await handleSpecialWebsite(page, searchUrl);
if (!cleanedContent) {
const content = await page.content();
const $ = cheerio.load(content);
cleanedContent = $('body').html() || '';
}
const result = results.get(searchUrl);
if (result) {
result.content = cleanedContent;
result.crawlStatus = 'Success';
}
await updateCacheAsync(searchUrl, cleanedContent || '');
} catch (error) {
results.set(searchUrl, { url: searchUrl, error: (error as Error).message, crawlStatus: 'Failed' });
} finally {
await page.close().catch(() => {});
}
});
for (const url of resultUrls) {
if (tasks.length >= pageCount + 10) {
break;
}
tasks.push(clusterInstance.queue({ searchUrl: url }));
}
await Promise.all(tasks);
await clusterInstance.idle();
await clusterInstance.close();
return Array.from(results.values()).sort((a, b) => b.score - a.score);
};

View File

@@ -0,0 +1,88 @@
import { Page } from 'puppeteer';
import randomUseragent from 'random-useragent';
const getRandomUserAgent = () => {
return randomUseragent.getRandom();
};
const getRandomPlatform = () => {
const platforms = ["Win32", "MacIntel", "Linux x86_64"];
return platforms[Math.floor(Math.random() * platforms.length)];
};
//代理池
const validateproxy = [
{ ip: "39.102.210.222", port: 8080 },
{ ip: "8.130.71.75", port: 8080 },
{ ip: "39.102.214.208", port: 9999 },
{ ip: "39.104.59.56", port: 8080 },
{ ip: "8.130.37.235", port: 3128 },
{ ip: "8.138.131.110", port: 8080 },
{ ip: "8.140.105.75", port: 8009 },
{ ip: "114.80.38.120", port: 3081 },
{ ip: "8.148.23.165", port: 8081 },
{ ip: "119.96.72.199", port: 59394 },
{ ip: "120.55.14.137", port: 80 },
{ ip: "47.116.181.146", port: 5060 },
{ ip: "39.102.214.199", port: 3128 },
{ ip: "47.121.183.107", port: 8080 },
{ ip: "39.104.16.201", port: 8080 },
{ ip: "39.102.209.163", port: 10002 },
{ ip: "101.201.76.157", port: 9090 },
{ ip: "122.224.124.26", port: 12080 },
{ ip: "180.105.244.199", port: 1080 },
{ ip: "119.3.113.150", port: 9094 }
];
const getRandomProxy = () => {
return validateproxy[Math.floor(Math.random() * validateproxy.length)];
};
const getRandomLanguages = () => {
const languages = [
["zh-CN", "zh", "en"],
["en-US", "en", "fr"],
["es-ES", "es", "en"]
];
return languages[Math.floor(Math.random() * languages.length)];
};
export const setupPage = async (page: Page): Promise<void> => {
const proxy = getRandomProxy();
await page.authenticate({
username: proxy.ip,
password: proxy.port.toString()
});
await page.evaluateOnNewDocument(() => {
const newProto = (navigator as any).__proto__;
delete newProto.webdriver;
(navigator as any).__proto__ = newProto;
(window as any).chrome = {};
(window as any).chrome.app = {"InstallState":"testt", "RunningState":"estt", "getDetails":"stte", "getIsInstalled":"ttes"};
(window as any).chrome.csi = function(){};
(window as any).chrome.loadTimes = function(){};
(window as any).chrome.runtime = function(){};
Object.defineProperty(navigator, 'userAgent', {
get: () => getRandomUserAgent(),
});
Object.defineProperty(navigator, 'platform', {
get: () => getRandomPlatform(),
});
Object.defineProperty(navigator, 'plugins', {
get: () => [{"description": "Shockwave Flash",
"filename": "pepflashplayer.dll",
"length": 1,
"name": "Shockwave Flash"}]
});
Object.defineProperty(navigator, 'languages', {
get: () => getRandomLanguages(),
});
const originalQuery = (window.navigator.permissions as any).query;
(window.navigator.permissions as any).query = (parameters: any) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission } as PermissionStatus) :
originalQuery(parameters)
);
});
};