1 (#3924)

2025-10-18 17:51:24 +00:00 · 2025-02-28 19:00:58 +08:00
parent cf0aaa1091
commit f7b2a57ca3
29 changed files with 7469 additions and 0 deletions
--- a/plugins/webcrawler/SPIDER/src/controllers/quickfetchController.ts
+++ b/plugins/webcrawler/SPIDER/src/controllers/quickfetchController.ts
@@ -0,0 +1,60 @@
+import { Request, Response } from 'express';
+import fetch from 'node-fetch';
+import dotenv from 'dotenv';
+
+dotenv.config();
+
+const userAgents = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
+];
+
+export const quickFetch = async (req: Request, res: Response): Promise<void> => {
+    const { url } = req.query;
+
+    if (!url) {
+        res.status(400).json({
+            status: 400,
+            error: {
+                code: "MISSING_PARAM",
+                message: "缺少必要参数: url"
+            }
+        });
+        return;
+    }
+
+    try {
+        const response = await fetch(url as string, {
+            headers: {
+                'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
+                'Referer': 'https://www.google.com/',
+                'Accept-Language': 'en-US,en;q=0.9',
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+                'Connection': 'keep-alive',
+                'Cache-Control': 'no-cache'
+            }
+        });
+        if (!response.ok) {
+            throw new Error(`HTTP error! status: ${response.status}`);
+        }
+        const data = await response.text();
+        res.status(200).json({
+            status: 200,
+            data: {
+                content: data
+            }
+        });
+    } catch (error) {
+        console.error('Error fetching the page:', error);
+        res.status(500).json({
+            status: 500,
+            error: {
+                code: "INTERNAL_SERVER_ERROR",
+                message: "发生错误"
+            }
+        });
+    }
+};
+
+export default { quickFetch };
--- a/plugins/webcrawler/SPIDER/src/controllers/readController.ts
+++ b/plugins/webcrawler/SPIDER/src/controllers/readController.ts
@@ -0,0 +1,142 @@
+import { Request, Response } from 'express';
+import puppeteer, { Page } from 'puppeteer';
+import * as cheerio from 'cheerio';
+import UserAgent from 'user-agents';
+import { setupPage } from '../utils/setupPage'; // 导入 setupPage 模块
+import dotenv from 'dotenv'; // 导入 dotenv 模块
+import { URL } from 'url'; // 导入 URL 模块
+import { handleSpecialWebsite } from '../specialHandlers'; // 导入 handleSpecialWebsite 模块
+import fetch from 'node-fetch';
+import { getCachedPage, updateCacheAsync } from '../utils/cacheUpdater'; // 导入缓存相关模块
+
+dotenv.config(); // 加载环境变量
+
+const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
+const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
+
+export const readPage = async (req: Request, res: Response): Promise<void> => {
+  const { queryUrl } = req.query;
+  console.log("-------");
+  console.log(queryUrl);
+  console.log("-------");
+
+  if (!queryUrl) {
+    res.status(400).json({
+      status: 400,
+      error: {
+        code: "MISSING_PARAM",
+        message: "缺少必要参数: queryUrl"
+      }
+    });
+    return;
+  }
+
+  const urlDomain = new URL(queryUrl as string).hostname;
+  if (blacklistDomains.some((domain: string) => urlDomain.endsWith(domain))) {
+    res.status(403).json({
+      status: 403,
+      error: {
+        code: "BLACKLISTED_DOMAIN",
+        message: "该域名受到保护中"
+      }
+    });
+    return;
+  }
+
+  try {
+    const response = await fetch(queryUrl as string, {
+      headers: {
+        'User-Agent': new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }).toString(),
+        'Referer': 'https://www.google.com/',
+        'Accept-Language': 'en-US,en;q=0.9',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+        'Connection': 'keep-alive',
+        'Cache-Control': 'no-cache'
+      }
+    });
+
+    if (response.ok) {
+      const content = await response.text();
+      const $ = cheerio.load(content);
+      const cleanedContent = $('body').html();
+
+      res.status(200).json({
+        status: 200,
+        data: {
+          title: $('title').text(),
+          content: cleanedContent
+        }
+      });
+
+      await updateCacheAsync(queryUrl as string, cleanedContent || '');
+      console.log("Page read successfully");
+      return;
+    } else {
+      throw new Error(`HTTP error! status: ${response.status}`);
+    }
+  } catch (error) {
+    console.error('快速抓取页面时发生错误:', error);
+  }
+
+  try {
+    const browser = await puppeteer.launch({ 
+      ignoreDefaultArgs: ["--enable-automation"],
+      headless: true,  
+      executablePath: "/usr/bin/chromium",  // 明确指定 Chromium 路径
+      pipe: true,
+      args: [
+        '--no-sandbox',
+        '--disable-setuid-sandbox',
+        '--disable-dev-shm-usage',
+        '--disable-gpu',
+       // '--single-process' 
+      ]
+    });
+    const page = await browser.newPage();
+
+    // 检测是否需要特殊处理
+    if (typeof queryUrl === 'string' && detectWebsites.some(website => queryUrl.includes(website))) {
+      await setupPage(page);
+    } else {
+      const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
+      await page.setUserAgent(userAgent.toString());
+    }
+
+    const queryUrlSafe = new URL(queryUrl as string).toString();
+
+    await page.goto(queryUrlSafe, { waitUntil: 'load' });
+    await page.waitForSelector('body');
+
+    const title = await page.title();
+    let cleanedContent = await handleSpecialWebsite(page, queryUrl as string);
+
+    if (!cleanedContent) {
+      const content = await page.content();
+      const $ = cheerio.load(content);
+      cleanedContent = $('body').html();
+    }
+
+    await page.close();
+    await browser.close();
+
+    res.status(200).json({
+      status: 200,
+      data: {
+        title,
+        content: cleanedContent
+      }
+    });
+
+    await updateCacheAsync(queryUrl as string, cleanedContent || '');
+    console.log("Page read successfully");
+  } catch (error) {
+    console.error(error);
+    res.status(500).json({
+      status: 500,
+      error: {
+        code: "INTERNAL_SERVER_ERROR",
+        message: "读取页面时发生内部服务器错误"
+      }
+    });
+  }
+};
--- a/plugins/webcrawler/SPIDER/src/controllers/searchController.ts
+++ b/plugins/webcrawler/SPIDER/src/controllers/searchController.ts
@@ -0,0 +1,114 @@
+import { Request, Response } from 'express';
+import { Cluster } from 'puppeteer-cluster';
+import dotenv from 'dotenv';
+import { performDeepSearch } from '../utils/deepSearch';
+import { fetchSearchResults as fetchBaiduResults } from '../engines/baiduEngine';
+import { fetchSearchResults as fetchSearchxngResults } from '../engines/searchxngEngine';
+
+dotenv.config();
+
+const strategies = JSON.parse(process.env.STRATEGIES || '[]');
+const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
+const maxConcurrency = parseInt(process.env.MAX_CONCURRENCY || '10', 10);
+
+export const search = async (req: Request, res: Response): Promise<void> => {
+  const { query, pageCount = 10, needDetails = 'false', engine = 'baidu', categories = 'general' } = req.query;
+  const needDetailsBool = (needDetails === 'true');
+
+  if (!query) {
+    res.status(400).json({
+      status: 400,
+      error: {
+        code: "MISSING_PARAM",
+        message: "缺少必要参数: query"
+      }
+    });
+    return;
+  }
+  let fetchSearchResults;
+  let searchUrlBase;
+  try {
+      if (engine === 'baidu') {
+        fetchSearchResults = fetchBaiduResults;
+        searchUrlBase = process.env.ENGINE_BAIDUURL;
+      } else if (engine === 'searchxng') {
+        fetchSearchResults = fetchSearchxngResults;
+        searchUrlBase = process.env.ENGINE_SEARCHXNGURL;
+      } else {
+        res.status(400).json({
+          status: 400,
+          error: {
+            code: "INVALID_ENGINE",
+            message: "无效的搜索引擎"
+          }
+        });
+        return;
+      }
+
+    const { resultUrls, results } = await fetchSearchResults(query as string, Number(pageCount), searchUrlBase || '', categories as string);
+
+    //如果返回值为空，返回空数组
+    if (results.size === 0) {
+      console.log('No results found');
+      res.status(200).json({
+        status: 200,
+        data: {
+          results: []
+        }
+      });
+      return;
+    }
+
+    if (!needDetailsBool) {
+      console.log('Need details is false');
+      results.forEach((value: any) => {
+        if (value.crawlStatus === 'Pending') {
+          value.crawlStatus = 'Success';
+        }
+      });
+      res.status(200).json({
+        status: 200,
+        data: {
+          results: Array.from(results.values())
+        }
+      });
+    } else {
+      console.log('Need details is true');
+
+      const clusterInstance = await Cluster.launch({
+        concurrency: Cluster.CONCURRENCY_CONTEXT,
+        maxConcurrency: maxConcurrency,
+        puppeteerOptions: {
+          ignoreDefaultArgs: ["--enable-automation"],
+          headless: "true",
+          executablePath: "/usr/bin/chromium",  // 明确指定 Chromium 路径
+          pipe: true,
+          args: [
+            '--no-sandbox',
+            '--disable-setuid-sandbox',
+            '--disable-dev-shm-usage',
+            '--disable-gpu',
+          ]
+        }
+      });
+
+      const sortedResults = await performDeepSearch(clusterInstance, resultUrls, results, strategies, detectWebsites, Number(pageCount));
+      res.status(200).json({
+        status: 200,
+        data: {
+          results: sortedResults.slice(0, Number(pageCount))
+        }
+      });
+    }
+  } catch (error) {
+    res.status(500).json({
+      status: 500,
+      error: {
+        code: "INTERNAL_SERVER_ERROR",
+        message: "发生错误"
+      }
+    });
+  }
+};
+
+export default { search };
--- a/plugins/webcrawler/SPIDER/src/engines/baiduEngine.ts
+++ b/plugins/webcrawler/SPIDER/src/engines/baiduEngine.ts
@@ -0,0 +1,204 @@
+import { URL } from 'url';
+import { JSDOM } from 'jsdom';
+import puppeteer from 'puppeteer';
+import { setupPage } from '../utils/setupPage';
+import { Cluster } from 'puppeteer-cluster';
+
+async function randomWait(min: number, max: number) {
+    // 随机等待时间
+    const delay = Math.floor(Math.random() * (max - min + 1)) + min;
+    return new Promise(resolve => setTimeout(resolve, delay));
+}
+
+export const fetchSearchResults = async (query: string, pageCount: number, searchUrlBase: string, categories: string) => {
+    console.log(`Fetching Baidu search results for query: ${query}`);
+    // 如果 searchUrlBase 为空，返回空数组
+    if (!searchUrlBase) {
+        return { resultUrls: [], results: new Map() };
+    }
+    const resultUrls: string[] = [];
+    const results = new Map<string, any>();
+
+    const pagesToFetch = Math.ceil(pageCount / 10);
+
+    const browser = await puppeteer.launch({ 
+        ignoreDefaultArgs: ["--enable-automation"],
+        headless: true,  
+        executablePath: "/usr/bin/chromium",  // 明确指定 Chromium 路径
+        pipe: true,
+        args: [
+          '--no-sandbox',
+          '--disable-setuid-sandbox',
+          '--disable-dev-shm-usage',
+          '--disable-gpu',
+         // '--single-process' 
+        ]
+      });
+
+    const page = await browser.newPage();
+    await setupPage(page);
+
+    for (let i = 0; i < pagesToFetch; i++) {
+        const searchUrl = new URL(`${searchUrlBase}?wd=${encodeURIComponent(query)}&pn=${i * 10}`);
+        console.log(`Fetching page ${i + 1} from Baidu: ${searchUrl.toString()}`);
+        let retryCount = 0;
+        let success = false;
+
+        while (retryCount < 5 && !success) {
+            try {
+                console.time(`Page Load Time for page ${i + 1}`);
+                await page.goto(searchUrl.toString(), { waitUntil: 'load' });
+                console.timeEnd(`Page Load Time for page ${i + 1}`);
+
+                let content = await page.content();
+                let dom = new JSDOM(content);
+                let document = dom.window.document;
+                console.log(document.title);
+
+                // 如果是百度安全验证页面，重新设置页面并重新访问
+                if (document.title.includes('百度安全验证')) {
+                    console.log('Detected Baidu security verification, retrying...');
+                    await setupPage(page);
+                    retryCount++;
+                    //随机等待时间
+                    await randomWait(1000, 3000);
+                    continue;
+                }
+
+                // 解析搜索结果
+                console.time(`Link Retrieval Time for page ${i + 1}`);
+
+
+                const resultContainers = document.querySelectorAll('.result.c-container');
+                for (const result of resultContainers) {
+                    if (resultUrls.length > pageCount + 5) {
+                        break;
+                    }
+                    const titleElement = result.querySelector('h3 a');
+                    const title = titleElement ? titleElement.textContent : '';
+                    const url = titleElement ? titleElement.getAttribute('href') : '';
+                    const contentElement = result.querySelector('[class^="content"]');
+                    const content = contentElement ? contentElement.textContent : '';
+
+                    if (url) {
+                        resultUrls.push(url);
+                        results.set(url, {
+                            title,
+                            url,
+                            snippet: content,
+                            source: 'baidu',
+                            crawlStatus: 'Pending',
+                            score: 0 
+                        });
+                    }
+                }
+                console.timeEnd(`Link Retrieval Time for page ${i + 1}`);
+                success = true;
+            } catch (error) {
+                console.error(`Error fetching page ${i + 1}:`, error);
+                retryCount++;
+            }
+        }
+    }
+
+    await browser.close();
+
+    console.log('fetch all fake urls');
+
+    // 快速检索真实 URL
+    const urlsToProcessWithPuppeteer = [];
+    for (const url of resultUrls) {
+        try {
+            const response = await fetch(url, {
+                headers: {
+                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
+                    'Referer': 'https://www.google.com/',
+                    'Accept-Language': 'en-US,en;q=0.9',
+                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+                    'Connection': 'keep-alive',
+                    'Cache-Control': 'no-cache'
+                }
+            });
+
+            if (response.ok) {
+                const realUrl = response.url;
+                console.log('realurl:', realUrl);
+                const result = results.get(url);
+                if (result) {
+                    result.url = realUrl;
+                    result.crawlStatus = 'Success';
+                }
+            } else {
+                throw new Error(`HTTP error! status: ${response.status}`);
+            }
+        } catch (error) {
+            console.error(`Error fetching original URL for ${url}:`, error);
+            urlsToProcessWithPuppeteer.push(url);
+        }
+    }
+
+    console.log('pass quickfetch');
+
+    // 并发处理真实 URL
+    const cluster = await Cluster.launch({
+        concurrency: Cluster.CONCURRENCY_CONTEXT,
+        maxConcurrency: 10,
+        puppeteerOptions: {
+            ignoreDefaultArgs: ["--enable-automation"],
+            headless: "true",
+            executablePath: "/usr/bin/chromium",  // 明确指定 Chromium 路径
+            pipe: true,
+            args: [
+                '--no-sandbox',
+                '--disable-setuid-sandbox',
+                '--disable-dev-shm-usage',
+                '--disable-gpu',
+            ]
+        }
+    });
+
+    let failedUrlCount = 0;
+
+    await cluster.task(async ({ page, data: url }) => {
+        let retryUrlCount = 0;
+        let urlSuccess = false;
+        while (retryUrlCount < 3 && !urlSuccess) {
+            console.log(`Fetching original URL for ${url}, attempt ${retryUrlCount + 1}`);
+            try {
+                await page.goto(url, { waitUntil: 'load' });
+                // 检查页面是否被分离
+                if (page.isClosed()) {
+                    throw new Error('Page has been closed');
+                }
+                const realUrl = page.url(); // 获取真实 URL
+                const result = results.get(url);
+                if (result) {
+                    result.url = realUrl;
+                    result.crawlStatus = 'Success';
+                }
+                urlSuccess = true;
+            } catch (error) {
+                console.error(`Error fetching original URL, retrying...`, error);
+                retryUrlCount++;
+                await randomWait(1000, 3000);
+            }
+        }
+        if (!urlSuccess) {
+            failedUrlCount++;
+        }
+    });
+
+    for (const url of urlsToProcessWithPuppeteer) {
+        cluster.queue(url);
+    }
+
+    await cluster.idle();
+    await cluster.close();
+
+    console.log(`Number of URLs that failed to return a real URL: ${failedUrlCount}`);
+
+    // 过滤并返回前 pageCount 个结果
+    const filteredResults = Array.from(results.values()).slice(0, pageCount);
+
+    return { resultUrls: filteredResults.map(result => result.url), results: new Map(filteredResults.map(result => [result.url, result])) };
+};
--- a/plugins/webcrawler/SPIDER/src/engines/searchxngEngine.ts
+++ b/plugins/webcrawler/SPIDER/src/engines/searchxngEngine.ts
@@ -0,0 +1,55 @@
+import axios from 'axios';
+import { URL } from 'url';
+import dotenv from 'dotenv';
+
+dotenv.config();
+
+const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
+
+export const fetchSearchResults = async (query: string, pageCount: number, searchUrlBase: string, categories: string) => {
+
+  const MAX_PAGES = (pageCount / 10 +1) * 2+1; // 最多搜索的页面数
+  //如果searchUrlBase为空，返回空数组，pagecount是需要搜索结果的数量
+  if (!searchUrlBase) {
+    return { resultUrls: [], results: new Map() };
+  }
+  const resultUrls: string[] = [];
+  const results = new Map<string, any>();
+
+  let fetchedResultsCount = 0;
+  let pageIndex = 0;
+
+  while (fetchedResultsCount < pageCount && pageIndex < MAX_PAGES) {
+    const searchUrl = new URL(`${searchUrlBase}?q=${encodeURIComponent(query)}&pageno=${pageIndex + 1}&format=json&categories=${categories}`);
+    console.log(`Fetching page ${pageIndex + 1} from SearchXNG: ${searchUrl.toString()}`);
+    const response = await axios.get(searchUrl.toString());
+    const jsonResults = response.data.results;
+
+    for (let index = 0; index < jsonResults.length; index++) {
+      const result = jsonResults[index];
+      const resultDomain = new URL(result.url).hostname;
+      if (blacklistDomains.some((domain: string) => resultDomain.endsWith(domain)) || resultDomain.includes('zhihu')) {
+        continue;
+      }
+      resultUrls.push(result.url);
+      results.set(result.url, {
+        title: result.title,
+        url: result.url,
+        snippet: result.content,
+        source: result.engine,
+        crawlStatus: 'Pending',
+        score: result.score
+      });
+      fetchedResultsCount++;
+      if (fetchedResultsCount >= pageCount) {
+        break;
+      }
+    }
+    pageIndex++;
+    if (jsonResults.length === 0) {
+      break; // 如果没有更多结果，退出循环
+    }
+  }
+
+  return { resultUrls, results };
+};
--- a/plugins/webcrawler/SPIDER/src/index.ts
+++ b/plugins/webcrawler/SPIDER/src/index.ts
@@ -0,0 +1,18 @@
+import express, { Application } from 'express';
+import bodyParser from 'body-parser';
+import searchRoutes from './routes/searchRoutes';
+import readRoutes from './routes/readRoutes';
+import quickfetchRoutes from './routes/quickfetchRoutes';
+import dotenv from 'dotenv';
+
+dotenv.config();
+
+const app: Application = express();
+
+app.use(bodyParser.json());
+app.use('/api', searchRoutes);
+app.use('/api', readRoutes);
+app.use('/api', quickfetchRoutes);
+
+const PORT = process.env.PORT || 3000;
+app.listen(PORT, () => console.log(`Server running on port ${PORT}`));
--- a/plugins/webcrawler/SPIDER/src/middleware/authMiddleware.ts
+++ b/plugins/webcrawler/SPIDER/src/middleware/authMiddleware.ts
@@ -0,0 +1,21 @@
+import { Request, Response, NextFunction } from 'express';
+
+const authMiddleware = (req: Request, res: Response, next: NextFunction) => {
+  const bearerHeader = req.headers['authorization'];
+
+  if (bearerHeader) {
+    console.log("bearerHeader:" + bearerHeader);
+    const bearer = bearerHeader.split(' ');
+    const bearerToken = bearer[1];
+
+    if (bearerToken === process.env.ACCESS_TOKEN) {
+      next();
+    } else {
+      res.status(403).json({ message: 'Invalid token' });
+    }
+  } else {
+    res.status(401).json({ message: 'Bearer token not found' });
+  }
+};
+
+export default authMiddleware;
--- a/plugins/webcrawler/SPIDER/src/routes/quickfetchRoutes.ts
+++ b/plugins/webcrawler/SPIDER/src/routes/quickfetchRoutes.ts
@@ -0,0 +1,9 @@
+import express from 'express';
+import { quickFetch } from '../controllers/quickfetchController';
+import authMiddleware from '../middleware/authMiddleware';
+
+const readRoutes = express.Router();
+
+readRoutes.get('/quickFetch', authMiddleware, quickFetch);
+
+export default readRoutes;
--- a/plugins/webcrawler/SPIDER/src/routes/readRoutes.ts
+++ b/plugins/webcrawler/SPIDER/src/routes/readRoutes.ts
@@ -0,0 +1,9 @@
+import express from 'express';
+import { readPage } from '../controllers/readController';
+import authMiddleware from '../middleware/authMiddleware';
+
+const readRoutes = express.Router();
+
+readRoutes.get('/read', authMiddleware, readPage);
+
+export default readRoutes;
--- a/plugins/webcrawler/SPIDER/src/routes/searchRoutes.ts
+++ b/plugins/webcrawler/SPIDER/src/routes/searchRoutes.ts
@@ -0,0 +1,9 @@
+import express from 'express';
+import searchController from '../controllers/searchController';
+import authMiddleware from '../middleware/authMiddleware';
+
+const searchRoutes = express.Router();
+
+searchRoutes.get('/search', authMiddleware, searchController.search);
+
+export default searchRoutes;
--- a/plugins/webcrawler/SPIDER/src/specialHandlers/index.ts
+++ b/plugins/webcrawler/SPIDER/src/specialHandlers/index.ts
@@ -0,0 +1,21 @@
+import { Page } from 'puppeteer';
+
+export const handleSpecialWebsite = async (page: Page, url: string): Promise<string | null> => {
+  if (url.includes('blog.csdn.net')) {
+    await page.waitForSelector('article');
+    const content = await page.$eval('article', el => el.innerHTML);
+    return content;
+  }
+  if (url.includes('zhuanlan.zhihu.com')) {
+    console.log('是知乎，需要点击按掉！');
+    console.log(await page.content());
+    if((await page.content()).includes('{"error":{"message":"您当前请求存在异常，暂时限制本次访问。如有疑问，您可以通过手机摇一摇或登录后私信知乎小管家反馈。","code":40362}}')) return null;
+    await page.waitForSelector('button[aria-label="关闭"]');
+    await page.click('button[aria-label="关闭"]'); // 使用 aria-label 选择按钮
+    await page.waitForSelector('article');
+    const content = await page.$eval('article', el => el.innerHTML);
+    return content;
+  }
+  // 可以添加更多特殊网站的处理逻辑
+  return null;
+};
--- a/plugins/webcrawler/SPIDER/src/utils/cacheUpdater.ts
+++ b/plugins/webcrawler/SPIDER/src/utils/cacheUpdater.ts
@@ -0,0 +1,77 @@
+
+import NodeCache from 'node-cache';
+import { MongoClient } from 'mongodb';
+import crypto from 'crypto';
+import dotenv from 'dotenv';
+
+dotenv.config();
+
+const cache = new NodeCache({ stdTTL: parseInt(process.env.STD_TTL || '3600') });
+const mongoClient = new MongoClient(process.env.MONGODB_URI || 'mongodb://localhost:27017');
+const dbName = 'pageCache';
+const collectionName = 'pages';
+
+const connectToMongo = async () => {
+  await mongoClient.connect();
+  return mongoClient.db(dbName);
+};
+
+const createTTLIndex = async () => {
+  try {
+    const db = await connectToMongo();
+    await db.collection(collectionName).createIndex({ "updatedAt": 1 }, { expireAfterSeconds: parseInt(process.env.EXPIRE_AFTER_SECONDS || '9000') });
+    console.log("TTL index created successfully");
+  } catch (error) {
+    console.error("Error creating TTL index:", error);
+  }
+};
+
+const getPageHash = (content: string) => {
+  return crypto.createHash('md5').update(content).digest('hex');
+};
+
+export const getCachedPage = async (url: string) => {
+  const cachedPage = cache.get(url);
+  if (cachedPage) return cachedPage;
+
+  try {
+    const db = await connectToMongo();
+    const page = await db.collection(collectionName).findOne({ url });
+    if (page) cache.set(url, page);
+    return page;
+  } catch (error) {
+    console.error('Error getting cached page:', error);
+    throw error;
+  }
+};
+
+const savePageToCache = async (url: string, content: string) => {
+  const hash = getPageHash(content);
+  const page = { url, content, hash, updatedAt: new Date() };
+
+  cache.set(url, page); // 更新内存缓存
+
+  try {
+    const db = await connectToMongo();
+    await db.collection(collectionName).updateOne(
+      { url },
+      { $set: page },
+      { upsert: true }
+    ); // 更新持久化缓存
+  } catch (error) {
+    console.error('Error saving page to cache:', error);
+    throw error;
+  }
+};
+
+export const updateCacheAsync = async (url: string, content: string) => {
+  await savePageToCache(url, content);
+};
+
+process.on('SIGINT', async () => {
+  await mongoClient.close();
+  process.exit(0);
+});
+
+// 在应用启动时创建 TTL 索引
+createTTLIndex();
--- a/plugins/webcrawler/SPIDER/src/utils/deepSearch.ts
+++ b/plugins/webcrawler/SPIDER/src/utils/deepSearch.ts
@@ -0,0 +1,140 @@
+import { Cluster } from 'puppeteer-cluster';
+import * as cheerio from 'cheerio';
+import UserAgent from 'user-agents';
+import { setupPage } from './setupPage';
+import { getCachedPage, updateCacheAsync } from './cacheUpdater';
+import { handleSpecialWebsite } from '../specialHandlers';
+import fetch from 'node-fetch';
+
+interface CachedPage {
+  url: string;
+  content: string;
+  hash: string;
+  updatedAt: Date;
+}
+
+export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: string[], results: Map<string, any>, strategies: any[], detectWebsites: string[], pageCount: number) => {
+  const tasks = [];
+
+  await clusterInstance.task(async ({ page, data: { searchUrl } }) => {
+    try {
+      const cachedPage = await getCachedPage(searchUrl) as CachedPage | null;
+      if (cachedPage) {
+        const result = results.get(searchUrl);
+        if (result) {
+          result.content = cachedPage.content;
+          result.crawlStatus = 'Success';
+        }
+        return;
+      }
+    } catch (error) {
+      console.error(`从缓存获取页面 ${searchUrl} 时发生错误:`, error);
+      results.set(searchUrl, { url: searchUrl, error: (error as Error).message, crawlStatus: 'Failed' });
+      return;
+    }
+
+    try {
+      const response = await fetch(searchUrl, {
+        headers: {
+          'User-Agent': new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }).toString(),
+          'Referer': 'https://www.google.com/',
+          'Accept-Language': 'en-US,en;q=0.9',
+          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+          'Connection': 'keep-alive',
+          'Cache-Control': 'no-cache'
+        }
+      });
+
+      if (response.ok) {
+        const content = await response.text();
+        const $ = cheerio.load(content);
+        const cleanedContent = $('body').html() || '';
+
+        const result = results.get(searchUrl);
+        if (result) {
+          result.content = cleanedContent;
+          result.crawlStatus = 'Success';
+        }
+
+        await updateCacheAsync(searchUrl, cleanedContent || '');
+        return;
+      } else {
+        throw new Error(`HTTP error! status: ${response.status}`);
+      }
+    } catch (error) {
+      console.error(`快速抓取页面 ${searchUrl} 时发生错误:`, error);
+    }
+
+    try {
+      if (detectWebsites.some(website => searchUrl.includes(website))) {
+        await setupPage(page);
+      } else {
+        const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
+        await page.setUserAgent(userAgent.toString());
+      }
+    } catch (error) {
+      console.error(`访问页面 ${searchUrl} 设置用户代理时发生错误:`, error);
+    }
+
+    let pageLoaded = false;
+    let pageLoadError: Error | null = null;
+    for (const strategy of strategies) {
+      try {
+        await page.goto(searchUrl, { waitUntil: strategy.waitUntil, timeout: strategy.timeout });
+        pageLoaded = true;
+        break;
+      } catch (error: any) {
+        if (error.name === 'TimeoutError') {
+          pageLoadError = error;
+          continue;
+        } else {
+          pageLoadError = error;
+          throw error;
+        }
+      }
+    }
+    if (!pageLoaded) {
+      const result = results.get(searchUrl);
+      if (result) {
+        result.error = pageLoadError;
+        result.crawlStatus = 'Failed';
+      }
+      return;
+    }
+
+    try {
+      let cleanedContent = await handleSpecialWebsite(page, searchUrl);
+      if (!cleanedContent) {
+        const content = await page.content();
+        const $ = cheerio.load(content);
+        cleanedContent = $('body').html() || '';
+      }
+
+      const result = results.get(searchUrl);
+      if (result) {
+        result.content = cleanedContent;
+        result.crawlStatus = 'Success';
+      }
+
+      await updateCacheAsync(searchUrl, cleanedContent || '');
+    } catch (error) {
+      results.set(searchUrl, { url: searchUrl, error: (error as Error).message, crawlStatus: 'Failed' });
+    } finally {
+      await page.close().catch(() => {});
+    }
+  });
+
+  for (const url of resultUrls) {
+    if (tasks.length >= pageCount + 10) {
+      break;
+    }
+    tasks.push(clusterInstance.queue({ searchUrl: url }));
+  }
+
+  await Promise.all(tasks);
+
+  await clusterInstance.idle();
+  await clusterInstance.close();
+
+  return Array.from(results.values()).sort((a, b) => b.score - a.score);
+};
--- a/plugins/webcrawler/SPIDER/src/utils/setupPage.ts
+++ b/plugins/webcrawler/SPIDER/src/utils/setupPage.ts
@@ -0,0 +1,88 @@
+import { Page } from 'puppeteer';
+import randomUseragent from 'random-useragent';
+
+const getRandomUserAgent = () => {
+  return randomUseragent.getRandom();
+};
+
+const getRandomPlatform = () => {
+  const platforms = ["Win32", "MacIntel", "Linux x86_64"];
+  return platforms[Math.floor(Math.random() * platforms.length)];
+};
+
+//代理池
+const validateproxy = [
+  { ip: "39.102.210.222", port: 8080 },
+  { ip: "8.130.71.75", port: 8080 },
+  { ip: "39.102.214.208", port: 9999 },
+  { ip: "39.104.59.56", port: 8080 },
+  { ip: "8.130.37.235", port: 3128 },
+  { ip: "8.138.131.110", port: 8080 },
+  { ip: "8.140.105.75", port: 8009 },
+  { ip: "114.80.38.120", port: 3081 },
+  { ip: "8.148.23.165", port: 8081 },
+  { ip: "119.96.72.199", port: 59394 },
+  { ip: "120.55.14.137", port: 80 },
+  { ip: "47.116.181.146", port: 5060 },
+  { ip: "39.102.214.199", port: 3128 },
+  { ip: "47.121.183.107", port: 8080 },
+  { ip: "39.104.16.201", port: 8080 },
+  { ip: "39.102.209.163", port: 10002 },
+  { ip: "101.201.76.157", port: 9090 },
+  { ip: "122.224.124.26", port: 12080 },
+  { ip: "180.105.244.199", port: 1080 },
+  { ip: "119.3.113.150", port: 9094 }
+];
+
+const getRandomProxy = () => {
+  return validateproxy[Math.floor(Math.random() * validateproxy.length)];
+};
+
+const getRandomLanguages = () => {
+  const languages = [
+    ["zh-CN", "zh", "en"],
+    ["en-US", "en", "fr"],
+    ["es-ES", "es", "en"]
+  ];
+  return languages[Math.floor(Math.random() * languages.length)];
+};
+
+export const setupPage = async (page: Page): Promise<void> => {
+  const proxy = getRandomProxy();
+  await page.authenticate({
+    username: proxy.ip,
+    password: proxy.port.toString()
+  });
+
+  await page.evaluateOnNewDocument(() => {
+    const newProto = (navigator as any).__proto__;
+    delete newProto.webdriver;
+    (navigator as any).__proto__ = newProto;
+    (window as any).chrome = {};
+    (window as any).chrome.app = {"InstallState":"testt", "RunningState":"estt", "getDetails":"stte", "getIsInstalled":"ttes"};
+    (window as any).chrome.csi = function(){};
+    (window as any).chrome.loadTimes = function(){};
+    (window as any).chrome.runtime = function(){};
+    Object.defineProperty(navigator, 'userAgent', {
+      get: () => getRandomUserAgent(),
+    });
+    Object.defineProperty(navigator, 'platform', {
+      get: () => getRandomPlatform(),
+    });
+    Object.defineProperty(navigator, 'plugins', {
+      get: () => [{"description": "Shockwave Flash",
+                  "filename": "pepflashplayer.dll",
+                  "length": 1,
+                  "name": "Shockwave Flash"}]
+    });
+    Object.defineProperty(navigator, 'languages', {
+      get: () => getRandomLanguages(),
+    });
+    const originalQuery = (window.navigator.permissions as any).query;
+    (window.navigator.permissions as any).query = (parameters: any) => (
+      parameters.name === 'notifications' ?
+        Promise.resolve({ state: Notification.permission } as PermissionStatus) :
+        originalQuery(parameters)
+    );
+  });
+};