1 (#3924)

2025-07-27 08:25:07 +00:00 · 2025-02-28 19:00:58 +08:00
parent cf0aaa1091
commit f7b2a57ca3
29 changed files with 7469 additions and 0 deletions
--- a/plugins/webcrawler/.dockerignore
+++ b/plugins/webcrawler/.dockerignore
@@ -0,0 +1,3 @@
 # 忽略 .git 目录及其内容
 .git
 .gitignore
--- a/plugins/webcrawler/.gitignore
+++ b/plugins/webcrawler/.gitignore
@@ -0,0 +1,25 @@
 *~
 searxng-docker.service
 caddy
 srv
 searxng/uwsgi.ini
 .env
 SPIDER/.env
 # 忽略 node_modules 文件夹
 SPIDER/node_modules/
 # 忽略构建输出文件夹
 SPIDER/dist/
 # 忽略日志文件
 *.log
 # 忽略操作系统生成的文件
 .DS_Store
 Thumbs.db
 # 忽略 IDE/编辑器生成的文件
 .vscode/
 .idea/
--- a/plugins/webcrawler/.searchxng.env
+++ b/plugins/webcrawler/.searchxng.env
@@ -0,0 +1,14 @@
 # By default listen on https://localhost
 # To change this:
 # * uncomment SEARXNG_HOSTNAME, and replace <host> by the SearXNG hostname
 # * uncomment LETSENCRYPT_EMAIL, and replace <email> by your email (require to create a Let's Encrypt certificate)
 # SEARXNG_HOSTNAME=<host>
 # LETSENCRYPT_EMAIL=<email>
 # Optional:
 # If you run a very small or a very large instance, you might want to change the amount of used uwsgi workers and threads per worker
 # More workers (= processes) means that more search requests can be handled at the same time, but it also causes more resource usage
 SEARXNG_UWSGI_WORKERS=4
 SEARXNG_UWSGI_THREADS=4
--- a/plugins/webcrawler/Caddyfile
+++ b/plugins/webcrawler/Caddyfile
@@ -0,0 +1,91 @@
 {
 	admin off
 	log {
 		output stderr
 		format filter {
 			# Preserves first 8 bits from IPv4 and 32 bits from IPv6
 			request>remote_ip ip_mask 8 32
 			request>client_ip ip_mask 8 32
 			# Remove identificable information
 			request>remote_port delete
 			request>headers delete
 			request>uri query {
 				delete url
 				delete h
 				delete q
 			}
 		}
 	}
 }
 {$SEARXNG_HOSTNAME}
 tls {$SEARXNG_TLS}
 encode zstd gzip
@api {
 	path /config
 	path /healthz
 	path /stats/errors
 	path /stats/checker
 }
@search {
 	path /search
 }
@imageproxy {
 	path /image_proxy
 }
@static {
 	path /static/*
 }
 header {
 	# CSP (https://content-security-policy.com)
 	Content-Security-Policy "upgrade-insecure-requests; default-src 'none'; script-src 'self'; style-src 'self' 'unsafe-inline'; form-action 'self' https://github.com/searxng/searxng/issues/new; font-src 'self'; frame-ancestors 'self'; base-uri 'self'; connect-src 'self' https://overpass-api.de; img-src * data:; frame-src https://www.youtube-nocookie.com https://player.vimeo.com https://www.dailymotion.com https://www.deezer.com https://www.mixcloud.com https://w.soundcloud.com https://embed.spotify.com;"
 	# Disable some browser features
 	Permissions-Policy "accelerometer=(),camera=(),geolocation=(),gyroscope=(),magnetometer=(),microphone=(),payment=(),usb=()"
 	# Set referrer policy
 	Referrer-Policy "no-referrer"
 	# Force clients to use HTTPS
 	Strict-Transport-Security "max-age=31536000"
 	# Prevent MIME type sniffing from the declared Content-Type
 	X-Content-Type-Options "nosniff"
 	# X-Robots-Tag (comment to allow site indexing)
 	X-Robots-Tag "noindex, noarchive, nofollow"
 	# Remove "Server" header
 	-Server
 }
 header @api {
 	Access-Control-Allow-Methods "GET, OPTIONS"
 	Access-Control-Allow-Origin "*"
 }
 route {
 	# Cache policy
 	header Cache-Control "max-age=0, no-store"
 	header @search Cache-Control "max-age=5, private"
 	header @imageproxy Cache-Control "max-age=604800, public"
 	header @static Cache-Control "max-age=31536000, public, immutable"
 }
 # SearXNG (uWSGI)
 reverse_proxy localhost:8080 {
 	header_up X-Forwarded-Port ""
 	header_up X-Real-IP  ""
 	# https://github.com/searx/searx-docker/issues/24
 	header_up Connection "close"
 }
--- a/plugins/webcrawler/Dockerfile
+++ b/plugins/webcrawler/Dockerfile
@@ -0,0 +1,57 @@
 FROM node:20.10.0-slim
 WORKDIR /app
 # 安装 Chrome 运行依赖
 RUN apt-get update && apt-get install -y \
    ca-certificates \
    fonts-liberation \
    libasound2 \
    libatk-bridge2.0-0 \
    libatk1.0-0 \
    libc6 \
    libcairo2 \
    libcups2 \
    libdbus-1-3 \
    libexpat1 \
    libfontconfig1 \
    libgbm1 \
    libgcc1 \
    libglib2.0-0 \
    libgtk-3-0 \
    libnspr4 \
    libnss3 \
    libpango-1.0-0 \
    libpangocairo-1.0-0 \
    libstdc++6 \
    libx11-6 \
    libx11-xcb1 \
    libxcb1 \
    libxcomposite1 \
    libxcursor1 \
    libxdamage1 \
    libxext6 \
    libxfixes3 \
    libxi6 \
    libxrandr2 \
    libxrender1 \
    libxss1 \
    libxtst6 \
    lsb-release \
    wget \
    xdg-utils \
    chromium \
    && rm -rf /var/lib/apt/lists/*
 # 安装中文字体
 RUN apt-get update && apt-get install -y fonts-wqy-microhei && fc-cache -f -v
 COPY SPIDER/. .
 RUN test -f package.json || (echo "package.json missing" && exit 1)
 RUN test -f .env || (echo ".env file missing in SPIDER directory" && exit 1)
 RUN npm run build
 EXPOSE 3000
 CMD ["npm", "start"]
--- a/plugins/webcrawler/README.md
+++ b/plugins/webcrawler/README.md
@@ -0,0 +1,73 @@
 # webcrawler
 ## docker版快速部署
 ## 代码版部署
 0. 按照 https://github.com/searxng/searxng-docker 的方式处理docker
 1. 参考SPIDER文件夹下的.env.example，添加.env文件
 2. 进入SPIDER文件夹进行pnpm install
 3. 回到根目录，运行docker compose up -d
 ## 代码版开发
 1. 将docker-compose.yml中与SPIDER相关的部分注释掉（nodeapp）
 2. .env文件中的URL参照注释修改
 3. 注释掉启动puppteer部分里面指定浏览器地址的代码
 4. pnpm run dev
 ## 测试样例：
 Auth的Bear Token记得填,也就是.env里的ACCESS_TOKEN
 ### 读取单页面(content以HTML形式返回)
 ```
 http://localhost:3000/api/read?queryUrl=<url>
 ```
 返回结构
 ```json
 {
    "status": 200,
    "data": {
        "title": "something here",
        "content": "something here"
    }
 }
 {
    "status": 400,
    "error": {
        "code": "MISSING_PARAM",
        "message": "缺少必要参数: query"
    }
 }
 ```
 ### 搜索(content以HTML形式返回)
 ```
 http://localhost:3000/api/search?query=<something>&pageCount=5&needDetails=true&engine=baidu
 ```
 ```json
 {
    "status": 200,
    "data": {
        "results": [
            {
                "title": "string",
                "url": "string",
                "snippet": "string",
                "source": "string",
                "crawlStatus": "string",
                "score": 0,
                "content": "string"
            }
        ]
    }
 }
 {
    "status": 400,
    "error": {
        "code": "MISSING_PARAM",
        "message": "缺少必要参数: query"
    }
 }
 ```
--- a/plugins/webcrawler/SPIDER/.env.example
+++ b/plugins/webcrawler/SPIDER/.env.example
@@ -0,0 +1,21 @@
 ACCESS_TOKEN=114514
 DETECT_WEBSITE = zhuanlan.zhihu.com
 STRATEGIES=[{"waitUntil":"networkidle0","timeout":5000},{"waitUntil":"networkidle2","timeout":10000},{"waitUntil":"load","timeout":15000}]
 PORT=3000
 MAX_CONCURRENCY=10
 NODE_ENV=development
 ENGINE = [
 ]
 ENGINE_BAIDUURL=https://www.baidu.com/s
 #ENGINE_SEARCHXNGURL=http://localhost:8080/search
 ENGINE_SEARCHXNGURL=http://searxng:8080/search
 #MONGODB_URI=mongodb://root:example@localhost:27017
 MONGODB_URI=mongodb://root:example@mongodb:27017
 BLACKLIST = [".gov.cn",".edu.cn"]
 STD_TTL=3600
 EXPIRE_AFTER_SECONDS=9000
--- a/plugins/webcrawler/SPIDER/package-lock.json
+++ b/plugins/webcrawler/SPIDER/package-lock.json
--- a/plugins/webcrawler/SPIDER/package.json
+++ b/plugins/webcrawler/SPIDER/package.json
@@ -0,0 +1,62 @@
 {
  "name": "spider",
  "version": "1.0.0",
  "description": "",
  "main": "/dist/index.ts",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1",
    "start": "ts-node src/index.ts",
    "build": "webpack",
    "dev": "ts-node-dev --respawn src/index.ts"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "dependencies": {
    "@types/node-fetch": "^2.6.12",
    "assert": "^2.1.0",
    "axios": "^1.7.9",
    "body-parser": "^1.20.3",
    "browserify-zlib": "^0.2.0",
    "buffer": "^6.0.3",
    "cheerio": "^1.0.0",
    "crypto-browserify": "^3.12.1",
    "dotenv": "^16.4.7",
    "express": "^4.21.2",
    "https-proxy-agent": "^7.0.6",
    "jsdom": "^26.0.0",
    "mongodb": "^6.13.1",
    "node-cache": "^5.1.2",
    "node-fetch": "^2.7.0",
    "os-browserify": "^0.3.0",
    "path-browserify": "^1.0.1",
    "puppeteer": "^24.2.1",
    "puppeteer-cluster": "^0.24.0",
    "querystring-es3": "^0.2.1",
    "random-useragent": "^0.5.0",
    "spider": "file:",
    "stream-browserify": "^3.0.0",
    "stream-http": "^3.2.0",
    "string_decoder": "^1.3.0",
    "turndown": "^7.2.0",
    "turndown-plugin-gfm": "^1.0.2",
    "url": "^0.11.4",
    "user-agents": "^1.1.454",
    "util": "^0.12.5",
    "vm-browserify": "^1.1.2"
  },
  "devDependencies": {
    "@types/body-parser": "^1.19.5",
    "@types/express": "^5.0.0",
    "@types/jsdom": "^21.1.7",
    "@types/node": "^22.13.4",
    "@types/random-useragent": "^0.3.3",
    "@types/user-agents": "^1.0.4",
    "ts-loader": "^9.5.2",
    "ts-node-dev": "^2.0.0",
    "typescript": "^5.7.3",
    "webpack": "^5.98.0",
    "webpack-cli": "^6.0.1",
    "webpack-node-externals": "^3.0.0"
  }
 }
--- a/plugins/webcrawler/SPIDER/src/controllers/quickfetchController.ts
+++ b/plugins/webcrawler/SPIDER/src/controllers/quickfetchController.ts
@@ -0,0 +1,60 @@
 import { Request, Response } from 'express';
 import fetch from 'node-fetch';
 import dotenv from 'dotenv';
 dotenv.config();
 const userAgents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
 ];
 export const quickFetch = async (req: Request, res: Response): Promise<void> => {
    const { url } = req.query;
    if (!url) {
        res.status(400).json({
            status: 400,
            error: {
                code: "MISSING_PARAM",
                message: "缺少必要参数: url"
            }
        });
        return;
    }
    try {
        const response = await fetch(url as string, {
            headers: {
                'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
                'Referer': 'https://www.google.com/',
                'Accept-Language': 'en-US,en;q=0.9',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Connection': 'keep-alive',
                'Cache-Control': 'no-cache'
            }
        });
        if (!response.ok) {
            throw new Error(`HTTP error! status: ${response.status}`);
        }
        const data = await response.text();
        res.status(200).json({
            status: 200,
            data: {
                content: data
            }
        });
    } catch (error) {
        console.error('Error fetching the page:', error);
        res.status(500).json({
            status: 500,
            error: {
                code: "INTERNAL_SERVER_ERROR",
                message: "发生错误"
            }
        });
    }
 };
 export default { quickFetch };
--- a/plugins/webcrawler/SPIDER/src/controllers/readController.ts
+++ b/plugins/webcrawler/SPIDER/src/controllers/readController.ts
@@ -0,0 +1,142 @@
 import { Request, Response } from 'express';
 import puppeteer, { Page } from 'puppeteer';
 import * as cheerio from 'cheerio';
 import UserAgent from 'user-agents';
 import { setupPage } from '../utils/setupPage'; // 导入 setupPage 模块
 import dotenv from 'dotenv'; // 导入 dotenv 模块
 import { URL } from 'url'; // 导入 URL 模块
 import { handleSpecialWebsite } from '../specialHandlers'; // 导入 handleSpecialWebsite 模块
 import fetch from 'node-fetch';
 import { getCachedPage, updateCacheAsync } from '../utils/cacheUpdater'; // 导入缓存相关模块
 dotenv.config(); // 加载环境变量
 const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
 const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
 export const readPage = async (req: Request, res: Response): Promise<void> => {
  const { queryUrl } = req.query;
  console.log("-------");
  console.log(queryUrl);
  console.log("-------");
  if (!queryUrl) {
    res.status(400).json({
      status: 400,
      error: {
        code: "MISSING_PARAM",
        message: "缺少必要参数: queryUrl"
      }
    });
    return;
  }
  const urlDomain = new URL(queryUrl as string).hostname;
  if (blacklistDomains.some((domain: string) => urlDomain.endsWith(domain))) {
    res.status(403).json({
      status: 403,
      error: {
        code: "BLACKLISTED_DOMAIN",
        message: "该域名受到保护中"
      }
    });
    return;
  }
  try {
    const response = await fetch(queryUrl as string, {
      headers: {
        'User-Agent': new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }).toString(),
        'Referer': 'https://www.google.com/',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Connection': 'keep-alive',
        'Cache-Control': 'no-cache'
      }
    });
    if (response.ok) {
      const content = await response.text();
      const $ = cheerio.load(content);
      const cleanedContent = $('body').html();
      res.status(200).json({
        status: 200,
        data: {
          title: $('title').text(),
          content: cleanedContent
        }
      });
      await updateCacheAsync(queryUrl as string, cleanedContent || '');
      console.log("Page read successfully");
      return;
    } else {
      throw new Error(`HTTP error! status: ${response.status}`);
    }
  } catch (error) {
    console.error('快速抓取页面时发生错误:', error);
  }
  try {
    const browser = await puppeteer.launch({ 
      ignoreDefaultArgs: ["--enable-automation"],
      headless: true,  
      executablePath: "/usr/bin/chromium",  // 明确指定 Chromium 路径
      pipe: true,
      args: [
        '--no-sandbox',
        '--disable-setuid-sandbox',
        '--disable-dev-shm-usage',
        '--disable-gpu',
       // '--single-process' 
      ]
    });
    const page = await browser.newPage();
    // 检测是否需要特殊处理
    if (typeof queryUrl === 'string' && detectWebsites.some(website => queryUrl.includes(website))) {
      await setupPage(page);
    } else {
      const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
      await page.setUserAgent(userAgent.toString());
    }
    const queryUrlSafe = new URL(queryUrl as string).toString();
    await page.goto(queryUrlSafe, { waitUntil: 'load' });
    await page.waitForSelector('body');
    const title = await page.title();
    let cleanedContent = await handleSpecialWebsite(page, queryUrl as string);
    if (!cleanedContent) {
      const content = await page.content();
      const $ = cheerio.load(content);
      cleanedContent = $('body').html();
    }
    await page.close();
    await browser.close();
    res.status(200).json({
      status: 200,
      data: {
        title,
        content: cleanedContent
      }
    });
    await updateCacheAsync(queryUrl as string, cleanedContent || '');
    console.log("Page read successfully");
  } catch (error) {
    console.error(error);
    res.status(500).json({
      status: 500,
      error: {
        code: "INTERNAL_SERVER_ERROR",
        message: "读取页面时发生内部服务器错误"
      }
    });
  }
 };
--- a/plugins/webcrawler/SPIDER/src/controllers/searchController.ts
+++ b/plugins/webcrawler/SPIDER/src/controllers/searchController.ts
@@ -0,0 +1,114 @@
 import { Request, Response } from 'express';
 import { Cluster } from 'puppeteer-cluster';
 import dotenv from 'dotenv';
 import { performDeepSearch } from '../utils/deepSearch';
 import { fetchSearchResults as fetchBaiduResults } from '../engines/baiduEngine';
 import { fetchSearchResults as fetchSearchxngResults } from '../engines/searchxngEngine';
 dotenv.config();
 const strategies = JSON.parse(process.env.STRATEGIES || '[]');
 const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
 const maxConcurrency = parseInt(process.env.MAX_CONCURRENCY || '10', 10);
 export const search = async (req: Request, res: Response): Promise<void> => {
  const { query, pageCount = 10, needDetails = 'false', engine = 'baidu', categories = 'general' } = req.query;
  const needDetailsBool = (needDetails === 'true');
  if (!query) {
    res.status(400).json({
      status: 400,
      error: {
        code: "MISSING_PARAM",
        message: "缺少必要参数: query"
      }
    });
    return;
  }
  let fetchSearchResults;
  let searchUrlBase;
  try {
      if (engine === 'baidu') {
        fetchSearchResults = fetchBaiduResults;
        searchUrlBase = process.env.ENGINE_BAIDUURL;
      } else if (engine === 'searchxng') {
        fetchSearchResults = fetchSearchxngResults;
        searchUrlBase = process.env.ENGINE_SEARCHXNGURL;
      } else {
        res.status(400).json({
          status: 400,
          error: {
            code: "INVALID_ENGINE",
            message: "无效的搜索引擎"
          }
        });
        return;
      }
    const { resultUrls, results } = await fetchSearchResults(query as string, Number(pageCount), searchUrlBase || '', categories as string);
    //如果返回值为空，返回空数组
    if (results.size === 0) {
      console.log('No results found');
      res.status(200).json({
        status: 200,
        data: {
          results: []
        }
      });
      return;
    }
    if (!needDetailsBool) {
      console.log('Need details is false');
      results.forEach((value: any) => {
        if (value.crawlStatus === 'Pending') {
          value.crawlStatus = 'Success';
        }
      });
      res.status(200).json({
        status: 200,
        data: {
          results: Array.from(results.values())
        }
      });
    } else {
      console.log('Need details is true');
      const clusterInstance = await Cluster.launch({
        concurrency: Cluster.CONCURRENCY_CONTEXT,
        maxConcurrency: maxConcurrency,
        puppeteerOptions: {
          ignoreDefaultArgs: ["--enable-automation"],
          headless: "true",
          executablePath: "/usr/bin/chromium",  // 明确指定 Chromium 路径
          pipe: true,
          args: [
            '--no-sandbox',
            '--disable-setuid-sandbox',
            '--disable-dev-shm-usage',
            '--disable-gpu',
          ]
        }
      });
      const sortedResults = await performDeepSearch(clusterInstance, resultUrls, results, strategies, detectWebsites, Number(pageCount));
      res.status(200).json({
        status: 200,
        data: {
          results: sortedResults.slice(0, Number(pageCount))
        }
      });
    }
  } catch (error) {
    res.status(500).json({
      status: 500,
      error: {
        code: "INTERNAL_SERVER_ERROR",
        message: "发生错误"
      }
    });
  }
 };
 export default { search };
--- a/plugins/webcrawler/SPIDER/src/engines/baiduEngine.ts
+++ b/plugins/webcrawler/SPIDER/src/engines/baiduEngine.ts
@@ -0,0 +1,204 @@
 import { URL } from 'url';
 import { JSDOM } from 'jsdom';
 import puppeteer from 'puppeteer';
 import { setupPage } from '../utils/setupPage';
 import { Cluster } from 'puppeteer-cluster';
 async function randomWait(min: number, max: number) {
    // 随机等待时间
    const delay = Math.floor(Math.random() * (max - min + 1)) + min;
    return new Promise(resolve => setTimeout(resolve, delay));
 }
 export const fetchSearchResults = async (query: string, pageCount: number, searchUrlBase: string, categories: string) => {
    console.log(`Fetching Baidu search results for query: ${query}`);
    // 如果 searchUrlBase 为空，返回空数组
    if (!searchUrlBase) {
        return { resultUrls: [], results: new Map() };
    }
    const resultUrls: string[] = [];
    const results = new Map<string, any>();
    const pagesToFetch = Math.ceil(pageCount / 10);
    const browser = await puppeteer.launch({ 
        ignoreDefaultArgs: ["--enable-automation"],
        headless: true,  
        executablePath: "/usr/bin/chromium",  // 明确指定 Chromium 路径
        pipe: true,
        args: [
          '--no-sandbox',
          '--disable-setuid-sandbox',
          '--disable-dev-shm-usage',
          '--disable-gpu',
         // '--single-process' 
        ]
      });
    const page = await browser.newPage();
    await setupPage(page);
    for (let i = 0; i < pagesToFetch; i++) {
        const searchUrl = new URL(`${searchUrlBase}?wd=${encodeURIComponent(query)}&pn=${i * 10}`);
        console.log(`Fetching page ${i + 1} from Baidu: ${searchUrl.toString()}`);
        let retryCount = 0;
        let success = false;
        while (retryCount < 5 && !success) {
            try {
                console.time(`Page Load Time for page ${i + 1}`);
                await page.goto(searchUrl.toString(), { waitUntil: 'load' });
                console.timeEnd(`Page Load Time for page ${i + 1}`);
                let content = await page.content();
                let dom = new JSDOM(content);
                let document = dom.window.document;
                console.log(document.title);
                // 如果是百度安全验证页面，重新设置页面并重新访问
                if (document.title.includes('百度安全验证')) {
                    console.log('Detected Baidu security verification, retrying...');
                    await setupPage(page);
                    retryCount++;
                    //随机等待时间
                    await randomWait(1000, 3000);
                    continue;
                }
                // 解析搜索结果
                console.time(`Link Retrieval Time for page ${i + 1}`);
                const resultContainers = document.querySelectorAll('.result.c-container');
                for (const result of resultContainers) {
                    if (resultUrls.length > pageCount + 5) {
                        break;
                    }
                    const titleElement = result.querySelector('h3 a');
                    const title = titleElement ? titleElement.textContent : '';
                    const url = titleElement ? titleElement.getAttribute('href') : '';
                    const contentElement = result.querySelector('[class^="content"]');
                    const content = contentElement ? contentElement.textContent : '';
                    if (url) {
                        resultUrls.push(url);
                        results.set(url, {
                            title,
                            url,
                            snippet: content,
                            source: 'baidu',
                            crawlStatus: 'Pending',
                            score: 0 
                        });
                    }
                }
                console.timeEnd(`Link Retrieval Time for page ${i + 1}`);
                success = true;
            } catch (error) {
                console.error(`Error fetching page ${i + 1}:`, error);
                retryCount++;
            }
        }
    }
    await browser.close();
    console.log('fetch all fake urls');
    // 快速检索真实 URL
    const urlsToProcessWithPuppeteer = [];
    for (const url of resultUrls) {
        try {
            const response = await fetch(url, {
                headers: {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
                    'Referer': 'https://www.google.com/',
                    'Accept-Language': 'en-US,en;q=0.9',
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                    'Connection': 'keep-alive',
                    'Cache-Control': 'no-cache'
                }
            });
            if (response.ok) {
                const realUrl = response.url;
                console.log('realurl:', realUrl);
                const result = results.get(url);
                if (result) {
                    result.url = realUrl;
                    result.crawlStatus = 'Success';
                }
            } else {
                throw new Error(`HTTP error! status: ${response.status}`);
            }
        } catch (error) {
            console.error(`Error fetching original URL for ${url}:`, error);
            urlsToProcessWithPuppeteer.push(url);
        }
    }
    console.log('pass quickfetch');
    // 并发处理真实 URL
    const cluster = await Cluster.launch({
        concurrency: Cluster.CONCURRENCY_CONTEXT,
        maxConcurrency: 10,
        puppeteerOptions: {
            ignoreDefaultArgs: ["--enable-automation"],
            headless: "true",
            executablePath: "/usr/bin/chromium",  // 明确指定 Chromium 路径
            pipe: true,
            args: [
                '--no-sandbox',
                '--disable-setuid-sandbox',
                '--disable-dev-shm-usage',
                '--disable-gpu',
            ]
        }
    });
    let failedUrlCount = 0;
    await cluster.task(async ({ page, data: url }) => {
        let retryUrlCount = 0;
        let urlSuccess = false;
        while (retryUrlCount < 3 && !urlSuccess) {
            console.log(`Fetching original URL for ${url}, attempt ${retryUrlCount + 1}`);
            try {
                await page.goto(url, { waitUntil: 'load' });
                // 检查页面是否被分离
                if (page.isClosed()) {
                    throw new Error('Page has been closed');
                }
                const realUrl = page.url(); // 获取真实 URL
                const result = results.get(url);
                if (result) {
                    result.url = realUrl;
                    result.crawlStatus = 'Success';
                }
                urlSuccess = true;
            } catch (error) {
                console.error(`Error fetching original URL, retrying...`, error);
                retryUrlCount++;
                await randomWait(1000, 3000);
            }
        }
        if (!urlSuccess) {
            failedUrlCount++;
        }
    });
    for (const url of urlsToProcessWithPuppeteer) {
        cluster.queue(url);
    }
    await cluster.idle();
    await cluster.close();
    console.log(`Number of URLs that failed to return a real URL: ${failedUrlCount}`);
    // 过滤并返回前 pageCount 个结果
    const filteredResults = Array.from(results.values()).slice(0, pageCount);
    return { resultUrls: filteredResults.map(result => result.url), results: new Map(filteredResults.map(result => [result.url, result])) };
 };
--- a/plugins/webcrawler/SPIDER/src/engines/searchxngEngine.ts
+++ b/plugins/webcrawler/SPIDER/src/engines/searchxngEngine.ts
@@ -0,0 +1,55 @@
 import axios from 'axios';
 import { URL } from 'url';
 import dotenv from 'dotenv';
 dotenv.config();
 const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
 export const fetchSearchResults = async (query: string, pageCount: number, searchUrlBase: string, categories: string) => {
  const MAX_PAGES = (pageCount / 10 +1) * 2+1; // 最多搜索的页面数
  //如果searchUrlBase为空，返回空数组，pagecount是需要搜索结果的数量
  if (!searchUrlBase) {
    return { resultUrls: [], results: new Map() };
  }
  const resultUrls: string[] = [];
  const results = new Map<string, any>();
  let fetchedResultsCount = 0;
  let pageIndex = 0;
  while (fetchedResultsCount < pageCount && pageIndex < MAX_PAGES) {
    const searchUrl = new URL(`${searchUrlBase}?q=${encodeURIComponent(query)}&pageno=${pageIndex + 1}&format=json&categories=${categories}`);
    console.log(`Fetching page ${pageIndex + 1} from SearchXNG: ${searchUrl.toString()}`);
    const response = await axios.get(searchUrl.toString());
    const jsonResults = response.data.results;
    for (let index = 0; index < jsonResults.length; index++) {
      const result = jsonResults[index];
      const resultDomain = new URL(result.url).hostname;
      if (blacklistDomains.some((domain: string) => resultDomain.endsWith(domain)) || resultDomain.includes('zhihu')) {
        continue;
      }
      resultUrls.push(result.url);
      results.set(result.url, {
        title: result.title,
        url: result.url,
        snippet: result.content,
        source: result.engine,
        crawlStatus: 'Pending',
        score: result.score
      });
      fetchedResultsCount++;
      if (fetchedResultsCount >= pageCount) {
        break;
      }
    }
    pageIndex++;
    if (jsonResults.length === 0) {
      break; // 如果没有更多结果，退出循环
    }
  }
  return { resultUrls, results };
 };
--- a/plugins/webcrawler/SPIDER/src/index.ts
+++ b/plugins/webcrawler/SPIDER/src/index.ts
@@ -0,0 +1,18 @@
 import express, { Application } from 'express';
 import bodyParser from 'body-parser';
 import searchRoutes from './routes/searchRoutes';
 import readRoutes from './routes/readRoutes';
 import quickfetchRoutes from './routes/quickfetchRoutes';
 import dotenv from 'dotenv';
 dotenv.config();
 const app: Application = express();
 app.use(bodyParser.json());
 app.use('/api', searchRoutes);
 app.use('/api', readRoutes);
 app.use('/api', quickfetchRoutes);
 const PORT = process.env.PORT || 3000;
 app.listen(PORT, () => console.log(`Server running on port ${PORT}`));
--- a/plugins/webcrawler/SPIDER/src/middleware/authMiddleware.ts
+++ b/plugins/webcrawler/SPIDER/src/middleware/authMiddleware.ts
@@ -0,0 +1,21 @@
 import { Request, Response, NextFunction } from 'express';
 const authMiddleware = (req: Request, res: Response, next: NextFunction) => {
  const bearerHeader = req.headers['authorization'];
  if (bearerHeader) {
    console.log("bearerHeader:" + bearerHeader);
    const bearer = bearerHeader.split(' ');
    const bearerToken = bearer[1];
    if (bearerToken === process.env.ACCESS_TOKEN) {
      next();
    } else {
      res.status(403).json({ message: 'Invalid token' });
    }
  } else {
    res.status(401).json({ message: 'Bearer token not found' });
  }
 };
 export default authMiddleware;
--- a/plugins/webcrawler/SPIDER/src/routes/quickfetchRoutes.ts
+++ b/plugins/webcrawler/SPIDER/src/routes/quickfetchRoutes.ts
@@ -0,0 +1,9 @@
 import express from 'express';
 import { quickFetch } from '../controllers/quickfetchController';
 import authMiddleware from '../middleware/authMiddleware';
 const readRoutes = express.Router();
 readRoutes.get('/quickFetch', authMiddleware, quickFetch);
 export default readRoutes;
--- a/plugins/webcrawler/SPIDER/src/routes/readRoutes.ts
+++ b/plugins/webcrawler/SPIDER/src/routes/readRoutes.ts
@@ -0,0 +1,9 @@
 import express from 'express';
 import { readPage } from '../controllers/readController';
 import authMiddleware from '../middleware/authMiddleware';
 const readRoutes = express.Router();
 readRoutes.get('/read', authMiddleware, readPage);
 export default readRoutes;
--- a/plugins/webcrawler/SPIDER/src/routes/searchRoutes.ts
+++ b/plugins/webcrawler/SPIDER/src/routes/searchRoutes.ts
@@ -0,0 +1,9 @@
 import express from 'express';
 import searchController from '../controllers/searchController';
 import authMiddleware from '../middleware/authMiddleware';
 const searchRoutes = express.Router();
 searchRoutes.get('/search', authMiddleware, searchController.search);
 export default searchRoutes;
--- a/plugins/webcrawler/SPIDER/src/specialHandlers/index.ts
+++ b/plugins/webcrawler/SPIDER/src/specialHandlers/index.ts
@@ -0,0 +1,21 @@
 import { Page } from 'puppeteer';
 export const handleSpecialWebsite = async (page: Page, url: string): Promise<string | null> => {
  if (url.includes('blog.csdn.net')) {
    await page.waitForSelector('article');
    const content = await page.$eval('article', el => el.innerHTML);
    return content;
  }
  if (url.includes('zhuanlan.zhihu.com')) {
    console.log('是知乎，需要点击按掉！');
    console.log(await page.content());
    if((await page.content()).includes('{"error":{"message":"您当前请求存在异常，暂时限制本次访问。如有疑问，您可以通过手机摇一摇或登录后私信知乎小管家反馈。","code":40362}}')) return null;
    await page.waitForSelector('button[aria-label="关闭"]');
    await page.click('button[aria-label="关闭"]'); // 使用 aria-label 选择按钮
    await page.waitForSelector('article');
    const content = await page.$eval('article', el => el.innerHTML);
    return content;
  }
  // 可以添加更多特殊网站的处理逻辑
  return null;
 };
--- a/plugins/webcrawler/SPIDER/src/utils/cacheUpdater.ts
+++ b/plugins/webcrawler/SPIDER/src/utils/cacheUpdater.ts
@@ -0,0 +1,77 @@
 import NodeCache from 'node-cache';
 import { MongoClient } from 'mongodb';
 import crypto from 'crypto';
 import dotenv from 'dotenv';
 dotenv.config();
 const cache = new NodeCache({ stdTTL: parseInt(process.env.STD_TTL || '3600') });
 const mongoClient = new MongoClient(process.env.MONGODB_URI || 'mongodb://localhost:27017');
 const dbName = 'pageCache';
 const collectionName = 'pages';
 const connectToMongo = async () => {
  await mongoClient.connect();
  return mongoClient.db(dbName);
 };
 const createTTLIndex = async () => {
  try {
    const db = await connectToMongo();
    await db.collection(collectionName).createIndex({ "updatedAt": 1 }, { expireAfterSeconds: parseInt(process.env.EXPIRE_AFTER_SECONDS || '9000') });
    console.log("TTL index created successfully");
  } catch (error) {
    console.error("Error creating TTL index:", error);
  }
 };
 const getPageHash = (content: string) => {
  return crypto.createHash('md5').update(content).digest('hex');
 };
 export const getCachedPage = async (url: string) => {
  const cachedPage = cache.get(url);
  if (cachedPage) return cachedPage;
  try {
    const db = await connectToMongo();
    const page = await db.collection(collectionName).findOne({ url });
    if (page) cache.set(url, page);
    return page;
  } catch (error) {
    console.error('Error getting cached page:', error);
    throw error;
  }
 };
 const savePageToCache = async (url: string, content: string) => {
  const hash = getPageHash(content);
  const page = { url, content, hash, updatedAt: new Date() };
  cache.set(url, page); // 更新内存缓存
  try {
    const db = await connectToMongo();
    await db.collection(collectionName).updateOne(
      { url },
      { $set: page },
      { upsert: true }
    ); // 更新持久化缓存
  } catch (error) {
    console.error('Error saving page to cache:', error);
    throw error;
  }
 };
 export const updateCacheAsync = async (url: string, content: string) => {
  await savePageToCache(url, content);
 };
 process.on('SIGINT', async () => {
  await mongoClient.close();
  process.exit(0);
 });
 // 在应用启动时创建 TTL 索引
 createTTLIndex();
--- a/plugins/webcrawler/SPIDER/src/utils/deepSearch.ts
+++ b/plugins/webcrawler/SPIDER/src/utils/deepSearch.ts
@@ -0,0 +1,140 @@
 import { Cluster } from 'puppeteer-cluster';
 import * as cheerio from 'cheerio';
 import UserAgent from 'user-agents';
 import { setupPage } from './setupPage';
 import { getCachedPage, updateCacheAsync } from './cacheUpdater';
 import { handleSpecialWebsite } from '../specialHandlers';
 import fetch from 'node-fetch';
 interface CachedPage {
  url: string;
  content: string;
  hash: string;
  updatedAt: Date;
 }
 export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: string[], results: Map<string, any>, strategies: any[], detectWebsites: string[], pageCount: number) => {
  const tasks = [];
  await clusterInstance.task(async ({ page, data: { searchUrl } }) => {
    try {
      const cachedPage = await getCachedPage(searchUrl) as CachedPage | null;
      if (cachedPage) {
        const result = results.get(searchUrl);
        if (result) {
          result.content = cachedPage.content;
          result.crawlStatus = 'Success';
        }
        return;
      }
    } catch (error) {
      console.error(`从缓存获取页面 ${searchUrl} 时发生错误:`, error);
      results.set(searchUrl, { url: searchUrl, error: (error as Error).message, crawlStatus: 'Failed' });
      return;
    }
    try {
      const response = await fetch(searchUrl, {
        headers: {
          'User-Agent': new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }).toString(),
          'Referer': 'https://www.google.com/',
          'Accept-Language': 'en-US,en;q=0.9',
          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
          'Connection': 'keep-alive',
          'Cache-Control': 'no-cache'
        }
      });
      if (response.ok) {
        const content = await response.text();
        const $ = cheerio.load(content);
        const cleanedContent = $('body').html() || '';
        const result = results.get(searchUrl);
        if (result) {
          result.content = cleanedContent;
          result.crawlStatus = 'Success';
        }
        await updateCacheAsync(searchUrl, cleanedContent || '');
        return;
      } else {
        throw new Error(`HTTP error! status: ${response.status}`);
      }
    } catch (error) {
      console.error(`快速抓取页面 ${searchUrl} 时发生错误:`, error);
    }
    try {
      if (detectWebsites.some(website => searchUrl.includes(website))) {
        await setupPage(page);
      } else {
        const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
        await page.setUserAgent(userAgent.toString());
      }
    } catch (error) {
      console.error(`访问页面 ${searchUrl} 设置用户代理时发生错误:`, error);
    }
    let pageLoaded = false;
    let pageLoadError: Error | null = null;
    for (const strategy of strategies) {
      try {
        await page.goto(searchUrl, { waitUntil: strategy.waitUntil, timeout: strategy.timeout });
        pageLoaded = true;
        break;
      } catch (error: any) {
        if (error.name === 'TimeoutError') {
          pageLoadError = error;
          continue;
        } else {
          pageLoadError = error;
          throw error;
        }
      }
    }
    if (!pageLoaded) {
      const result = results.get(searchUrl);
      if (result) {
        result.error = pageLoadError;
        result.crawlStatus = 'Failed';
      }
      return;
    }
    try {
      let cleanedContent = await handleSpecialWebsite(page, searchUrl);
      if (!cleanedContent) {
        const content = await page.content();
        const $ = cheerio.load(content);
        cleanedContent = $('body').html() || '';
      }
      const result = results.get(searchUrl);
      if (result) {
        result.content = cleanedContent;
        result.crawlStatus = 'Success';
      }
      await updateCacheAsync(searchUrl, cleanedContent || '');
    } catch (error) {
      results.set(searchUrl, { url: searchUrl, error: (error as Error).message, crawlStatus: 'Failed' });
    } finally {
      await page.close().catch(() => {});
    }
  });
  for (const url of resultUrls) {
    if (tasks.length >= pageCount + 10) {
      break;
    }
    tasks.push(clusterInstance.queue({ searchUrl: url }));
  }
  await Promise.all(tasks);
  await clusterInstance.idle();
  await clusterInstance.close();
  return Array.from(results.values()).sort((a, b) => b.score - a.score);
 };
--- a/plugins/webcrawler/SPIDER/src/utils/setupPage.ts
+++ b/plugins/webcrawler/SPIDER/src/utils/setupPage.ts
@@ -0,0 +1,88 @@
 import { Page } from 'puppeteer';
 import randomUseragent from 'random-useragent';
 const getRandomUserAgent = () => {
  return randomUseragent.getRandom();
 };
 const getRandomPlatform = () => {
  const platforms = ["Win32", "MacIntel", "Linux x86_64"];
  return platforms[Math.floor(Math.random() * platforms.length)];
 };
 //代理池
 const validateproxy = [
  { ip: "39.102.210.222", port: 8080 },
  { ip: "8.130.71.75", port: 8080 },
  { ip: "39.102.214.208", port: 9999 },
  { ip: "39.104.59.56", port: 8080 },
  { ip: "8.130.37.235", port: 3128 },
  { ip: "8.138.131.110", port: 8080 },
  { ip: "8.140.105.75", port: 8009 },
  { ip: "114.80.38.120", port: 3081 },
  { ip: "8.148.23.165", port: 8081 },
  { ip: "119.96.72.199", port: 59394 },
  { ip: "120.55.14.137", port: 80 },
  { ip: "47.116.181.146", port: 5060 },
  { ip: "39.102.214.199", port: 3128 },
  { ip: "47.121.183.107", port: 8080 },
  { ip: "39.104.16.201", port: 8080 },
  { ip: "39.102.209.163", port: 10002 },
  { ip: "101.201.76.157", port: 9090 },
  { ip: "122.224.124.26", port: 12080 },
  { ip: "180.105.244.199", port: 1080 },
  { ip: "119.3.113.150", port: 9094 }
 ];
 const getRandomProxy = () => {
  return validateproxy[Math.floor(Math.random() * validateproxy.length)];
 };
 const getRandomLanguages = () => {
  const languages = [
    ["zh-CN", "zh", "en"],
    ["en-US", "en", "fr"],
    ["es-ES", "es", "en"]
  ];
  return languages[Math.floor(Math.random() * languages.length)];
 };
 export const setupPage = async (page: Page): Promise<void> => {
  const proxy = getRandomProxy();
  await page.authenticate({
    username: proxy.ip,
    password: proxy.port.toString()
  });
  await page.evaluateOnNewDocument(() => {
    const newProto = (navigator as any).__proto__;
    delete newProto.webdriver;
    (navigator as any).__proto__ = newProto;
    (window as any).chrome = {};
    (window as any).chrome.app = {"InstallState":"testt", "RunningState":"estt", "getDetails":"stte", "getIsInstalled":"ttes"};
    (window as any).chrome.csi = function(){};
    (window as any).chrome.loadTimes = function(){};
    (window as any).chrome.runtime = function(){};
    Object.defineProperty(navigator, 'userAgent', {
      get: () => getRandomUserAgent(),
    });
    Object.defineProperty(navigator, 'platform', {
      get: () => getRandomPlatform(),
    });
    Object.defineProperty(navigator, 'plugins', {
      get: () => [{"description": "Shockwave Flash",
                  "filename": "pepflashplayer.dll",
                  "length": 1,
                  "name": "Shockwave Flash"}]
    });
    Object.defineProperty(navigator, 'languages', {
      get: () => getRandomLanguages(),
    });
    const originalQuery = (window.navigator.permissions as any).query;
    (window.navigator.permissions as any).query = (parameters: any) => (
      parameters.name === 'notifications' ?
        Promise.resolve({ state: Notification.permission } as PermissionStatus) :
        originalQuery(parameters)
    );
  });
 };
--- a/plugins/webcrawler/SPIDER/tsconfig.json
+++ b/plugins/webcrawler/SPIDER/tsconfig.json
@@ -0,0 +1,113 @@
 {
  "compilerOptions": {
    /* Visit https://aka.ms/tsconfig to read more about this file */
    /* Projects */
    // "incremental": true,                              /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
    // "composite": true,                                /* Enable constraints that allow a TypeScript project to be used with project references. */
    // "tsBuildInfoFile": "./.tsbuildinfo",              /* Specify the path to .tsbuildinfo incremental compilation file. */
    // "disableSourceOfProjectReferenceRedirect": true,  /* Disable preferring source files instead of declaration files when referencing composite projects. */
    // "disableSolutionSearching": true,                 /* Opt a project out of multi-project reference checking when editing. */
    // "disableReferencedProjectLoad": true,             /* Reduce the number of projects loaded automatically by TypeScript. */
    "types": ["node"],
    /* Language and Environment */
    "target": "es6",                                  /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
    // "lib": [],                                        /* Specify a set of bundled library declaration files that describe the target runtime environment. */
    // "jsx": "preserve",                                /* Specify what JSX code is generated. */
    // "experimentalDecorators": true,                   /* Enable experimental support for legacy experimental decorators. */
    // "emitDecoratorMetadata": true,                    /* Emit design-type metadata for decorated declarations in source files. */
    // "jsxFactory": "",                                 /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
    // "jsxFragmentFactory": "",                         /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
    // "jsxImportSource": "",                            /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
    // "reactNamespace": "",                             /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
    // "noLib": true,                                    /* Disable including any library files, including the default lib.d.ts. */
    // "useDefineForClassFields": true,                  /* Emit ECMAScript-standard-compliant class fields. */
    // "moduleDetection": "auto",                        /* Control what method is used to detect module-format JS files. */
    /* Modules */
    //"module": "es6",                                /* Specify what module code is generated. */
    "rootDir": "./src",                                  /* Specify the root folder within your source files. */
    "moduleResolution": "node",                     /* Specify how TypeScript looks up a file from a given module specifier. */
    // "baseUrl": "./",                                  /* Specify the base directory to resolve non-relative module names. */
    // "paths": {},                                      /* Specify a set of entries that re-map imports to additional lookup locations. */
    // "rootDirs": [],                                   /* Allow multiple folders to be treated as one when resolving modules. */                                /* Specify type package names to be included without being referenced in a source file. */
    // "allowUmdGlobalAccess": true,                     /* Allow accessing UMD globals from modules. */
    // "moduleSuffixes": [],                             /* List of file name suffixes to search when resolving a module. */
    // "allowImportingTsExtensions": true,               /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */
    // "rewriteRelativeImportExtensions": true,          /* Rewrite '.ts', '.tsx', '.mts', and '.cts' file extensions in relative import paths to their JavaScript equivalent in output files. */
    // "resolvePackageJsonExports": true,                /* Use the package.json 'exports' field when resolving package imports. */
    // "resolvePackageJsonImports": true,                /* Use the package.json 'imports' field when resolving imports. */
    // "customConditions": [],                           /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
    // "noUncheckedSideEffectImports": true,             /* Check side effect imports. */
    // "resolveJsonModule": true,                        /* Enable importing .json files. */
    // "allowArbitraryExtensions": true,                 /* Enable importing files with any extension, provided a declaration file is present. */
    // "noResolve": true,                                /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
    /* JavaScript Support */
    // "allowJs": true,                                  /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
    // "checkJs": true,                                  /* Enable error reporting in type-checked JavaScript files. */
    // "maxNodeModuleJsDepth": 1,                        /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
    /* Emit */
    // "declaration": true,                              /* Generate .d.ts files from TypeScript and JavaScript files in your project. */
    // "declarationMap": true,                           /* Create sourcemaps for d.ts files. */
    // "emitDeclarationOnly": true,                      /* Only output d.ts files and not JavaScript files. */
    // "sourceMap": true,                                /* Create source map files for emitted JavaScript files. */
    // "inlineSourceMap": true,                          /* Include sourcemap files inside the emitted JavaScript. */
    // "noEmit": true,                                   /* Disable emitting files from a compilation. */
    // "outFile": "./",                                  /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
    "outDir": "./dist",                                   /* Specify an output folder for all emitted files. */
    // "removeComments": true,                           /* Disable emitting comments. */
    // "importHelpers": true,                            /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
    // "downlevelIteration": true,                       /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
    // "sourceRoot": "",                                 /* Specify the root path for debuggers to find the reference source code. */
    // "mapRoot": "",                                    /* Specify the location where debugger should locate map files instead of generated locations. */
    // "inlineSources": true,                            /* Include source code in the sourcemaps inside the emitted JavaScript. */
    // "emitBOM": true,                                  /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
    // "newLine": "crlf",                                /* Set the newline character for emitting files. */
    // "stripInternal": true,                            /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
    // "noEmitHelpers": true,                            /* Disable generating custom helper functions like '__extends' in compiled output. */
    // "noEmitOnError": true,                            /* Disable emitting files if any type checking errors are reported. */
    // "preserveConstEnums": true,                       /* Disable erasing 'const enum' declarations in generated code. */
    // "declarationDir": "./",                           /* Specify the output directory for generated declaration files. */
    /* Interop Constraints */
    // "isolatedModules": true,                          /* Ensure that each file can be safely transpiled without relying on other imports. */
    // "verbatimModuleSyntax": true,                     /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
    // "isolatedDeclarations": true,                     /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */
    // "allowSyntheticDefaultImports": true,             /* Allow 'import x from y' when a module doesn't have a default export. */
    "esModuleInterop": true,                             /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
    // "preserveSymlinks": true,                         /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
    "forceConsistentCasingInFileNames": true,            /* Ensure that casing is correct in imports. */
    /* Type Checking */
    "typeRoots": ["./node_modules/@types"], 
    "strict": true,                                      /* Enable all strict type-checking options. */
    // "noImplicitAny": true,                            /* Enable error reporting for expressions and declarations with an implied 'any' type. */
    // "strictNullChecks": true,                         /* When type checking, take into account 'null' and 'undefined'. */
    // "strictFunctionTypes": true,                      /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
    // "strictBindCallApply": true,                      /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
    // "strictPropertyInitialization": true,             /* Check for class properties that are declared but not set in the constructor. */
    // "strictBuiltinIteratorReturn": true,              /* Built-in iterators are instantiated with a 'TReturn' type of 'undefined' instead of 'any'. */
    // "noImplicitThis": true,                           /* Enable error reporting when 'this' is given the type 'any'. */
    // "useUnknownInCatchVariables": true,               /* Default catch clause variables as 'unknown' instead of 'any'. */
    // "alwaysStrict": true,                             /* Ensure 'use strict' is always emitted. */
    // "noUnusedLocals": true,                           /* Enable error reporting when local variables aren't read. */
    // "noUnusedParameters": true,                       /* Raise an error when a function parameter isn't read. */
    // "exactOptionalPropertyTypes": true,               /* Interpret optional property types as written, rather than adding 'undefined'. */
    // "noImplicitReturns": true,                        /* Enable error reporting for codepaths that do not explicitly return in a function. */
    // "noFallthroughCasesInSwitch": true,               /* Enable error reporting for fallthrough cases in switch statements. */
    // "noUncheckedIndexedAccess": true,                 /* Add 'undefined' to a type when accessed using an index. */
    // "noImplicitOverride": true,                       /* Ensure overriding members in derived classes are marked with an override modifier. */
    // "noPropertyAccessFromIndexSignature": true,       /* Enforces using indexed accessors for keys declared using an indexed type. */
    // "allowUnusedLabels": true,                        /* Disable error reporting for unused labels. */
    // "allowUnreachableCode": true,                     /* Disable error reporting for unreachable code. */
    /* Completeness */
    // "skipDefaultLibCheck": true,
    //                       /* Skip type checking .d.ts files that are included with TypeScript. */
    "skipLibCheck": true/* Skip type checking all .d.ts files. */
  },
  "include": ["src/**/*.ts"],
  "exclude": ["node_modules"]
 }
--- a/plugins/webcrawler/SPIDER/webpack.config.js
+++ b/plugins/webcrawler/SPIDER/webpack.config.js
@@ -0,0 +1,55 @@
 // 引入path包
 const path = require('path')
 require('dotenv').config();
 const mode = process.env.NODE_ENV || 'development'
 const nodeExternals = require('webpack-node-externals');
 module.exports = {
    target: 'node', // 指定构建目标为 Node.js
    externals: [nodeExternals()], // 排除 node_modules
    // 指定入口文件
    entry: "./src/index.ts",
    // 指定打包文件所在目录
    output: {
        path: path.resolve(__dirname, 'dist'),
        // 打包后文件的名称
        filename: "bundle.js"
    },
    resolve: {
        extensions: ['.ts', '.tsx', '.js', '.json'],
        fallback: {
            "zlib": require.resolve("browserify-zlib"),
            "querystring": require.resolve("querystring-es3"),
            "path": require.resolve("path-browserify"),
            "crypto": require.resolve("crypto-browserify"),
            "stream": require.resolve("stream-browserify"),
            "os": require.resolve("os-browserify/browser"),
            "http": require.resolve("stream-http"),
            "net": false, 
            "string_decoder": require.resolve("string_decoder/"),
            "url": require.resolve("url/"),
            "buffer": require.resolve("buffer/"),
            "util": require.resolve("util/"),
            // 新增 assert 的 fallback
            "assert": require.resolve("assert/"),
            // 处理新出现的 vm 警告
            "vm": require.resolve("vm-browserify"),
            "fs": false
        }
    },
    // 指定webpack打包的时候要使用的模块
    module: {
        // 指定要价在的规则
        rules: [
            {
                // test指定的是规则生效的文件,意思是，用ts-loader来处理以ts为结尾的文件
                test: /\.ts$/,
                use: 'ts-loader',
                exclude: /node_modules/
            }
        ]
    },
    mode,
 }
--- a/plugins/webcrawler/docker-compose.yaml
+++ b/plugins/webcrawler/docker-compose.yaml
@@ -0,0 +1,124 @@
 name: spider
 version: "0.0.1"
 services:
  caddy:
    container_name: caddy
    image: docker.io/library/caddy:2-alpine
    network_mode: host
    restart: unless-stopped
    volumes:
      - ./Caddyfile:/etc/caddy/Caddyfile:ro
      - caddy-data:/data:rw
      - caddy-config:/config:rw
    environment:
      - SEARXNG_HOSTNAME=${SEARXNG_HOSTNAME:-http://localhost}
      - SEARXNG_TLS=${LETSENCRYPT_EMAIL:-internal}
    cap_add:
      - NET_BIND_SERVICE
    cap_drop:
      - ALL
    logging:
      driver: "json-file"
      options:
        max-size: "1m"
        max-file: "1"
  redis:
    container_name: redis
    image: docker.io/valkey/valkey:8-alpine
    command: valkey-server --save 30 1 --loglevel warning
    restart: unless-stopped
    networks:
      - searxng
    volumes:
      - valkey-data2:/data
    cap_drop:
      - ALL
    cap_add:
      - SETGID
      - SETUID
      - DAC_OVERRIDE
    logging:
      driver: "json-file"
      options:
        max-size: "1m"
        max-file: "1"
  searxng:
    container_name: searxng
    image: docker.io/searxng/searxng:latest
    restart: unless-stopped
    networks:
      - searxng
    ports:
      - "127.0.0.1:8080:8080"
    volumes:
      - ./searxng:/etc/searxng:rw
    environment:
      - SEARXNG_BASE_URL=https://${SEARXNG_HOSTNAME:-localhost}/
      - UWSGI_WORKERS=${SEARXNG_UWSGI_WORKERS:-4}
      - UWSGI_THREADS=${SEARXNG_UWSGI_THREADS:-4}
    env_file:
      - .searchxng.env
    cap_drop:
      - ALL
    cap_add:
      - CHOWN
      - SETGID
      - SETUID
    logging:
      driver: "json-file"
      options:
        max-size: "1m"
        max-file: "1"
  mongodb:
    container_name: mongodb
    image: mongo:4.4
    restart: unless-stopped
    networks:
      - searxng
    ports:
      - "27017:27017"
    volumes:
      - mongo-data:/data/db
    environment:
      MONGO_INITDB_ROOT_USERNAME: root
      MONGO_INITDB_ROOT_PASSWORD: example
    logging:
      driver: "json-file"
      options:
        max-size: "1m"
        max-file: "1"
  nodeapp:
    container_name: main
    build:
      context: . 
    ports:
      - "3000:3000"
    networks:
      - searxng
    depends_on:
      - mongodb
    logging:
      driver: "json-file"
      options:
        max-size: "1m"
        max-file: "1"
    volumes:
      - /dev/shm:/dev/shm
    deploy:
        resources:
          limits:
            memory: 4G
            cpus: '2.0'
 networks:
  searxng:
 volumes:
  caddy-data:
  caddy-config:
  valkey-data2:
  mongo-data:
--- a/plugins/webcrawler/searxng-docker.service.template
+++ b/plugins/webcrawler/searxng-docker.service.template
@@ -0,0 +1,16 @@
 [Unit]
 Description=SearXNG service
 Requires=docker.service
 After=docker.service
 [Service]
 Restart=on-failure
 Environment=SEARXNG_DOCKERCOMPOSEFILE=docker-compose.yaml
 WorkingDirectory=/usr/local/searxng-docker
 ExecStart=/usr/local/bin/docker compose -f ${SEARXNG_DOCKERCOMPOSEFILE} up --remove-orphans
 ExecStop=/usr/local/bin/docker compose -f ${SEARXNG_DOCKERCOMPOSEFILE} down
 [Install]
 WantedBy=multi-user.target
--- a/plugins/webcrawler/searxng/limiter.toml
+++ b/plugins/webcrawler/searxng/limiter.toml
@@ -0,0 +1,6 @@
 # This configuration file updates the default configuration file
 # See https://github.com/searxng/searxng/blob/master/searx/limiter.toml
 [botdetection.ip_limit]
 # activate link_token method in the ip_limit method
 link_token = true
--- a/plugins/webcrawler/searxng/settings.yml
+++ b/plugins/webcrawler/searxng/settings.yml
@@ -0,0 +1,38 @@
 # see https://docs.searxng.org/admin/settings/settings.html#settings-use-default-settings
 use_default_settings: true
 server:
  # base_url is defined in the SEARXNG_BASE_URL environment variable, see .env and docker-compose.yml
  secret_key: "01042f00ae8bb522a9c03d3e7e1910318208a2c9fbdd23a6315577a9c98553a8"  # change this!
  limiter: false  # can be disabled for a private instance
  image_proxy: true
 ui:
  static_use_hash: true
  # 启用 cn 分类
  enabled_categories: [cn, general, images]  # 按需添加其他分类
  # 或者定义分类显示顺序
  categories_order: [cn, general, images]
 redis:
  url: redis://redis:6379/0
 engines:
    - name: bing
      disabled: false
      categories: cn
    #- name: bilibili
     # engine: bilibili
     # shortcut: bil
     # disabled: false
     # categories: cn
    - name : baidu
      engine : json_engine
      paging : True
      first_page_num : 0
      search_url : https://www.baidu.com/s?tn=json&wd={query}&pn={pageno}&rn=50
      url_query : url
      title_query : title
      content_query : abs
      categories : cn
 search:
    formats:
        - html
        - json