1 (#3924)

2026-05-05 01:02:59 +08:00 · 2025-02-28 19:00:58 +08:00
parent cf0aaa1091
commit f7b2a57ca3
29 changed files with 7469 additions and 0 deletions
@@ -0,0 +1,60 @@
+import { Request, Response } from 'express';
+import fetch from 'node-fetch';
+import dotenv from 'dotenv';
+
+dotenv.config();
+
+const userAgents = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
+];
+
+export const quickFetch = async (req: Request, res: Response): Promise<void> => {
+    const { url } = req.query;
+
+    if (!url) {
+        res.status(400).json({
+            status: 400,
+            error: {
+                code: "MISSING_PARAM",
+                message: "缺少必要参数: url"
+            }
+        });
+        return;
+    }
+
+    try {
+        const response = await fetch(url as string, {
+            headers: {
+                'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
+                'Referer': 'https://www.google.com/',
+                'Accept-Language': 'en-US,en;q=0.9',
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+                'Connection': 'keep-alive',
+                'Cache-Control': 'no-cache'
+            }
+        });
+        if (!response.ok) {
+            throw new Error(`HTTP error! status: ${response.status}`);
+        }
+        const data = await response.text();
+        res.status(200).json({
+            status: 200,
+            data: {
+                content: data
+            }
+        });
+    } catch (error) {
+        console.error('Error fetching the page:', error);
+        res.status(500).json({
+            status: 500,
+            error: {
+                code: "INTERNAL_SERVER_ERROR",
+                message: "发生错误"
+            }
+        });
+    }
+};
+
+export default { quickFetch };
@@ -0,0 +1,142 @@
+import { Request, Response } from 'express';
+import puppeteer, { Page } from 'puppeteer';
+import * as cheerio from 'cheerio';
+import UserAgent from 'user-agents';
+import { setupPage } from '../utils/setupPage'; // 导入 setupPage 模块
+import dotenv from 'dotenv'; // 导入 dotenv 模块
+import { URL } from 'url'; // 导入 URL 模块
+import { handleSpecialWebsite } from '../specialHandlers'; // 导入 handleSpecialWebsite 模块
+import fetch from 'node-fetch';
+import { getCachedPage, updateCacheAsync } from '../utils/cacheUpdater'; // 导入缓存相关模块
+
+dotenv.config(); // 加载环境变量
+
+const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
+const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
+
+export const readPage = async (req: Request, res: Response): Promise<void> => {
+  const { queryUrl } = req.query;
+  console.log("-------");
+  console.log(queryUrl);
+  console.log("-------");
+
+  if (!queryUrl) {
+    res.status(400).json({
+      status: 400,
+      error: {
+        code: "MISSING_PARAM",
+        message: "缺少必要参数: queryUrl"
+      }
+    });
+    return;
+  }
+
+  const urlDomain = new URL(queryUrl as string).hostname;
+  if (blacklistDomains.some((domain: string) => urlDomain.endsWith(domain))) {
+    res.status(403).json({
+      status: 403,
+      error: {
+        code: "BLACKLISTED_DOMAIN",
+        message: "该域名受到保护中"
+      }
+    });
+    return;
+  }
+
+  try {
+    const response = await fetch(queryUrl as string, {
+      headers: {
+        'User-Agent': new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }).toString(),
+        'Referer': 'https://www.google.com/',
+        'Accept-Language': 'en-US,en;q=0.9',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+        'Connection': 'keep-alive',
+        'Cache-Control': 'no-cache'
+      }
+    });
+
+    if (response.ok) {
+      const content = await response.text();
+      const $ = cheerio.load(content);
+      const cleanedContent = $('body').html();
+
+      res.status(200).json({
+        status: 200,
+        data: {
+          title: $('title').text(),
+          content: cleanedContent
+        }
+      });
+
+      await updateCacheAsync(queryUrl as string, cleanedContent || '');
+      console.log("Page read successfully");
+      return;
+    } else {
+      throw new Error(`HTTP error! status: ${response.status}`);
+    }
+  } catch (error) {
+    console.error('快速抓取页面时发生错误:', error);
+  }
+
+  try {
+    const browser = await puppeteer.launch({ 
+      ignoreDefaultArgs: ["--enable-automation"],
+      headless: true,  
+      executablePath: "/usr/bin/chromium",  // 明确指定 Chromium 路径
+      pipe: true,
+      args: [
+        '--no-sandbox',
+        '--disable-setuid-sandbox',
+        '--disable-dev-shm-usage',
+        '--disable-gpu',
+       // '--single-process' 
+      ]
+    });
+    const page = await browser.newPage();
+
+    // 检测是否需要特殊处理
+    if (typeof queryUrl === 'string' && detectWebsites.some(website => queryUrl.includes(website))) {
+      await setupPage(page);
+    } else {
+      const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
+      await page.setUserAgent(userAgent.toString());
+    }
+
+    const queryUrlSafe = new URL(queryUrl as string).toString();
+
+    await page.goto(queryUrlSafe, { waitUntil: 'load' });
+    await page.waitForSelector('body');
+
+    const title = await page.title();
+    let cleanedContent = await handleSpecialWebsite(page, queryUrl as string);
+
+    if (!cleanedContent) {
+      const content = await page.content();
+      const $ = cheerio.load(content);
+      cleanedContent = $('body').html();
+    }
+
+    await page.close();
+    await browser.close();
+
+    res.status(200).json({
+      status: 200,
+      data: {
+        title,
+        content: cleanedContent
+      }
+    });
+
+    await updateCacheAsync(queryUrl as string, cleanedContent || '');
+    console.log("Page read successfully");
+  } catch (error) {
+    console.error(error);
+    res.status(500).json({
+      status: 500,
+      error: {
+        code: "INTERNAL_SERVER_ERROR",
+        message: "读取页面时发生内部服务器错误"
+      }
+    });
+  }
+};
@@ -0,0 +1,114 @@
+import { Request, Response } from 'express';
+import { Cluster } from 'puppeteer-cluster';
+import dotenv from 'dotenv';
+import { performDeepSearch } from '../utils/deepSearch';
+import { fetchSearchResults as fetchBaiduResults } from '../engines/baiduEngine';
+import { fetchSearchResults as fetchSearchxngResults } from '../engines/searchxngEngine';
+
+dotenv.config();
+
+const strategies = JSON.parse(process.env.STRATEGIES || '[]');
+const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
+const maxConcurrency = parseInt(process.env.MAX_CONCURRENCY || '10', 10);
+
+export const search = async (req: Request, res: Response): Promise<void> => {
+  const { query, pageCount = 10, needDetails = 'false', engine = 'baidu', categories = 'general' } = req.query;
+  const needDetailsBool = (needDetails === 'true');
+
+  if (!query) {
+    res.status(400).json({
+      status: 400,
+      error: {
+        code: "MISSING_PARAM",
+        message: "缺少必要参数: query"
+      }
+    });
+    return;
+  }
+  let fetchSearchResults;
+  let searchUrlBase;
+  try {
+      if (engine === 'baidu') {
+        fetchSearchResults = fetchBaiduResults;
+        searchUrlBase = process.env.ENGINE_BAIDUURL;
+      } else if (engine === 'searchxng') {
+        fetchSearchResults = fetchSearchxngResults;
+        searchUrlBase = process.env.ENGINE_SEARCHXNGURL;
+      } else {
+        res.status(400).json({
+          status: 400,
+          error: {
+            code: "INVALID_ENGINE",
+            message: "无效的搜索引擎"
+          }
+        });
+        return;
+      }
+
+    const { resultUrls, results } = await fetchSearchResults(query as string, Number(pageCount), searchUrlBase || '', categories as string);
+
+    //如果返回值为空，返回空数组
+    if (results.size === 0) {
+      console.log('No results found');
+      res.status(200).json({
+        status: 200,
+        data: {
+          results: []
+        }
+      });
+      return;
+    }
+
+    if (!needDetailsBool) {
+      console.log('Need details is false');
+      results.forEach((value: any) => {
+        if (value.crawlStatus === 'Pending') {
+          value.crawlStatus = 'Success';
+        }
+      });
+      res.status(200).json({
+        status: 200,
+        data: {
+          results: Array.from(results.values())
+        }
+      });
+    } else {
+      console.log('Need details is true');
+
+      const clusterInstance = await Cluster.launch({
+        concurrency: Cluster.CONCURRENCY_CONTEXT,
+        maxConcurrency: maxConcurrency,
+        puppeteerOptions: {
+          ignoreDefaultArgs: ["--enable-automation"],
+          headless: "true",
+          executablePath: "/usr/bin/chromium",  // 明确指定 Chromium 路径
+          pipe: true,
+          args: [
+            '--no-sandbox',
+            '--disable-setuid-sandbox',
+            '--disable-dev-shm-usage',
+            '--disable-gpu',
+          ]
+        }
+      });
+
+      const sortedResults = await performDeepSearch(clusterInstance, resultUrls, results, strategies, detectWebsites, Number(pageCount));
+      res.status(200).json({
+        status: 200,
+        data: {
+          results: sortedResults.slice(0, Number(pageCount))
+        }
+      });
+    }
+  } catch (error) {
+    res.status(500).json({
+      status: 500,
+      error: {
+        code: "INTERNAL_SERVER_ERROR",
+        message: "发生错误"
+      }
+    });
+  }
+};
+
+export default { search };