mirror of
https://github.com/labring/FastGPT.git
synced 2025-12-25 02:01:34 +08:00
1 (#3924)
This commit is contained in:
21
plugins/webcrawler/SPIDER/.env.example
Normal file
21
plugins/webcrawler/SPIDER/.env.example
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
ACCESS_TOKEN=114514
|
||||
DETECT_WEBSITE = zhuanlan.zhihu.com
|
||||
STRATEGIES=[{"waitUntil":"networkidle0","timeout":5000},{"waitUntil":"networkidle2","timeout":10000},{"waitUntil":"load","timeout":15000}]
|
||||
PORT=3000
|
||||
MAX_CONCURRENCY=10
|
||||
NODE_ENV=development
|
||||
ENGINE = [
|
||||
|
||||
]
|
||||
|
||||
ENGINE_BAIDUURL=https://www.baidu.com/s
|
||||
#ENGINE_SEARCHXNGURL=http://localhost:8080/search
|
||||
ENGINE_SEARCHXNGURL=http://searxng:8080/search
|
||||
|
||||
#MONGODB_URI=mongodb://root:example@localhost:27017
|
||||
MONGODB_URI=mongodb://root:example@mongodb:27017
|
||||
BLACKLIST = [".gov.cn",".edu.cn"]
|
||||
|
||||
STD_TTL=3600
|
||||
EXPIRE_AFTER_SECONDS=9000
|
||||
5804
plugins/webcrawler/SPIDER/package-lock.json
generated
Normal file
5804
plugins/webcrawler/SPIDER/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
62
plugins/webcrawler/SPIDER/package.json
Normal file
62
plugins/webcrawler/SPIDER/package.json
Normal file
@@ -0,0 +1,62 @@
|
||||
{
|
||||
"name": "spider",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "/dist/index.ts",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1",
|
||||
"start": "ts-node src/index.ts",
|
||||
"build": "webpack",
|
||||
"dev": "ts-node-dev --respawn src/index.ts"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@types/node-fetch": "^2.6.12",
|
||||
"assert": "^2.1.0",
|
||||
"axios": "^1.7.9",
|
||||
"body-parser": "^1.20.3",
|
||||
"browserify-zlib": "^0.2.0",
|
||||
"buffer": "^6.0.3",
|
||||
"cheerio": "^1.0.0",
|
||||
"crypto-browserify": "^3.12.1",
|
||||
"dotenv": "^16.4.7",
|
||||
"express": "^4.21.2",
|
||||
"https-proxy-agent": "^7.0.6",
|
||||
"jsdom": "^26.0.0",
|
||||
"mongodb": "^6.13.1",
|
||||
"node-cache": "^5.1.2",
|
||||
"node-fetch": "^2.7.0",
|
||||
"os-browserify": "^0.3.0",
|
||||
"path-browserify": "^1.0.1",
|
||||
"puppeteer": "^24.2.1",
|
||||
"puppeteer-cluster": "^0.24.0",
|
||||
"querystring-es3": "^0.2.1",
|
||||
"random-useragent": "^0.5.0",
|
||||
"spider": "file:",
|
||||
"stream-browserify": "^3.0.0",
|
||||
"stream-http": "^3.2.0",
|
||||
"string_decoder": "^1.3.0",
|
||||
"turndown": "^7.2.0",
|
||||
"turndown-plugin-gfm": "^1.0.2",
|
||||
"url": "^0.11.4",
|
||||
"user-agents": "^1.1.454",
|
||||
"util": "^0.12.5",
|
||||
"vm-browserify": "^1.1.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/body-parser": "^1.19.5",
|
||||
"@types/express": "^5.0.0",
|
||||
"@types/jsdom": "^21.1.7",
|
||||
"@types/node": "^22.13.4",
|
||||
"@types/random-useragent": "^0.3.3",
|
||||
"@types/user-agents": "^1.0.4",
|
||||
"ts-loader": "^9.5.2",
|
||||
"ts-node-dev": "^2.0.0",
|
||||
"typescript": "^5.7.3",
|
||||
"webpack": "^5.98.0",
|
||||
"webpack-cli": "^6.0.1",
|
||||
"webpack-node-externals": "^3.0.0"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
import { Request, Response } from 'express';
|
||||
import fetch from 'node-fetch';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const userAgents = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
|
||||
];
|
||||
|
||||
export const quickFetch = async (req: Request, res: Response): Promise<void> => {
|
||||
const { url } = req.query;
|
||||
|
||||
if (!url) {
|
||||
res.status(400).json({
|
||||
status: 400,
|
||||
error: {
|
||||
code: "MISSING_PARAM",
|
||||
message: "缺少必要参数: url"
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(url as string, {
|
||||
headers: {
|
||||
'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
|
||||
'Referer': 'https://www.google.com/',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Connection': 'keep-alive',
|
||||
'Cache-Control': 'no-cache'
|
||||
}
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP error! status: ${response.status}`);
|
||||
}
|
||||
const data = await response.text();
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
content: data
|
||||
}
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error fetching the page:', error);
|
||||
res.status(500).json({
|
||||
status: 500,
|
||||
error: {
|
||||
code: "INTERNAL_SERVER_ERROR",
|
||||
message: "发生错误"
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
export default { quickFetch };
|
||||
142
plugins/webcrawler/SPIDER/src/controllers/readController.ts
Normal file
142
plugins/webcrawler/SPIDER/src/controllers/readController.ts
Normal file
@@ -0,0 +1,142 @@
|
||||
import { Request, Response } from 'express';
|
||||
import puppeteer, { Page } from 'puppeteer';
|
||||
import * as cheerio from 'cheerio';
|
||||
import UserAgent from 'user-agents';
|
||||
import { setupPage } from '../utils/setupPage'; // 导入 setupPage 模块
|
||||
import dotenv from 'dotenv'; // 导入 dotenv 模块
|
||||
import { URL } from 'url'; // 导入 URL 模块
|
||||
import { handleSpecialWebsite } from '../specialHandlers'; // 导入 handleSpecialWebsite 模块
|
||||
import fetch from 'node-fetch';
|
||||
import { getCachedPage, updateCacheAsync } from '../utils/cacheUpdater'; // 导入缓存相关模块
|
||||
|
||||
dotenv.config(); // 加载环境变量
|
||||
|
||||
const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
|
||||
const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
|
||||
|
||||
export const readPage = async (req: Request, res: Response): Promise<void> => {
|
||||
const { queryUrl } = req.query;
|
||||
console.log("-------");
|
||||
console.log(queryUrl);
|
||||
console.log("-------");
|
||||
|
||||
if (!queryUrl) {
|
||||
res.status(400).json({
|
||||
status: 400,
|
||||
error: {
|
||||
code: "MISSING_PARAM",
|
||||
message: "缺少必要参数: queryUrl"
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
const urlDomain = new URL(queryUrl as string).hostname;
|
||||
if (blacklistDomains.some((domain: string) => urlDomain.endsWith(domain))) {
|
||||
res.status(403).json({
|
||||
status: 403,
|
||||
error: {
|
||||
code: "BLACKLISTED_DOMAIN",
|
||||
message: "该域名受到保护中"
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(queryUrl as string, {
|
||||
headers: {
|
||||
'User-Agent': new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }).toString(),
|
||||
'Referer': 'https://www.google.com/',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Connection': 'keep-alive',
|
||||
'Cache-Control': 'no-cache'
|
||||
}
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const content = await response.text();
|
||||
const $ = cheerio.load(content);
|
||||
const cleanedContent = $('body').html();
|
||||
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
title: $('title').text(),
|
||||
content: cleanedContent
|
||||
}
|
||||
});
|
||||
|
||||
await updateCacheAsync(queryUrl as string, cleanedContent || '');
|
||||
console.log("Page read successfully");
|
||||
return;
|
||||
} else {
|
||||
throw new Error(`HTTP error! status: ${response.status}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('快速抓取页面时发生错误:', error);
|
||||
}
|
||||
|
||||
try {
|
||||
const browser = await puppeteer.launch({
|
||||
ignoreDefaultArgs: ["--enable-automation"],
|
||||
headless: true,
|
||||
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径
|
||||
pipe: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
// '--single-process'
|
||||
]
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
|
||||
// 检测是否需要特殊处理
|
||||
if (typeof queryUrl === 'string' && detectWebsites.some(website => queryUrl.includes(website))) {
|
||||
await setupPage(page);
|
||||
} else {
|
||||
const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
|
||||
await page.setUserAgent(userAgent.toString());
|
||||
}
|
||||
|
||||
const queryUrlSafe = new URL(queryUrl as string).toString();
|
||||
|
||||
await page.goto(queryUrlSafe, { waitUntil: 'load' });
|
||||
await page.waitForSelector('body');
|
||||
|
||||
const title = await page.title();
|
||||
let cleanedContent = await handleSpecialWebsite(page, queryUrl as string);
|
||||
|
||||
if (!cleanedContent) {
|
||||
const content = await page.content();
|
||||
const $ = cheerio.load(content);
|
||||
cleanedContent = $('body').html();
|
||||
}
|
||||
|
||||
await page.close();
|
||||
await browser.close();
|
||||
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
title,
|
||||
content: cleanedContent
|
||||
}
|
||||
});
|
||||
|
||||
await updateCacheAsync(queryUrl as string, cleanedContent || '');
|
||||
console.log("Page read successfully");
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
res.status(500).json({
|
||||
status: 500,
|
||||
error: {
|
||||
code: "INTERNAL_SERVER_ERROR",
|
||||
message: "读取页面时发生内部服务器错误"
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
114
plugins/webcrawler/SPIDER/src/controllers/searchController.ts
Normal file
114
plugins/webcrawler/SPIDER/src/controllers/searchController.ts
Normal file
@@ -0,0 +1,114 @@
|
||||
import { Request, Response } from 'express';
|
||||
import { Cluster } from 'puppeteer-cluster';
|
||||
import dotenv from 'dotenv';
|
||||
import { performDeepSearch } from '../utils/deepSearch';
|
||||
import { fetchSearchResults as fetchBaiduResults } from '../engines/baiduEngine';
|
||||
import { fetchSearchResults as fetchSearchxngResults } from '../engines/searchxngEngine';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const strategies = JSON.parse(process.env.STRATEGIES || '[]');
|
||||
const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
|
||||
const maxConcurrency = parseInt(process.env.MAX_CONCURRENCY || '10', 10);
|
||||
|
||||
export const search = async (req: Request, res: Response): Promise<void> => {
|
||||
const { query, pageCount = 10, needDetails = 'false', engine = 'baidu', categories = 'general' } = req.query;
|
||||
const needDetailsBool = (needDetails === 'true');
|
||||
|
||||
if (!query) {
|
||||
res.status(400).json({
|
||||
status: 400,
|
||||
error: {
|
||||
code: "MISSING_PARAM",
|
||||
message: "缺少必要参数: query"
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
let fetchSearchResults;
|
||||
let searchUrlBase;
|
||||
try {
|
||||
if (engine === 'baidu') {
|
||||
fetchSearchResults = fetchBaiduResults;
|
||||
searchUrlBase = process.env.ENGINE_BAIDUURL;
|
||||
} else if (engine === 'searchxng') {
|
||||
fetchSearchResults = fetchSearchxngResults;
|
||||
searchUrlBase = process.env.ENGINE_SEARCHXNGURL;
|
||||
} else {
|
||||
res.status(400).json({
|
||||
status: 400,
|
||||
error: {
|
||||
code: "INVALID_ENGINE",
|
||||
message: "无效的搜索引擎"
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
const { resultUrls, results } = await fetchSearchResults(query as string, Number(pageCount), searchUrlBase || '', categories as string);
|
||||
|
||||
//如果返回值为空,返回空数组
|
||||
if (results.size === 0) {
|
||||
console.log('No results found');
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
results: []
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
if (!needDetailsBool) {
|
||||
console.log('Need details is false');
|
||||
results.forEach((value: any) => {
|
||||
if (value.crawlStatus === 'Pending') {
|
||||
value.crawlStatus = 'Success';
|
||||
}
|
||||
});
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
results: Array.from(results.values())
|
||||
}
|
||||
});
|
||||
} else {
|
||||
console.log('Need details is true');
|
||||
|
||||
const clusterInstance = await Cluster.launch({
|
||||
concurrency: Cluster.CONCURRENCY_CONTEXT,
|
||||
maxConcurrency: maxConcurrency,
|
||||
puppeteerOptions: {
|
||||
ignoreDefaultArgs: ["--enable-automation"],
|
||||
headless: "true",
|
||||
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径
|
||||
pipe: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
]
|
||||
}
|
||||
});
|
||||
|
||||
const sortedResults = await performDeepSearch(clusterInstance, resultUrls, results, strategies, detectWebsites, Number(pageCount));
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
results: sortedResults.slice(0, Number(pageCount))
|
||||
}
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
res.status(500).json({
|
||||
status: 500,
|
||||
error: {
|
||||
code: "INTERNAL_SERVER_ERROR",
|
||||
message: "发生错误"
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
export default { search };
|
||||
204
plugins/webcrawler/SPIDER/src/engines/baiduEngine.ts
Normal file
204
plugins/webcrawler/SPIDER/src/engines/baiduEngine.ts
Normal file
@@ -0,0 +1,204 @@
|
||||
import { URL } from 'url';
|
||||
import { JSDOM } from 'jsdom';
|
||||
import puppeteer from 'puppeteer';
|
||||
import { setupPage } from '../utils/setupPage';
|
||||
import { Cluster } from 'puppeteer-cluster';
|
||||
|
||||
async function randomWait(min: number, max: number) {
|
||||
// 随机等待时间
|
||||
const delay = Math.floor(Math.random() * (max - min + 1)) + min;
|
||||
return new Promise(resolve => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
export const fetchSearchResults = async (query: string, pageCount: number, searchUrlBase: string, categories: string) => {
|
||||
console.log(`Fetching Baidu search results for query: ${query}`);
|
||||
// 如果 searchUrlBase 为空,返回空数组
|
||||
if (!searchUrlBase) {
|
||||
return { resultUrls: [], results: new Map() };
|
||||
}
|
||||
const resultUrls: string[] = [];
|
||||
const results = new Map<string, any>();
|
||||
|
||||
const pagesToFetch = Math.ceil(pageCount / 10);
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
ignoreDefaultArgs: ["--enable-automation"],
|
||||
headless: true,
|
||||
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径
|
||||
pipe: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
// '--single-process'
|
||||
]
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
await setupPage(page);
|
||||
|
||||
for (let i = 0; i < pagesToFetch; i++) {
|
||||
const searchUrl = new URL(`${searchUrlBase}?wd=${encodeURIComponent(query)}&pn=${i * 10}`);
|
||||
console.log(`Fetching page ${i + 1} from Baidu: ${searchUrl.toString()}`);
|
||||
let retryCount = 0;
|
||||
let success = false;
|
||||
|
||||
while (retryCount < 5 && !success) {
|
||||
try {
|
||||
console.time(`Page Load Time for page ${i + 1}`);
|
||||
await page.goto(searchUrl.toString(), { waitUntil: 'load' });
|
||||
console.timeEnd(`Page Load Time for page ${i + 1}`);
|
||||
|
||||
let content = await page.content();
|
||||
let dom = new JSDOM(content);
|
||||
let document = dom.window.document;
|
||||
console.log(document.title);
|
||||
|
||||
// 如果是百度安全验证页面,重新设置页面并重新访问
|
||||
if (document.title.includes('百度安全验证')) {
|
||||
console.log('Detected Baidu security verification, retrying...');
|
||||
await setupPage(page);
|
||||
retryCount++;
|
||||
//随机等待时间
|
||||
await randomWait(1000, 3000);
|
||||
continue;
|
||||
}
|
||||
|
||||
// 解析搜索结果
|
||||
console.time(`Link Retrieval Time for page ${i + 1}`);
|
||||
|
||||
|
||||
const resultContainers = document.querySelectorAll('.result.c-container');
|
||||
for (const result of resultContainers) {
|
||||
if (resultUrls.length > pageCount + 5) {
|
||||
break;
|
||||
}
|
||||
const titleElement = result.querySelector('h3 a');
|
||||
const title = titleElement ? titleElement.textContent : '';
|
||||
const url = titleElement ? titleElement.getAttribute('href') : '';
|
||||
const contentElement = result.querySelector('[class^="content"]');
|
||||
const content = contentElement ? contentElement.textContent : '';
|
||||
|
||||
if (url) {
|
||||
resultUrls.push(url);
|
||||
results.set(url, {
|
||||
title,
|
||||
url,
|
||||
snippet: content,
|
||||
source: 'baidu',
|
||||
crawlStatus: 'Pending',
|
||||
score: 0
|
||||
});
|
||||
}
|
||||
}
|
||||
console.timeEnd(`Link Retrieval Time for page ${i + 1}`);
|
||||
success = true;
|
||||
} catch (error) {
|
||||
console.error(`Error fetching page ${i + 1}:`, error);
|
||||
retryCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
|
||||
console.log('fetch all fake urls');
|
||||
|
||||
// 快速检索真实 URL
|
||||
const urlsToProcessWithPuppeteer = [];
|
||||
for (const url of resultUrls) {
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
||||
'Referer': 'https://www.google.com/',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Connection': 'keep-alive',
|
||||
'Cache-Control': 'no-cache'
|
||||
}
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const realUrl = response.url;
|
||||
console.log('realurl:', realUrl);
|
||||
const result = results.get(url);
|
||||
if (result) {
|
||||
result.url = realUrl;
|
||||
result.crawlStatus = 'Success';
|
||||
}
|
||||
} else {
|
||||
throw new Error(`HTTP error! status: ${response.status}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error fetching original URL for ${url}:`, error);
|
||||
urlsToProcessWithPuppeteer.push(url);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('pass quickfetch');
|
||||
|
||||
// 并发处理真实 URL
|
||||
const cluster = await Cluster.launch({
|
||||
concurrency: Cluster.CONCURRENCY_CONTEXT,
|
||||
maxConcurrency: 10,
|
||||
puppeteerOptions: {
|
||||
ignoreDefaultArgs: ["--enable-automation"],
|
||||
headless: "true",
|
||||
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径
|
||||
pipe: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
]
|
||||
}
|
||||
});
|
||||
|
||||
let failedUrlCount = 0;
|
||||
|
||||
await cluster.task(async ({ page, data: url }) => {
|
||||
let retryUrlCount = 0;
|
||||
let urlSuccess = false;
|
||||
while (retryUrlCount < 3 && !urlSuccess) {
|
||||
console.log(`Fetching original URL for ${url}, attempt ${retryUrlCount + 1}`);
|
||||
try {
|
||||
await page.goto(url, { waitUntil: 'load' });
|
||||
// 检查页面是否被分离
|
||||
if (page.isClosed()) {
|
||||
throw new Error('Page has been closed');
|
||||
}
|
||||
const realUrl = page.url(); // 获取真实 URL
|
||||
const result = results.get(url);
|
||||
if (result) {
|
||||
result.url = realUrl;
|
||||
result.crawlStatus = 'Success';
|
||||
}
|
||||
urlSuccess = true;
|
||||
} catch (error) {
|
||||
console.error(`Error fetching original URL, retrying...`, error);
|
||||
retryUrlCount++;
|
||||
await randomWait(1000, 3000);
|
||||
}
|
||||
}
|
||||
if (!urlSuccess) {
|
||||
failedUrlCount++;
|
||||
}
|
||||
});
|
||||
|
||||
for (const url of urlsToProcessWithPuppeteer) {
|
||||
cluster.queue(url);
|
||||
}
|
||||
|
||||
await cluster.idle();
|
||||
await cluster.close();
|
||||
|
||||
console.log(`Number of URLs that failed to return a real URL: ${failedUrlCount}`);
|
||||
|
||||
// 过滤并返回前 pageCount 个结果
|
||||
const filteredResults = Array.from(results.values()).slice(0, pageCount);
|
||||
|
||||
return { resultUrls: filteredResults.map(result => result.url), results: new Map(filteredResults.map(result => [result.url, result])) };
|
||||
};
|
||||
55
plugins/webcrawler/SPIDER/src/engines/searchxngEngine.ts
Normal file
55
plugins/webcrawler/SPIDER/src/engines/searchxngEngine.ts
Normal file
@@ -0,0 +1,55 @@
|
||||
import axios from 'axios';
|
||||
import { URL } from 'url';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
|
||||
|
||||
export const fetchSearchResults = async (query: string, pageCount: number, searchUrlBase: string, categories: string) => {
|
||||
|
||||
const MAX_PAGES = (pageCount / 10 +1) * 2+1; // 最多搜索的页面数
|
||||
//如果searchUrlBase为空,返回空数组,pagecount是需要搜索结果的数量
|
||||
if (!searchUrlBase) {
|
||||
return { resultUrls: [], results: new Map() };
|
||||
}
|
||||
const resultUrls: string[] = [];
|
||||
const results = new Map<string, any>();
|
||||
|
||||
let fetchedResultsCount = 0;
|
||||
let pageIndex = 0;
|
||||
|
||||
while (fetchedResultsCount < pageCount && pageIndex < MAX_PAGES) {
|
||||
const searchUrl = new URL(`${searchUrlBase}?q=${encodeURIComponent(query)}&pageno=${pageIndex + 1}&format=json&categories=${categories}`);
|
||||
console.log(`Fetching page ${pageIndex + 1} from SearchXNG: ${searchUrl.toString()}`);
|
||||
const response = await axios.get(searchUrl.toString());
|
||||
const jsonResults = response.data.results;
|
||||
|
||||
for (let index = 0; index < jsonResults.length; index++) {
|
||||
const result = jsonResults[index];
|
||||
const resultDomain = new URL(result.url).hostname;
|
||||
if (blacklistDomains.some((domain: string) => resultDomain.endsWith(domain)) || resultDomain.includes('zhihu')) {
|
||||
continue;
|
||||
}
|
||||
resultUrls.push(result.url);
|
||||
results.set(result.url, {
|
||||
title: result.title,
|
||||
url: result.url,
|
||||
snippet: result.content,
|
||||
source: result.engine,
|
||||
crawlStatus: 'Pending',
|
||||
score: result.score
|
||||
});
|
||||
fetchedResultsCount++;
|
||||
if (fetchedResultsCount >= pageCount) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
pageIndex++;
|
||||
if (jsonResults.length === 0) {
|
||||
break; // 如果没有更多结果,退出循环
|
||||
}
|
||||
}
|
||||
|
||||
return { resultUrls, results };
|
||||
};
|
||||
18
plugins/webcrawler/SPIDER/src/index.ts
Normal file
18
plugins/webcrawler/SPIDER/src/index.ts
Normal file
@@ -0,0 +1,18 @@
|
||||
import express, { Application } from 'express';
|
||||
import bodyParser from 'body-parser';
|
||||
import searchRoutes from './routes/searchRoutes';
|
||||
import readRoutes from './routes/readRoutes';
|
||||
import quickfetchRoutes from './routes/quickfetchRoutes';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const app: Application = express();
|
||||
|
||||
app.use(bodyParser.json());
|
||||
app.use('/api', searchRoutes);
|
||||
app.use('/api', readRoutes);
|
||||
app.use('/api', quickfetchRoutes);
|
||||
|
||||
const PORT = process.env.PORT || 3000;
|
||||
app.listen(PORT, () => console.log(`Server running on port ${PORT}`));
|
||||
21
plugins/webcrawler/SPIDER/src/middleware/authMiddleware.ts
Normal file
21
plugins/webcrawler/SPIDER/src/middleware/authMiddleware.ts
Normal file
@@ -0,0 +1,21 @@
|
||||
import { Request, Response, NextFunction } from 'express';
|
||||
|
||||
const authMiddleware = (req: Request, res: Response, next: NextFunction) => {
|
||||
const bearerHeader = req.headers['authorization'];
|
||||
|
||||
if (bearerHeader) {
|
||||
console.log("bearerHeader:" + bearerHeader);
|
||||
const bearer = bearerHeader.split(' ');
|
||||
const bearerToken = bearer[1];
|
||||
|
||||
if (bearerToken === process.env.ACCESS_TOKEN) {
|
||||
next();
|
||||
} else {
|
||||
res.status(403).json({ message: 'Invalid token' });
|
||||
}
|
||||
} else {
|
||||
res.status(401).json({ message: 'Bearer token not found' });
|
||||
}
|
||||
};
|
||||
|
||||
export default authMiddleware;
|
||||
9
plugins/webcrawler/SPIDER/src/routes/quickfetchRoutes.ts
Normal file
9
plugins/webcrawler/SPIDER/src/routes/quickfetchRoutes.ts
Normal file
@@ -0,0 +1,9 @@
|
||||
import express from 'express';
|
||||
import { quickFetch } from '../controllers/quickfetchController';
|
||||
import authMiddleware from '../middleware/authMiddleware';
|
||||
|
||||
const readRoutes = express.Router();
|
||||
|
||||
readRoutes.get('/quickFetch', authMiddleware, quickFetch);
|
||||
|
||||
export default readRoutes;
|
||||
9
plugins/webcrawler/SPIDER/src/routes/readRoutes.ts
Normal file
9
plugins/webcrawler/SPIDER/src/routes/readRoutes.ts
Normal file
@@ -0,0 +1,9 @@
|
||||
import express from 'express';
|
||||
import { readPage } from '../controllers/readController';
|
||||
import authMiddleware from '../middleware/authMiddleware';
|
||||
|
||||
const readRoutes = express.Router();
|
||||
|
||||
readRoutes.get('/read', authMiddleware, readPage);
|
||||
|
||||
export default readRoutes;
|
||||
9
plugins/webcrawler/SPIDER/src/routes/searchRoutes.ts
Normal file
9
plugins/webcrawler/SPIDER/src/routes/searchRoutes.ts
Normal file
@@ -0,0 +1,9 @@
|
||||
import express from 'express';
|
||||
import searchController from '../controllers/searchController';
|
||||
import authMiddleware from '../middleware/authMiddleware';
|
||||
|
||||
const searchRoutes = express.Router();
|
||||
|
||||
searchRoutes.get('/search', authMiddleware, searchController.search);
|
||||
|
||||
export default searchRoutes;
|
||||
21
plugins/webcrawler/SPIDER/src/specialHandlers/index.ts
Normal file
21
plugins/webcrawler/SPIDER/src/specialHandlers/index.ts
Normal file
@@ -0,0 +1,21 @@
|
||||
import { Page } from 'puppeteer';
|
||||
|
||||
export const handleSpecialWebsite = async (page: Page, url: string): Promise<string | null> => {
|
||||
if (url.includes('blog.csdn.net')) {
|
||||
await page.waitForSelector('article');
|
||||
const content = await page.$eval('article', el => el.innerHTML);
|
||||
return content;
|
||||
}
|
||||
if (url.includes('zhuanlan.zhihu.com')) {
|
||||
console.log('是知乎,需要点击按掉!');
|
||||
console.log(await page.content());
|
||||
if((await page.content()).includes('{"error":{"message":"您当前请求存在异常,暂时限制本次访问。如有疑问,您可以通过手机摇一摇或登录后私信知乎小管家反馈。","code":40362}}')) return null;
|
||||
await page.waitForSelector('button[aria-label="关闭"]');
|
||||
await page.click('button[aria-label="关闭"]'); // 使用 aria-label 选择按钮
|
||||
await page.waitForSelector('article');
|
||||
const content = await page.$eval('article', el => el.innerHTML);
|
||||
return content;
|
||||
}
|
||||
// 可以添加更多特殊网站的处理逻辑
|
||||
return null;
|
||||
};
|
||||
77
plugins/webcrawler/SPIDER/src/utils/cacheUpdater.ts
Normal file
77
plugins/webcrawler/SPIDER/src/utils/cacheUpdater.ts
Normal file
@@ -0,0 +1,77 @@
|
||||
|
||||
import NodeCache from 'node-cache';
|
||||
import { MongoClient } from 'mongodb';
|
||||
import crypto from 'crypto';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const cache = new NodeCache({ stdTTL: parseInt(process.env.STD_TTL || '3600') });
|
||||
const mongoClient = new MongoClient(process.env.MONGODB_URI || 'mongodb://localhost:27017');
|
||||
const dbName = 'pageCache';
|
||||
const collectionName = 'pages';
|
||||
|
||||
const connectToMongo = async () => {
|
||||
await mongoClient.connect();
|
||||
return mongoClient.db(dbName);
|
||||
};
|
||||
|
||||
const createTTLIndex = async () => {
|
||||
try {
|
||||
const db = await connectToMongo();
|
||||
await db.collection(collectionName).createIndex({ "updatedAt": 1 }, { expireAfterSeconds: parseInt(process.env.EXPIRE_AFTER_SECONDS || '9000') });
|
||||
console.log("TTL index created successfully");
|
||||
} catch (error) {
|
||||
console.error("Error creating TTL index:", error);
|
||||
}
|
||||
};
|
||||
|
||||
const getPageHash = (content: string) => {
|
||||
return crypto.createHash('md5').update(content).digest('hex');
|
||||
};
|
||||
|
||||
export const getCachedPage = async (url: string) => {
|
||||
const cachedPage = cache.get(url);
|
||||
if (cachedPage) return cachedPage;
|
||||
|
||||
try {
|
||||
const db = await connectToMongo();
|
||||
const page = await db.collection(collectionName).findOne({ url });
|
||||
if (page) cache.set(url, page);
|
||||
return page;
|
||||
} catch (error) {
|
||||
console.error('Error getting cached page:', error);
|
||||
throw error;
|
||||
}
|
||||
};
|
||||
|
||||
const savePageToCache = async (url: string, content: string) => {
|
||||
const hash = getPageHash(content);
|
||||
const page = { url, content, hash, updatedAt: new Date() };
|
||||
|
||||
cache.set(url, page); // 更新内存缓存
|
||||
|
||||
try {
|
||||
const db = await connectToMongo();
|
||||
await db.collection(collectionName).updateOne(
|
||||
{ url },
|
||||
{ $set: page },
|
||||
{ upsert: true }
|
||||
); // 更新持久化缓存
|
||||
} catch (error) {
|
||||
console.error('Error saving page to cache:', error);
|
||||
throw error;
|
||||
}
|
||||
};
|
||||
|
||||
export const updateCacheAsync = async (url: string, content: string) => {
|
||||
await savePageToCache(url, content);
|
||||
};
|
||||
|
||||
process.on('SIGINT', async () => {
|
||||
await mongoClient.close();
|
||||
process.exit(0);
|
||||
});
|
||||
|
||||
// 在应用启动时创建 TTL 索引
|
||||
createTTLIndex();
|
||||
140
plugins/webcrawler/SPIDER/src/utils/deepSearch.ts
Normal file
140
plugins/webcrawler/SPIDER/src/utils/deepSearch.ts
Normal file
@@ -0,0 +1,140 @@
|
||||
import { Cluster } from 'puppeteer-cluster';
|
||||
import * as cheerio from 'cheerio';
|
||||
import UserAgent from 'user-agents';
|
||||
import { setupPage } from './setupPage';
|
||||
import { getCachedPage, updateCacheAsync } from './cacheUpdater';
|
||||
import { handleSpecialWebsite } from '../specialHandlers';
|
||||
import fetch from 'node-fetch';
|
||||
|
||||
interface CachedPage {
|
||||
url: string;
|
||||
content: string;
|
||||
hash: string;
|
||||
updatedAt: Date;
|
||||
}
|
||||
|
||||
export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: string[], results: Map<string, any>, strategies: any[], detectWebsites: string[], pageCount: number) => {
|
||||
const tasks = [];
|
||||
|
||||
await clusterInstance.task(async ({ page, data: { searchUrl } }) => {
|
||||
try {
|
||||
const cachedPage = await getCachedPage(searchUrl) as CachedPage | null;
|
||||
if (cachedPage) {
|
||||
const result = results.get(searchUrl);
|
||||
if (result) {
|
||||
result.content = cachedPage.content;
|
||||
result.crawlStatus = 'Success';
|
||||
}
|
||||
return;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`从缓存获取页面 ${searchUrl} 时发生错误:`, error);
|
||||
results.set(searchUrl, { url: searchUrl, error: (error as Error).message, crawlStatus: 'Failed' });
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(searchUrl, {
|
||||
headers: {
|
||||
'User-Agent': new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }).toString(),
|
||||
'Referer': 'https://www.google.com/',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Connection': 'keep-alive',
|
||||
'Cache-Control': 'no-cache'
|
||||
}
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const content = await response.text();
|
||||
const $ = cheerio.load(content);
|
||||
const cleanedContent = $('body').html() || '';
|
||||
|
||||
const result = results.get(searchUrl);
|
||||
if (result) {
|
||||
result.content = cleanedContent;
|
||||
result.crawlStatus = 'Success';
|
||||
}
|
||||
|
||||
await updateCacheAsync(searchUrl, cleanedContent || '');
|
||||
return;
|
||||
} else {
|
||||
throw new Error(`HTTP error! status: ${response.status}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`快速抓取页面 ${searchUrl} 时发生错误:`, error);
|
||||
}
|
||||
|
||||
try {
|
||||
if (detectWebsites.some(website => searchUrl.includes(website))) {
|
||||
await setupPage(page);
|
||||
} else {
|
||||
const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
|
||||
await page.setUserAgent(userAgent.toString());
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`访问页面 ${searchUrl} 设置用户代理时发生错误:`, error);
|
||||
}
|
||||
|
||||
let pageLoaded = false;
|
||||
let pageLoadError: Error | null = null;
|
||||
for (const strategy of strategies) {
|
||||
try {
|
||||
await page.goto(searchUrl, { waitUntil: strategy.waitUntil, timeout: strategy.timeout });
|
||||
pageLoaded = true;
|
||||
break;
|
||||
} catch (error: any) {
|
||||
if (error.name === 'TimeoutError') {
|
||||
pageLoadError = error;
|
||||
continue;
|
||||
} else {
|
||||
pageLoadError = error;
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!pageLoaded) {
|
||||
const result = results.get(searchUrl);
|
||||
if (result) {
|
||||
result.error = pageLoadError;
|
||||
result.crawlStatus = 'Failed';
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
let cleanedContent = await handleSpecialWebsite(page, searchUrl);
|
||||
if (!cleanedContent) {
|
||||
const content = await page.content();
|
||||
const $ = cheerio.load(content);
|
||||
cleanedContent = $('body').html() || '';
|
||||
}
|
||||
|
||||
const result = results.get(searchUrl);
|
||||
if (result) {
|
||||
result.content = cleanedContent;
|
||||
result.crawlStatus = 'Success';
|
||||
}
|
||||
|
||||
await updateCacheAsync(searchUrl, cleanedContent || '');
|
||||
} catch (error) {
|
||||
results.set(searchUrl, { url: searchUrl, error: (error as Error).message, crawlStatus: 'Failed' });
|
||||
} finally {
|
||||
await page.close().catch(() => {});
|
||||
}
|
||||
});
|
||||
|
||||
for (const url of resultUrls) {
|
||||
if (tasks.length >= pageCount + 10) {
|
||||
break;
|
||||
}
|
||||
tasks.push(clusterInstance.queue({ searchUrl: url }));
|
||||
}
|
||||
|
||||
await Promise.all(tasks);
|
||||
|
||||
await clusterInstance.idle();
|
||||
await clusterInstance.close();
|
||||
|
||||
return Array.from(results.values()).sort((a, b) => b.score - a.score);
|
||||
};
|
||||
88
plugins/webcrawler/SPIDER/src/utils/setupPage.ts
Normal file
88
plugins/webcrawler/SPIDER/src/utils/setupPage.ts
Normal file
@@ -0,0 +1,88 @@
|
||||
import { Page } from 'puppeteer';
|
||||
import randomUseragent from 'random-useragent';
|
||||
|
||||
const getRandomUserAgent = () => {
|
||||
return randomUseragent.getRandom();
|
||||
};
|
||||
|
||||
const getRandomPlatform = () => {
|
||||
const platforms = ["Win32", "MacIntel", "Linux x86_64"];
|
||||
return platforms[Math.floor(Math.random() * platforms.length)];
|
||||
};
|
||||
|
||||
//代理池
|
||||
const validateproxy = [
|
||||
{ ip: "39.102.210.222", port: 8080 },
|
||||
{ ip: "8.130.71.75", port: 8080 },
|
||||
{ ip: "39.102.214.208", port: 9999 },
|
||||
{ ip: "39.104.59.56", port: 8080 },
|
||||
{ ip: "8.130.37.235", port: 3128 },
|
||||
{ ip: "8.138.131.110", port: 8080 },
|
||||
{ ip: "8.140.105.75", port: 8009 },
|
||||
{ ip: "114.80.38.120", port: 3081 },
|
||||
{ ip: "8.148.23.165", port: 8081 },
|
||||
{ ip: "119.96.72.199", port: 59394 },
|
||||
{ ip: "120.55.14.137", port: 80 },
|
||||
{ ip: "47.116.181.146", port: 5060 },
|
||||
{ ip: "39.102.214.199", port: 3128 },
|
||||
{ ip: "47.121.183.107", port: 8080 },
|
||||
{ ip: "39.104.16.201", port: 8080 },
|
||||
{ ip: "39.102.209.163", port: 10002 },
|
||||
{ ip: "101.201.76.157", port: 9090 },
|
||||
{ ip: "122.224.124.26", port: 12080 },
|
||||
{ ip: "180.105.244.199", port: 1080 },
|
||||
{ ip: "119.3.113.150", port: 9094 }
|
||||
];
|
||||
|
||||
const getRandomProxy = () => {
|
||||
return validateproxy[Math.floor(Math.random() * validateproxy.length)];
|
||||
};
|
||||
|
||||
const getRandomLanguages = () => {
|
||||
const languages = [
|
||||
["zh-CN", "zh", "en"],
|
||||
["en-US", "en", "fr"],
|
||||
["es-ES", "es", "en"]
|
||||
];
|
||||
return languages[Math.floor(Math.random() * languages.length)];
|
||||
};
|
||||
|
||||
export const setupPage = async (page: Page): Promise<void> => {
|
||||
const proxy = getRandomProxy();
|
||||
await page.authenticate({
|
||||
username: proxy.ip,
|
||||
password: proxy.port.toString()
|
||||
});
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
const newProto = (navigator as any).__proto__;
|
||||
delete newProto.webdriver;
|
||||
(navigator as any).__proto__ = newProto;
|
||||
(window as any).chrome = {};
|
||||
(window as any).chrome.app = {"InstallState":"testt", "RunningState":"estt", "getDetails":"stte", "getIsInstalled":"ttes"};
|
||||
(window as any).chrome.csi = function(){};
|
||||
(window as any).chrome.loadTimes = function(){};
|
||||
(window as any).chrome.runtime = function(){};
|
||||
Object.defineProperty(navigator, 'userAgent', {
|
||||
get: () => getRandomUserAgent(),
|
||||
});
|
||||
Object.defineProperty(navigator, 'platform', {
|
||||
get: () => getRandomPlatform(),
|
||||
});
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [{"description": "Shockwave Flash",
|
||||
"filename": "pepflashplayer.dll",
|
||||
"length": 1,
|
||||
"name": "Shockwave Flash"}]
|
||||
});
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => getRandomLanguages(),
|
||||
});
|
||||
const originalQuery = (window.navigator.permissions as any).query;
|
||||
(window.navigator.permissions as any).query = (parameters: any) => (
|
||||
parameters.name === 'notifications' ?
|
||||
Promise.resolve({ state: Notification.permission } as PermissionStatus) :
|
||||
originalQuery(parameters)
|
||||
);
|
||||
});
|
||||
};
|
||||
113
plugins/webcrawler/SPIDER/tsconfig.json
Normal file
113
plugins/webcrawler/SPIDER/tsconfig.json
Normal file
@@ -0,0 +1,113 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
/* Visit https://aka.ms/tsconfig to read more about this file */
|
||||
/* Projects */
|
||||
// "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
|
||||
// "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */
|
||||
// "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */
|
||||
// "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */
|
||||
// "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */
|
||||
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
|
||||
"types": ["node"],
|
||||
/* Language and Environment */
|
||||
"target": "es6", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
|
||||
// "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
|
||||
// "jsx": "preserve", /* Specify what JSX code is generated. */
|
||||
// "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
|
||||
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
|
||||
// "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
|
||||
// "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
|
||||
// "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
|
||||
// "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
|
||||
// "noLib": true, /* Disable including any library files, including the default lib.d.ts. */
|
||||
// "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */
|
||||
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
|
||||
|
||||
/* Modules */
|
||||
//"module": "es6", /* Specify what module code is generated. */
|
||||
"rootDir": "./src", /* Specify the root folder within your source files. */
|
||||
"moduleResolution": "node", /* Specify how TypeScript looks up a file from a given module specifier. */
|
||||
// "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
|
||||
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
|
||||
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ /* Specify type package names to be included without being referenced in a source file. */
|
||||
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
|
||||
// "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */
|
||||
// "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */
|
||||
// "rewriteRelativeImportExtensions": true, /* Rewrite '.ts', '.tsx', '.mts', and '.cts' file extensions in relative import paths to their JavaScript equivalent in output files. */
|
||||
// "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
|
||||
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
|
||||
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
|
||||
// "noUncheckedSideEffectImports": true, /* Check side effect imports. */
|
||||
// "resolveJsonModule": true, /* Enable importing .json files. */
|
||||
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
|
||||
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
|
||||
|
||||
/* JavaScript Support */
|
||||
// "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
|
||||
// "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */
|
||||
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
|
||||
|
||||
/* Emit */
|
||||
// "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */
|
||||
// "declarationMap": true, /* Create sourcemaps for d.ts files. */
|
||||
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
|
||||
// "sourceMap": true, /* Create source map files for emitted JavaScript files. */
|
||||
// "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */
|
||||
// "noEmit": true, /* Disable emitting files from a compilation. */
|
||||
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
|
||||
"outDir": "./dist", /* Specify an output folder for all emitted files. */
|
||||
// "removeComments": true, /* Disable emitting comments. */
|
||||
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
|
||||
// "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
|
||||
// "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */
|
||||
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
|
||||
// "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */
|
||||
// "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
|
||||
// "newLine": "crlf", /* Set the newline character for emitting files. */
|
||||
// "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
|
||||
// "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */
|
||||
// "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */
|
||||
// "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */
|
||||
// "declarationDir": "./", /* Specify the output directory for generated declaration files. */
|
||||
|
||||
/* Interop Constraints */
|
||||
// "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
|
||||
// "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
|
||||
// "isolatedDeclarations": true, /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */
|
||||
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
|
||||
"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
|
||||
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
|
||||
"forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */
|
||||
|
||||
/* Type Checking */
|
||||
"typeRoots": ["./node_modules/@types"],
|
||||
"strict": true, /* Enable all strict type-checking options. */
|
||||
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
|
||||
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
|
||||
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
|
||||
// "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
|
||||
// "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */
|
||||
// "strictBuiltinIteratorReturn": true, /* Built-in iterators are instantiated with a 'TReturn' type of 'undefined' instead of 'any'. */
|
||||
// "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */
|
||||
// "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */
|
||||
// "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */
|
||||
// "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */
|
||||
// "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */
|
||||
// "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */
|
||||
// "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */
|
||||
// "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */
|
||||
// "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */
|
||||
// "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */
|
||||
// "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */
|
||||
// "allowUnusedLabels": true, /* Disable error reporting for unused labels. */
|
||||
// "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */
|
||||
|
||||
/* Completeness */
|
||||
// "skipDefaultLibCheck": true,
|
||||
// /* Skip type checking .d.ts files that are included with TypeScript. */
|
||||
"skipLibCheck": true/* Skip type checking all .d.ts files. */
|
||||
|
||||
},
|
||||
"include": ["src/**/*.ts"],
|
||||
"exclude": ["node_modules"]
|
||||
}
|
||||
55
plugins/webcrawler/SPIDER/webpack.config.js
Normal file
55
plugins/webcrawler/SPIDER/webpack.config.js
Normal file
@@ -0,0 +1,55 @@
|
||||
// 引入path包
|
||||
const path = require('path')
|
||||
require('dotenv').config();
|
||||
const mode = process.env.NODE_ENV || 'development'
|
||||
|
||||
const nodeExternals = require('webpack-node-externals');
|
||||
module.exports = {
|
||||
target: 'node', // 指定构建目标为 Node.js
|
||||
externals: [nodeExternals()], // 排除 node_modules
|
||||
// 指定入口文件
|
||||
entry: "./src/index.ts",
|
||||
|
||||
// 指定打包文件所在目录
|
||||
output: {
|
||||
path: path.resolve(__dirname, 'dist'),
|
||||
// 打包后文件的名称
|
||||
filename: "bundle.js"
|
||||
},
|
||||
resolve: {
|
||||
extensions: ['.ts', '.tsx', '.js', '.json'],
|
||||
fallback: {
|
||||
"zlib": require.resolve("browserify-zlib"),
|
||||
"querystring": require.resolve("querystring-es3"),
|
||||
"path": require.resolve("path-browserify"),
|
||||
"crypto": require.resolve("crypto-browserify"),
|
||||
"stream": require.resolve("stream-browserify"),
|
||||
"os": require.resolve("os-browserify/browser"),
|
||||
"http": require.resolve("stream-http"),
|
||||
"net": false,
|
||||
"string_decoder": require.resolve("string_decoder/"),
|
||||
"url": require.resolve("url/"),
|
||||
"buffer": require.resolve("buffer/"),
|
||||
"util": require.resolve("util/"),
|
||||
// 新增 assert 的 fallback
|
||||
"assert": require.resolve("assert/"),
|
||||
// 处理新出现的 vm 警告
|
||||
"vm": require.resolve("vm-browserify"),
|
||||
"fs": false
|
||||
}
|
||||
},
|
||||
|
||||
// 指定webpack打包的时候要使用的模块
|
||||
module: {
|
||||
// 指定要价在的规则
|
||||
rules: [
|
||||
{
|
||||
// test指定的是规则生效的文件,意思是,用ts-loader来处理以ts为结尾的文件
|
||||
test: /\.ts$/,
|
||||
use: 'ts-loader',
|
||||
exclude: /node_modules/
|
||||
}
|
||||
]
|
||||
},
|
||||
mode,
|
||||
}
|
||||
Reference in New Issue
Block a user