mirror of
https://github.com/labring/FastGPT.git
synced 2026-03-02 01:02:30 +08:00
1 (#3924)
This commit is contained in:
@@ -0,0 +1,60 @@
|
||||
import { Request, Response } from 'express';
|
||||
import fetch from 'node-fetch';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const userAgents = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
|
||||
];
|
||||
|
||||
export const quickFetch = async (req: Request, res: Response): Promise<void> => {
|
||||
const { url } = req.query;
|
||||
|
||||
if (!url) {
|
||||
res.status(400).json({
|
||||
status: 400,
|
||||
error: {
|
||||
code: "MISSING_PARAM",
|
||||
message: "缺少必要参数: url"
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(url as string, {
|
||||
headers: {
|
||||
'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
|
||||
'Referer': 'https://www.google.com/',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Connection': 'keep-alive',
|
||||
'Cache-Control': 'no-cache'
|
||||
}
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP error! status: ${response.status}`);
|
||||
}
|
||||
const data = await response.text();
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
content: data
|
||||
}
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error fetching the page:', error);
|
||||
res.status(500).json({
|
||||
status: 500,
|
||||
error: {
|
||||
code: "INTERNAL_SERVER_ERROR",
|
||||
message: "发生错误"
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
export default { quickFetch };
|
||||
142
plugins/webcrawler/SPIDER/src/controllers/readController.ts
Normal file
142
plugins/webcrawler/SPIDER/src/controllers/readController.ts
Normal file
@@ -0,0 +1,142 @@
|
||||
import { Request, Response } from 'express';
|
||||
import puppeteer, { Page } from 'puppeteer';
|
||||
import * as cheerio from 'cheerio';
|
||||
import UserAgent from 'user-agents';
|
||||
import { setupPage } from '../utils/setupPage'; // 导入 setupPage 模块
|
||||
import dotenv from 'dotenv'; // 导入 dotenv 模块
|
||||
import { URL } from 'url'; // 导入 URL 模块
|
||||
import { handleSpecialWebsite } from '../specialHandlers'; // 导入 handleSpecialWebsite 模块
|
||||
import fetch from 'node-fetch';
|
||||
import { getCachedPage, updateCacheAsync } from '../utils/cacheUpdater'; // 导入缓存相关模块
|
||||
|
||||
dotenv.config(); // 加载环境变量
|
||||
|
||||
const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
|
||||
const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
|
||||
|
||||
export const readPage = async (req: Request, res: Response): Promise<void> => {
|
||||
const { queryUrl } = req.query;
|
||||
console.log("-------");
|
||||
console.log(queryUrl);
|
||||
console.log("-------");
|
||||
|
||||
if (!queryUrl) {
|
||||
res.status(400).json({
|
||||
status: 400,
|
||||
error: {
|
||||
code: "MISSING_PARAM",
|
||||
message: "缺少必要参数: queryUrl"
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
const urlDomain = new URL(queryUrl as string).hostname;
|
||||
if (blacklistDomains.some((domain: string) => urlDomain.endsWith(domain))) {
|
||||
res.status(403).json({
|
||||
status: 403,
|
||||
error: {
|
||||
code: "BLACKLISTED_DOMAIN",
|
||||
message: "该域名受到保护中"
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(queryUrl as string, {
|
||||
headers: {
|
||||
'User-Agent': new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }).toString(),
|
||||
'Referer': 'https://www.google.com/',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Connection': 'keep-alive',
|
||||
'Cache-Control': 'no-cache'
|
||||
}
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const content = await response.text();
|
||||
const $ = cheerio.load(content);
|
||||
const cleanedContent = $('body').html();
|
||||
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
title: $('title').text(),
|
||||
content: cleanedContent
|
||||
}
|
||||
});
|
||||
|
||||
await updateCacheAsync(queryUrl as string, cleanedContent || '');
|
||||
console.log("Page read successfully");
|
||||
return;
|
||||
} else {
|
||||
throw new Error(`HTTP error! status: ${response.status}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('快速抓取页面时发生错误:', error);
|
||||
}
|
||||
|
||||
try {
|
||||
const browser = await puppeteer.launch({
|
||||
ignoreDefaultArgs: ["--enable-automation"],
|
||||
headless: true,
|
||||
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径
|
||||
pipe: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
// '--single-process'
|
||||
]
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
|
||||
// 检测是否需要特殊处理
|
||||
if (typeof queryUrl === 'string' && detectWebsites.some(website => queryUrl.includes(website))) {
|
||||
await setupPage(page);
|
||||
} else {
|
||||
const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
|
||||
await page.setUserAgent(userAgent.toString());
|
||||
}
|
||||
|
||||
const queryUrlSafe = new URL(queryUrl as string).toString();
|
||||
|
||||
await page.goto(queryUrlSafe, { waitUntil: 'load' });
|
||||
await page.waitForSelector('body');
|
||||
|
||||
const title = await page.title();
|
||||
let cleanedContent = await handleSpecialWebsite(page, queryUrl as string);
|
||||
|
||||
if (!cleanedContent) {
|
||||
const content = await page.content();
|
||||
const $ = cheerio.load(content);
|
||||
cleanedContent = $('body').html();
|
||||
}
|
||||
|
||||
await page.close();
|
||||
await browser.close();
|
||||
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
title,
|
||||
content: cleanedContent
|
||||
}
|
||||
});
|
||||
|
||||
await updateCacheAsync(queryUrl as string, cleanedContent || '');
|
||||
console.log("Page read successfully");
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
res.status(500).json({
|
||||
status: 500,
|
||||
error: {
|
||||
code: "INTERNAL_SERVER_ERROR",
|
||||
message: "读取页面时发生内部服务器错误"
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
114
plugins/webcrawler/SPIDER/src/controllers/searchController.ts
Normal file
114
plugins/webcrawler/SPIDER/src/controllers/searchController.ts
Normal file
@@ -0,0 +1,114 @@
|
||||
import { Request, Response } from 'express';
|
||||
import { Cluster } from 'puppeteer-cluster';
|
||||
import dotenv from 'dotenv';
|
||||
import { performDeepSearch } from '../utils/deepSearch';
|
||||
import { fetchSearchResults as fetchBaiduResults } from '../engines/baiduEngine';
|
||||
import { fetchSearchResults as fetchSearchxngResults } from '../engines/searchxngEngine';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const strategies = JSON.parse(process.env.STRATEGIES || '[]');
|
||||
const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
|
||||
const maxConcurrency = parseInt(process.env.MAX_CONCURRENCY || '10', 10);
|
||||
|
||||
export const search = async (req: Request, res: Response): Promise<void> => {
|
||||
const { query, pageCount = 10, needDetails = 'false', engine = 'baidu', categories = 'general' } = req.query;
|
||||
const needDetailsBool = (needDetails === 'true');
|
||||
|
||||
if (!query) {
|
||||
res.status(400).json({
|
||||
status: 400,
|
||||
error: {
|
||||
code: "MISSING_PARAM",
|
||||
message: "缺少必要参数: query"
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
let fetchSearchResults;
|
||||
let searchUrlBase;
|
||||
try {
|
||||
if (engine === 'baidu') {
|
||||
fetchSearchResults = fetchBaiduResults;
|
||||
searchUrlBase = process.env.ENGINE_BAIDUURL;
|
||||
} else if (engine === 'searchxng') {
|
||||
fetchSearchResults = fetchSearchxngResults;
|
||||
searchUrlBase = process.env.ENGINE_SEARCHXNGURL;
|
||||
} else {
|
||||
res.status(400).json({
|
||||
status: 400,
|
||||
error: {
|
||||
code: "INVALID_ENGINE",
|
||||
message: "无效的搜索引擎"
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
const { resultUrls, results } = await fetchSearchResults(query as string, Number(pageCount), searchUrlBase || '', categories as string);
|
||||
|
||||
//如果返回值为空,返回空数组
|
||||
if (results.size === 0) {
|
||||
console.log('No results found');
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
results: []
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
if (!needDetailsBool) {
|
||||
console.log('Need details is false');
|
||||
results.forEach((value: any) => {
|
||||
if (value.crawlStatus === 'Pending') {
|
||||
value.crawlStatus = 'Success';
|
||||
}
|
||||
});
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
results: Array.from(results.values())
|
||||
}
|
||||
});
|
||||
} else {
|
||||
console.log('Need details is true');
|
||||
|
||||
const clusterInstance = await Cluster.launch({
|
||||
concurrency: Cluster.CONCURRENCY_CONTEXT,
|
||||
maxConcurrency: maxConcurrency,
|
||||
puppeteerOptions: {
|
||||
ignoreDefaultArgs: ["--enable-automation"],
|
||||
headless: "true",
|
||||
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径
|
||||
pipe: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
]
|
||||
}
|
||||
});
|
||||
|
||||
const sortedResults = await performDeepSearch(clusterInstance, resultUrls, results, strategies, detectWebsites, Number(pageCount));
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
results: sortedResults.slice(0, Number(pageCount))
|
||||
}
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
res.status(500).json({
|
||||
status: 500,
|
||||
error: {
|
||||
code: "INTERNAL_SERVER_ERROR",
|
||||
message: "发生错误"
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
export default { search };
|
||||
Reference in New Issue
Block a user