mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 21:13:50 +00:00
fix: add order:true to all create transactions (#3948)
This commit is contained in:
@@ -216,7 +216,7 @@ export async function createOneCollection({
|
|||||||
nextSyncTime
|
nextSyncTime
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
return collection;
|
return collection;
|
||||||
|
@@ -97,7 +97,7 @@ export const createOrGetCollectionTags = async ({
|
|||||||
datasetId,
|
datasetId,
|
||||||
tag: tagContent
|
tag: tagContent
|
||||||
})),
|
})),
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
return [...existingTags.map((tag) => tag._id), ...newTags.map((tag) => tag._id)];
|
return [...existingTags.map((tag) => tag._id), ...newTags.map((tag) => tag._id)];
|
||||||
|
@@ -196,7 +196,8 @@ export async function syncCollaborators({
|
|||||||
permission: item.permission
|
permission: item.permission
|
||||||
})),
|
})),
|
||||||
{
|
{
|
||||||
session
|
session,
|
||||||
|
ordered: true
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@@ -100,7 +100,7 @@ export const initTeamFreePlan = async ({
|
|||||||
surplusPoints: freePoints
|
surplusPoints: freePoints
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -160,7 +160,7 @@ export const createTrainingUsage = async ({
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
return { billId: String(_id) };
|
return { billId: String(_id) };
|
||||||
|
@@ -5,56 +5,56 @@ import dotenv from 'dotenv';
|
|||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
const userAgents = [
|
const userAgents = [
|
||||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
||||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
|
||||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
|
||||||
];
|
];
|
||||||
|
|
||||||
export const quickFetch = async (req: Request, res: Response): Promise<void> => {
|
export const quickFetch = async (req: Request, res: Response): Promise<void> => {
|
||||||
const { url } = req.query;
|
const { url } = req.query;
|
||||||
|
|
||||||
if (!url) {
|
if (!url) {
|
||||||
res.status(400).json({
|
res.status(400).json({
|
||||||
status: 400,
|
status: 400,
|
||||||
error: {
|
error: {
|
||||||
code: "MISSING_PARAM",
|
code: 'MISSING_PARAM',
|
||||||
message: "缺少必要参数: url"
|
message: '缺少必要参数: url'
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const response = await fetch(url as string, {
|
const response = await fetch(url as string, {
|
||||||
headers: {
|
headers: {
|
||||||
'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
|
'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
|
||||||
'Referer': 'https://www.google.com/',
|
Referer: 'https://www.google.com/',
|
||||||
'Accept-Language': 'en-US,en;q=0.9',
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
'Connection': 'keep-alive',
|
Connection: 'keep-alive',
|
||||||
'Cache-Control': 'no-cache'
|
'Cache-Control': 'no-cache'
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
throw new Error(`HTTP error! status: ${response.status}`);
|
throw new Error(`HTTP error! status: ${response.status}`);
|
||||||
}
|
|
||||||
const data = await response.text();
|
|
||||||
res.status(200).json({
|
|
||||||
status: 200,
|
|
||||||
data: {
|
|
||||||
content: data
|
|
||||||
}
|
|
||||||
});
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Error fetching the page:', error);
|
|
||||||
res.status(500).json({
|
|
||||||
status: 500,
|
|
||||||
error: {
|
|
||||||
code: "INTERNAL_SERVER_ERROR",
|
|
||||||
message: "发生错误"
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
const data = await response.text();
|
||||||
|
res.status(200).json({
|
||||||
|
status: 200,
|
||||||
|
data: {
|
||||||
|
content: data
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error fetching the page:', error);
|
||||||
|
res.status(500).json({
|
||||||
|
status: 500,
|
||||||
|
error: {
|
||||||
|
code: 'INTERNAL_SERVER_ERROR',
|
||||||
|
message: '发生错误'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
export default { quickFetch };
|
export default { quickFetch };
|
@@ -16,16 +16,16 @@ const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIS
|
|||||||
|
|
||||||
export const readPage = async (req: Request, res: Response): Promise<void> => {
|
export const readPage = async (req: Request, res: Response): Promise<void> => {
|
||||||
const { queryUrl } = req.query;
|
const { queryUrl } = req.query;
|
||||||
console.log("-------");
|
console.log('-------');
|
||||||
console.log(queryUrl);
|
console.log(queryUrl);
|
||||||
console.log("-------");
|
console.log('-------');
|
||||||
|
|
||||||
if (!queryUrl) {
|
if (!queryUrl) {
|
||||||
res.status(400).json({
|
res.status(400).json({
|
||||||
status: 400,
|
status: 400,
|
||||||
error: {
|
error: {
|
||||||
code: "MISSING_PARAM",
|
code: 'MISSING_PARAM',
|
||||||
message: "缺少必要参数: queryUrl"
|
message: '缺少必要参数: queryUrl'
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return;
|
return;
|
||||||
@@ -36,8 +36,8 @@ export const readPage = async (req: Request, res: Response): Promise<void> => {
|
|||||||
res.status(403).json({
|
res.status(403).json({
|
||||||
status: 403,
|
status: 403,
|
||||||
error: {
|
error: {
|
||||||
code: "BLACKLISTED_DOMAIN",
|
code: 'BLACKLISTED_DOMAIN',
|
||||||
message: "该域名受到保护中"
|
message: '该域名受到保护中'
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return;
|
return;
|
||||||
@@ -46,11 +46,14 @@ export const readPage = async (req: Request, res: Response): Promise<void> => {
|
|||||||
try {
|
try {
|
||||||
const response = await fetch(queryUrl as string, {
|
const response = await fetch(queryUrl as string, {
|
||||||
headers: {
|
headers: {
|
||||||
'User-Agent': new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }).toString(),
|
'User-Agent': new UserAgent({
|
||||||
'Referer': 'https://www.google.com/',
|
deviceCategory: 'desktop',
|
||||||
|
platform: 'Linux x86_64'
|
||||||
|
}).toString(),
|
||||||
|
Referer: 'https://www.google.com/',
|
||||||
'Accept-Language': 'en-US,en;q=0.9',
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
'Connection': 'keep-alive',
|
Connection: 'keep-alive',
|
||||||
'Cache-Control': 'no-cache'
|
'Cache-Control': 'no-cache'
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@@ -69,7 +72,7 @@ export const readPage = async (req: Request, res: Response): Promise<void> => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
await updateCacheAsync(queryUrl as string, cleanedContent || '');
|
await updateCacheAsync(queryUrl as string, cleanedContent || '');
|
||||||
console.log("Page read successfully");
|
console.log('Page read successfully');
|
||||||
return;
|
return;
|
||||||
} else {
|
} else {
|
||||||
throw new Error(`HTTP error! status: ${response.status}`);
|
throw new Error(`HTTP error! status: ${response.status}`);
|
||||||
@@ -80,22 +83,25 @@ export const readPage = async (req: Request, res: Response): Promise<void> => {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
const browser = await puppeteer.launch({
|
const browser = await puppeteer.launch({
|
||||||
ignoreDefaultArgs: ["--enable-automation"],
|
ignoreDefaultArgs: ['--enable-automation'],
|
||||||
headless: true,
|
headless: true,
|
||||||
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径
|
executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
|
||||||
pipe: true,
|
pipe: true,
|
||||||
args: [
|
args: [
|
||||||
'--no-sandbox',
|
'--no-sandbox',
|
||||||
'--disable-setuid-sandbox',
|
'--disable-setuid-sandbox',
|
||||||
'--disable-dev-shm-usage',
|
'--disable-dev-shm-usage',
|
||||||
'--disable-gpu',
|
'--disable-gpu'
|
||||||
// '--single-process'
|
// '--single-process'
|
||||||
]
|
]
|
||||||
});
|
});
|
||||||
const page = await browser.newPage();
|
const page = await browser.newPage();
|
||||||
|
|
||||||
// 检测是否需要特殊处理
|
// 检测是否需要特殊处理
|
||||||
if (typeof queryUrl === 'string' && detectWebsites.some(website => queryUrl.includes(website))) {
|
if (
|
||||||
|
typeof queryUrl === 'string' &&
|
||||||
|
detectWebsites.some((website) => queryUrl.includes(website))
|
||||||
|
) {
|
||||||
await setupPage(page);
|
await setupPage(page);
|
||||||
} else {
|
} else {
|
||||||
const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
|
const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
|
||||||
@@ -128,14 +134,14 @@ export const readPage = async (req: Request, res: Response): Promise<void> => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
await updateCacheAsync(queryUrl as string, cleanedContent || '');
|
await updateCacheAsync(queryUrl as string, cleanedContent || '');
|
||||||
console.log("Page read successfully");
|
console.log('Page read successfully');
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
console.error(error);
|
||||||
res.status(500).json({
|
res.status(500).json({
|
||||||
status: 500,
|
status: 500,
|
||||||
error: {
|
error: {
|
||||||
code: "INTERNAL_SERVER_ERROR",
|
code: 'INTERNAL_SERVER_ERROR',
|
||||||
message: "读取页面时发生内部服务器错误"
|
message: '读取页面时发生内部服务器错误'
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@@ -12,15 +12,21 @@ const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
|
|||||||
const maxConcurrency = parseInt(process.env.MAX_CONCURRENCY || '10', 10);
|
const maxConcurrency = parseInt(process.env.MAX_CONCURRENCY || '10', 10);
|
||||||
|
|
||||||
export const search = async (req: Request, res: Response): Promise<void> => {
|
export const search = async (req: Request, res: Response): Promise<void> => {
|
||||||
const { query, pageCount = 10, needDetails = 'false', engine = 'baidu', categories = 'general' } = req.query;
|
const {
|
||||||
const needDetailsBool = (needDetails === 'true');
|
query,
|
||||||
|
pageCount = 10,
|
||||||
|
needDetails = 'false',
|
||||||
|
engine = 'baidu',
|
||||||
|
categories = 'general'
|
||||||
|
} = req.query;
|
||||||
|
const needDetailsBool = needDetails === 'true';
|
||||||
|
|
||||||
if (!query) {
|
if (!query) {
|
||||||
res.status(400).json({
|
res.status(400).json({
|
||||||
status: 400,
|
status: 400,
|
||||||
error: {
|
error: {
|
||||||
code: "MISSING_PARAM",
|
code: 'MISSING_PARAM',
|
||||||
message: "缺少必要参数: query"
|
message: '缺少必要参数: query'
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return;
|
return;
|
||||||
@@ -28,24 +34,29 @@ export const search = async (req: Request, res: Response): Promise<void> => {
|
|||||||
let fetchSearchResults;
|
let fetchSearchResults;
|
||||||
let searchUrlBase;
|
let searchUrlBase;
|
||||||
try {
|
try {
|
||||||
if (engine === 'baidu') {
|
if (engine === 'baidu') {
|
||||||
fetchSearchResults = fetchBaiduResults;
|
fetchSearchResults = fetchBaiduResults;
|
||||||
searchUrlBase = process.env.ENGINE_BAIDUURL;
|
searchUrlBase = process.env.ENGINE_BAIDUURL;
|
||||||
} else if (engine === 'searchxng') {
|
} else if (engine === 'searchxng') {
|
||||||
fetchSearchResults = fetchSearchxngResults;
|
fetchSearchResults = fetchSearchxngResults;
|
||||||
searchUrlBase = process.env.ENGINE_SEARCHXNGURL;
|
searchUrlBase = process.env.ENGINE_SEARCHXNGURL;
|
||||||
} else {
|
} else {
|
||||||
res.status(400).json({
|
res.status(400).json({
|
||||||
status: 400,
|
status: 400,
|
||||||
error: {
|
error: {
|
||||||
code: "INVALID_ENGINE",
|
code: 'INVALID_ENGINE',
|
||||||
message: "无效的搜索引擎"
|
message: '无效的搜索引擎'
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const { resultUrls, results } = await fetchSearchResults(query as string, Number(pageCount), searchUrlBase || '', categories as string);
|
const { resultUrls, results } = await fetchSearchResults(
|
||||||
|
query as string,
|
||||||
|
Number(pageCount),
|
||||||
|
searchUrlBase || '',
|
||||||
|
categories as string
|
||||||
|
);
|
||||||
|
|
||||||
//如果返回值为空,返回空数组
|
//如果返回值为空,返回空数组
|
||||||
if (results.size === 0) {
|
if (results.size === 0) {
|
||||||
@@ -79,20 +90,27 @@ export const search = async (req: Request, res: Response): Promise<void> => {
|
|||||||
concurrency: Cluster.CONCURRENCY_CONTEXT,
|
concurrency: Cluster.CONCURRENCY_CONTEXT,
|
||||||
maxConcurrency: maxConcurrency,
|
maxConcurrency: maxConcurrency,
|
||||||
puppeteerOptions: {
|
puppeteerOptions: {
|
||||||
ignoreDefaultArgs: ["--enable-automation"],
|
ignoreDefaultArgs: ['--enable-automation'],
|
||||||
headless: "true",
|
headless: 'true',
|
||||||
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径
|
executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
|
||||||
pipe: true,
|
pipe: true,
|
||||||
args: [
|
args: [
|
||||||
'--no-sandbox',
|
'--no-sandbox',
|
||||||
'--disable-setuid-sandbox',
|
'--disable-setuid-sandbox',
|
||||||
'--disable-dev-shm-usage',
|
'--disable-dev-shm-usage',
|
||||||
'--disable-gpu',
|
'--disable-gpu'
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
const sortedResults = await performDeepSearch(clusterInstance, resultUrls, results, strategies, detectWebsites, Number(pageCount));
|
const sortedResults = await performDeepSearch(
|
||||||
|
clusterInstance,
|
||||||
|
resultUrls,
|
||||||
|
results,
|
||||||
|
strategies,
|
||||||
|
detectWebsites,
|
||||||
|
Number(pageCount)
|
||||||
|
);
|
||||||
res.status(200).json({
|
res.status(200).json({
|
||||||
status: 200,
|
status: 200,
|
||||||
data: {
|
data: {
|
||||||
@@ -104,8 +122,8 @@ export const search = async (req: Request, res: Response): Promise<void> => {
|
|||||||
res.status(500).json({
|
res.status(500).json({
|
||||||
status: 500,
|
status: 500,
|
||||||
error: {
|
error: {
|
||||||
code: "INTERNAL_SERVER_ERROR",
|
code: 'INTERNAL_SERVER_ERROR',
|
||||||
message: "发生错误"
|
message: '发生错误'
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@@ -5,200 +5,203 @@ import { setupPage } from '../utils/setupPage';
|
|||||||
import { Cluster } from 'puppeteer-cluster';
|
import { Cluster } from 'puppeteer-cluster';
|
||||||
|
|
||||||
async function randomWait(min: number, max: number) {
|
async function randomWait(min: number, max: number) {
|
||||||
// 随机等待时间
|
// 随机等待时间
|
||||||
const delay = Math.floor(Math.random() * (max - min + 1)) + min;
|
const delay = Math.floor(Math.random() * (max - min + 1)) + min;
|
||||||
return new Promise(resolve => setTimeout(resolve, delay));
|
return new Promise((resolve) => setTimeout(resolve, delay));
|
||||||
}
|
}
|
||||||
|
|
||||||
export const fetchSearchResults = async (query: string, pageCount: number, searchUrlBase: string, categories: string) => {
|
export const fetchSearchResults = async (
|
||||||
console.log(`Fetching Baidu search results for query: ${query}`);
|
query: string,
|
||||||
// 如果 searchUrlBase 为空,返回空数组
|
pageCount: number,
|
||||||
if (!searchUrlBase) {
|
searchUrlBase: string,
|
||||||
return { resultUrls: [], results: new Map() };
|
categories: string
|
||||||
|
) => {
|
||||||
|
console.log(`Fetching Baidu search results for query: ${query}`);
|
||||||
|
// 如果 searchUrlBase 为空,返回空数组
|
||||||
|
if (!searchUrlBase) {
|
||||||
|
return { resultUrls: [], results: new Map() };
|
||||||
|
}
|
||||||
|
const resultUrls: string[] = [];
|
||||||
|
const results = new Map<string, any>();
|
||||||
|
|
||||||
|
const pagesToFetch = Math.ceil(pageCount / 10);
|
||||||
|
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
ignoreDefaultArgs: ['--enable-automation'],
|
||||||
|
headless: true,
|
||||||
|
executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
|
||||||
|
pipe: true,
|
||||||
|
args: [
|
||||||
|
'--no-sandbox',
|
||||||
|
'--disable-setuid-sandbox',
|
||||||
|
'--disable-dev-shm-usage',
|
||||||
|
'--disable-gpu'
|
||||||
|
// '--single-process'
|
||||||
|
]
|
||||||
|
});
|
||||||
|
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await setupPage(page);
|
||||||
|
|
||||||
|
for (let i = 0; i < pagesToFetch; i++) {
|
||||||
|
const searchUrl = new URL(`${searchUrlBase}?wd=${encodeURIComponent(query)}&pn=${i * 10}`);
|
||||||
|
console.log(`Fetching page ${i + 1} from Baidu: ${searchUrl.toString()}`);
|
||||||
|
let retryCount = 0;
|
||||||
|
let success = false;
|
||||||
|
|
||||||
|
while (retryCount < 5 && !success) {
|
||||||
|
try {
|
||||||
|
console.time(`Page Load Time for page ${i + 1}`);
|
||||||
|
await page.goto(searchUrl.toString(), { waitUntil: 'load' });
|
||||||
|
console.timeEnd(`Page Load Time for page ${i + 1}`);
|
||||||
|
|
||||||
|
let content = await page.content();
|
||||||
|
let dom = new JSDOM(content);
|
||||||
|
let document = dom.window.document;
|
||||||
|
console.log(document.title);
|
||||||
|
|
||||||
|
// 如果是百度安全验证页面,重新设置页面并重新访问
|
||||||
|
if (document.title.includes('百度安全验证')) {
|
||||||
|
console.log('Detected Baidu security verification, retrying...');
|
||||||
|
await setupPage(page);
|
||||||
|
retryCount++;
|
||||||
|
//随机等待时间
|
||||||
|
await randomWait(1000, 3000);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 解析搜索结果
|
||||||
|
console.time(`Link Retrieval Time for page ${i + 1}`);
|
||||||
|
|
||||||
|
const resultContainers = document.querySelectorAll('.result.c-container');
|
||||||
|
for (const result of resultContainers) {
|
||||||
|
if (resultUrls.length > pageCount + 5) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
const titleElement = result.querySelector('h3 a');
|
||||||
|
const title = titleElement ? titleElement.textContent : '';
|
||||||
|
const url = titleElement ? titleElement.getAttribute('href') : '';
|
||||||
|
const contentElement = result.querySelector('[class^="content"]');
|
||||||
|
const content = contentElement ? contentElement.textContent : '';
|
||||||
|
|
||||||
|
if (url) {
|
||||||
|
resultUrls.push(url);
|
||||||
|
results.set(url, {
|
||||||
|
title,
|
||||||
|
url,
|
||||||
|
snippet: content,
|
||||||
|
source: 'baidu',
|
||||||
|
crawlStatus: 'Pending',
|
||||||
|
score: 0
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
console.timeEnd(`Link Retrieval Time for page ${i + 1}`);
|
||||||
|
success = true;
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error fetching page ${i + 1}:`, error);
|
||||||
|
retryCount++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
const resultUrls: string[] = [];
|
}
|
||||||
const results = new Map<string, any>();
|
|
||||||
|
|
||||||
const pagesToFetch = Math.ceil(pageCount / 10);
|
await browser.close();
|
||||||
|
|
||||||
const browser = await puppeteer.launch({
|
console.log('fetch all fake urls');
|
||||||
ignoreDefaultArgs: ["--enable-automation"],
|
|
||||||
headless: true,
|
// 快速检索真实 URL
|
||||||
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径
|
const urlsToProcessWithPuppeteer = [];
|
||||||
pipe: true,
|
for (const url of resultUrls) {
|
||||||
args: [
|
try {
|
||||||
'--no-sandbox',
|
const response = await fetch(url, {
|
||||||
'--disable-setuid-sandbox',
|
headers: {
|
||||||
'--disable-dev-shm-usage',
|
'User-Agent':
|
||||||
'--disable-gpu',
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
||||||
// '--single-process'
|
Referer: 'https://www.google.com/',
|
||||||
]
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
|
Connection: 'keep-alive',
|
||||||
|
'Cache-Control': 'no-cache'
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
const page = await browser.newPage();
|
if (response.ok) {
|
||||||
await setupPage(page);
|
const realUrl = response.url;
|
||||||
|
console.log('realurl:', realUrl);
|
||||||
for (let i = 0; i < pagesToFetch; i++) {
|
const result = results.get(url);
|
||||||
const searchUrl = new URL(`${searchUrlBase}?wd=${encodeURIComponent(query)}&pn=${i * 10}`);
|
if (result) {
|
||||||
console.log(`Fetching page ${i + 1} from Baidu: ${searchUrl.toString()}`);
|
result.url = realUrl;
|
||||||
let retryCount = 0;
|
result.crawlStatus = 'Success';
|
||||||
let success = false;
|
|
||||||
|
|
||||||
while (retryCount < 5 && !success) {
|
|
||||||
try {
|
|
||||||
console.time(`Page Load Time for page ${i + 1}`);
|
|
||||||
await page.goto(searchUrl.toString(), { waitUntil: 'load' });
|
|
||||||
console.timeEnd(`Page Load Time for page ${i + 1}`);
|
|
||||||
|
|
||||||
let content = await page.content();
|
|
||||||
let dom = new JSDOM(content);
|
|
||||||
let document = dom.window.document;
|
|
||||||
console.log(document.title);
|
|
||||||
|
|
||||||
// 如果是百度安全验证页面,重新设置页面并重新访问
|
|
||||||
if (document.title.includes('百度安全验证')) {
|
|
||||||
console.log('Detected Baidu security verification, retrying...');
|
|
||||||
await setupPage(page);
|
|
||||||
retryCount++;
|
|
||||||
//随机等待时间
|
|
||||||
await randomWait(1000, 3000);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 解析搜索结果
|
|
||||||
console.time(`Link Retrieval Time for page ${i + 1}`);
|
|
||||||
|
|
||||||
|
|
||||||
const resultContainers = document.querySelectorAll('.result.c-container');
|
|
||||||
for (const result of resultContainers) {
|
|
||||||
if (resultUrls.length > pageCount + 5) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
const titleElement = result.querySelector('h3 a');
|
|
||||||
const title = titleElement ? titleElement.textContent : '';
|
|
||||||
const url = titleElement ? titleElement.getAttribute('href') : '';
|
|
||||||
const contentElement = result.querySelector('[class^="content"]');
|
|
||||||
const content = contentElement ? contentElement.textContent : '';
|
|
||||||
|
|
||||||
if (url) {
|
|
||||||
resultUrls.push(url);
|
|
||||||
results.set(url, {
|
|
||||||
title,
|
|
||||||
url,
|
|
||||||
snippet: content,
|
|
||||||
source: 'baidu',
|
|
||||||
crawlStatus: 'Pending',
|
|
||||||
score: 0
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
console.timeEnd(`Link Retrieval Time for page ${i + 1}`);
|
|
||||||
success = true;
|
|
||||||
} catch (error) {
|
|
||||||
console.error(`Error fetching page ${i + 1}:`, error);
|
|
||||||
retryCount++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
throw new Error(`HTTP error! status: ${response.status}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error fetching original URL for ${url}:`, error);
|
||||||
|
urlsToProcessWithPuppeteer.push(url);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
await browser.close();
|
console.log('pass quickfetch');
|
||||||
|
|
||||||
console.log('fetch all fake urls');
|
// 并发处理真实 URL
|
||||||
|
const cluster = await Cluster.launch({
|
||||||
// 快速检索真实 URL
|
concurrency: Cluster.CONCURRENCY_CONTEXT,
|
||||||
const urlsToProcessWithPuppeteer = [];
|
maxConcurrency: 10,
|
||||||
for (const url of resultUrls) {
|
puppeteerOptions: {
|
||||||
try {
|
ignoreDefaultArgs: ['--enable-automation'],
|
||||||
const response = await fetch(url, {
|
headless: 'true',
|
||||||
headers: {
|
executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
pipe: true,
|
||||||
'Referer': 'https://www.google.com/',
|
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu']
|
||||||
'Accept-Language': 'en-US,en;q=0.9',
|
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
||||||
'Connection': 'keep-alive',
|
|
||||||
'Cache-Control': 'no-cache'
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
if (response.ok) {
|
|
||||||
const realUrl = response.url;
|
|
||||||
console.log('realurl:', realUrl);
|
|
||||||
const result = results.get(url);
|
|
||||||
if (result) {
|
|
||||||
result.url = realUrl;
|
|
||||||
result.crawlStatus = 'Success';
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
throw new Error(`HTTP error! status: ${response.status}`);
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
console.error(`Error fetching original URL for ${url}:`, error);
|
|
||||||
urlsToProcessWithPuppeteer.push(url);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
});
|
||||||
|
|
||||||
console.log('pass quickfetch');
|
let failedUrlCount = 0;
|
||||||
|
|
||||||
// 并发处理真实 URL
|
await cluster.task(async ({ page, data: url }) => {
|
||||||
const cluster = await Cluster.launch({
|
let retryUrlCount = 0;
|
||||||
concurrency: Cluster.CONCURRENCY_CONTEXT,
|
let urlSuccess = false;
|
||||||
maxConcurrency: 10,
|
while (retryUrlCount < 3 && !urlSuccess) {
|
||||||
puppeteerOptions: {
|
console.log(`Fetching original URL for ${url}, attempt ${retryUrlCount + 1}`);
|
||||||
ignoreDefaultArgs: ["--enable-automation"],
|
try {
|
||||||
headless: "true",
|
await page.goto(url, { waitUntil: 'load' });
|
||||||
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径
|
// 检查页面是否被分离
|
||||||
pipe: true,
|
if (page.isClosed()) {
|
||||||
args: [
|
throw new Error('Page has been closed');
|
||||||
'--no-sandbox',
|
|
||||||
'--disable-setuid-sandbox',
|
|
||||||
'--disable-dev-shm-usage',
|
|
||||||
'--disable-gpu',
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
});
|
const realUrl = page.url(); // 获取真实 URL
|
||||||
|
const result = results.get(url);
|
||||||
let failedUrlCount = 0;
|
if (result) {
|
||||||
|
result.url = realUrl;
|
||||||
await cluster.task(async ({ page, data: url }) => {
|
result.crawlStatus = 'Success';
|
||||||
let retryUrlCount = 0;
|
|
||||||
let urlSuccess = false;
|
|
||||||
while (retryUrlCount < 3 && !urlSuccess) {
|
|
||||||
console.log(`Fetching original URL for ${url}, attempt ${retryUrlCount + 1}`);
|
|
||||||
try {
|
|
||||||
await page.goto(url, { waitUntil: 'load' });
|
|
||||||
// 检查页面是否被分离
|
|
||||||
if (page.isClosed()) {
|
|
||||||
throw new Error('Page has been closed');
|
|
||||||
}
|
|
||||||
const realUrl = page.url(); // 获取真实 URL
|
|
||||||
const result = results.get(url);
|
|
||||||
if (result) {
|
|
||||||
result.url = realUrl;
|
|
||||||
result.crawlStatus = 'Success';
|
|
||||||
}
|
|
||||||
urlSuccess = true;
|
|
||||||
} catch (error) {
|
|
||||||
console.error(`Error fetching original URL, retrying...`, error);
|
|
||||||
retryUrlCount++;
|
|
||||||
await randomWait(1000, 3000);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (!urlSuccess) {
|
urlSuccess = true;
|
||||||
failedUrlCount++;
|
} catch (error) {
|
||||||
}
|
console.error(`Error fetching original URL, retrying...`, error);
|
||||||
});
|
retryUrlCount++;
|
||||||
|
await randomWait(1000, 3000);
|
||||||
for (const url of urlsToProcessWithPuppeteer) {
|
}
|
||||||
cluster.queue(url);
|
|
||||||
}
|
}
|
||||||
|
if (!urlSuccess) {
|
||||||
|
failedUrlCount++;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
await cluster.idle();
|
for (const url of urlsToProcessWithPuppeteer) {
|
||||||
await cluster.close();
|
cluster.queue(url);
|
||||||
|
}
|
||||||
|
|
||||||
console.log(`Number of URLs that failed to return a real URL: ${failedUrlCount}`);
|
await cluster.idle();
|
||||||
|
await cluster.close();
|
||||||
|
|
||||||
// 过滤并返回前 pageCount 个结果
|
console.log(`Number of URLs that failed to return a real URL: ${failedUrlCount}`);
|
||||||
const filteredResults = Array.from(results.values()).slice(0, pageCount);
|
|
||||||
|
|
||||||
return { resultUrls: filteredResults.map(result => result.url), results: new Map(filteredResults.map(result => [result.url, result])) };
|
// 过滤并返回前 pageCount 个结果
|
||||||
|
const filteredResults = Array.from(results.values()).slice(0, pageCount);
|
||||||
|
|
||||||
|
return {
|
||||||
|
resultUrls: filteredResults.map((result) => result.url),
|
||||||
|
results: new Map(filteredResults.map((result) => [result.url, result]))
|
||||||
|
};
|
||||||
};
|
};
|
@@ -6,9 +6,13 @@ dotenv.config();
|
|||||||
|
|
||||||
const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
|
const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
|
||||||
|
|
||||||
export const fetchSearchResults = async (query: string, pageCount: number, searchUrlBase: string, categories: string) => {
|
export const fetchSearchResults = async (
|
||||||
|
query: string,
|
||||||
const MAX_PAGES = (pageCount / 10 +1) * 2+1; // 最多搜索的页面数
|
pageCount: number,
|
||||||
|
searchUrlBase: string,
|
||||||
|
categories: string
|
||||||
|
) => {
|
||||||
|
const MAX_PAGES = (pageCount / 10 + 1) * 2 + 1; // 最多搜索的页面数
|
||||||
//如果searchUrlBase为空,返回空数组,pagecount是需要搜索结果的数量
|
//如果searchUrlBase为空,返回空数组,pagecount是需要搜索结果的数量
|
||||||
if (!searchUrlBase) {
|
if (!searchUrlBase) {
|
||||||
return { resultUrls: [], results: new Map() };
|
return { resultUrls: [], results: new Map() };
|
||||||
@@ -20,7 +24,9 @@ export const fetchSearchResults = async (query: string, pageCount: number, searc
|
|||||||
let pageIndex = 0;
|
let pageIndex = 0;
|
||||||
|
|
||||||
while (fetchedResultsCount < pageCount && pageIndex < MAX_PAGES) {
|
while (fetchedResultsCount < pageCount && pageIndex < MAX_PAGES) {
|
||||||
const searchUrl = new URL(`${searchUrlBase}?q=${encodeURIComponent(query)}&pageno=${pageIndex + 1}&format=json&categories=${categories}`);
|
const searchUrl = new URL(
|
||||||
|
`${searchUrlBase}?q=${encodeURIComponent(query)}&pageno=${pageIndex + 1}&format=json&categories=${categories}`
|
||||||
|
);
|
||||||
console.log(`Fetching page ${pageIndex + 1} from SearchXNG: ${searchUrl.toString()}`);
|
console.log(`Fetching page ${pageIndex + 1} from SearchXNG: ${searchUrl.toString()}`);
|
||||||
const response = await axios.get(searchUrl.toString());
|
const response = await axios.get(searchUrl.toString());
|
||||||
const jsonResults = response.data.results;
|
const jsonResults = response.data.results;
|
||||||
@@ -28,7 +34,10 @@ export const fetchSearchResults = async (query: string, pageCount: number, searc
|
|||||||
for (let index = 0; index < jsonResults.length; index++) {
|
for (let index = 0; index < jsonResults.length; index++) {
|
||||||
const result = jsonResults[index];
|
const result = jsonResults[index];
|
||||||
const resultDomain = new URL(result.url).hostname;
|
const resultDomain = new URL(result.url).hostname;
|
||||||
if (blacklistDomains.some((domain: string) => resultDomain.endsWith(domain)) || resultDomain.includes('zhihu')) {
|
if (
|
||||||
|
blacklistDomains.some((domain: string) => resultDomain.endsWith(domain)) ||
|
||||||
|
resultDomain.includes('zhihu')
|
||||||
|
) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
resultUrls.push(result.url);
|
resultUrls.push(result.url);
|
||||||
|
@@ -4,7 +4,7 @@ const authMiddleware = (req: Request, res: Response, next: NextFunction) => {
|
|||||||
const bearerHeader = req.headers['authorization'];
|
const bearerHeader = req.headers['authorization'];
|
||||||
|
|
||||||
if (bearerHeader) {
|
if (bearerHeader) {
|
||||||
console.log("bearerHeader:" + bearerHeader);
|
console.log('bearerHeader:' + bearerHeader);
|
||||||
const bearer = bearerHeader.split(' ');
|
const bearer = bearerHeader.split(' ');
|
||||||
const bearerToken = bearer[1];
|
const bearerToken = bearer[1];
|
||||||
|
|
||||||
|
@@ -3,17 +3,22 @@ import { Page } from 'puppeteer';
|
|||||||
export const handleSpecialWebsite = async (page: Page, url: string): Promise<string | null> => {
|
export const handleSpecialWebsite = async (page: Page, url: string): Promise<string | null> => {
|
||||||
if (url.includes('blog.csdn.net')) {
|
if (url.includes('blog.csdn.net')) {
|
||||||
await page.waitForSelector('article');
|
await page.waitForSelector('article');
|
||||||
const content = await page.$eval('article', el => el.innerHTML);
|
const content = await page.$eval('article', (el) => el.innerHTML);
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
if (url.includes('zhuanlan.zhihu.com')) {
|
if (url.includes('zhuanlan.zhihu.com')) {
|
||||||
console.log('是知乎,需要点击按掉!');
|
console.log('是知乎,需要点击按掉!');
|
||||||
console.log(await page.content());
|
console.log(await page.content());
|
||||||
if((await page.content()).includes('{"error":{"message":"您当前请求存在异常,暂时限制本次访问。如有疑问,您可以通过手机摇一摇或登录后私信知乎小管家反馈。","code":40362}}')) return null;
|
if (
|
||||||
|
(await page.content()).includes(
|
||||||
|
'{"error":{"message":"您当前请求存在异常,暂时限制本次访问。如有疑问,您可以通过手机摇一摇或登录后私信知乎小管家反馈。","code":40362}}'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return null;
|
||||||
await page.waitForSelector('button[aria-label="关闭"]');
|
await page.waitForSelector('button[aria-label="关闭"]');
|
||||||
await page.click('button[aria-label="关闭"]'); // 使用 aria-label 选择按钮
|
await page.click('button[aria-label="关闭"]'); // 使用 aria-label 选择按钮
|
||||||
await page.waitForSelector('article');
|
await page.waitForSelector('article');
|
||||||
const content = await page.$eval('article', el => el.innerHTML);
|
const content = await page.$eval('article', (el) => el.innerHTML);
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
// 可以添加更多特殊网站的处理逻辑
|
// 可以添加更多特殊网站的处理逻辑
|
||||||
|
@@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
import NodeCache from 'node-cache';
|
import NodeCache from 'node-cache';
|
||||||
import { MongoClient } from 'mongodb';
|
import { MongoClient } from 'mongodb';
|
||||||
import crypto from 'crypto';
|
import crypto from 'crypto';
|
||||||
@@ -19,10 +18,15 @@ const connectToMongo = async () => {
|
|||||||
const createTTLIndex = async () => {
|
const createTTLIndex = async () => {
|
||||||
try {
|
try {
|
||||||
const db = await connectToMongo();
|
const db = await connectToMongo();
|
||||||
await db.collection(collectionName).createIndex({ "updatedAt": 1 }, { expireAfterSeconds: parseInt(process.env.EXPIRE_AFTER_SECONDS || '9000') });
|
await db
|
||||||
console.log("TTL index created successfully");
|
.collection(collectionName)
|
||||||
|
.createIndex(
|
||||||
|
{ updatedAt: 1 },
|
||||||
|
{ expireAfterSeconds: parseInt(process.env.EXPIRE_AFTER_SECONDS || '9000') }
|
||||||
|
);
|
||||||
|
console.log('TTL index created successfully');
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error creating TTL index:", error);
|
console.error('Error creating TTL index:', error);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -53,11 +57,7 @@ const savePageToCache = async (url: string, content: string) => {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
const db = await connectToMongo();
|
const db = await connectToMongo();
|
||||||
await db.collection(collectionName).updateOne(
|
await db.collection(collectionName).updateOne({ url }, { $set: page }, { upsert: true }); // 更新持久化缓存
|
||||||
{ url },
|
|
||||||
{ $set: page },
|
|
||||||
{ upsert: true }
|
|
||||||
); // 更新持久化缓存
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error saving page to cache:', error);
|
console.error('Error saving page to cache:', error);
|
||||||
throw error;
|
throw error;
|
||||||
|
@@ -13,12 +13,19 @@ interface CachedPage {
|
|||||||
updatedAt: Date;
|
updatedAt: Date;
|
||||||
}
|
}
|
||||||
|
|
||||||
export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: string[], results: Map<string, any>, strategies: any[], detectWebsites: string[], pageCount: number) => {
|
export const performDeepSearch = async (
|
||||||
|
clusterInstance: Cluster,
|
||||||
|
resultUrls: string[],
|
||||||
|
results: Map<string, any>,
|
||||||
|
strategies: any[],
|
||||||
|
detectWebsites: string[],
|
||||||
|
pageCount: number
|
||||||
|
) => {
|
||||||
const tasks = [];
|
const tasks = [];
|
||||||
|
|
||||||
await clusterInstance.task(async ({ page, data: { searchUrl } }) => {
|
await clusterInstance.task(async ({ page, data: { searchUrl } }) => {
|
||||||
try {
|
try {
|
||||||
const cachedPage = await getCachedPage(searchUrl) as CachedPage | null;
|
const cachedPage = (await getCachedPage(searchUrl)) as CachedPage | null;
|
||||||
if (cachedPage) {
|
if (cachedPage) {
|
||||||
const result = results.get(searchUrl);
|
const result = results.get(searchUrl);
|
||||||
if (result) {
|
if (result) {
|
||||||
@@ -29,18 +36,25 @@ export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: st
|
|||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`从缓存获取页面 ${searchUrl} 时发生错误:`, error);
|
console.error(`从缓存获取页面 ${searchUrl} 时发生错误:`, error);
|
||||||
results.set(searchUrl, { url: searchUrl, error: (error as Error).message, crawlStatus: 'Failed' });
|
results.set(searchUrl, {
|
||||||
|
url: searchUrl,
|
||||||
|
error: (error as Error).message,
|
||||||
|
crawlStatus: 'Failed'
|
||||||
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const response = await fetch(searchUrl, {
|
const response = await fetch(searchUrl, {
|
||||||
headers: {
|
headers: {
|
||||||
'User-Agent': new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }).toString(),
|
'User-Agent': new UserAgent({
|
||||||
'Referer': 'https://www.google.com/',
|
deviceCategory: 'desktop',
|
||||||
|
platform: 'Linux x86_64'
|
||||||
|
}).toString(),
|
||||||
|
Referer: 'https://www.google.com/',
|
||||||
'Accept-Language': 'en-US,en;q=0.9',
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
'Connection': 'keep-alive',
|
Connection: 'keep-alive',
|
||||||
'Cache-Control': 'no-cache'
|
'Cache-Control': 'no-cache'
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@@ -66,7 +80,7 @@ export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: st
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (detectWebsites.some(website => searchUrl.includes(website))) {
|
if (detectWebsites.some((website) => searchUrl.includes(website))) {
|
||||||
await setupPage(page);
|
await setupPage(page);
|
||||||
} else {
|
} else {
|
||||||
const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
|
const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
|
||||||
@@ -118,7 +132,11 @@ export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: st
|
|||||||
|
|
||||||
await updateCacheAsync(searchUrl, cleanedContent || '');
|
await updateCacheAsync(searchUrl, cleanedContent || '');
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
results.set(searchUrl, { url: searchUrl, error: (error as Error).message, crawlStatus: 'Failed' });
|
results.set(searchUrl, {
|
||||||
|
url: searchUrl,
|
||||||
|
error: (error as Error).message,
|
||||||
|
crawlStatus: 'Failed'
|
||||||
|
});
|
||||||
} finally {
|
} finally {
|
||||||
await page.close().catch(() => {});
|
await page.close().catch(() => {});
|
||||||
}
|
}
|
||||||
|
@@ -8,7 +8,7 @@ const getRandomUserAgent = () => {
|
|||||||
};
|
};
|
||||||
|
|
||||||
const getRandomPlatform = () => {
|
const getRandomPlatform = () => {
|
||||||
const platforms = ["Win32", "MacIntel", "Linux x86_64"];
|
const platforms = ['Win32', 'MacIntel', 'Linux x86_64'];
|
||||||
return platforms[Math.floor(Math.random() * platforms.length)];
|
return platforms[Math.floor(Math.random() * platforms.length)];
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -16,14 +16,16 @@ const getRandomPlatform = () => {
|
|||||||
const validateproxy = process.env.VALIDATE_PROXY ? JSON.parse(process.env.VALIDATE_PROXY) : [];
|
const validateproxy = process.env.VALIDATE_PROXY ? JSON.parse(process.env.VALIDATE_PROXY) : [];
|
||||||
|
|
||||||
const getRandomProxy = () => {
|
const getRandomProxy = () => {
|
||||||
return validateproxy.length > 0 ? validateproxy[Math.floor(Math.random() * validateproxy.length)] : null;
|
return validateproxy.length > 0
|
||||||
|
? validateproxy[Math.floor(Math.random() * validateproxy.length)]
|
||||||
|
: null;
|
||||||
};
|
};
|
||||||
|
|
||||||
const getRandomLanguages = () => {
|
const getRandomLanguages = () => {
|
||||||
const languages = [
|
const languages = [
|
||||||
["zh-CN", "zh", "en"],
|
['zh-CN', 'zh', 'en'],
|
||||||
["en-US", "en", "fr"],
|
['en-US', 'en', 'fr'],
|
||||||
["es-ES", "es", "en"]
|
['es-ES', 'es', 'en']
|
||||||
];
|
];
|
||||||
return languages[Math.floor(Math.random() * languages.length)];
|
return languages[Math.floor(Math.random() * languages.length)];
|
||||||
};
|
};
|
||||||
@@ -42,30 +44,38 @@ export const setupPage = async (page: Page): Promise<void> => {
|
|||||||
delete newProto.webdriver;
|
delete newProto.webdriver;
|
||||||
(navigator as any).__proto__ = newProto;
|
(navigator as any).__proto__ = newProto;
|
||||||
(window as any).chrome = {};
|
(window as any).chrome = {};
|
||||||
(window as any).chrome.app = {"InstallState":"testt", "RunningState":"estt", "getDetails":"stte", "getIsInstalled":"ttes"};
|
(window as any).chrome.app = {
|
||||||
(window as any).chrome.csi = function(){};
|
InstallState: 'testt',
|
||||||
(window as any).chrome.loadTimes = function(){};
|
RunningState: 'estt',
|
||||||
(window as any).chrome.runtime = function(){};
|
getDetails: 'stte',
|
||||||
|
getIsInstalled: 'ttes'
|
||||||
|
};
|
||||||
|
(window as any).chrome.csi = function () {};
|
||||||
|
(window as any).chrome.loadTimes = function () {};
|
||||||
|
(window as any).chrome.runtime = function () {};
|
||||||
Object.defineProperty(navigator, 'userAgent', {
|
Object.defineProperty(navigator, 'userAgent', {
|
||||||
get: () => getRandomUserAgent(),
|
get: () => getRandomUserAgent()
|
||||||
});
|
});
|
||||||
Object.defineProperty(navigator, 'platform', {
|
Object.defineProperty(navigator, 'platform', {
|
||||||
get: () => getRandomPlatform(),
|
get: () => getRandomPlatform()
|
||||||
});
|
});
|
||||||
Object.defineProperty(navigator, 'plugins', {
|
Object.defineProperty(navigator, 'plugins', {
|
||||||
get: () => [{"description": "Shockwave Flash",
|
get: () => [
|
||||||
"filename": "pepflashplayer.dll",
|
{
|
||||||
"length": 1,
|
description: 'Shockwave Flash',
|
||||||
"name": "Shockwave Flash"}]
|
filename: 'pepflashplayer.dll',
|
||||||
|
length: 1,
|
||||||
|
name: 'Shockwave Flash'
|
||||||
|
}
|
||||||
|
]
|
||||||
});
|
});
|
||||||
Object.defineProperty(navigator, 'languages', {
|
Object.defineProperty(navigator, 'languages', {
|
||||||
get: () => getRandomLanguages(),
|
get: () => getRandomLanguages()
|
||||||
});
|
});
|
||||||
const originalQuery = (window.navigator.permissions as any).query;
|
const originalQuery = (window.navigator.permissions as any).query;
|
||||||
(window.navigator.permissions as any).query = (parameters: any) => (
|
(window.navigator.permissions as any).query = (parameters: any) =>
|
||||||
parameters.name === 'notifications' ?
|
parameters.name === 'notifications'
|
||||||
Promise.resolve({ state: Notification.permission } as PermissionStatus) :
|
? Promise.resolve({ state: Notification.permission } as PermissionStatus)
|
||||||
originalQuery(parameters)
|
: originalQuery(parameters);
|
||||||
);
|
|
||||||
});
|
});
|
||||||
};
|
};
|
@@ -63,7 +63,7 @@ async function initHttp(teamId?: string): Promise<any> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
/* 批量创建子插件 */
|
/* 批量创建子插件 */
|
||||||
@@ -88,7 +88,7 @@ async function initHttp(teamId?: string): Promise<any> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
if (item.version === 'v2') {
|
if (item.version === 'v2') {
|
||||||
await MongoAppVersion.create(
|
await MongoAppVersion.create(
|
||||||
@@ -100,7 +100,7 @@ async function initHttp(teamId?: string): Promise<any> {
|
|||||||
edges: item.edges
|
edges: item.edges
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -160,7 +160,7 @@ async function initPlugin(teamId?: string): Promise<any> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
if (plugin.version === 'v2') {
|
if (plugin.version === 'v2') {
|
||||||
@@ -173,7 +173,7 @@ async function initPlugin(teamId?: string): Promise<any> {
|
|||||||
edges: plugin.edges
|
edges: plugin.edges
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -98,7 +98,8 @@ async function handler(
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
{
|
{
|
||||||
session
|
session,
|
||||||
|
ordered: true
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@@ -126,7 +126,7 @@ export const onCreateApp = async ({
|
|||||||
'pluginData.nodeVersion': defaultNodeVersion
|
'pluginData.nodeVersion': defaultNodeVersion
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!AppFolderTypeList.includes(type!)) {
|
if (!AppFolderTypeList.includes(type!)) {
|
||||||
@@ -144,7 +144,7 @@ export const onCreateApp = async ({
|
|||||||
isPublish: true
|
isPublish: true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -89,7 +89,8 @@ async function handler(req: ApiRequestProps<CreateAppFolderBody>) {
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
{
|
{
|
||||||
session
|
session,
|
||||||
|
ordered: true
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@@ -45,7 +45,7 @@ async function handler(req: ApiRequestProps<PostPublishAppProps>, res: NextApiRe
|
|||||||
tmbId
|
tmbId
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
// update app
|
// update app
|
||||||
|
@@ -88,7 +88,7 @@ async function handler(
|
|||||||
yuqueServer
|
yuqueServer
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
await refreshSourceAvatar(avatar, undefined, session);
|
await refreshSourceAvatar(avatar, undefined, session);
|
||||||
|
|
||||||
|
@@ -87,7 +87,7 @@ async function handler(
|
|||||||
permission: OwnerPermissionVal
|
permission: OwnerPermissionVal
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@@ -122,7 +122,8 @@ async function handler(req: ApiRequestProps<rebuildEmbeddingBody>): Promise<Resp
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
{
|
{
|
||||||
session
|
session,
|
||||||
|
ordered: true
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@@ -98,7 +98,7 @@ export async function insertData2Dataset({
|
|||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
// 3. Create mongo data text
|
// 3. Create mongo data text
|
||||||
@@ -112,7 +112,7 @@ export async function insertData2Dataset({
|
|||||||
fullTextToken: jiebaSplit({ text: qaStr })
|
fullTextToken: jiebaSplit({ text: qaStr })
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@@ -192,7 +192,7 @@ const rebuildData = async ({
|
|||||||
retryCount: 50
|
retryCount: 50
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@@ -37,7 +37,7 @@ export async function initRootUser(retry = 3): Promise<any> {
|
|||||||
password: hashStr(psw)
|
password: hashStr(psw)
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
rootId = _id;
|
rootId = _id;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user