fix: add order:true to all create transactions (#3948)

This commit is contained in:
Finley Ge
2025-03-03 11:37:51 +08:00
committed by GitHub
parent 113e8f711f
commit 4bc7f21182
30 changed files with 425 additions and 352 deletions

View File

@@ -216,7 +216,7 @@ export async function createOneCollection({
nextSyncTime nextSyncTime
} }
], ],
{ session } { session, ordered: true }
); );
return collection; return collection;

View File

@@ -97,7 +97,7 @@ export const createOrGetCollectionTags = async ({
datasetId, datasetId,
tag: tagContent tag: tagContent
})), })),
{ session } { session, ordered: true }
); );
return [...existingTags.map((tag) => tag._id), ...newTags.map((tag) => tag._id)]; return [...existingTags.map((tag) => tag._id), ...newTags.map((tag) => tag._id)];

View File

@@ -196,7 +196,8 @@ export async function syncCollaborators({
permission: item.permission permission: item.permission
})), })),
{ {
session session,
ordered: true
} }
); );
} }

View File

@@ -100,7 +100,7 @@ export const initTeamFreePlan = async ({
surplusPoints: freePoints surplusPoints: freePoints
} }
], ],
{ session } { session, ordered: true }
); );
}; };

View File

@@ -160,7 +160,7 @@ export const createTrainingUsage = async ({
] ]
} }
], ],
{ session } { session, ordered: true }
); );
return { billId: String(_id) }; return { billId: String(_id) };

View File

@@ -5,56 +5,56 @@ import dotenv from 'dotenv';
dotenv.config(); dotenv.config();
const userAgents = [ const userAgents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0' 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
]; ];
export const quickFetch = async (req: Request, res: Response): Promise<void> => { export const quickFetch = async (req: Request, res: Response): Promise<void> => {
const { url } = req.query; const { url } = req.query;
if (!url) { if (!url) {
res.status(400).json({ res.status(400).json({
status: 400, status: 400,
error: { error: {
code: "MISSING_PARAM", code: 'MISSING_PARAM',
message: "缺少必要参数: url" message: '缺少必要参数: url'
} }
}); });
return; return;
} }
try { try {
const response = await fetch(url as string, { const response = await fetch(url as string, {
headers: { headers: {
'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)], 'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
'Referer': 'https://www.google.com/', Referer: 'https://www.google.com/',
'Accept-Language': 'en-US,en;q=0.9', 'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Connection': 'keep-alive', Connection: 'keep-alive',
'Cache-Control': 'no-cache' 'Cache-Control': 'no-cache'
} }
}); });
if (!response.ok) { if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`); throw new Error(`HTTP error! status: ${response.status}`);
}
const data = await response.text();
res.status(200).json({
status: 200,
data: {
content: data
}
});
} catch (error) {
console.error('Error fetching the page:', error);
res.status(500).json({
status: 500,
error: {
code: "INTERNAL_SERVER_ERROR",
message: "发生错误"
}
});
} }
const data = await response.text();
res.status(200).json({
status: 200,
data: {
content: data
}
});
} catch (error) {
console.error('Error fetching the page:', error);
res.status(500).json({
status: 500,
error: {
code: 'INTERNAL_SERVER_ERROR',
message: '发生错误'
}
});
}
}; };
export default { quickFetch }; export default { quickFetch };

View File

@@ -16,16 +16,16 @@ const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIS
export const readPage = async (req: Request, res: Response): Promise<void> => { export const readPage = async (req: Request, res: Response): Promise<void> => {
const { queryUrl } = req.query; const { queryUrl } = req.query;
console.log("-------"); console.log('-------');
console.log(queryUrl); console.log(queryUrl);
console.log("-------"); console.log('-------');
if (!queryUrl) { if (!queryUrl) {
res.status(400).json({ res.status(400).json({
status: 400, status: 400,
error: { error: {
code: "MISSING_PARAM", code: 'MISSING_PARAM',
message: "缺少必要参数: queryUrl" message: '缺少必要参数: queryUrl'
} }
}); });
return; return;
@@ -36,8 +36,8 @@ export const readPage = async (req: Request, res: Response): Promise<void> => {
res.status(403).json({ res.status(403).json({
status: 403, status: 403,
error: { error: {
code: "BLACKLISTED_DOMAIN", code: 'BLACKLISTED_DOMAIN',
message: "该域名受到保护中" message: '该域名受到保护中'
} }
}); });
return; return;
@@ -46,11 +46,14 @@ export const readPage = async (req: Request, res: Response): Promise<void> => {
try { try {
const response = await fetch(queryUrl as string, { const response = await fetch(queryUrl as string, {
headers: { headers: {
'User-Agent': new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }).toString(), 'User-Agent': new UserAgent({
'Referer': 'https://www.google.com/', deviceCategory: 'desktop',
platform: 'Linux x86_64'
}).toString(),
Referer: 'https://www.google.com/',
'Accept-Language': 'en-US,en;q=0.9', 'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Connection': 'keep-alive', Connection: 'keep-alive',
'Cache-Control': 'no-cache' 'Cache-Control': 'no-cache'
} }
}); });
@@ -69,7 +72,7 @@ export const readPage = async (req: Request, res: Response): Promise<void> => {
}); });
await updateCacheAsync(queryUrl as string, cleanedContent || ''); await updateCacheAsync(queryUrl as string, cleanedContent || '');
console.log("Page read successfully"); console.log('Page read successfully');
return; return;
} else { } else {
throw new Error(`HTTP error! status: ${response.status}`); throw new Error(`HTTP error! status: ${response.status}`);
@@ -80,22 +83,25 @@ export const readPage = async (req: Request, res: Response): Promise<void> => {
try { try {
const browser = await puppeteer.launch({ const browser = await puppeteer.launch({
ignoreDefaultArgs: ["--enable-automation"], ignoreDefaultArgs: ['--enable-automation'],
headless: true, headless: true,
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径 executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
pipe: true, pipe: true,
args: [ args: [
'--no-sandbox', '--no-sandbox',
'--disable-setuid-sandbox', '--disable-setuid-sandbox',
'--disable-dev-shm-usage', '--disable-dev-shm-usage',
'--disable-gpu', '--disable-gpu'
// '--single-process' // '--single-process'
] ]
}); });
const page = await browser.newPage(); const page = await browser.newPage();
// 检测是否需要特殊处理 // 检测是否需要特殊处理
if (typeof queryUrl === 'string' && detectWebsites.some(website => queryUrl.includes(website))) { if (
typeof queryUrl === 'string' &&
detectWebsites.some((website) => queryUrl.includes(website))
) {
await setupPage(page); await setupPage(page);
} else { } else {
const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }); const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
@@ -128,14 +134,14 @@ export const readPage = async (req: Request, res: Response): Promise<void> => {
}); });
await updateCacheAsync(queryUrl as string, cleanedContent || ''); await updateCacheAsync(queryUrl as string, cleanedContent || '');
console.log("Page read successfully"); console.log('Page read successfully');
} catch (error) { } catch (error) {
console.error(error); console.error(error);
res.status(500).json({ res.status(500).json({
status: 500, status: 500,
error: { error: {
code: "INTERNAL_SERVER_ERROR", code: 'INTERNAL_SERVER_ERROR',
message: "读取页面时发生内部服务器错误" message: '读取页面时发生内部服务器错误'
} }
}); });
} }

View File

@@ -12,15 +12,21 @@ const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
const maxConcurrency = parseInt(process.env.MAX_CONCURRENCY || '10', 10); const maxConcurrency = parseInt(process.env.MAX_CONCURRENCY || '10', 10);
export const search = async (req: Request, res: Response): Promise<void> => { export const search = async (req: Request, res: Response): Promise<void> => {
const { query, pageCount = 10, needDetails = 'false', engine = 'baidu', categories = 'general' } = req.query; const {
const needDetailsBool = (needDetails === 'true'); query,
pageCount = 10,
needDetails = 'false',
engine = 'baidu',
categories = 'general'
} = req.query;
const needDetailsBool = needDetails === 'true';
if (!query) { if (!query) {
res.status(400).json({ res.status(400).json({
status: 400, status: 400,
error: { error: {
code: "MISSING_PARAM", code: 'MISSING_PARAM',
message: "缺少必要参数: query" message: '缺少必要参数: query'
} }
}); });
return; return;
@@ -28,24 +34,29 @@ export const search = async (req: Request, res: Response): Promise<void> => {
let fetchSearchResults; let fetchSearchResults;
let searchUrlBase; let searchUrlBase;
try { try {
if (engine === 'baidu') { if (engine === 'baidu') {
fetchSearchResults = fetchBaiduResults; fetchSearchResults = fetchBaiduResults;
searchUrlBase = process.env.ENGINE_BAIDUURL; searchUrlBase = process.env.ENGINE_BAIDUURL;
} else if (engine === 'searchxng') { } else if (engine === 'searchxng') {
fetchSearchResults = fetchSearchxngResults; fetchSearchResults = fetchSearchxngResults;
searchUrlBase = process.env.ENGINE_SEARCHXNGURL; searchUrlBase = process.env.ENGINE_SEARCHXNGURL;
} else { } else {
res.status(400).json({ res.status(400).json({
status: 400, status: 400,
error: { error: {
code: "INVALID_ENGINE", code: 'INVALID_ENGINE',
message: "无效的搜索引擎" message: '无效的搜索引擎'
} }
}); });
return; return;
} }
const { resultUrls, results } = await fetchSearchResults(query as string, Number(pageCount), searchUrlBase || '', categories as string); const { resultUrls, results } = await fetchSearchResults(
query as string,
Number(pageCount),
searchUrlBase || '',
categories as string
);
//如果返回值为空,返回空数组 //如果返回值为空,返回空数组
if (results.size === 0) { if (results.size === 0) {
@@ -79,20 +90,27 @@ export const search = async (req: Request, res: Response): Promise<void> => {
concurrency: Cluster.CONCURRENCY_CONTEXT, concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: maxConcurrency, maxConcurrency: maxConcurrency,
puppeteerOptions: { puppeteerOptions: {
ignoreDefaultArgs: ["--enable-automation"], ignoreDefaultArgs: ['--enable-automation'],
headless: "true", headless: 'true',
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径 executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
pipe: true, pipe: true,
args: [ args: [
'--no-sandbox', '--no-sandbox',
'--disable-setuid-sandbox', '--disable-setuid-sandbox',
'--disable-dev-shm-usage', '--disable-dev-shm-usage',
'--disable-gpu', '--disable-gpu'
] ]
} }
}); });
const sortedResults = await performDeepSearch(clusterInstance, resultUrls, results, strategies, detectWebsites, Number(pageCount)); const sortedResults = await performDeepSearch(
clusterInstance,
resultUrls,
results,
strategies,
detectWebsites,
Number(pageCount)
);
res.status(200).json({ res.status(200).json({
status: 200, status: 200,
data: { data: {
@@ -104,8 +122,8 @@ export const search = async (req: Request, res: Response): Promise<void> => {
res.status(500).json({ res.status(500).json({
status: 500, status: 500,
error: { error: {
code: "INTERNAL_SERVER_ERROR", code: 'INTERNAL_SERVER_ERROR',
message: "发生错误" message: '发生错误'
} }
}); });
} }

View File

@@ -5,200 +5,203 @@ import { setupPage } from '../utils/setupPage';
import { Cluster } from 'puppeteer-cluster'; import { Cluster } from 'puppeteer-cluster';
async function randomWait(min: number, max: number) { async function randomWait(min: number, max: number) {
// 随机等待时间 // 随机等待时间
const delay = Math.floor(Math.random() * (max - min + 1)) + min; const delay = Math.floor(Math.random() * (max - min + 1)) + min;
return new Promise(resolve => setTimeout(resolve, delay)); return new Promise((resolve) => setTimeout(resolve, delay));
} }
export const fetchSearchResults = async (query: string, pageCount: number, searchUrlBase: string, categories: string) => { export const fetchSearchResults = async (
console.log(`Fetching Baidu search results for query: ${query}`); query: string,
// 如果 searchUrlBase 为空,返回空数组 pageCount: number,
if (!searchUrlBase) { searchUrlBase: string,
return { resultUrls: [], results: new Map() }; categories: string
) => {
console.log(`Fetching Baidu search results for query: ${query}`);
// 如果 searchUrlBase 为空,返回空数组
if (!searchUrlBase) {
return { resultUrls: [], results: new Map() };
}
const resultUrls: string[] = [];
const results = new Map<string, any>();
const pagesToFetch = Math.ceil(pageCount / 10);
const browser = await puppeteer.launch({
ignoreDefaultArgs: ['--enable-automation'],
headless: true,
executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
pipe: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu'
// '--single-process'
]
});
const page = await browser.newPage();
await setupPage(page);
for (let i = 0; i < pagesToFetch; i++) {
const searchUrl = new URL(`${searchUrlBase}?wd=${encodeURIComponent(query)}&pn=${i * 10}`);
console.log(`Fetching page ${i + 1} from Baidu: ${searchUrl.toString()}`);
let retryCount = 0;
let success = false;
while (retryCount < 5 && !success) {
try {
console.time(`Page Load Time for page ${i + 1}`);
await page.goto(searchUrl.toString(), { waitUntil: 'load' });
console.timeEnd(`Page Load Time for page ${i + 1}`);
let content = await page.content();
let dom = new JSDOM(content);
let document = dom.window.document;
console.log(document.title);
// 如果是百度安全验证页面,重新设置页面并重新访问
if (document.title.includes('百度安全验证')) {
console.log('Detected Baidu security verification, retrying...');
await setupPage(page);
retryCount++;
//随机等待时间
await randomWait(1000, 3000);
continue;
}
// 解析搜索结果
console.time(`Link Retrieval Time for page ${i + 1}`);
const resultContainers = document.querySelectorAll('.result.c-container');
for (const result of resultContainers) {
if (resultUrls.length > pageCount + 5) {
break;
}
const titleElement = result.querySelector('h3 a');
const title = titleElement ? titleElement.textContent : '';
const url = titleElement ? titleElement.getAttribute('href') : '';
const contentElement = result.querySelector('[class^="content"]');
const content = contentElement ? contentElement.textContent : '';
if (url) {
resultUrls.push(url);
results.set(url, {
title,
url,
snippet: content,
source: 'baidu',
crawlStatus: 'Pending',
score: 0
});
}
}
console.timeEnd(`Link Retrieval Time for page ${i + 1}`);
success = true;
} catch (error) {
console.error(`Error fetching page ${i + 1}:`, error);
retryCount++;
}
} }
const resultUrls: string[] = []; }
const results = new Map<string, any>();
const pagesToFetch = Math.ceil(pageCount / 10); await browser.close();
const browser = await puppeteer.launch({ console.log('fetch all fake urls');
ignoreDefaultArgs: ["--enable-automation"],
headless: true, // 快速检索真实 URL
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径 const urlsToProcessWithPuppeteer = [];
pipe: true, for (const url of resultUrls) {
args: [ try {
'--no-sandbox', const response = await fetch(url, {
'--disable-setuid-sandbox', headers: {
'--disable-dev-shm-usage', 'User-Agent':
'--disable-gpu', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
// '--single-process' Referer: 'https://www.google.com/',
] 'Accept-Language': 'en-US,en;q=0.9',
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
Connection: 'keep-alive',
'Cache-Control': 'no-cache'
}
}); });
const page = await browser.newPage(); if (response.ok) {
await setupPage(page); const realUrl = response.url;
console.log('realurl:', realUrl);
for (let i = 0; i < pagesToFetch; i++) { const result = results.get(url);
const searchUrl = new URL(`${searchUrlBase}?wd=${encodeURIComponent(query)}&pn=${i * 10}`); if (result) {
console.log(`Fetching page ${i + 1} from Baidu: ${searchUrl.toString()}`); result.url = realUrl;
let retryCount = 0; result.crawlStatus = 'Success';
let success = false;
while (retryCount < 5 && !success) {
try {
console.time(`Page Load Time for page ${i + 1}`);
await page.goto(searchUrl.toString(), { waitUntil: 'load' });
console.timeEnd(`Page Load Time for page ${i + 1}`);
let content = await page.content();
let dom = new JSDOM(content);
let document = dom.window.document;
console.log(document.title);
// 如果是百度安全验证页面,重新设置页面并重新访问
if (document.title.includes('百度安全验证')) {
console.log('Detected Baidu security verification, retrying...');
await setupPage(page);
retryCount++;
//随机等待时间
await randomWait(1000, 3000);
continue;
}
// 解析搜索结果
console.time(`Link Retrieval Time for page ${i + 1}`);
const resultContainers = document.querySelectorAll('.result.c-container');
for (const result of resultContainers) {
if (resultUrls.length > pageCount + 5) {
break;
}
const titleElement = result.querySelector('h3 a');
const title = titleElement ? titleElement.textContent : '';
const url = titleElement ? titleElement.getAttribute('href') : '';
const contentElement = result.querySelector('[class^="content"]');
const content = contentElement ? contentElement.textContent : '';
if (url) {
resultUrls.push(url);
results.set(url, {
title,
url,
snippet: content,
source: 'baidu',
crawlStatus: 'Pending',
score: 0
});
}
}
console.timeEnd(`Link Retrieval Time for page ${i + 1}`);
success = true;
} catch (error) {
console.error(`Error fetching page ${i + 1}:`, error);
retryCount++;
}
} }
} else {
throw new Error(`HTTP error! status: ${response.status}`);
}
} catch (error) {
console.error(`Error fetching original URL for ${url}:`, error);
urlsToProcessWithPuppeteer.push(url);
} }
}
await browser.close(); console.log('pass quickfetch');
console.log('fetch all fake urls'); // 并发处理真实 URL
const cluster = await Cluster.launch({
// 快速检索真实 URL concurrency: Cluster.CONCURRENCY_CONTEXT,
const urlsToProcessWithPuppeteer = []; maxConcurrency: 10,
for (const url of resultUrls) { puppeteerOptions: {
try { ignoreDefaultArgs: ['--enable-automation'],
const response = await fetch(url, { headless: 'true',
headers: { executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', pipe: true,
'Referer': 'https://www.google.com/', args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu']
'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Connection': 'keep-alive',
'Cache-Control': 'no-cache'
}
});
if (response.ok) {
const realUrl = response.url;
console.log('realurl:', realUrl);
const result = results.get(url);
if (result) {
result.url = realUrl;
result.crawlStatus = 'Success';
}
} else {
throw new Error(`HTTP error! status: ${response.status}`);
}
} catch (error) {
console.error(`Error fetching original URL for ${url}:`, error);
urlsToProcessWithPuppeteer.push(url);
}
} }
});
console.log('pass quickfetch'); let failedUrlCount = 0;
// 并发处理真实 URL await cluster.task(async ({ page, data: url }) => {
const cluster = await Cluster.launch({ let retryUrlCount = 0;
concurrency: Cluster.CONCURRENCY_CONTEXT, let urlSuccess = false;
maxConcurrency: 10, while (retryUrlCount < 3 && !urlSuccess) {
puppeteerOptions: { console.log(`Fetching original URL for ${url}, attempt ${retryUrlCount + 1}`);
ignoreDefaultArgs: ["--enable-automation"], try {
headless: "true", await page.goto(url, { waitUntil: 'load' });
executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径 // 检查页面是否被分离
pipe: true, if (page.isClosed()) {
args: [ throw new Error('Page has been closed');
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
]
} }
}); const realUrl = page.url(); // 获取真实 URL
const result = results.get(url);
let failedUrlCount = 0; if (result) {
result.url = realUrl;
await cluster.task(async ({ page, data: url }) => { result.crawlStatus = 'Success';
let retryUrlCount = 0;
let urlSuccess = false;
while (retryUrlCount < 3 && !urlSuccess) {
console.log(`Fetching original URL for ${url}, attempt ${retryUrlCount + 1}`);
try {
await page.goto(url, { waitUntil: 'load' });
// 检查页面是否被分离
if (page.isClosed()) {
throw new Error('Page has been closed');
}
const realUrl = page.url(); // 获取真实 URL
const result = results.get(url);
if (result) {
result.url = realUrl;
result.crawlStatus = 'Success';
}
urlSuccess = true;
} catch (error) {
console.error(`Error fetching original URL, retrying...`, error);
retryUrlCount++;
await randomWait(1000, 3000);
}
} }
if (!urlSuccess) { urlSuccess = true;
failedUrlCount++; } catch (error) {
} console.error(`Error fetching original URL, retrying...`, error);
}); retryUrlCount++;
await randomWait(1000, 3000);
for (const url of urlsToProcessWithPuppeteer) { }
cluster.queue(url);
} }
if (!urlSuccess) {
failedUrlCount++;
}
});
await cluster.idle(); for (const url of urlsToProcessWithPuppeteer) {
await cluster.close(); cluster.queue(url);
}
console.log(`Number of URLs that failed to return a real URL: ${failedUrlCount}`); await cluster.idle();
await cluster.close();
// 过滤并返回前 pageCount 个结果 console.log(`Number of URLs that failed to return a real URL: ${failedUrlCount}`);
const filteredResults = Array.from(results.values()).slice(0, pageCount);
return { resultUrls: filteredResults.map(result => result.url), results: new Map(filteredResults.map(result => [result.url, result])) }; // 过滤并返回前 pageCount 个结果
const filteredResults = Array.from(results.values()).slice(0, pageCount);
return {
resultUrls: filteredResults.map((result) => result.url),
results: new Map(filteredResults.map((result) => [result.url, result]))
};
}; };

View File

@@ -6,9 +6,13 @@ dotenv.config();
const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : []; const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
export const fetchSearchResults = async (query: string, pageCount: number, searchUrlBase: string, categories: string) => { export const fetchSearchResults = async (
query: string,
const MAX_PAGES = (pageCount / 10 +1) * 2+1; // 最多搜索的页面数 pageCount: number,
searchUrlBase: string,
categories: string
) => {
const MAX_PAGES = (pageCount / 10 + 1) * 2 + 1; // 最多搜索的页面数
//如果searchUrlBase为空返回空数组pagecount是需要搜索结果的数量 //如果searchUrlBase为空返回空数组pagecount是需要搜索结果的数量
if (!searchUrlBase) { if (!searchUrlBase) {
return { resultUrls: [], results: new Map() }; return { resultUrls: [], results: new Map() };
@@ -20,7 +24,9 @@ export const fetchSearchResults = async (query: string, pageCount: number, searc
let pageIndex = 0; let pageIndex = 0;
while (fetchedResultsCount < pageCount && pageIndex < MAX_PAGES) { while (fetchedResultsCount < pageCount && pageIndex < MAX_PAGES) {
const searchUrl = new URL(`${searchUrlBase}?q=${encodeURIComponent(query)}&pageno=${pageIndex + 1}&format=json&categories=${categories}`); const searchUrl = new URL(
`${searchUrlBase}?q=${encodeURIComponent(query)}&pageno=${pageIndex + 1}&format=json&categories=${categories}`
);
console.log(`Fetching page ${pageIndex + 1} from SearchXNG: ${searchUrl.toString()}`); console.log(`Fetching page ${pageIndex + 1} from SearchXNG: ${searchUrl.toString()}`);
const response = await axios.get(searchUrl.toString()); const response = await axios.get(searchUrl.toString());
const jsonResults = response.data.results; const jsonResults = response.data.results;
@@ -28,7 +34,10 @@ export const fetchSearchResults = async (query: string, pageCount: number, searc
for (let index = 0; index < jsonResults.length; index++) { for (let index = 0; index < jsonResults.length; index++) {
const result = jsonResults[index]; const result = jsonResults[index];
const resultDomain = new URL(result.url).hostname; const resultDomain = new URL(result.url).hostname;
if (blacklistDomains.some((domain: string) => resultDomain.endsWith(domain)) || resultDomain.includes('zhihu')) { if (
blacklistDomains.some((domain: string) => resultDomain.endsWith(domain)) ||
resultDomain.includes('zhihu')
) {
continue; continue;
} }
resultUrls.push(result.url); resultUrls.push(result.url);

View File

@@ -4,7 +4,7 @@ const authMiddleware = (req: Request, res: Response, next: NextFunction) => {
const bearerHeader = req.headers['authorization']; const bearerHeader = req.headers['authorization'];
if (bearerHeader) { if (bearerHeader) {
console.log("bearerHeader:" + bearerHeader); console.log('bearerHeader:' + bearerHeader);
const bearer = bearerHeader.split(' '); const bearer = bearerHeader.split(' ');
const bearerToken = bearer[1]; const bearerToken = bearer[1];

View File

@@ -3,17 +3,22 @@ import { Page } from 'puppeteer';
export const handleSpecialWebsite = async (page: Page, url: string): Promise<string | null> => { export const handleSpecialWebsite = async (page: Page, url: string): Promise<string | null> => {
if (url.includes('blog.csdn.net')) { if (url.includes('blog.csdn.net')) {
await page.waitForSelector('article'); await page.waitForSelector('article');
const content = await page.$eval('article', el => el.innerHTML); const content = await page.$eval('article', (el) => el.innerHTML);
return content; return content;
} }
if (url.includes('zhuanlan.zhihu.com')) { if (url.includes('zhuanlan.zhihu.com')) {
console.log('是知乎,需要点击按掉!'); console.log('是知乎,需要点击按掉!');
console.log(await page.content()); console.log(await page.content());
if((await page.content()).includes('{"error":{"message":"您当前请求存在异常,暂时限制本次访问。如有疑问,您可以通过手机摇一摇或登录后私信知乎小管家反馈。","code":40362}}')) return null; if (
(await page.content()).includes(
'{"error":{"message":"您当前请求存在异常,暂时限制本次访问。如有疑问,您可以通过手机摇一摇或登录后私信知乎小管家反馈。","code":40362}}'
)
)
return null;
await page.waitForSelector('button[aria-label="关闭"]'); await page.waitForSelector('button[aria-label="关闭"]');
await page.click('button[aria-label="关闭"]'); // 使用 aria-label 选择按钮 await page.click('button[aria-label="关闭"]'); // 使用 aria-label 选择按钮
await page.waitForSelector('article'); await page.waitForSelector('article');
const content = await page.$eval('article', el => el.innerHTML); const content = await page.$eval('article', (el) => el.innerHTML);
return content; return content;
} }
// 可以添加更多特殊网站的处理逻辑 // 可以添加更多特殊网站的处理逻辑

View File

@@ -1,4 +1,3 @@
import NodeCache from 'node-cache'; import NodeCache from 'node-cache';
import { MongoClient } from 'mongodb'; import { MongoClient } from 'mongodb';
import crypto from 'crypto'; import crypto from 'crypto';
@@ -19,10 +18,15 @@ const connectToMongo = async () => {
const createTTLIndex = async () => { const createTTLIndex = async () => {
try { try {
const db = await connectToMongo(); const db = await connectToMongo();
await db.collection(collectionName).createIndex({ "updatedAt": 1 }, { expireAfterSeconds: parseInt(process.env.EXPIRE_AFTER_SECONDS || '9000') }); await db
console.log("TTL index created successfully"); .collection(collectionName)
.createIndex(
{ updatedAt: 1 },
{ expireAfterSeconds: parseInt(process.env.EXPIRE_AFTER_SECONDS || '9000') }
);
console.log('TTL index created successfully');
} catch (error) { } catch (error) {
console.error("Error creating TTL index:", error); console.error('Error creating TTL index:', error);
} }
}; };
@@ -53,11 +57,7 @@ const savePageToCache = async (url: string, content: string) => {
try { try {
const db = await connectToMongo(); const db = await connectToMongo();
await db.collection(collectionName).updateOne( await db.collection(collectionName).updateOne({ url }, { $set: page }, { upsert: true }); // 更新持久化缓存
{ url },
{ $set: page },
{ upsert: true }
); // 更新持久化缓存
} catch (error) { } catch (error) {
console.error('Error saving page to cache:', error); console.error('Error saving page to cache:', error);
throw error; throw error;

View File

@@ -13,12 +13,19 @@ interface CachedPage {
updatedAt: Date; updatedAt: Date;
} }
export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: string[], results: Map<string, any>, strategies: any[], detectWebsites: string[], pageCount: number) => { export const performDeepSearch = async (
clusterInstance: Cluster,
resultUrls: string[],
results: Map<string, any>,
strategies: any[],
detectWebsites: string[],
pageCount: number
) => {
const tasks = []; const tasks = [];
await clusterInstance.task(async ({ page, data: { searchUrl } }) => { await clusterInstance.task(async ({ page, data: { searchUrl } }) => {
try { try {
const cachedPage = await getCachedPage(searchUrl) as CachedPage | null; const cachedPage = (await getCachedPage(searchUrl)) as CachedPage | null;
if (cachedPage) { if (cachedPage) {
const result = results.get(searchUrl); const result = results.get(searchUrl);
if (result) { if (result) {
@@ -29,18 +36,25 @@ export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: st
} }
} catch (error) { } catch (error) {
console.error(`从缓存获取页面 ${searchUrl} 时发生错误:`, error); console.error(`从缓存获取页面 ${searchUrl} 时发生错误:`, error);
results.set(searchUrl, { url: searchUrl, error: (error as Error).message, crawlStatus: 'Failed' }); results.set(searchUrl, {
url: searchUrl,
error: (error as Error).message,
crawlStatus: 'Failed'
});
return; return;
} }
try { try {
const response = await fetch(searchUrl, { const response = await fetch(searchUrl, {
headers: { headers: {
'User-Agent': new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }).toString(), 'User-Agent': new UserAgent({
'Referer': 'https://www.google.com/', deviceCategory: 'desktop',
platform: 'Linux x86_64'
}).toString(),
Referer: 'https://www.google.com/',
'Accept-Language': 'en-US,en;q=0.9', 'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Connection': 'keep-alive', Connection: 'keep-alive',
'Cache-Control': 'no-cache' 'Cache-Control': 'no-cache'
} }
}); });
@@ -66,7 +80,7 @@ export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: st
} }
try { try {
if (detectWebsites.some(website => searchUrl.includes(website))) { if (detectWebsites.some((website) => searchUrl.includes(website))) {
await setupPage(page); await setupPage(page);
} else { } else {
const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }); const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
@@ -118,7 +132,11 @@ export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: st
await updateCacheAsync(searchUrl, cleanedContent || ''); await updateCacheAsync(searchUrl, cleanedContent || '');
} catch (error) { } catch (error) {
results.set(searchUrl, { url: searchUrl, error: (error as Error).message, crawlStatus: 'Failed' }); results.set(searchUrl, {
url: searchUrl,
error: (error as Error).message,
crawlStatus: 'Failed'
});
} finally { } finally {
await page.close().catch(() => {}); await page.close().catch(() => {});
} }

View File

@@ -8,7 +8,7 @@ const getRandomUserAgent = () => {
}; };
const getRandomPlatform = () => { const getRandomPlatform = () => {
const platforms = ["Win32", "MacIntel", "Linux x86_64"]; const platforms = ['Win32', 'MacIntel', 'Linux x86_64'];
return platforms[Math.floor(Math.random() * platforms.length)]; return platforms[Math.floor(Math.random() * platforms.length)];
}; };
@@ -16,14 +16,16 @@ const getRandomPlatform = () => {
const validateproxy = process.env.VALIDATE_PROXY ? JSON.parse(process.env.VALIDATE_PROXY) : []; const validateproxy = process.env.VALIDATE_PROXY ? JSON.parse(process.env.VALIDATE_PROXY) : [];
const getRandomProxy = () => { const getRandomProxy = () => {
return validateproxy.length > 0 ? validateproxy[Math.floor(Math.random() * validateproxy.length)] : null; return validateproxy.length > 0
? validateproxy[Math.floor(Math.random() * validateproxy.length)]
: null;
}; };
const getRandomLanguages = () => { const getRandomLanguages = () => {
const languages = [ const languages = [
["zh-CN", "zh", "en"], ['zh-CN', 'zh', 'en'],
["en-US", "en", "fr"], ['en-US', 'en', 'fr'],
["es-ES", "es", "en"] ['es-ES', 'es', 'en']
]; ];
return languages[Math.floor(Math.random() * languages.length)]; return languages[Math.floor(Math.random() * languages.length)];
}; };
@@ -42,30 +44,38 @@ export const setupPage = async (page: Page): Promise<void> => {
delete newProto.webdriver; delete newProto.webdriver;
(navigator as any).__proto__ = newProto; (navigator as any).__proto__ = newProto;
(window as any).chrome = {}; (window as any).chrome = {};
(window as any).chrome.app = {"InstallState":"testt", "RunningState":"estt", "getDetails":"stte", "getIsInstalled":"ttes"}; (window as any).chrome.app = {
(window as any).chrome.csi = function(){}; InstallState: 'testt',
(window as any).chrome.loadTimes = function(){}; RunningState: 'estt',
(window as any).chrome.runtime = function(){}; getDetails: 'stte',
getIsInstalled: 'ttes'
};
(window as any).chrome.csi = function () {};
(window as any).chrome.loadTimes = function () {};
(window as any).chrome.runtime = function () {};
Object.defineProperty(navigator, 'userAgent', { Object.defineProperty(navigator, 'userAgent', {
get: () => getRandomUserAgent(), get: () => getRandomUserAgent()
}); });
Object.defineProperty(navigator, 'platform', { Object.defineProperty(navigator, 'platform', {
get: () => getRandomPlatform(), get: () => getRandomPlatform()
}); });
Object.defineProperty(navigator, 'plugins', { Object.defineProperty(navigator, 'plugins', {
get: () => [{"description": "Shockwave Flash", get: () => [
"filename": "pepflashplayer.dll", {
"length": 1, description: 'Shockwave Flash',
"name": "Shockwave Flash"}] filename: 'pepflashplayer.dll',
length: 1,
name: 'Shockwave Flash'
}
]
}); });
Object.defineProperty(navigator, 'languages', { Object.defineProperty(navigator, 'languages', {
get: () => getRandomLanguages(), get: () => getRandomLanguages()
}); });
const originalQuery = (window.navigator.permissions as any).query; const originalQuery = (window.navigator.permissions as any).query;
(window.navigator.permissions as any).query = (parameters: any) => ( (window.navigator.permissions as any).query = (parameters: any) =>
parameters.name === 'notifications' ? parameters.name === 'notifications'
Promise.resolve({ state: Notification.permission } as PermissionStatus) : ? Promise.resolve({ state: Notification.permission } as PermissionStatus)
originalQuery(parameters) : originalQuery(parameters);
);
}); });
}; };

View File

@@ -63,7 +63,7 @@ async function initHttp(teamId?: string): Promise<any> {
} }
} }
], ],
{ session } { session, ordered: true }
); );
/* 批量创建子插件 */ /* 批量创建子插件 */
@@ -88,7 +88,7 @@ async function initHttp(teamId?: string): Promise<any> {
} }
} }
], ],
{ session } { session, ordered: true }
); );
if (item.version === 'v2') { if (item.version === 'v2') {
await MongoAppVersion.create( await MongoAppVersion.create(
@@ -100,7 +100,7 @@ async function initHttp(teamId?: string): Promise<any> {
edges: item.edges edges: item.edges
} }
], ],
{ session } { session, ordered: true }
); );
} }
} }
@@ -160,7 +160,7 @@ async function initPlugin(teamId?: string): Promise<any> {
} }
} }
], ],
{ session } { session, ordered: true }
); );
if (plugin.version === 'v2') { if (plugin.version === 'v2') {
@@ -173,7 +173,7 @@ async function initPlugin(teamId?: string): Promise<any> {
edges: plugin.edges edges: plugin.edges
} }
], ],
{ session } { session, ordered: true }
); );
} }

View File

@@ -98,7 +98,8 @@ async function handler(
} }
], ],
{ {
session session,
ordered: true
} }
); );
} }

View File

@@ -126,7 +126,7 @@ export const onCreateApp = async ({
'pluginData.nodeVersion': defaultNodeVersion 'pluginData.nodeVersion': defaultNodeVersion
} }
], ],
{ session } { session, ordered: true }
); );
if (!AppFolderTypeList.includes(type!)) { if (!AppFolderTypeList.includes(type!)) {
@@ -144,7 +144,7 @@ export const onCreateApp = async ({
isPublish: true isPublish: true
} }
], ],
{ session } { session, ordered: true }
); );
} }

View File

@@ -89,7 +89,8 @@ async function handler(req: ApiRequestProps<CreateAppFolderBody>) {
} }
], ],
{ {
session session,
ordered: true
} }
); );
} }

View File

@@ -45,7 +45,7 @@ async function handler(req: ApiRequestProps<PostPublishAppProps>, res: NextApiRe
tmbId tmbId
} }
], ],
{ session } { session, ordered: true }
); );
// update app // update app

View File

@@ -88,7 +88,7 @@ async function handler(
yuqueServer yuqueServer
} }
], ],
{ session } { session, ordered: true }
); );
await refreshSourceAvatar(avatar, undefined, session); await refreshSourceAvatar(avatar, undefined, session);

View File

@@ -87,7 +87,7 @@ async function handler(
permission: OwnerPermissionVal permission: OwnerPermissionVal
} }
], ],
{ session } { session, ordered: true }
); );
} }
}); });

View File

@@ -122,7 +122,8 @@ async function handler(req: ApiRequestProps<rebuildEmbeddingBody>): Promise<Resp
} }
], ],
{ {
session session,
ordered: true
} }
); );
} }

View File

@@ -98,7 +98,7 @@ export async function insertData2Dataset({
})) }))
} }
], ],
{ session } { session, ordered: true }
); );
// 3. Create mongo data text // 3. Create mongo data text
@@ -112,7 +112,7 @@ export async function insertData2Dataset({
fullTextToken: jiebaSplit({ text: qaStr }) fullTextToken: jiebaSplit({ text: qaStr })
} }
], ],
{ session } { session, ordered: true }
); );
return { return {

View File

@@ -192,7 +192,7 @@ const rebuildData = async ({
retryCount: 50 retryCount: 50
} }
], ],
{ session } { session, ordered: true }
); );
} }
}); });

View File

@@ -37,7 +37,7 @@ export async function initRootUser(retry = 3): Promise<any> {
password: hashStr(psw) password: hashStr(psw)
} }
], ],
{ session } { session, ordered: true }
); );
rootId = _id; rootId = _id;
} }