From 4bc7f211829337f755f2be93ca5014d3ef2b1c56 Mon Sep 17 00:00:00 2001 From: Finley Ge <32237950+FinleyGe@users.noreply.github.com> Date: Mon, 3 Mar 2025 11:37:51 +0800 Subject: [PATCH] fix: add order:true to all create transactions (#3948) --- .../core/dataset/collection/controller.ts | 2 +- .../service/core/dataset/collection/utils.ts | 2 +- .../support/permission/inheritPermission.ts | 3 +- packages/service/support/wallet/sub/utils.ts | 2 +- .../support/wallet/usage/controller.ts | 2 +- .../src/controllers/quickfetchController.ts | 90 ++--- .../SPIDER/src/controllers/readController.ts | 50 +-- .../src/controllers/searchController.ts | 76 ++-- .../SPIDER/src/engines/baiduEngine.ts | 355 +++++++++--------- .../SPIDER/src/engines/searchxngEngine.ts | 21 +- plugins/webcrawler/SPIDER/src/index.ts | 2 +- .../SPIDER/src/middleware/authMiddleware.ts | 4 +- .../SPIDER/src/routes/quickfetchRoutes.ts | 2 +- .../SPIDER/src/routes/readRoutes.ts | 2 +- .../SPIDER/src/routes/searchRoutes.ts | 2 +- .../SPIDER/src/specialHandlers/index.ts | 13 +- .../SPIDER/src/utils/cacheUpdater.ts | 20 +- .../webcrawler/SPIDER/src/utils/deepSearch.ts | 38 +- .../webcrawler/SPIDER/src/utils/setupPage.ts | 54 +-- projects/app/src/pages/api/admin/initv485.ts | 10 +- .../app/src/pages/api/admin/resetMilvus.ts | 3 +- projects/app/src/pages/api/core/app/create.ts | 4 +- .../src/pages/api/core/app/folder/create.ts | 3 +- .../src/pages/api/core/app/version/publish.ts | 2 +- .../app/src/pages/api/core/dataset/create.ts | 2 +- .../pages/api/core/dataset/folder/create.ts | 2 +- .../core/dataset/training/rebuildEmbedding.ts | 3 +- .../service/core/dataset/data/controller.ts | 4 +- .../app/src/service/events/generateVector.ts | 2 +- projects/app/src/service/mongo.ts | 2 +- 30 files changed, 425 insertions(+), 352 deletions(-) diff --git a/packages/service/core/dataset/collection/controller.ts b/packages/service/core/dataset/collection/controller.ts index 5aa63f4fe..02686ab21 100644 --- a/packages/service/core/dataset/collection/controller.ts +++ b/packages/service/core/dataset/collection/controller.ts @@ -216,7 +216,7 @@ export async function createOneCollection({ nextSyncTime } ], - { session } + { session, ordered: true } ); return collection; diff --git a/packages/service/core/dataset/collection/utils.ts b/packages/service/core/dataset/collection/utils.ts index e1a9e4632..4f674a9dd 100644 --- a/packages/service/core/dataset/collection/utils.ts +++ b/packages/service/core/dataset/collection/utils.ts @@ -97,7 +97,7 @@ export const createOrGetCollectionTags = async ({ datasetId, tag: tagContent })), - { session } + { session, ordered: true } ); return [...existingTags.map((tag) => tag._id), ...newTags.map((tag) => tag._id)]; diff --git a/packages/service/support/permission/inheritPermission.ts b/packages/service/support/permission/inheritPermission.ts index 4f9993f5a..76de2b78d 100644 --- a/packages/service/support/permission/inheritPermission.ts +++ b/packages/service/support/permission/inheritPermission.ts @@ -196,7 +196,8 @@ export async function syncCollaborators({ permission: item.permission })), { - session + session, + ordered: true } ); } diff --git a/packages/service/support/wallet/sub/utils.ts b/packages/service/support/wallet/sub/utils.ts index d81072d1a..5eb8255de 100644 --- a/packages/service/support/wallet/sub/utils.ts +++ b/packages/service/support/wallet/sub/utils.ts @@ -100,7 +100,7 @@ export const initTeamFreePlan = async ({ surplusPoints: freePoints } ], - { session } + { session, ordered: true } ); }; diff --git a/packages/service/support/wallet/usage/controller.ts b/packages/service/support/wallet/usage/controller.ts index 9c1410785..7eee2f0f1 100644 --- a/packages/service/support/wallet/usage/controller.ts +++ b/packages/service/support/wallet/usage/controller.ts @@ -160,7 +160,7 @@ export const createTrainingUsage = async ({ ] } ], - { session } + { session, ordered: true } ); return { billId: String(_id) }; diff --git a/plugins/webcrawler/SPIDER/src/controllers/quickfetchController.ts b/plugins/webcrawler/SPIDER/src/controllers/quickfetchController.ts index e5e836d03..ce5230fc9 100644 --- a/plugins/webcrawler/SPIDER/src/controllers/quickfetchController.ts +++ b/plugins/webcrawler/SPIDER/src/controllers/quickfetchController.ts @@ -5,56 +5,56 @@ import dotenv from 'dotenv'; dotenv.config(); const userAgents = [ - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0' ]; export const quickFetch = async (req: Request, res: Response): Promise => { - const { url } = req.query; + const { url } = req.query; - if (!url) { - res.status(400).json({ - status: 400, - error: { - code: "MISSING_PARAM", - message: "缺少必要参数: url" - } - }); - return; - } + if (!url) { + res.status(400).json({ + status: 400, + error: { + code: 'MISSING_PARAM', + message: '缺少必要参数: url' + } + }); + return; + } - try { - const response = await fetch(url as string, { - headers: { - 'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)], - 'Referer': 'https://www.google.com/', - 'Accept-Language': 'en-US,en;q=0.9', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Connection': 'keep-alive', - 'Cache-Control': 'no-cache' - } - }); - if (!response.ok) { - throw new Error(`HTTP error! status: ${response.status}`); - } - const data = await response.text(); - res.status(200).json({ - status: 200, - data: { - content: data - } - }); - } catch (error) { - console.error('Error fetching the page:', error); - res.status(500).json({ - status: 500, - error: { - code: "INTERNAL_SERVER_ERROR", - message: "发生错误" - } - }); + try { + const response = await fetch(url as string, { + headers: { + 'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)], + Referer: 'https://www.google.com/', + 'Accept-Language': 'en-US,en;q=0.9', + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + Connection: 'keep-alive', + 'Cache-Control': 'no-cache' + } + }); + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); } + const data = await response.text(); + res.status(200).json({ + status: 200, + data: { + content: data + } + }); + } catch (error) { + console.error('Error fetching the page:', error); + res.status(500).json({ + status: 500, + error: { + code: 'INTERNAL_SERVER_ERROR', + message: '发生错误' + } + }); + } }; -export default { quickFetch }; \ No newline at end of file +export default { quickFetch }; diff --git a/plugins/webcrawler/SPIDER/src/controllers/readController.ts b/plugins/webcrawler/SPIDER/src/controllers/readController.ts index 0758a5b77..62b26e5b6 100644 --- a/plugins/webcrawler/SPIDER/src/controllers/readController.ts +++ b/plugins/webcrawler/SPIDER/src/controllers/readController.ts @@ -16,16 +16,16 @@ const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIS export const readPage = async (req: Request, res: Response): Promise => { const { queryUrl } = req.query; - console.log("-------"); + console.log('-------'); console.log(queryUrl); - console.log("-------"); + console.log('-------'); if (!queryUrl) { res.status(400).json({ status: 400, error: { - code: "MISSING_PARAM", - message: "缺少必要参数: queryUrl" + code: 'MISSING_PARAM', + message: '缺少必要参数: queryUrl' } }); return; @@ -36,8 +36,8 @@ export const readPage = async (req: Request, res: Response): Promise => { res.status(403).json({ status: 403, error: { - code: "BLACKLISTED_DOMAIN", - message: "该域名受到保护中" + code: 'BLACKLISTED_DOMAIN', + message: '该域名受到保护中' } }); return; @@ -46,11 +46,14 @@ export const readPage = async (req: Request, res: Response): Promise => { try { const response = await fetch(queryUrl as string, { headers: { - 'User-Agent': new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }).toString(), - 'Referer': 'https://www.google.com/', + 'User-Agent': new UserAgent({ + deviceCategory: 'desktop', + platform: 'Linux x86_64' + }).toString(), + Referer: 'https://www.google.com/', 'Accept-Language': 'en-US,en;q=0.9', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Connection': 'keep-alive', + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + Connection: 'keep-alive', 'Cache-Control': 'no-cache' } }); @@ -69,7 +72,7 @@ export const readPage = async (req: Request, res: Response): Promise => { }); await updateCacheAsync(queryUrl as string, cleanedContent || ''); - console.log("Page read successfully"); + console.log('Page read successfully'); return; } else { throw new Error(`HTTP error! status: ${response.status}`); @@ -79,23 +82,26 @@ export const readPage = async (req: Request, res: Response): Promise => { } try { - const browser = await puppeteer.launch({ - ignoreDefaultArgs: ["--enable-automation"], - headless: true, - executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径 + const browser = await puppeteer.launch({ + ignoreDefaultArgs: ['--enable-automation'], + headless: true, + executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径 pipe: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', - '--disable-gpu', - // '--single-process' + '--disable-gpu' + // '--single-process' ] }); const page = await browser.newPage(); // 检测是否需要特殊处理 - if (typeof queryUrl === 'string' && detectWebsites.some(website => queryUrl.includes(website))) { + if ( + typeof queryUrl === 'string' && + detectWebsites.some((website) => queryUrl.includes(website)) + ) { await setupPage(page); } else { const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }); @@ -128,15 +134,15 @@ export const readPage = async (req: Request, res: Response): Promise => { }); await updateCacheAsync(queryUrl as string, cleanedContent || ''); - console.log("Page read successfully"); + console.log('Page read successfully'); } catch (error) { console.error(error); res.status(500).json({ status: 500, error: { - code: "INTERNAL_SERVER_ERROR", - message: "读取页面时发生内部服务器错误" + code: 'INTERNAL_SERVER_ERROR', + message: '读取页面时发生内部服务器错误' } }); } -}; \ No newline at end of file +}; diff --git a/plugins/webcrawler/SPIDER/src/controllers/searchController.ts b/plugins/webcrawler/SPIDER/src/controllers/searchController.ts index 80db8ac06..26c84c670 100644 --- a/plugins/webcrawler/SPIDER/src/controllers/searchController.ts +++ b/plugins/webcrawler/SPIDER/src/controllers/searchController.ts @@ -12,15 +12,21 @@ const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || []; const maxConcurrency = parseInt(process.env.MAX_CONCURRENCY || '10', 10); export const search = async (req: Request, res: Response): Promise => { - const { query, pageCount = 10, needDetails = 'false', engine = 'baidu', categories = 'general' } = req.query; - const needDetailsBool = (needDetails === 'true'); + const { + query, + pageCount = 10, + needDetails = 'false', + engine = 'baidu', + categories = 'general' + } = req.query; + const needDetailsBool = needDetails === 'true'; if (!query) { res.status(400).json({ status: 400, error: { - code: "MISSING_PARAM", - message: "缺少必要参数: query" + code: 'MISSING_PARAM', + message: '缺少必要参数: query' } }); return; @@ -28,24 +34,29 @@ export const search = async (req: Request, res: Response): Promise => { let fetchSearchResults; let searchUrlBase; try { - if (engine === 'baidu') { - fetchSearchResults = fetchBaiduResults; - searchUrlBase = process.env.ENGINE_BAIDUURL; - } else if (engine === 'searchxng') { - fetchSearchResults = fetchSearchxngResults; - searchUrlBase = process.env.ENGINE_SEARCHXNGURL; - } else { - res.status(400).json({ - status: 400, - error: { - code: "INVALID_ENGINE", - message: "无效的搜索引擎" - } - }); - return; - } + if (engine === 'baidu') { + fetchSearchResults = fetchBaiduResults; + searchUrlBase = process.env.ENGINE_BAIDUURL; + } else if (engine === 'searchxng') { + fetchSearchResults = fetchSearchxngResults; + searchUrlBase = process.env.ENGINE_SEARCHXNGURL; + } else { + res.status(400).json({ + status: 400, + error: { + code: 'INVALID_ENGINE', + message: '无效的搜索引擎' + } + }); + return; + } - const { resultUrls, results } = await fetchSearchResults(query as string, Number(pageCount), searchUrlBase || '', categories as string); + const { resultUrls, results } = await fetchSearchResults( + query as string, + Number(pageCount), + searchUrlBase || '', + categories as string + ); //如果返回值为空,返回空数组 if (results.size === 0) { @@ -79,20 +90,27 @@ export const search = async (req: Request, res: Response): Promise => { concurrency: Cluster.CONCURRENCY_CONTEXT, maxConcurrency: maxConcurrency, puppeteerOptions: { - ignoreDefaultArgs: ["--enable-automation"], - headless: "true", - executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径 + ignoreDefaultArgs: ['--enable-automation'], + headless: 'true', + executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径 pipe: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', - '--disable-gpu', + '--disable-gpu' ] } }); - const sortedResults = await performDeepSearch(clusterInstance, resultUrls, results, strategies, detectWebsites, Number(pageCount)); + const sortedResults = await performDeepSearch( + clusterInstance, + resultUrls, + results, + strategies, + detectWebsites, + Number(pageCount) + ); res.status(200).json({ status: 200, data: { @@ -104,11 +122,11 @@ export const search = async (req: Request, res: Response): Promise => { res.status(500).json({ status: 500, error: { - code: "INTERNAL_SERVER_ERROR", - message: "发生错误" + code: 'INTERNAL_SERVER_ERROR', + message: '发生错误' } }); } }; -export default { search }; \ No newline at end of file +export default { search }; diff --git a/plugins/webcrawler/SPIDER/src/engines/baiduEngine.ts b/plugins/webcrawler/SPIDER/src/engines/baiduEngine.ts index e2f11f3b4..ca75f0c1b 100644 --- a/plugins/webcrawler/SPIDER/src/engines/baiduEngine.ts +++ b/plugins/webcrawler/SPIDER/src/engines/baiduEngine.ts @@ -5,200 +5,203 @@ import { setupPage } from '../utils/setupPage'; import { Cluster } from 'puppeteer-cluster'; async function randomWait(min: number, max: number) { - // 随机等待时间 - const delay = Math.floor(Math.random() * (max - min + 1)) + min; - return new Promise(resolve => setTimeout(resolve, delay)); + // 随机等待时间 + const delay = Math.floor(Math.random() * (max - min + 1)) + min; + return new Promise((resolve) => setTimeout(resolve, delay)); } -export const fetchSearchResults = async (query: string, pageCount: number, searchUrlBase: string, categories: string) => { - console.log(`Fetching Baidu search results for query: ${query}`); - // 如果 searchUrlBase 为空,返回空数组 - if (!searchUrlBase) { - return { resultUrls: [], results: new Map() }; +export const fetchSearchResults = async ( + query: string, + pageCount: number, + searchUrlBase: string, + categories: string +) => { + console.log(`Fetching Baidu search results for query: ${query}`); + // 如果 searchUrlBase 为空,返回空数组 + if (!searchUrlBase) { + return { resultUrls: [], results: new Map() }; + } + const resultUrls: string[] = []; + const results = new Map(); + + const pagesToFetch = Math.ceil(pageCount / 10); + + const browser = await puppeteer.launch({ + ignoreDefaultArgs: ['--enable-automation'], + headless: true, + executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径 + pipe: true, + args: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-gpu' + // '--single-process' + ] + }); + + const page = await browser.newPage(); + await setupPage(page); + + for (let i = 0; i < pagesToFetch; i++) { + const searchUrl = new URL(`${searchUrlBase}?wd=${encodeURIComponent(query)}&pn=${i * 10}`); + console.log(`Fetching page ${i + 1} from Baidu: ${searchUrl.toString()}`); + let retryCount = 0; + let success = false; + + while (retryCount < 5 && !success) { + try { + console.time(`Page Load Time for page ${i + 1}`); + await page.goto(searchUrl.toString(), { waitUntil: 'load' }); + console.timeEnd(`Page Load Time for page ${i + 1}`); + + let content = await page.content(); + let dom = new JSDOM(content); + let document = dom.window.document; + console.log(document.title); + + // 如果是百度安全验证页面,重新设置页面并重新访问 + if (document.title.includes('百度安全验证')) { + console.log('Detected Baidu security verification, retrying...'); + await setupPage(page); + retryCount++; + //随机等待时间 + await randomWait(1000, 3000); + continue; + } + + // 解析搜索结果 + console.time(`Link Retrieval Time for page ${i + 1}`); + + const resultContainers = document.querySelectorAll('.result.c-container'); + for (const result of resultContainers) { + if (resultUrls.length > pageCount + 5) { + break; + } + const titleElement = result.querySelector('h3 a'); + const title = titleElement ? titleElement.textContent : ''; + const url = titleElement ? titleElement.getAttribute('href') : ''; + const contentElement = result.querySelector('[class^="content"]'); + const content = contentElement ? contentElement.textContent : ''; + + if (url) { + resultUrls.push(url); + results.set(url, { + title, + url, + snippet: content, + source: 'baidu', + crawlStatus: 'Pending', + score: 0 + }); + } + } + console.timeEnd(`Link Retrieval Time for page ${i + 1}`); + success = true; + } catch (error) { + console.error(`Error fetching page ${i + 1}:`, error); + retryCount++; + } } - const resultUrls: string[] = []; - const results = new Map(); + } - const pagesToFetch = Math.ceil(pageCount / 10); + await browser.close(); - const browser = await puppeteer.launch({ - ignoreDefaultArgs: ["--enable-automation"], - headless: true, - executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径 - pipe: true, - args: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - // '--single-process' - ] + console.log('fetch all fake urls'); + + // 快速检索真实 URL + const urlsToProcessWithPuppeteer = []; + for (const url of resultUrls) { + try { + const response = await fetch(url, { + headers: { + 'User-Agent': + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', + Referer: 'https://www.google.com/', + 'Accept-Language': 'en-US,en;q=0.9', + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + Connection: 'keep-alive', + 'Cache-Control': 'no-cache' + } }); - const page = await browser.newPage(); - await setupPage(page); - - for (let i = 0; i < pagesToFetch; i++) { - const searchUrl = new URL(`${searchUrlBase}?wd=${encodeURIComponent(query)}&pn=${i * 10}`); - console.log(`Fetching page ${i + 1} from Baidu: ${searchUrl.toString()}`); - let retryCount = 0; - let success = false; - - while (retryCount < 5 && !success) { - try { - console.time(`Page Load Time for page ${i + 1}`); - await page.goto(searchUrl.toString(), { waitUntil: 'load' }); - console.timeEnd(`Page Load Time for page ${i + 1}`); - - let content = await page.content(); - let dom = new JSDOM(content); - let document = dom.window.document; - console.log(document.title); - - // 如果是百度安全验证页面,重新设置页面并重新访问 - if (document.title.includes('百度安全验证')) { - console.log('Detected Baidu security verification, retrying...'); - await setupPage(page); - retryCount++; - //随机等待时间 - await randomWait(1000, 3000); - continue; - } - - // 解析搜索结果 - console.time(`Link Retrieval Time for page ${i + 1}`); - - - const resultContainers = document.querySelectorAll('.result.c-container'); - for (const result of resultContainers) { - if (resultUrls.length > pageCount + 5) { - break; - } - const titleElement = result.querySelector('h3 a'); - const title = titleElement ? titleElement.textContent : ''; - const url = titleElement ? titleElement.getAttribute('href') : ''; - const contentElement = result.querySelector('[class^="content"]'); - const content = contentElement ? contentElement.textContent : ''; - - if (url) { - resultUrls.push(url); - results.set(url, { - title, - url, - snippet: content, - source: 'baidu', - crawlStatus: 'Pending', - score: 0 - }); - } - } - console.timeEnd(`Link Retrieval Time for page ${i + 1}`); - success = true; - } catch (error) { - console.error(`Error fetching page ${i + 1}:`, error); - retryCount++; - } + if (response.ok) { + const realUrl = response.url; + console.log('realurl:', realUrl); + const result = results.get(url); + if (result) { + result.url = realUrl; + result.crawlStatus = 'Success'; } + } else { + throw new Error(`HTTP error! status: ${response.status}`); + } + } catch (error) { + console.error(`Error fetching original URL for ${url}:`, error); + urlsToProcessWithPuppeteer.push(url); } + } - await browser.close(); + console.log('pass quickfetch'); - console.log('fetch all fake urls'); - - // 快速检索真实 URL - const urlsToProcessWithPuppeteer = []; - for (const url of resultUrls) { - try { - const response = await fetch(url, { - headers: { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', - 'Referer': 'https://www.google.com/', - 'Accept-Language': 'en-US,en;q=0.9', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Connection': 'keep-alive', - 'Cache-Control': 'no-cache' - } - }); - - if (response.ok) { - const realUrl = response.url; - console.log('realurl:', realUrl); - const result = results.get(url); - if (result) { - result.url = realUrl; - result.crawlStatus = 'Success'; - } - } else { - throw new Error(`HTTP error! status: ${response.status}`); - } - } catch (error) { - console.error(`Error fetching original URL for ${url}:`, error); - urlsToProcessWithPuppeteer.push(url); - } + // 并发处理真实 URL + const cluster = await Cluster.launch({ + concurrency: Cluster.CONCURRENCY_CONTEXT, + maxConcurrency: 10, + puppeteerOptions: { + ignoreDefaultArgs: ['--enable-automation'], + headless: 'true', + executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径 + pipe: true, + args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'] } + }); - console.log('pass quickfetch'); + let failedUrlCount = 0; - // 并发处理真实 URL - const cluster = await Cluster.launch({ - concurrency: Cluster.CONCURRENCY_CONTEXT, - maxConcurrency: 10, - puppeteerOptions: { - ignoreDefaultArgs: ["--enable-automation"], - headless: "true", - executablePath: "/usr/bin/chromium", // 明确指定 Chromium 路径 - pipe: true, - args: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - ] + await cluster.task(async ({ page, data: url }) => { + let retryUrlCount = 0; + let urlSuccess = false; + while (retryUrlCount < 3 && !urlSuccess) { + console.log(`Fetching original URL for ${url}, attempt ${retryUrlCount + 1}`); + try { + await page.goto(url, { waitUntil: 'load' }); + // 检查页面是否被分离 + if (page.isClosed()) { + throw new Error('Page has been closed'); } - }); - - let failedUrlCount = 0; - - await cluster.task(async ({ page, data: url }) => { - let retryUrlCount = 0; - let urlSuccess = false; - while (retryUrlCount < 3 && !urlSuccess) { - console.log(`Fetching original URL for ${url}, attempt ${retryUrlCount + 1}`); - try { - await page.goto(url, { waitUntil: 'load' }); - // 检查页面是否被分离 - if (page.isClosed()) { - throw new Error('Page has been closed'); - } - const realUrl = page.url(); // 获取真实 URL - const result = results.get(url); - if (result) { - result.url = realUrl; - result.crawlStatus = 'Success'; - } - urlSuccess = true; - } catch (error) { - console.error(`Error fetching original URL, retrying...`, error); - retryUrlCount++; - await randomWait(1000, 3000); - } + const realUrl = page.url(); // 获取真实 URL + const result = results.get(url); + if (result) { + result.url = realUrl; + result.crawlStatus = 'Success'; } - if (!urlSuccess) { - failedUrlCount++; - } - }); - - for (const url of urlsToProcessWithPuppeteer) { - cluster.queue(url); + urlSuccess = true; + } catch (error) { + console.error(`Error fetching original URL, retrying...`, error); + retryUrlCount++; + await randomWait(1000, 3000); + } } + if (!urlSuccess) { + failedUrlCount++; + } + }); - await cluster.idle(); - await cluster.close(); + for (const url of urlsToProcessWithPuppeteer) { + cluster.queue(url); + } - console.log(`Number of URLs that failed to return a real URL: ${failedUrlCount}`); + await cluster.idle(); + await cluster.close(); - // 过滤并返回前 pageCount 个结果 - const filteredResults = Array.from(results.values()).slice(0, pageCount); + console.log(`Number of URLs that failed to return a real URL: ${failedUrlCount}`); - return { resultUrls: filteredResults.map(result => result.url), results: new Map(filteredResults.map(result => [result.url, result])) }; -}; \ No newline at end of file + // 过滤并返回前 pageCount 个结果 + const filteredResults = Array.from(results.values()).slice(0, pageCount); + + return { + resultUrls: filteredResults.map((result) => result.url), + results: new Map(filteredResults.map((result) => [result.url, result])) + }; +}; diff --git a/plugins/webcrawler/SPIDER/src/engines/searchxngEngine.ts b/plugins/webcrawler/SPIDER/src/engines/searchxngEngine.ts index 27bc53876..305fb8114 100644 --- a/plugins/webcrawler/SPIDER/src/engines/searchxngEngine.ts +++ b/plugins/webcrawler/SPIDER/src/engines/searchxngEngine.ts @@ -6,9 +6,13 @@ dotenv.config(); const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : []; -export const fetchSearchResults = async (query: string, pageCount: number, searchUrlBase: string, categories: string) => { - - const MAX_PAGES = (pageCount / 10 +1) * 2+1; // 最多搜索的页面数 +export const fetchSearchResults = async ( + query: string, + pageCount: number, + searchUrlBase: string, + categories: string +) => { + const MAX_PAGES = (pageCount / 10 + 1) * 2 + 1; // 最多搜索的页面数 //如果searchUrlBase为空,返回空数组,pagecount是需要搜索结果的数量 if (!searchUrlBase) { return { resultUrls: [], results: new Map() }; @@ -20,7 +24,9 @@ export const fetchSearchResults = async (query: string, pageCount: number, searc let pageIndex = 0; while (fetchedResultsCount < pageCount && pageIndex < MAX_PAGES) { - const searchUrl = new URL(`${searchUrlBase}?q=${encodeURIComponent(query)}&pageno=${pageIndex + 1}&format=json&categories=${categories}`); + const searchUrl = new URL( + `${searchUrlBase}?q=${encodeURIComponent(query)}&pageno=${pageIndex + 1}&format=json&categories=${categories}` + ); console.log(`Fetching page ${pageIndex + 1} from SearchXNG: ${searchUrl.toString()}`); const response = await axios.get(searchUrl.toString()); const jsonResults = response.data.results; @@ -28,7 +34,10 @@ export const fetchSearchResults = async (query: string, pageCount: number, searc for (let index = 0; index < jsonResults.length; index++) { const result = jsonResults[index]; const resultDomain = new URL(result.url).hostname; - if (blacklistDomains.some((domain: string) => resultDomain.endsWith(domain)) || resultDomain.includes('zhihu')) { + if ( + blacklistDomains.some((domain: string) => resultDomain.endsWith(domain)) || + resultDomain.includes('zhihu') + ) { continue; } resultUrls.push(result.url); @@ -52,4 +61,4 @@ export const fetchSearchResults = async (query: string, pageCount: number, searc } return { resultUrls, results }; -}; \ No newline at end of file +}; diff --git a/plugins/webcrawler/SPIDER/src/index.ts b/plugins/webcrawler/SPIDER/src/index.ts index cf3c64170..bea94a7a6 100644 --- a/plugins/webcrawler/SPIDER/src/index.ts +++ b/plugins/webcrawler/SPIDER/src/index.ts @@ -15,4 +15,4 @@ app.use('/api', readRoutes); app.use('/api', quickfetchRoutes); const PORT = process.env.PORT || 3000; -app.listen(PORT, () => console.log(`Server running on port ${PORT}`)); \ No newline at end of file +app.listen(PORT, () => console.log(`Server running on port ${PORT}`)); diff --git a/plugins/webcrawler/SPIDER/src/middleware/authMiddleware.ts b/plugins/webcrawler/SPIDER/src/middleware/authMiddleware.ts index 2bf2a1765..76ca9ff4a 100644 --- a/plugins/webcrawler/SPIDER/src/middleware/authMiddleware.ts +++ b/plugins/webcrawler/SPIDER/src/middleware/authMiddleware.ts @@ -4,7 +4,7 @@ const authMiddleware = (req: Request, res: Response, next: NextFunction) => { const bearerHeader = req.headers['authorization']; if (bearerHeader) { - console.log("bearerHeader:" + bearerHeader); + console.log('bearerHeader:' + bearerHeader); const bearer = bearerHeader.split(' '); const bearerToken = bearer[1]; @@ -18,4 +18,4 @@ const authMiddleware = (req: Request, res: Response, next: NextFunction) => { } }; -export default authMiddleware; \ No newline at end of file +export default authMiddleware; diff --git a/plugins/webcrawler/SPIDER/src/routes/quickfetchRoutes.ts b/plugins/webcrawler/SPIDER/src/routes/quickfetchRoutes.ts index 8617b8442..139c9b24e 100644 --- a/plugins/webcrawler/SPIDER/src/routes/quickfetchRoutes.ts +++ b/plugins/webcrawler/SPIDER/src/routes/quickfetchRoutes.ts @@ -6,4 +6,4 @@ const readRoutes = express.Router(); readRoutes.get('/quickFetch', authMiddleware, quickFetch); -export default readRoutes; \ No newline at end of file +export default readRoutes; diff --git a/plugins/webcrawler/SPIDER/src/routes/readRoutes.ts b/plugins/webcrawler/SPIDER/src/routes/readRoutes.ts index 2fdbc1c12..c50447d6a 100644 --- a/plugins/webcrawler/SPIDER/src/routes/readRoutes.ts +++ b/plugins/webcrawler/SPIDER/src/routes/readRoutes.ts @@ -6,4 +6,4 @@ const readRoutes = express.Router(); readRoutes.get('/read', authMiddleware, readPage); -export default readRoutes; \ No newline at end of file +export default readRoutes; diff --git a/plugins/webcrawler/SPIDER/src/routes/searchRoutes.ts b/plugins/webcrawler/SPIDER/src/routes/searchRoutes.ts index e074d2213..b3b92035f 100644 --- a/plugins/webcrawler/SPIDER/src/routes/searchRoutes.ts +++ b/plugins/webcrawler/SPIDER/src/routes/searchRoutes.ts @@ -6,4 +6,4 @@ const searchRoutes = express.Router(); searchRoutes.get('/search', authMiddleware, searchController.search); -export default searchRoutes; \ No newline at end of file +export default searchRoutes; diff --git a/plugins/webcrawler/SPIDER/src/specialHandlers/index.ts b/plugins/webcrawler/SPIDER/src/specialHandlers/index.ts index aac874753..9b8f4000b 100644 --- a/plugins/webcrawler/SPIDER/src/specialHandlers/index.ts +++ b/plugins/webcrawler/SPIDER/src/specialHandlers/index.ts @@ -3,19 +3,24 @@ import { Page } from 'puppeteer'; export const handleSpecialWebsite = async (page: Page, url: string): Promise => { if (url.includes('blog.csdn.net')) { await page.waitForSelector('article'); - const content = await page.$eval('article', el => el.innerHTML); + const content = await page.$eval('article', (el) => el.innerHTML); return content; } if (url.includes('zhuanlan.zhihu.com')) { console.log('是知乎,需要点击按掉!'); console.log(await page.content()); - if((await page.content()).includes('{"error":{"message":"您当前请求存在异常,暂时限制本次访问。如有疑问,您可以通过手机摇一摇或登录后私信知乎小管家反馈。","code":40362}}')) return null; + if ( + (await page.content()).includes( + '{"error":{"message":"您当前请求存在异常,暂时限制本次访问。如有疑问,您可以通过手机摇一摇或登录后私信知乎小管家反馈。","code":40362}}' + ) + ) + return null; await page.waitForSelector('button[aria-label="关闭"]'); await page.click('button[aria-label="关闭"]'); // 使用 aria-label 选择按钮 await page.waitForSelector('article'); - const content = await page.$eval('article', el => el.innerHTML); + const content = await page.$eval('article', (el) => el.innerHTML); return content; } // 可以添加更多特殊网站的处理逻辑 return null; -}; \ No newline at end of file +}; diff --git a/plugins/webcrawler/SPIDER/src/utils/cacheUpdater.ts b/plugins/webcrawler/SPIDER/src/utils/cacheUpdater.ts index ac0aa3f69..3ceb6c219 100644 --- a/plugins/webcrawler/SPIDER/src/utils/cacheUpdater.ts +++ b/plugins/webcrawler/SPIDER/src/utils/cacheUpdater.ts @@ -1,4 +1,3 @@ - import NodeCache from 'node-cache'; import { MongoClient } from 'mongodb'; import crypto from 'crypto'; @@ -19,10 +18,15 @@ const connectToMongo = async () => { const createTTLIndex = async () => { try { const db = await connectToMongo(); - await db.collection(collectionName).createIndex({ "updatedAt": 1 }, { expireAfterSeconds: parseInt(process.env.EXPIRE_AFTER_SECONDS || '9000') }); - console.log("TTL index created successfully"); + await db + .collection(collectionName) + .createIndex( + { updatedAt: 1 }, + { expireAfterSeconds: parseInt(process.env.EXPIRE_AFTER_SECONDS || '9000') } + ); + console.log('TTL index created successfully'); } catch (error) { - console.error("Error creating TTL index:", error); + console.error('Error creating TTL index:', error); } }; @@ -53,11 +57,7 @@ const savePageToCache = async (url: string, content: string) => { try { const db = await connectToMongo(); - await db.collection(collectionName).updateOne( - { url }, - { $set: page }, - { upsert: true } - ); // 更新持久化缓存 + await db.collection(collectionName).updateOne({ url }, { $set: page }, { upsert: true }); // 更新持久化缓存 } catch (error) { console.error('Error saving page to cache:', error); throw error; @@ -74,4 +74,4 @@ process.on('SIGINT', async () => { }); // 在应用启动时创建 TTL 索引 -createTTLIndex(); \ No newline at end of file +createTTLIndex(); diff --git a/plugins/webcrawler/SPIDER/src/utils/deepSearch.ts b/plugins/webcrawler/SPIDER/src/utils/deepSearch.ts index 7ef86901b..1053c91a5 100644 --- a/plugins/webcrawler/SPIDER/src/utils/deepSearch.ts +++ b/plugins/webcrawler/SPIDER/src/utils/deepSearch.ts @@ -13,12 +13,19 @@ interface CachedPage { updatedAt: Date; } -export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: string[], results: Map, strategies: any[], detectWebsites: string[], pageCount: number) => { +export const performDeepSearch = async ( + clusterInstance: Cluster, + resultUrls: string[], + results: Map, + strategies: any[], + detectWebsites: string[], + pageCount: number +) => { const tasks = []; await clusterInstance.task(async ({ page, data: { searchUrl } }) => { try { - const cachedPage = await getCachedPage(searchUrl) as CachedPage | null; + const cachedPage = (await getCachedPage(searchUrl)) as CachedPage | null; if (cachedPage) { const result = results.get(searchUrl); if (result) { @@ -29,18 +36,25 @@ export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: st } } catch (error) { console.error(`从缓存获取页面 ${searchUrl} 时发生错误:`, error); - results.set(searchUrl, { url: searchUrl, error: (error as Error).message, crawlStatus: 'Failed' }); + results.set(searchUrl, { + url: searchUrl, + error: (error as Error).message, + crawlStatus: 'Failed' + }); return; } try { const response = await fetch(searchUrl, { headers: { - 'User-Agent': new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }).toString(), - 'Referer': 'https://www.google.com/', + 'User-Agent': new UserAgent({ + deviceCategory: 'desktop', + platform: 'Linux x86_64' + }).toString(), + Referer: 'https://www.google.com/', 'Accept-Language': 'en-US,en;q=0.9', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Connection': 'keep-alive', + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + Connection: 'keep-alive', 'Cache-Control': 'no-cache' } }); @@ -66,7 +80,7 @@ export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: st } try { - if (detectWebsites.some(website => searchUrl.includes(website))) { + if (detectWebsites.some((website) => searchUrl.includes(website))) { await setupPage(page); } else { const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' }); @@ -118,7 +132,11 @@ export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: st await updateCacheAsync(searchUrl, cleanedContent || ''); } catch (error) { - results.set(searchUrl, { url: searchUrl, error: (error as Error).message, crawlStatus: 'Failed' }); + results.set(searchUrl, { + url: searchUrl, + error: (error as Error).message, + crawlStatus: 'Failed' + }); } finally { await page.close().catch(() => {}); } @@ -137,4 +155,4 @@ export const performDeepSearch = async (clusterInstance: Cluster, resultUrls: st await clusterInstance.close(); return Array.from(results.values()).sort((a, b) => b.score - a.score); -}; \ No newline at end of file +}; diff --git a/plugins/webcrawler/SPIDER/src/utils/setupPage.ts b/plugins/webcrawler/SPIDER/src/utils/setupPage.ts index e95efa1ec..cae1f8878 100644 --- a/plugins/webcrawler/SPIDER/src/utils/setupPage.ts +++ b/plugins/webcrawler/SPIDER/src/utils/setupPage.ts @@ -8,7 +8,7 @@ const getRandomUserAgent = () => { }; const getRandomPlatform = () => { - const platforms = ["Win32", "MacIntel", "Linux x86_64"]; + const platforms = ['Win32', 'MacIntel', 'Linux x86_64']; return platforms[Math.floor(Math.random() * platforms.length)]; }; @@ -16,14 +16,16 @@ const getRandomPlatform = () => { const validateproxy = process.env.VALIDATE_PROXY ? JSON.parse(process.env.VALIDATE_PROXY) : []; const getRandomProxy = () => { - return validateproxy.length > 0 ? validateproxy[Math.floor(Math.random() * validateproxy.length)] : null; + return validateproxy.length > 0 + ? validateproxy[Math.floor(Math.random() * validateproxy.length)] + : null; }; const getRandomLanguages = () => { const languages = [ - ["zh-CN", "zh", "en"], - ["en-US", "en", "fr"], - ["es-ES", "es", "en"] + ['zh-CN', 'zh', 'en'], + ['en-US', 'en', 'fr'], + ['es-ES', 'es', 'en'] ]; return languages[Math.floor(Math.random() * languages.length)]; }; @@ -42,30 +44,38 @@ export const setupPage = async (page: Page): Promise => { delete newProto.webdriver; (navigator as any).__proto__ = newProto; (window as any).chrome = {}; - (window as any).chrome.app = {"InstallState":"testt", "RunningState":"estt", "getDetails":"stte", "getIsInstalled":"ttes"}; - (window as any).chrome.csi = function(){}; - (window as any).chrome.loadTimes = function(){}; - (window as any).chrome.runtime = function(){}; + (window as any).chrome.app = { + InstallState: 'testt', + RunningState: 'estt', + getDetails: 'stte', + getIsInstalled: 'ttes' + }; + (window as any).chrome.csi = function () {}; + (window as any).chrome.loadTimes = function () {}; + (window as any).chrome.runtime = function () {}; Object.defineProperty(navigator, 'userAgent', { - get: () => getRandomUserAgent(), + get: () => getRandomUserAgent() }); Object.defineProperty(navigator, 'platform', { - get: () => getRandomPlatform(), + get: () => getRandomPlatform() }); Object.defineProperty(navigator, 'plugins', { - get: () => [{"description": "Shockwave Flash", - "filename": "pepflashplayer.dll", - "length": 1, - "name": "Shockwave Flash"}] + get: () => [ + { + description: 'Shockwave Flash', + filename: 'pepflashplayer.dll', + length: 1, + name: 'Shockwave Flash' + } + ] }); Object.defineProperty(navigator, 'languages', { - get: () => getRandomLanguages(), + get: () => getRandomLanguages() }); const originalQuery = (window.navigator.permissions as any).query; - (window.navigator.permissions as any).query = (parameters: any) => ( - parameters.name === 'notifications' ? - Promise.resolve({ state: Notification.permission } as PermissionStatus) : - originalQuery(parameters) - ); + (window.navigator.permissions as any).query = (parameters: any) => + parameters.name === 'notifications' + ? Promise.resolve({ state: Notification.permission } as PermissionStatus) + : originalQuery(parameters); }); -}; \ No newline at end of file +}; diff --git a/projects/app/src/pages/api/admin/initv485.ts b/projects/app/src/pages/api/admin/initv485.ts index 5aac3723e..53c2b7d88 100644 --- a/projects/app/src/pages/api/admin/initv485.ts +++ b/projects/app/src/pages/api/admin/initv485.ts @@ -63,7 +63,7 @@ async function initHttp(teamId?: string): Promise { } } ], - { session } + { session, ordered: true } ); /* 批量创建子插件 */ @@ -88,7 +88,7 @@ async function initHttp(teamId?: string): Promise { } } ], - { session } + { session, ordered: true } ); if (item.version === 'v2') { await MongoAppVersion.create( @@ -100,7 +100,7 @@ async function initHttp(teamId?: string): Promise { edges: item.edges } ], - { session } + { session, ordered: true } ); } } @@ -160,7 +160,7 @@ async function initPlugin(teamId?: string): Promise { } } ], - { session } + { session, ordered: true } ); if (plugin.version === 'v2') { @@ -173,7 +173,7 @@ async function initPlugin(teamId?: string): Promise { edges: plugin.edges } ], - { session } + { session, ordered: true } ); } diff --git a/projects/app/src/pages/api/admin/resetMilvus.ts b/projects/app/src/pages/api/admin/resetMilvus.ts index 2d5e47b01..c465ff3f5 100644 --- a/projects/app/src/pages/api/admin/resetMilvus.ts +++ b/projects/app/src/pages/api/admin/resetMilvus.ts @@ -98,7 +98,8 @@ async function handler( } ], { - session + session, + ordered: true } ); } diff --git a/projects/app/src/pages/api/core/app/create.ts b/projects/app/src/pages/api/core/app/create.ts index d9f725f50..1eb64b840 100644 --- a/projects/app/src/pages/api/core/app/create.ts +++ b/projects/app/src/pages/api/core/app/create.ts @@ -126,7 +126,7 @@ export const onCreateApp = async ({ 'pluginData.nodeVersion': defaultNodeVersion } ], - { session } + { session, ordered: true } ); if (!AppFolderTypeList.includes(type!)) { @@ -144,7 +144,7 @@ export const onCreateApp = async ({ isPublish: true } ], - { session } + { session, ordered: true } ); } diff --git a/projects/app/src/pages/api/core/app/folder/create.ts b/projects/app/src/pages/api/core/app/folder/create.ts index eb05ee1b4..c27b5269d 100644 --- a/projects/app/src/pages/api/core/app/folder/create.ts +++ b/projects/app/src/pages/api/core/app/folder/create.ts @@ -89,7 +89,8 @@ async function handler(req: ApiRequestProps) { } ], { - session + session, + ordered: true } ); } diff --git a/projects/app/src/pages/api/core/app/version/publish.ts b/projects/app/src/pages/api/core/app/version/publish.ts index ad052820b..048a41b95 100644 --- a/projects/app/src/pages/api/core/app/version/publish.ts +++ b/projects/app/src/pages/api/core/app/version/publish.ts @@ -45,7 +45,7 @@ async function handler(req: ApiRequestProps, res: NextApiRe tmbId } ], - { session } + { session, ordered: true } ); // update app diff --git a/projects/app/src/pages/api/core/dataset/create.ts b/projects/app/src/pages/api/core/dataset/create.ts index 1bf40c2c3..1d356b6c8 100644 --- a/projects/app/src/pages/api/core/dataset/create.ts +++ b/projects/app/src/pages/api/core/dataset/create.ts @@ -88,7 +88,7 @@ async function handler( yuqueServer } ], - { session } + { session, ordered: true } ); await refreshSourceAvatar(avatar, undefined, session); diff --git a/projects/app/src/pages/api/core/dataset/folder/create.ts b/projects/app/src/pages/api/core/dataset/folder/create.ts index 5f9a7de26..117367131 100644 --- a/projects/app/src/pages/api/core/dataset/folder/create.ts +++ b/projects/app/src/pages/api/core/dataset/folder/create.ts @@ -87,7 +87,7 @@ async function handler( permission: OwnerPermissionVal } ], - { session } + { session, ordered: true } ); } }); diff --git a/projects/app/src/pages/api/core/dataset/training/rebuildEmbedding.ts b/projects/app/src/pages/api/core/dataset/training/rebuildEmbedding.ts index 063ae04a1..a1ac81d34 100644 --- a/projects/app/src/pages/api/core/dataset/training/rebuildEmbedding.ts +++ b/projects/app/src/pages/api/core/dataset/training/rebuildEmbedding.ts @@ -122,7 +122,8 @@ async function handler(req: ApiRequestProps): Promise { password: hashStr(psw) } ], - { session } + { session, ordered: true } ); rootId = _id; }