fix: 网页抓取:正确处理//开头的超链接 (#2803)

Co-authored-by: zhenyiwang <zhenyiwang@intl.zju.edu.cn>
This commit is contained in:
Zhenyi-Wang
2024-09-26 14:53:24 +08:00
committed by GitHub
parent cb6fe9d0da
commit 5ad8c81ef3

View File

@@ -14,6 +14,7 @@ export const cheerioToHtml = ({
}) => {
// get origin url
const originUrl = new URL(fetchUrl).origin;
const protocol = new URL(fetchUrl).protocol; // http: or https:
const usedSelector = selector || 'body';
const selectDom = $(usedSelector);
@@ -32,14 +33,22 @@ export const cheerioToHtml = ({
// if link,img startWith /, add origin url
selectDom.find('a').each((i, el) => {
const href = $(el).attr('href');
if (href && href.startsWith('/')) {
$(el).attr('href', originUrl + href);
if (href) {
if (href.startsWith('//')) {
$(el).attr('href', protocol + href);
} else if (href.startsWith('/')) {
$(el).attr('href', originUrl + href);
}
}
});
selectDom.find('img').each((i, el) => {
const src = $(el).attr('src');
if (src && src.startsWith('/')) {
$(el).attr('src', originUrl + src);
if (src) {
if (src.startsWith('//')) {
$(el).attr('src', protocol + src);
} else if (src.startsWith('/')) {
$(el).attr('src', originUrl + src);
}
}
});