4.6.3-website dataset (#532)

This commit is contained in:
Archer
2023-12-03 20:45:57 +08:00
committed by GitHub
parent b916183848
commit a9ae270335
122 changed files with 3793 additions and 1360 deletions

8
packages/global/common/file/api.d.ts vendored Normal file
View File

@@ -0,0 +1,8 @@
export type UrlFetchParams = {
urlList: string[];
selector?: string;
};
export type UrlFetchResponse = {
url: string;
content: string;
}[];

View File

@@ -1,3 +1,8 @@
import axios from 'axios';
import { UrlFetchParams, UrlFetchResponse } from './api.d';
import { htmlToMarkdown } from '../string/markdown';
import * as cheerio from 'cheerio';
export const formatFileSize = (bytes: number): string => {
if (bytes === 0) return '0 B';
@@ -7,3 +12,84 @@ export const formatFileSize = (bytes: number): string => {
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
};
export const cheerioToHtml = ({
fetchUrl,
$,
selector
}: {
fetchUrl: string;
$: cheerio.CheerioAPI;
selector?: string;
}) => {
// get origin url
const originUrl = new URL(fetchUrl).origin;
// remove i element
$('i,script').remove();
// remove empty a element
$('a')
.filter((i, el) => {
return $(el).text().trim() === '' && $(el).children().length === 0;
})
.remove();
// if link,img startWith /, add origin url
$('a').each((i, el) => {
const href = $(el).attr('href');
if (href && href.startsWith('/')) {
$(el).attr('href', originUrl + href);
}
});
$('img').each((i, el) => {
const src = $(el).attr('src');
if (src && src.startsWith('/')) {
$(el).attr('src', originUrl + src);
}
});
return $(selector || 'body').html();
};
export const urlsFetch = async ({
urlList,
selector
}: UrlFetchParams): Promise<UrlFetchResponse> => {
urlList = urlList.filter((url) => /^(http|https):\/\/[^ "]+$/.test(url));
const response = (
await Promise.all(
urlList.map(async (url) => {
try {
const fetchRes = await axios.get(url, {
timeout: 30000
});
const $ = cheerio.load(fetchRes.data);
const md = htmlToMarkdown(
cheerioToHtml({
fetchUrl: url,
$,
selector
})
);
return {
url,
content: md
};
} catch (error) {
console.log(error, 'fetch error');
return {
url,
content: ''
};
}
})
)
).filter((item) => item.content);
return response;
};