Files
FastGPT/packages/service/common/string/cheerio.ts
Finley Ge b4238257b6 feat: dynamic website crawler (#2609)
* feat: cheerio returns cherrio.load for pro version website sync

* chore: rename
2024-09-05 14:49:57 +08:00

106 lines
2.3 KiB
TypeScript

import { UrlFetchParams, UrlFetchResponse } from '@fastgpt/global/common/file/api';
import * as cheerio from 'cheerio';
import axios from 'axios';
import { htmlToMarkdown } from './utils';
export const cheerioToHtml = ({
fetchUrl,
$,
selector
}: {
fetchUrl: string;
$: cheerio.CheerioAPI;
selector?: string;
}) => {
// get origin url
const originUrl = new URL(fetchUrl).origin;
const usedSelector = selector || 'body';
const selectDom = $(usedSelector);
// remove i element
selectDom.find('i,script').remove();
// remove empty a element
selectDom
.find('a')
.filter((i, el) => {
return $(el).text().trim() === '' && $(el).children().length === 0;
})
.remove();
// if link,img startWith /, add origin url
selectDom.find('a').each((i, el) => {
const href = $(el).attr('href');
if (href && href.startsWith('/')) {
$(el).attr('href', originUrl + href);
}
});
selectDom.find('img').each((i, el) => {
const src = $(el).attr('src');
if (src && src.startsWith('/')) {
$(el).attr('src', originUrl + src);
}
});
const html = selectDom
.map((item, dom) => {
return $(dom).html();
})
.get()
.join('\n');
const title = $('head title').text() || $('h1:first').text() || fetchUrl;
return {
html,
title,
usedSelector
};
};
export const urlsFetch = async ({
urlList,
selector
}: UrlFetchParams): Promise<UrlFetchResponse> => {
urlList = urlList.filter((url) => /^(http|https):\/\/[^ "]+$/.test(url));
const response = await Promise.all(
urlList.map(async (url) => {
try {
const fetchRes = await axios.get(url, {
timeout: 30000
});
const $ = cheerio.load(fetchRes.data);
const { title, html, usedSelector } = cheerioToHtml({
fetchUrl: url,
$,
selector
});
const md = await htmlToMarkdown(html);
return {
url,
title,
content: md,
selector: usedSelector
};
} catch (error) {
console.log(error, 'fetch error');
return {
url,
title: '',
content: '',
selector: ''
};
}
})
);
return response;
};
export const loadContentByCheerio = async (content: string) => cheerio.load(content);