mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 13:03:50 +00:00
4.6.3-website dataset (#532)
This commit is contained in:
8
packages/global/common/file/api.d.ts
vendored
Normal file
8
packages/global/common/file/api.d.ts
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
export type UrlFetchParams = {
|
||||
urlList: string[];
|
||||
selector?: string;
|
||||
};
|
||||
export type UrlFetchResponse = {
|
||||
url: string;
|
||||
content: string;
|
||||
}[];
|
@@ -1,3 +1,8 @@
|
||||
import axios from 'axios';
|
||||
import { UrlFetchParams, UrlFetchResponse } from './api.d';
|
||||
import { htmlToMarkdown } from '../string/markdown';
|
||||
import * as cheerio from 'cheerio';
|
||||
|
||||
export const formatFileSize = (bytes: number): string => {
|
||||
if (bytes === 0) return '0 B';
|
||||
|
||||
@@ -7,3 +12,84 @@ export const formatFileSize = (bytes: number): string => {
|
||||
|
||||
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
|
||||
};
|
||||
|
||||
export const cheerioToHtml = ({
|
||||
fetchUrl,
|
||||
$,
|
||||
selector
|
||||
}: {
|
||||
fetchUrl: string;
|
||||
$: cheerio.CheerioAPI;
|
||||
selector?: string;
|
||||
}) => {
|
||||
// get origin url
|
||||
const originUrl = new URL(fetchUrl).origin;
|
||||
|
||||
// remove i element
|
||||
$('i,script').remove();
|
||||
|
||||
// remove empty a element
|
||||
$('a')
|
||||
.filter((i, el) => {
|
||||
return $(el).text().trim() === '' && $(el).children().length === 0;
|
||||
})
|
||||
.remove();
|
||||
|
||||
// if link,img startWith /, add origin url
|
||||
$('a').each((i, el) => {
|
||||
const href = $(el).attr('href');
|
||||
if (href && href.startsWith('/')) {
|
||||
$(el).attr('href', originUrl + href);
|
||||
}
|
||||
});
|
||||
$('img').each((i, el) => {
|
||||
const src = $(el).attr('src');
|
||||
if (src && src.startsWith('/')) {
|
||||
$(el).attr('src', originUrl + src);
|
||||
}
|
||||
});
|
||||
|
||||
return $(selector || 'body').html();
|
||||
};
|
||||
export const urlsFetch = async ({
|
||||
urlList,
|
||||
selector
|
||||
}: UrlFetchParams): Promise<UrlFetchResponse> => {
|
||||
urlList = urlList.filter((url) => /^(http|https):\/\/[^ "]+$/.test(url));
|
||||
|
||||
const response = (
|
||||
await Promise.all(
|
||||
urlList.map(async (url) => {
|
||||
try {
|
||||
const fetchRes = await axios.get(url, {
|
||||
timeout: 30000
|
||||
});
|
||||
|
||||
const $ = cheerio.load(fetchRes.data);
|
||||
|
||||
const md = htmlToMarkdown(
|
||||
cheerioToHtml({
|
||||
fetchUrl: url,
|
||||
$,
|
||||
selector
|
||||
})
|
||||
);
|
||||
|
||||
return {
|
||||
url,
|
||||
content: md
|
||||
};
|
||||
} catch (error) {
|
||||
console.log(error, 'fetch error');
|
||||
|
||||
return {
|
||||
url,
|
||||
content: ''
|
||||
};
|
||||
}
|
||||
})
|
||||
)
|
||||
).filter((item) => item.content);
|
||||
|
||||
return response;
|
||||
};
|
||||
|
Reference in New Issue
Block a user