Files
FastGPT/python/api/services/fetch.py
不做了睡大觉 9f889d8806 Create Python API (#457)
* 更新镜像

* 更新镜像信息

* 更新镜像信息

* Create openai_api.py

* Create requirements.txt

* Create README.md

* 添加python接口

* Delete python directory

* Create README.md

* Create Python API

* 文件结构化

* 文件结构化
2023-11-09 11:52:53 +08:00

91 lines
2.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import bs4
import nltk
from urllib.parse import urljoin
from time import sleep
import time
import math
# 全局变量来记录开始时间
start_time = time.time()
# 你可以设定一个最大运行时长比如60秒
max_run_time = 20
# 添加一个简单的IDF计算器
class SimpleIDFCalculator:
def __init__(self):
self.doc_freq = {}
self.num_docs = 0
def add_document(self, doc):
self.num_docs += 1
words = set(nltk.word_tokenize(doc))
for word in words:
if word in self.doc_freq:
self.doc_freq[word] += 1
else:
self.doc_freq[word] = 1
def idf(self, word):
return math.log(self.num_docs / (1 + self.doc_freq.get(word, 0)))
# 定义一个函数,用于获取网页的内容,并进行总结
def get_summary(url, level):
result = []
visited = set()
idf_calculator = SimpleIDFCalculator()
helper(url, level, result, visited, idf_calculator)
return result
# 辅助函数
def helper(url, level, result, visited, idf_calculator):
# # 检查是否超出运行时间限制
# if time.time() - start_time > max_run_time:
# print("Reached max run time, exiting...")
# return
if level == 0 or url in visited or not url.startswith("http"):
return
visited.add(url)
try:
response = requests.get(url)
if response.status_code != 200:
return
soup = bs4.BeautifulSoup(response.text, "html.parser")
title = soup.title.string if soup.title else 'No Title'
text = soup.get_text().strip()
idf_calculator.add_document(text)
sentences = nltk.sent_tokenize(text)
words = nltk.word_tokenize(text)
scores = {}
for sentence in sentences:
for word in nltk.word_tokenize(sentence):
tf = words.count(word) / len(words)
idf = idf_calculator.idf(word)
scores[sentence] = scores.get(sentence, 0) + (tf * idf)
summary = " ".join(sorted(scores, key=scores.get, reverse=True)[:10])
result.append((url, title, summary))
sleep(1) # Simple delay to prevent aggressive crawling
links = soup.find_all("a")
for link in links:
href = link.get("href")
if href:
# Handle relative links
next_url = urljoin(url, href)
helper(next_url, level - 1, result, visited, idf_calculator)
except Exception as e:
print(f"Error processing {url}: {e}")
# # 主程序部分,仅作为函数调用示例:
# summary = get_summary('https://zhihu.com', 2)
# print(summary)