Create Python API (#457)

* 更新镜像

* 更新镜像信息

* 更新镜像信息

* Create openai_api.py

* Create requirements.txt

* Create README.md

* 添加python接口

* Delete python directory

* Create README.md

* Create Python API

* 文件结构化

* 文件结构化
This commit is contained in:
不做了睡大觉
2023-11-09 11:52:53 +08:00
committed by GitHub
parent 8bb5588305
commit 9f889d8806
9 changed files with 467 additions and 0 deletions

View File

@@ -0,0 +1,90 @@
import requests
import bs4
import nltk
from urllib.parse import urljoin
from time import sleep
import time
import math
# 全局变量来记录开始时间
start_time = time.time()
# 你可以设定一个最大运行时长比如60秒
max_run_time = 20
# 添加一个简单的IDF计算器
class SimpleIDFCalculator:
def __init__(self):
self.doc_freq = {}
self.num_docs = 0
def add_document(self, doc):
self.num_docs += 1
words = set(nltk.word_tokenize(doc))
for word in words:
if word in self.doc_freq:
self.doc_freq[word] += 1
else:
self.doc_freq[word] = 1
def idf(self, word):
return math.log(self.num_docs / (1 + self.doc_freq.get(word, 0)))
# 定义一个函数,用于获取网页的内容,并进行总结
def get_summary(url, level):
result = []
visited = set()
idf_calculator = SimpleIDFCalculator()
helper(url, level, result, visited, idf_calculator)
return result
# 辅助函数
def helper(url, level, result, visited, idf_calculator):
# # 检查是否超出运行时间限制
# if time.time() - start_time > max_run_time:
# print("Reached max run time, exiting...")
# return
if level == 0 or url in visited or not url.startswith("http"):
return
visited.add(url)
try:
response = requests.get(url)
if response.status_code != 200:
return
soup = bs4.BeautifulSoup(response.text, "html.parser")
title = soup.title.string if soup.title else 'No Title'
text = soup.get_text().strip()
idf_calculator.add_document(text)
sentences = nltk.sent_tokenize(text)
words = nltk.word_tokenize(text)
scores = {}
for sentence in sentences:
for word in nltk.word_tokenize(sentence):
tf = words.count(word) / len(words)
idf = idf_calculator.idf(word)
scores[sentence] = scores.get(sentence, 0) + (tf * idf)
summary = " ".join(sorted(scores, key=scores.get, reverse=True)[:10])
result.append((url, title, summary))
sleep(1) # Simple delay to prevent aggressive crawling
links = soup.find_all("a")
for link in links:
href = link.get("href")
if href:
# Handle relative links
next_url = urljoin(url, href)
helper(next_url, level - 1, result, visited, idf_calculator)
except Exception as e:
print(f"Error processing {url}: {e}")
# # 主程序部分,仅作为函数调用示例:
# summary = get_summary('https://zhihu.com', 2)
# print(summary)