Files
hot-news-api/app/services/sites/hackernews.py
2026-03-26 15:04:59 +08:00

236 lines
9.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import datetime
import time
import requests
from bs4 import BeautifulSoup
import urllib3
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from ...core import cache
from ...db.mysql import News
from .crawler import Crawler
from ..browser_manager import BrowserManager
# 禁用SSL警告
urllib3.disable_warnings()
class HackerNewsCrawler(Crawler):
"""hacker news"""
def fetch(self, date_str):
current_time = datetime.datetime.now()
try:
# 首先尝试直接请求方式获取内容
result = self._fetch_with_requests()
if result and len(result) > 0:
# 缓存数据
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
return result
# 如果请求方式失败,尝试使用浏览器模拟获取
browser_manager = BrowserManager()
result = self._fetch_with_browser(browser_manager)
if result and len(result) > 0:
# 缓存数据
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
return result
except Exception as e:
# 如果遇到错误,返回空列表
return []
# 所有方法都失败,返回空列表
return []
def _fetch_with_requests(self):
"""使用requests直接获取Hacker News内容"""
url = "https://news.ycombinator.com/"
try:
# 发送HTTP请求
response = requests.get(url, headers=self.header, timeout=self.timeout)
if response.status_code != 200:
return []
# 解析HTML内容
soup = BeautifulSoup(response.text, 'html.parser')
result = []
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 获取所有新闻条目
items = soup.select("tr.athing")
for item in items:
try:
# 获取ID用于关联评论和元数据
item_id = item.get('id')
if not item_id:
continue
# 获取标题和链接
title_element = item.select_one(".titleline a")
if not title_element:
continue
title = title_element.text.strip()
url = title_element.get('href')
# 如果URL是相对路径转换为绝对路径
if url and not url.startswith('http'):
url = f"https://news.ycombinator.com/{url}"
# 获取来源网站
site_element = item.select_one(".sitestr")
site = site_element.text.strip() if site_element else ""
# 查找下一个tr获取元数据分数、用户、时间等
metadata = item.find_next_sibling('tr')
if not metadata:
continue
# 获取分数
score_element = metadata.select_one(".score")
score = score_element.text.strip() if score_element else "0 points"
# 获取作者
user_element = metadata.select_one(".hnuser")
user = user_element.text.strip() if user_element else "unknown"
# 获取评论数
comments_element = metadata.select_one("a:last-child")
comments = comments_element.text.strip() if comments_element else "0 comments"
if "discuss" in comments:
comments = "0 comments"
# 构建内容摘要
content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}"
news = {
'title': title,
'url': url,
'content': content,
'source': 'hackernews',
'publish_time': current_time
}
result.append(news)
# 限制获取前30条
if len(result) >= 30:
break
except Exception as e:
continue
return result
except Exception as e:
return []
def _fetch_with_browser(self, browser_manager):
"""使用浏览器模拟方式获取Hacker News内容"""
url = "https://news.ycombinator.com/"
try:
# 获取页面内容
page_source, driver = browser_manager.get_page_content(url, wait_time=5)
# 等待页面元素加载
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".athing"))
)
except:
# 如果等待超时,仍然尝试获取内容
pass
result = []
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 获取所有新闻条目
items = driver.find_elements(By.CSS_SELECTOR, "tr.athing")
for item in items:
try:
# 获取ID用于关联评论和元数据
item_id = item.get_attribute("id")
if not item_id:
continue
# 获取标题和链接
title_element = item.find_element(By.CSS_SELECTOR, ".titleline a")
title = title_element.text.strip()
url = title_element.get_attribute("href")
# 获取来源网站
site = ""
try:
site_element = item.find_element(By.CSS_SELECTOR, ".sitestr")
site = site_element.text.strip()
except:
pass
# 查找下一个tr获取元数据分数、用户、时间等
try:
metadata = driver.find_element(By.XPATH, f"//tr[@id='{item_id}']/following-sibling::tr[1]")
# 获取分数
score = "0 points"
try:
score_element = metadata.find_element(By.CSS_SELECTOR, ".score")
score = score_element.text.strip()
except:
pass
# 获取作者
user = "unknown"
try:
user_element = metadata.find_element(By.CSS_SELECTOR, ".hnuser")
user = user_element.text.strip()
except:
pass
# 获取评论数
comments = "0 comments"
try:
comments_element = metadata.find_element(By.XPATH, ".//a[last()]")
comments = comments_element.text.strip()
if "discuss" in comments:
comments = "0 comments"
except:
pass
# 构建内容摘要
content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}"
except:
content = f"来源: {site}"
news = {
'title': title,
'url': url,
'content': content,
'source': 'hackernews',
'publish_time': current_time
}
result.append(news)
# 限制获取前30条
if len(result) >= 30:
break
except Exception as e:
continue
return result
except Exception as e:
return []
def crawler_name(self):
return "hackernews"