This commit is contained in:
2026-03-26 15:04:59 +08:00
commit e0af97ac7f
65 changed files with 7366 additions and 0 deletions

View File

@@ -0,0 +1,235 @@
import json
import datetime
import time
import requests
from bs4 import BeautifulSoup
import urllib3
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from ...core import cache
from ...db.mysql import News
from .crawler import Crawler
from ..browser_manager import BrowserManager
# 禁用SSL警告
urllib3.disable_warnings()
class HackerNewsCrawler(Crawler):
"""hacker news"""
def fetch(self, date_str):
current_time = datetime.datetime.now()
try:
# 首先尝试直接请求方式获取内容
result = self._fetch_with_requests()
if result and len(result) > 0:
# 缓存数据
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
return result
# 如果请求方式失败,尝试使用浏览器模拟获取
browser_manager = BrowserManager()
result = self._fetch_with_browser(browser_manager)
if result and len(result) > 0:
# 缓存数据
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
return result
except Exception as e:
# 如果遇到错误,返回空列表
return []
# 所有方法都失败,返回空列表
return []
def _fetch_with_requests(self):
"""使用requests直接获取Hacker News内容"""
url = "https://news.ycombinator.com/"
try:
# 发送HTTP请求
response = requests.get(url, headers=self.header, timeout=self.timeout)
if response.status_code != 200:
return []
# 解析HTML内容
soup = BeautifulSoup(response.text, 'html.parser')
result = []
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 获取所有新闻条目
items = soup.select("tr.athing")
for item in items:
try:
# 获取ID用于关联评论和元数据
item_id = item.get('id')
if not item_id:
continue
# 获取标题和链接
title_element = item.select_one(".titleline a")
if not title_element:
continue
title = title_element.text.strip()
url = title_element.get('href')
# 如果URL是相对路径转换为绝对路径
if url and not url.startswith('http'):
url = f"https://news.ycombinator.com/{url}"
# 获取来源网站
site_element = item.select_one(".sitestr")
site = site_element.text.strip() if site_element else ""
# 查找下一个tr获取元数据分数、用户、时间等
metadata = item.find_next_sibling('tr')
if not metadata:
continue
# 获取分数
score_element = metadata.select_one(".score")
score = score_element.text.strip() if score_element else "0 points"
# 获取作者
user_element = metadata.select_one(".hnuser")
user = user_element.text.strip() if user_element else "unknown"
# 获取评论数
comments_element = metadata.select_one("a:last-child")
comments = comments_element.text.strip() if comments_element else "0 comments"
if "discuss" in comments:
comments = "0 comments"
# 构建内容摘要
content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}"
news = {
'title': title,
'url': url,
'content': content,
'source': 'hackernews',
'publish_time': current_time
}
result.append(news)
# 限制获取前30条
if len(result) >= 30:
break
except Exception as e:
continue
return result
except Exception as e:
return []
def _fetch_with_browser(self, browser_manager):
"""使用浏览器模拟方式获取Hacker News内容"""
url = "https://news.ycombinator.com/"
try:
# 获取页面内容
page_source, driver = browser_manager.get_page_content(url, wait_time=5)
# 等待页面元素加载
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".athing"))
)
except:
# 如果等待超时,仍然尝试获取内容
pass
result = []
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 获取所有新闻条目
items = driver.find_elements(By.CSS_SELECTOR, "tr.athing")
for item in items:
try:
# 获取ID用于关联评论和元数据
item_id = item.get_attribute("id")
if not item_id:
continue
# 获取标题和链接
title_element = item.find_element(By.CSS_SELECTOR, ".titleline a")
title = title_element.text.strip()
url = title_element.get_attribute("href")
# 获取来源网站
site = ""
try:
site_element = item.find_element(By.CSS_SELECTOR, ".sitestr")
site = site_element.text.strip()
except:
pass
# 查找下一个tr获取元数据分数、用户、时间等
try:
metadata = driver.find_element(By.XPATH, f"//tr[@id='{item_id}']/following-sibling::tr[1]")
# 获取分数
score = "0 points"
try:
score_element = metadata.find_element(By.CSS_SELECTOR, ".score")
score = score_element.text.strip()
except:
pass
# 获取作者
user = "unknown"
try:
user_element = metadata.find_element(By.CSS_SELECTOR, ".hnuser")
user = user_element.text.strip()
except:
pass
# 获取评论数
comments = "0 comments"
try:
comments_element = metadata.find_element(By.XPATH, ".//a[last()]")
comments = comments_element.text.strip()
if "discuss" in comments:
comments = "0 comments"
except:
pass
# 构建内容摘要
content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}"
except:
content = f"来源: {site}"
news = {
'title': title,
'url': url,
'content': content,
'source': 'hackernews',
'publish_time': current_time
}
result.append(news)
# 限制获取前30条
if len(result) >= 30:
break
except Exception as e:
continue
return result
except Exception as e:
return []
def crawler_name(self):
return "hackernews"