236 lines
9.0 KiB
Python
236 lines
9.0 KiB
Python
import json
|
||
import datetime
|
||
import time
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
import urllib3
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
|
||
from ...core import cache
|
||
from ...db.mysql import News
|
||
from .crawler import Crawler
|
||
from ..browser_manager import BrowserManager
|
||
|
||
# 禁用SSL警告
|
||
urllib3.disable_warnings()
|
||
|
||
class HackerNewsCrawler(Crawler):
|
||
"""hacker news"""
|
||
def fetch(self, date_str):
|
||
current_time = datetime.datetime.now()
|
||
|
||
try:
|
||
# 首先尝试直接请求方式获取内容
|
||
result = self._fetch_with_requests()
|
||
|
||
if result and len(result) > 0:
|
||
# 缓存数据
|
||
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
|
||
return result
|
||
|
||
# 如果请求方式失败,尝试使用浏览器模拟获取
|
||
browser_manager = BrowserManager()
|
||
result = self._fetch_with_browser(browser_manager)
|
||
if result and len(result) > 0:
|
||
# 缓存数据
|
||
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
|
||
return result
|
||
|
||
except Exception as e:
|
||
# 如果遇到错误,返回空列表
|
||
return []
|
||
|
||
# 所有方法都失败,返回空列表
|
||
return []
|
||
|
||
def _fetch_with_requests(self):
|
||
"""使用requests直接获取Hacker News内容"""
|
||
url = "https://news.ycombinator.com/"
|
||
|
||
try:
|
||
# 发送HTTP请求
|
||
response = requests.get(url, headers=self.header, timeout=self.timeout)
|
||
if response.status_code != 200:
|
||
return []
|
||
|
||
# 解析HTML内容
|
||
soup = BeautifulSoup(response.text, 'html.parser')
|
||
|
||
result = []
|
||
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
|
||
# 获取所有新闻条目
|
||
items = soup.select("tr.athing")
|
||
|
||
for item in items:
|
||
try:
|
||
# 获取ID用于关联评论和元数据
|
||
item_id = item.get('id')
|
||
if not item_id:
|
||
continue
|
||
|
||
# 获取标题和链接
|
||
title_element = item.select_one(".titleline a")
|
||
if not title_element:
|
||
continue
|
||
|
||
title = title_element.text.strip()
|
||
url = title_element.get('href')
|
||
|
||
# 如果URL是相对路径,转换为绝对路径
|
||
if url and not url.startswith('http'):
|
||
url = f"https://news.ycombinator.com/{url}"
|
||
|
||
# 获取来源网站
|
||
site_element = item.select_one(".sitestr")
|
||
site = site_element.text.strip() if site_element else ""
|
||
|
||
# 查找下一个tr获取元数据(分数、用户、时间等)
|
||
metadata = item.find_next_sibling('tr')
|
||
if not metadata:
|
||
continue
|
||
|
||
# 获取分数
|
||
score_element = metadata.select_one(".score")
|
||
score = score_element.text.strip() if score_element else "0 points"
|
||
|
||
# 获取作者
|
||
user_element = metadata.select_one(".hnuser")
|
||
user = user_element.text.strip() if user_element else "unknown"
|
||
|
||
# 获取评论数
|
||
comments_element = metadata.select_one("a:last-child")
|
||
comments = comments_element.text.strip() if comments_element else "0 comments"
|
||
if "discuss" in comments:
|
||
comments = "0 comments"
|
||
|
||
# 构建内容摘要
|
||
content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}"
|
||
|
||
news = {
|
||
'title': title,
|
||
'url': url,
|
||
'content': content,
|
||
'source': 'hackernews',
|
||
'publish_time': current_time
|
||
}
|
||
|
||
result.append(news)
|
||
|
||
# 限制获取前30条
|
||
if len(result) >= 30:
|
||
break
|
||
|
||
except Exception as e:
|
||
continue
|
||
|
||
return result
|
||
|
||
except Exception as e:
|
||
return []
|
||
|
||
def _fetch_with_browser(self, browser_manager):
|
||
"""使用浏览器模拟方式获取Hacker News内容"""
|
||
url = "https://news.ycombinator.com/"
|
||
|
||
try:
|
||
# 获取页面内容
|
||
page_source, driver = browser_manager.get_page_content(url, wait_time=5)
|
||
|
||
# 等待页面元素加载
|
||
try:
|
||
WebDriverWait(driver, 10).until(
|
||
EC.presence_of_element_located((By.CSS_SELECTOR, ".athing"))
|
||
)
|
||
except:
|
||
# 如果等待超时,仍然尝试获取内容
|
||
pass
|
||
|
||
result = []
|
||
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
|
||
# 获取所有新闻条目
|
||
items = driver.find_elements(By.CSS_SELECTOR, "tr.athing")
|
||
|
||
for item in items:
|
||
try:
|
||
# 获取ID用于关联评论和元数据
|
||
item_id = item.get_attribute("id")
|
||
if not item_id:
|
||
continue
|
||
|
||
# 获取标题和链接
|
||
title_element = item.find_element(By.CSS_SELECTOR, ".titleline a")
|
||
title = title_element.text.strip()
|
||
url = title_element.get_attribute("href")
|
||
|
||
# 获取来源网站
|
||
site = ""
|
||
try:
|
||
site_element = item.find_element(By.CSS_SELECTOR, ".sitestr")
|
||
site = site_element.text.strip()
|
||
except:
|
||
pass
|
||
|
||
# 查找下一个tr获取元数据(分数、用户、时间等)
|
||
try:
|
||
metadata = driver.find_element(By.XPATH, f"//tr[@id='{item_id}']/following-sibling::tr[1]")
|
||
|
||
# 获取分数
|
||
score = "0 points"
|
||
try:
|
||
score_element = metadata.find_element(By.CSS_SELECTOR, ".score")
|
||
score = score_element.text.strip()
|
||
except:
|
||
pass
|
||
|
||
# 获取作者
|
||
user = "unknown"
|
||
try:
|
||
user_element = metadata.find_element(By.CSS_SELECTOR, ".hnuser")
|
||
user = user_element.text.strip()
|
||
except:
|
||
pass
|
||
|
||
# 获取评论数
|
||
comments = "0 comments"
|
||
try:
|
||
comments_element = metadata.find_element(By.XPATH, ".//a[last()]")
|
||
comments = comments_element.text.strip()
|
||
if "discuss" in comments:
|
||
comments = "0 comments"
|
||
except:
|
||
pass
|
||
|
||
# 构建内容摘要
|
||
content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}"
|
||
except:
|
||
content = f"来源: {site}"
|
||
|
||
news = {
|
||
'title': title,
|
||
'url': url,
|
||
'content': content,
|
||
'source': 'hackernews',
|
||
'publish_time': current_time
|
||
}
|
||
|
||
result.append(news)
|
||
|
||
# 限制获取前30条
|
||
if len(result) >= 30:
|
||
break
|
||
|
||
except Exception as e:
|
||
continue
|
||
|
||
return result
|
||
|
||
except Exception as e:
|
||
return []
|
||
|
||
def crawler_name(self):
|
||
return "hackernews"
|