init
This commit is contained in:
235
app/services/sites/hackernews.py
Normal file
235
app/services/sites/hackernews.py
Normal file
@@ -0,0 +1,235 @@
|
||||
import json
|
||||
import datetime
|
||||
import time
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib3
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
from .crawler import Crawler
|
||||
from ..browser_manager import BrowserManager
|
||||
|
||||
# 禁用SSL警告
|
||||
urllib3.disable_warnings()
|
||||
|
||||
class HackerNewsCrawler(Crawler):
|
||||
"""hacker news"""
|
||||
def fetch(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
try:
|
||||
# 首先尝试直接请求方式获取内容
|
||||
result = self._fetch_with_requests()
|
||||
|
||||
if result and len(result) > 0:
|
||||
# 缓存数据
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
# 如果请求方式失败,尝试使用浏览器模拟获取
|
||||
browser_manager = BrowserManager()
|
||||
result = self._fetch_with_browser(browser_manager)
|
||||
if result and len(result) > 0:
|
||||
# 缓存数据
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
# 如果遇到错误,返回空列表
|
||||
return []
|
||||
|
||||
# 所有方法都失败,返回空列表
|
||||
return []
|
||||
|
||||
def _fetch_with_requests(self):
|
||||
"""使用requests直接获取Hacker News内容"""
|
||||
url = "https://news.ycombinator.com/"
|
||||
|
||||
try:
|
||||
# 发送HTTP请求
|
||||
response = requests.get(url, headers=self.header, timeout=self.timeout)
|
||||
if response.status_code != 200:
|
||||
return []
|
||||
|
||||
# 解析HTML内容
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
result = []
|
||||
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# 获取所有新闻条目
|
||||
items = soup.select("tr.athing")
|
||||
|
||||
for item in items:
|
||||
try:
|
||||
# 获取ID用于关联评论和元数据
|
||||
item_id = item.get('id')
|
||||
if not item_id:
|
||||
continue
|
||||
|
||||
# 获取标题和链接
|
||||
title_element = item.select_one(".titleline a")
|
||||
if not title_element:
|
||||
continue
|
||||
|
||||
title = title_element.text.strip()
|
||||
url = title_element.get('href')
|
||||
|
||||
# 如果URL是相对路径,转换为绝对路径
|
||||
if url and not url.startswith('http'):
|
||||
url = f"https://news.ycombinator.com/{url}"
|
||||
|
||||
# 获取来源网站
|
||||
site_element = item.select_one(".sitestr")
|
||||
site = site_element.text.strip() if site_element else ""
|
||||
|
||||
# 查找下一个tr获取元数据(分数、用户、时间等)
|
||||
metadata = item.find_next_sibling('tr')
|
||||
if not metadata:
|
||||
continue
|
||||
|
||||
# 获取分数
|
||||
score_element = metadata.select_one(".score")
|
||||
score = score_element.text.strip() if score_element else "0 points"
|
||||
|
||||
# 获取作者
|
||||
user_element = metadata.select_one(".hnuser")
|
||||
user = user_element.text.strip() if user_element else "unknown"
|
||||
|
||||
# 获取评论数
|
||||
comments_element = metadata.select_one("a:last-child")
|
||||
comments = comments_element.text.strip() if comments_element else "0 comments"
|
||||
if "discuss" in comments:
|
||||
comments = "0 comments"
|
||||
|
||||
# 构建内容摘要
|
||||
content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': content,
|
||||
'source': 'hackernews',
|
||||
'publish_time': current_time
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
|
||||
# 限制获取前30条
|
||||
if len(result) >= 30:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def _fetch_with_browser(self, browser_manager):
|
||||
"""使用浏览器模拟方式获取Hacker News内容"""
|
||||
url = "https://news.ycombinator.com/"
|
||||
|
||||
try:
|
||||
# 获取页面内容
|
||||
page_source, driver = browser_manager.get_page_content(url, wait_time=5)
|
||||
|
||||
# 等待页面元素加载
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, ".athing"))
|
||||
)
|
||||
except:
|
||||
# 如果等待超时,仍然尝试获取内容
|
||||
pass
|
||||
|
||||
result = []
|
||||
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# 获取所有新闻条目
|
||||
items = driver.find_elements(By.CSS_SELECTOR, "tr.athing")
|
||||
|
||||
for item in items:
|
||||
try:
|
||||
# 获取ID用于关联评论和元数据
|
||||
item_id = item.get_attribute("id")
|
||||
if not item_id:
|
||||
continue
|
||||
|
||||
# 获取标题和链接
|
||||
title_element = item.find_element(By.CSS_SELECTOR, ".titleline a")
|
||||
title = title_element.text.strip()
|
||||
url = title_element.get_attribute("href")
|
||||
|
||||
# 获取来源网站
|
||||
site = ""
|
||||
try:
|
||||
site_element = item.find_element(By.CSS_SELECTOR, ".sitestr")
|
||||
site = site_element.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
# 查找下一个tr获取元数据(分数、用户、时间等)
|
||||
try:
|
||||
metadata = driver.find_element(By.XPATH, f"//tr[@id='{item_id}']/following-sibling::tr[1]")
|
||||
|
||||
# 获取分数
|
||||
score = "0 points"
|
||||
try:
|
||||
score_element = metadata.find_element(By.CSS_SELECTOR, ".score")
|
||||
score = score_element.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
# 获取作者
|
||||
user = "unknown"
|
||||
try:
|
||||
user_element = metadata.find_element(By.CSS_SELECTOR, ".hnuser")
|
||||
user = user_element.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
# 获取评论数
|
||||
comments = "0 comments"
|
||||
try:
|
||||
comments_element = metadata.find_element(By.XPATH, ".//a[last()]")
|
||||
comments = comments_element.text.strip()
|
||||
if "discuss" in comments:
|
||||
comments = "0 comments"
|
||||
except:
|
||||
pass
|
||||
|
||||
# 构建内容摘要
|
||||
content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}"
|
||||
except:
|
||||
content = f"来源: {site}"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': content,
|
||||
'source': 'hackernews',
|
||||
'publish_time': current_time
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
|
||||
# 限制获取前30条
|
||||
if len(result) >= 30:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "hackernews"
|
||||
Reference in New Issue
Block a user