Files
hot-news-api/app/services/sites/weixin.py
2026-03-26 15:04:59 +08:00

229 lines
8.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import datetime
import time
import requests
from bs4 import BeautifulSoup
import urllib3
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from ...core import cache
from ...db.mysql import News
from .crawler import Crawler
from ..browser_manager import BrowserManager
# 禁用SSL警告
urllib3.disable_warnings()
class WeiXinCrawler(Crawler):
"""
微信热门内容爬虫
使用微信看一看热门页面获取数据
"""
def fetch(self, date_str):
"""获取微信热门内容"""
current_time = datetime.datetime.now()
browser_manager = BrowserManager()
try:
# 首先尝试从微信看一看获取热门内容
result = self._fetch_from_weixin_kankan(browser_manager)
if result and len(result) > 0:
# 缓存数据
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
return result
# 如果看一看失败,尝试从微信读书获取热门书评
result = self._fetch_from_weixin_dushu(browser_manager)
if result and len(result) > 0:
# 缓存数据
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
return result
except Exception as e:
# 如果遇到错误,返回空列表
return []
# 所有方法都失败,返回空列表
return []
def _fetch_from_weixin_kankan(self, browser_manager):
"""从微信看一看页面获取热门内容"""
url = "https://k.weixin.qq.com/"
try:
# 获取页面内容
page_source, driver = browser_manager.get_page_content(url, wait_time=10)
# 等待热门内容加载
try:
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".hot"))
)
except:
# 如果等待超时,仍然尝试获取内容
pass
# 点击"热点"标签切换到热门内容
try:
hot_tab = driver.find_element(By.XPATH, "//div[contains(text(), '热点') and @class='tab']")
hot_tab.click()
time.sleep(3) # 等待内容加载
except:
# 如果找不到热点标签,继续尝试获取当前页面内容
pass
result = []
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 获取文章列表
articles = driver.find_elements(By.CSS_SELECTOR, ".article-item")
if not articles:
# 尝试其他可能的选择器
articles = driver.find_elements(By.CSS_SELECTOR, ".doc-item")
if not articles:
# 再尝试其他可能的选择器
articles = driver.find_elements(By.CSS_SELECTOR, ".item")
for article in articles:
try:
# 获取文章标题和链接
title_elem = article.find_element(By.CSS_SELECTOR, "h3, .title")
title = title_elem.text.strip()
# 尝试获取链接
link = None
try:
link_elem = article.find_element(By.TAG_NAME, "a")
link = link_elem.get_attribute("href")
except:
# 如果直接获取链接失败则记录文章id以后可以构建链接
try:
article_id = article.get_attribute("data-id") or article.get_attribute("id")
link = f"https://k.weixin.qq.com/article?id={article_id}"
except:
link = "https://k.weixin.qq.com/"
# 获取来源
source = ""
try:
source_elem = article.find_element(By.CSS_SELECTOR, ".account, .source")
source = source_elem.text.strip()
except:
pass
# 获取摘要
summary = ""
try:
summary_elem = article.find_element(By.CSS_SELECTOR, ".desc, .summary, p")
summary = summary_elem.text.strip()
except:
pass
news = {
'title': title,
'url': link,
'content': f"来源: {source} | 摘要: {summary[:50] if summary else '无摘要'}",
'source': 'weixin',
'publish_time': current_time
}
result.append(news)
# 限制获取前20条
if len(result) >= 20:
break
except Exception as e:
continue
return result
except Exception as e:
return []
def _fetch_from_weixin_dushu(self, browser_manager):
"""从微信读书获取热门书评"""
url = "https://weread.qq.com/web/category/all"
try:
# 获取页面内容
page_source, driver = browser_manager.get_page_content(url, wait_time=8)
result = []
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 尝试点击排行榜标签
try:
rank_tab = driver.find_element(By.XPATH, "//a[contains(text(), '排行榜')]")
rank_tab.click()
time.sleep(3) # 等待内容加载
except:
# 如果找不到排行榜标签,继续尝试获取当前页面内容
pass
# 获取热门书籍列表
books = driver.find_elements(By.CSS_SELECTOR, ".shelf-item, .book-item")
for book in books:
try:
# 获取书籍标题和链接
title_elem = book.find_element(By.CSS_SELECTOR, ".title, h3")
title = title_elem.text.strip()
# 尝试获取链接
link = "https://weread.qq.com/web/category/all"
try:
link_elem = book.find_element(By.TAG_NAME, "a")
link = link_elem.get_attribute("href")
except:
book_id = book.get_attribute("data-bid") or book.get_attribute("id")
if book_id:
link = f"https://weread.qq.com/web/reader/{book_id}"
# 获取作者
author = ""
try:
author_elem = book.find_element(By.CSS_SELECTOR, ".author, .writer")
author = author_elem.text.strip()
except:
pass
# 获取摘要/简介
intro = ""
try:
intro_elem = book.find_element(By.CSS_SELECTOR, ".intro, .desc")
intro = intro_elem.text.strip()
except:
pass
news = {
'title': f"热门书籍: {title}",
'url': link,
'content': f"作者: {author} | 简介: {intro[:50] if intro else '无简介'}",
'source': 'weixin',
'publish_time': current_time
}
result.append(news)
# 限制获取前20条
if len(result) >= 20:
break
except Exception as e:
continue
return result
except Exception as e:
return []
def crawler_name(self):
return "weixin"