This commit is contained in:
2026-03-26 15:04:59 +08:00
commit e0af97ac7f
65 changed files with 7366 additions and 0 deletions

View File

@@ -0,0 +1,228 @@
import json
import datetime
import time
import requests
from bs4 import BeautifulSoup
import urllib3
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from ...core import cache
from ...db.mysql import News
from .crawler import Crawler
from ..browser_manager import BrowserManager
# 禁用SSL警告
urllib3.disable_warnings()
class WeiXinCrawler(Crawler):
"""
微信热门内容爬虫
使用微信看一看热门页面获取数据
"""
def fetch(self, date_str):
"""获取微信热门内容"""
current_time = datetime.datetime.now()
browser_manager = BrowserManager()
try:
# 首先尝试从微信看一看获取热门内容
result = self._fetch_from_weixin_kankan(browser_manager)
if result and len(result) > 0:
# 缓存数据
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
return result
# 如果看一看失败,尝试从微信读书获取热门书评
result = self._fetch_from_weixin_dushu(browser_manager)
if result and len(result) > 0:
# 缓存数据
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
return result
except Exception as e:
# 如果遇到错误,返回空列表
return []
# 所有方法都失败,返回空列表
return []
def _fetch_from_weixin_kankan(self, browser_manager):
"""从微信看一看页面获取热门内容"""
url = "https://k.weixin.qq.com/"
try:
# 获取页面内容
page_source, driver = browser_manager.get_page_content(url, wait_time=10)
# 等待热门内容加载
try:
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".hot"))
)
except:
# 如果等待超时,仍然尝试获取内容
pass
# 点击"热点"标签切换到热门内容
try:
hot_tab = driver.find_element(By.XPATH, "//div[contains(text(), '热点') and @class='tab']")
hot_tab.click()
time.sleep(3) # 等待内容加载
except:
# 如果找不到热点标签,继续尝试获取当前页面内容
pass
result = []
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 获取文章列表
articles = driver.find_elements(By.CSS_SELECTOR, ".article-item")
if not articles:
# 尝试其他可能的选择器
articles = driver.find_elements(By.CSS_SELECTOR, ".doc-item")
if not articles:
# 再尝试其他可能的选择器
articles = driver.find_elements(By.CSS_SELECTOR, ".item")
for article in articles:
try:
# 获取文章标题和链接
title_elem = article.find_element(By.CSS_SELECTOR, "h3, .title")
title = title_elem.text.strip()
# 尝试获取链接
link = None
try:
link_elem = article.find_element(By.TAG_NAME, "a")
link = link_elem.get_attribute("href")
except:
# 如果直接获取链接失败则记录文章id以后可以构建链接
try:
article_id = article.get_attribute("data-id") or article.get_attribute("id")
link = f"https://k.weixin.qq.com/article?id={article_id}"
except:
link = "https://k.weixin.qq.com/"
# 获取来源
source = ""
try:
source_elem = article.find_element(By.CSS_SELECTOR, ".account, .source")
source = source_elem.text.strip()
except:
pass
# 获取摘要
summary = ""
try:
summary_elem = article.find_element(By.CSS_SELECTOR, ".desc, .summary, p")
summary = summary_elem.text.strip()
except:
pass
news = {
'title': title,
'url': link,
'content': f"来源: {source} | 摘要: {summary[:50] if summary else '无摘要'}",
'source': 'weixin',
'publish_time': current_time
}
result.append(news)
# 限制获取前20条
if len(result) >= 20:
break
except Exception as e:
continue
return result
except Exception as e:
return []
def _fetch_from_weixin_dushu(self, browser_manager):
"""从微信读书获取热门书评"""
url = "https://weread.qq.com/web/category/all"
try:
# 获取页面内容
page_source, driver = browser_manager.get_page_content(url, wait_time=8)
result = []
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 尝试点击排行榜标签
try:
rank_tab = driver.find_element(By.XPATH, "//a[contains(text(), '排行榜')]")
rank_tab.click()
time.sleep(3) # 等待内容加载
except:
# 如果找不到排行榜标签,继续尝试获取当前页面内容
pass
# 获取热门书籍列表
books = driver.find_elements(By.CSS_SELECTOR, ".shelf-item, .book-item")
for book in books:
try:
# 获取书籍标题和链接
title_elem = book.find_element(By.CSS_SELECTOR, ".title, h3")
title = title_elem.text.strip()
# 尝试获取链接
link = "https://weread.qq.com/web/category/all"
try:
link_elem = book.find_element(By.TAG_NAME, "a")
link = link_elem.get_attribute("href")
except:
book_id = book.get_attribute("data-bid") or book.get_attribute("id")
if book_id:
link = f"https://weread.qq.com/web/reader/{book_id}"
# 获取作者
author = ""
try:
author_elem = book.find_element(By.CSS_SELECTOR, ".author, .writer")
author = author_elem.text.strip()
except:
pass
# 获取摘要/简介
intro = ""
try:
intro_elem = book.find_element(By.CSS_SELECTOR, ".intro, .desc")
intro = intro_elem.text.strip()
except:
pass
news = {
'title': f"热门书籍: {title}",
'url': link,
'content': f"作者: {author} | 简介: {intro[:50] if intro else '无简介'}",
'source': 'weixin',
'publish_time': current_time
}
result.append(news)
# 限制获取前20条
if len(result) >= 20:
break
except Exception as e:
continue
return result
except Exception as e:
return []
def crawler_name(self):
return "weixin"