229 lines
8.6 KiB
Python
229 lines
8.6 KiB
Python
import json
|
||
import datetime
|
||
import time
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
import urllib3
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
|
||
from ...core import cache
|
||
from ...db.mysql import News
|
||
from .crawler import Crawler
|
||
from ..browser_manager import BrowserManager
|
||
|
||
# 禁用SSL警告
|
||
urllib3.disable_warnings()
|
||
|
||
class WeiXinCrawler(Crawler):
|
||
"""
|
||
微信热门内容爬虫
|
||
使用微信看一看热门页面获取数据
|
||
"""
|
||
|
||
def fetch(self, date_str):
|
||
"""获取微信热门内容"""
|
||
current_time = datetime.datetime.now()
|
||
browser_manager = BrowserManager()
|
||
|
||
try:
|
||
# 首先尝试从微信看一看获取热门内容
|
||
result = self._fetch_from_weixin_kankan(browser_manager)
|
||
|
||
if result and len(result) > 0:
|
||
# 缓存数据
|
||
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
|
||
return result
|
||
|
||
# 如果看一看失败,尝试从微信读书获取热门书评
|
||
result = self._fetch_from_weixin_dushu(browser_manager)
|
||
if result and len(result) > 0:
|
||
# 缓存数据
|
||
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
|
||
return result
|
||
|
||
except Exception as e:
|
||
# 如果遇到错误,返回空列表
|
||
return []
|
||
|
||
# 所有方法都失败,返回空列表
|
||
return []
|
||
|
||
def _fetch_from_weixin_kankan(self, browser_manager):
|
||
"""从微信看一看页面获取热门内容"""
|
||
url = "https://k.weixin.qq.com/"
|
||
|
||
try:
|
||
# 获取页面内容
|
||
page_source, driver = browser_manager.get_page_content(url, wait_time=10)
|
||
|
||
# 等待热门内容加载
|
||
try:
|
||
WebDriverWait(driver, 15).until(
|
||
EC.presence_of_element_located((By.CSS_SELECTOR, ".hot"))
|
||
)
|
||
except:
|
||
# 如果等待超时,仍然尝试获取内容
|
||
pass
|
||
|
||
# 点击"热点"标签切换到热门内容
|
||
try:
|
||
hot_tab = driver.find_element(By.XPATH, "//div[contains(text(), '热点') and @class='tab']")
|
||
hot_tab.click()
|
||
time.sleep(3) # 等待内容加载
|
||
except:
|
||
# 如果找不到热点标签,继续尝试获取当前页面内容
|
||
pass
|
||
|
||
result = []
|
||
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
|
||
# 获取文章列表
|
||
articles = driver.find_elements(By.CSS_SELECTOR, ".article-item")
|
||
|
||
if not articles:
|
||
# 尝试其他可能的选择器
|
||
articles = driver.find_elements(By.CSS_SELECTOR, ".doc-item")
|
||
|
||
if not articles:
|
||
# 再尝试其他可能的选择器
|
||
articles = driver.find_elements(By.CSS_SELECTOR, ".item")
|
||
|
||
for article in articles:
|
||
try:
|
||
# 获取文章标题和链接
|
||
title_elem = article.find_element(By.CSS_SELECTOR, "h3, .title")
|
||
title = title_elem.text.strip()
|
||
|
||
# 尝试获取链接
|
||
link = None
|
||
try:
|
||
link_elem = article.find_element(By.TAG_NAME, "a")
|
||
link = link_elem.get_attribute("href")
|
||
except:
|
||
# 如果直接获取链接失败,则记录文章id,以后可以构建链接
|
||
try:
|
||
article_id = article.get_attribute("data-id") or article.get_attribute("id")
|
||
link = f"https://k.weixin.qq.com/article?id={article_id}"
|
||
except:
|
||
link = "https://k.weixin.qq.com/"
|
||
|
||
# 获取来源
|
||
source = ""
|
||
try:
|
||
source_elem = article.find_element(By.CSS_SELECTOR, ".account, .source")
|
||
source = source_elem.text.strip()
|
||
except:
|
||
pass
|
||
|
||
# 获取摘要
|
||
summary = ""
|
||
try:
|
||
summary_elem = article.find_element(By.CSS_SELECTOR, ".desc, .summary, p")
|
||
summary = summary_elem.text.strip()
|
||
except:
|
||
pass
|
||
|
||
news = {
|
||
'title': title,
|
||
'url': link,
|
||
'content': f"来源: {source} | 摘要: {summary[:50] if summary else '无摘要'}",
|
||
'source': 'weixin',
|
||
'publish_time': current_time
|
||
}
|
||
|
||
result.append(news)
|
||
|
||
# 限制获取前20条
|
||
if len(result) >= 20:
|
||
break
|
||
|
||
except Exception as e:
|
||
continue
|
||
|
||
return result
|
||
|
||
except Exception as e:
|
||
return []
|
||
|
||
def _fetch_from_weixin_dushu(self, browser_manager):
|
||
"""从微信读书获取热门书评"""
|
||
url = "https://weread.qq.com/web/category/all"
|
||
|
||
try:
|
||
# 获取页面内容
|
||
page_source, driver = browser_manager.get_page_content(url, wait_time=8)
|
||
|
||
result = []
|
||
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
|
||
# 尝试点击排行榜标签
|
||
try:
|
||
rank_tab = driver.find_element(By.XPATH, "//a[contains(text(), '排行榜')]")
|
||
rank_tab.click()
|
||
time.sleep(3) # 等待内容加载
|
||
except:
|
||
# 如果找不到排行榜标签,继续尝试获取当前页面内容
|
||
pass
|
||
|
||
# 获取热门书籍列表
|
||
books = driver.find_elements(By.CSS_SELECTOR, ".shelf-item, .book-item")
|
||
|
||
for book in books:
|
||
try:
|
||
# 获取书籍标题和链接
|
||
title_elem = book.find_element(By.CSS_SELECTOR, ".title, h3")
|
||
title = title_elem.text.strip()
|
||
|
||
# 尝试获取链接
|
||
link = "https://weread.qq.com/web/category/all"
|
||
try:
|
||
link_elem = book.find_element(By.TAG_NAME, "a")
|
||
link = link_elem.get_attribute("href")
|
||
except:
|
||
book_id = book.get_attribute("data-bid") or book.get_attribute("id")
|
||
if book_id:
|
||
link = f"https://weread.qq.com/web/reader/{book_id}"
|
||
|
||
# 获取作者
|
||
author = ""
|
||
try:
|
||
author_elem = book.find_element(By.CSS_SELECTOR, ".author, .writer")
|
||
author = author_elem.text.strip()
|
||
except:
|
||
pass
|
||
|
||
# 获取摘要/简介
|
||
intro = ""
|
||
try:
|
||
intro_elem = book.find_element(By.CSS_SELECTOR, ".intro, .desc")
|
||
intro = intro_elem.text.strip()
|
||
except:
|
||
pass
|
||
|
||
news = {
|
||
'title': f"热门书籍: {title}",
|
||
'url': link,
|
||
'content': f"作者: {author} | 简介: {intro[:50] if intro else '无简介'}",
|
||
'source': 'weixin',
|
||
'publish_time': current_time
|
||
}
|
||
|
||
result.append(news)
|
||
|
||
# 限制获取前20条
|
||
if len(result) >= 20:
|
||
break
|
||
|
||
except Exception as e:
|
||
continue
|
||
|
||
return result
|
||
|
||
except Exception as e:
|
||
return []
|
||
|
||
def crawler_name(self):
|
||
return "weixin"
|