hot-news-api/app/services/sites/weixin.py

import json
import datetime
import time
import requests
from bs4 import BeautifulSoup
import urllib3
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from ...core import cache
from ...db.mysql import News
from .crawler import Crawler
from ..browser_manager import BrowserManager

# 禁用SSL警告
urllib3.disable_warnings()

class WeiXinCrawler(Crawler):
    """
    微信热门内容爬虫
    使用微信看一看热门页面获取数据
    """

    def fetch(self, date_str):
        """获取微信热门内容"""
        current_time = datetime.datetime.now()
        browser_manager = BrowserManager()

        try:
            # 首先尝试从微信看一看获取热门内容
            result = self._fetch_from_weixin_kankan(browser_manager)

            if result and len(result) > 0:
                # 缓存数据
                cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
                return result

            # 如果看一看失败，尝试从微信读书获取热门书评
            result = self._fetch_from_weixin_dushu(browser_manager)
            if result and len(result) > 0:
                # 缓存数据
                cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
                return result

        except Exception as e:
            # 如果遇到错误，返回空列表
            return []

        # 所有方法都失败，返回空列表
        return []

    def _fetch_from_weixin_kankan(self, browser_manager):
        """从微信看一看页面获取热门内容"""
        url = "https://k.weixin.qq.com/"

        try:
            # 获取页面内容
            page_source, driver = browser_manager.get_page_content(url, wait_time=10)

            # 等待热门内容加载
            try:
                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, ".hot"))
                )
            except:
                # 如果等待超时，仍然尝试获取内容
                pass

            # 点击"热点"标签切换到热门内容
            try:
                hot_tab = driver.find_element(By.XPATH, "//div[contains(text(), '热点') and @class='tab']")
                hot_tab.click()
                time.sleep(3)  # 等待内容加载
            except:
                # 如果找不到热点标签，继续尝试获取当前页面内容
                pass

            result = []
            current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

            # 获取文章列表
            articles = driver.find_elements(By.CSS_SELECTOR, ".article-item")

            if not articles:
                # 尝试其他可能的选择器
                articles = driver.find_elements(By.CSS_SELECTOR, ".doc-item")

            if not articles:
                # 再尝试其他可能的选择器
                articles = driver.find_elements(By.CSS_SELECTOR, ".item")

            for article in articles:
                try:
                    # 获取文章标题和链接
                    title_elem = article.find_element(By.CSS_SELECTOR, "h3, .title")
                    title = title_elem.text.strip()

                    # 尝试获取链接
                    link = None
                    try:
                        link_elem = article.find_element(By.TAG_NAME, "a")
                        link = link_elem.get_attribute("href")
                    except:
                        # 如果直接获取链接失败，则记录文章id，以后可以构建链接
                        try:
                            article_id = article.get_attribute("data-id") or article.get_attribute("id")
                            link = f"https://k.weixin.qq.com/article?id={article_id}"
                        except:
                            link = "https://k.weixin.qq.com/"

                    # 获取来源
                    source = ""
                    try:
                        source_elem = article.find_element(By.CSS_SELECTOR, ".account, .source")
                        source = source_elem.text.strip()
                    except:
                        pass

                    # 获取摘要
                    summary = ""
                    try:
                        summary_elem = article.find_element(By.CSS_SELECTOR, ".desc, .summary, p")
                        summary = summary_elem.text.strip()
                    except:
                        pass

                    news = {
                        'title': title,
                        'url': link,
                        'content': f"来源: {source} | 摘要: {summary[:50] if summary else '无摘要'}",
                        'source': 'weixin',
                        'publish_time': current_time
                    }

                    result.append(news)

                    # 限制获取前20条
                    if len(result) >= 20:
                        break

                except Exception as e:
                    continue

            return result

        except Exception as e:
            return []

    def _fetch_from_weixin_dushu(self, browser_manager):
        """从微信读书获取热门书评"""
        url = "https://weread.qq.com/web/category/all"

        try:
            # 获取页面内容
            page_source, driver = browser_manager.get_page_content(url, wait_time=8)

            result = []
            current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

            # 尝试点击排行榜标签
            try:
                rank_tab = driver.find_element(By.XPATH, "//a[contains(text(), '排行榜')]")
                rank_tab.click()
                time.sleep(3)  # 等待内容加载
            except:
                # 如果找不到排行榜标签，继续尝试获取当前页面内容
                pass

            # 获取热门书籍列表
            books = driver.find_elements(By.CSS_SELECTOR, ".shelf-item, .book-item")

            for book in books:
                try:
                    # 获取书籍标题和链接
                    title_elem = book.find_element(By.CSS_SELECTOR, ".title, h3")
                    title = title_elem.text.strip()

                    # 尝试获取链接
                    link = "https://weread.qq.com/web/category/all"
                    try:
                        link_elem = book.find_element(By.TAG_NAME, "a")
                        link = link_elem.get_attribute("href")
                    except:
                        book_id = book.get_attribute("data-bid") or book.get_attribute("id")
                        if book_id:
                            link = f"https://weread.qq.com/web/reader/{book_id}"

                    # 获取作者
                    author = ""
                    try:
                        author_elem = book.find_element(By.CSS_SELECTOR, ".author, .writer")
                        author = author_elem.text.strip()
                    except:
                        pass

                    # 获取摘要/简介
                    intro = ""
                    try:
                        intro_elem = book.find_element(By.CSS_SELECTOR, ".intro, .desc")
                        intro = intro_elem.text.strip()
                    except:
                        pass

                    news = {
                        'title': f"热门书籍: {title}",
                        'url': link,
                        'content': f"作者: {author} | 简介: {intro[:50] if intro else '无简介'}",
                        'source': 'weixin',
                        'publish_time': current_time
                    }

                    result.append(news)

                    # 限制获取前20条
                    if len(result) >= 20:
                        break

                except Exception as e:
                    continue

            return result

        except Exception as e:
            return []

    def crawler_name(self):
        return "weixin"