init

2026-03-26 15:04:59 +08:00
commit e0af97ac7f
65 changed files with 7366 additions and 0 deletions
--- a/app/services/sites/weixin.py
+++ b/app/services/sites/weixin.py
@@ -0,0 +1,228 @@
+import json
+import datetime
+import time
+import requests
+from bs4 import BeautifulSoup
+import urllib3
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+from ...core import cache
+from ...db.mysql import News
+from .crawler import Crawler
+from ..browser_manager import BrowserManager
+
+# 禁用SSL警告
+urllib3.disable_warnings()
+
+class WeiXinCrawler(Crawler):
+    """
+    微信热门内容爬虫
+    使用微信看一看热门页面获取数据
+    """
+    
+    def fetch(self, date_str):
+        """获取微信热门内容"""
+        current_time = datetime.datetime.now()
+        browser_manager = BrowserManager()
+        
+        try:
+            # 首先尝试从微信看一看获取热门内容
+            result = self._fetch_from_weixin_kankan(browser_manager)
+            
+            if result and len(result) > 0:
+                # 缓存数据
+                cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
+                return result
+                
+            # 如果看一看失败，尝试从微信读书获取热门书评
+            result = self._fetch_from_weixin_dushu(browser_manager)
+            if result and len(result) > 0:
+                # 缓存数据
+                cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
+                return result
+                
+        except Exception as e:
+            # 如果遇到错误，返回空列表
+            return []
+            
+        # 所有方法都失败，返回空列表
+        return []
+    
+    def _fetch_from_weixin_kankan(self, browser_manager):
+        """从微信看一看页面获取热门内容"""
+        url = "https://k.weixin.qq.com/"
+        
+        try:
+            # 获取页面内容
+            page_source, driver = browser_manager.get_page_content(url, wait_time=10)
+            
+            # 等待热门内容加载
+            try:
+                WebDriverWait(driver, 15).until(
+                    EC.presence_of_element_located((By.CSS_SELECTOR, ".hot"))
+                )
+            except:
+                # 如果等待超时，仍然尝试获取内容
+                pass
+                
+            # 点击"热点"标签切换到热门内容
+            try:
+                hot_tab = driver.find_element(By.XPATH, "//div[contains(text(), '热点') and @class='tab']")
+                hot_tab.click()
+                time.sleep(3)  # 等待内容加载
+            except:
+                # 如果找不到热点标签，继续尝试获取当前页面内容
+                pass
+                
+            result = []
+            current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+            
+            # 获取文章列表
+            articles = driver.find_elements(By.CSS_SELECTOR, ".article-item")
+            
+            if not articles:
+                # 尝试其他可能的选择器
+                articles = driver.find_elements(By.CSS_SELECTOR, ".doc-item")
+                
+            if not articles:
+                # 再尝试其他可能的选择器
+                articles = driver.find_elements(By.CSS_SELECTOR, ".item")
+                
+            for article in articles:
+                try:
+                    # 获取文章标题和链接
+                    title_elem = article.find_element(By.CSS_SELECTOR, "h3, .title")
+                    title = title_elem.text.strip()
+                    
+                    # 尝试获取链接
+                    link = None
+                    try:
+                        link_elem = article.find_element(By.TAG_NAME, "a")
+                        link = link_elem.get_attribute("href")
+                    except:
+                        # 如果直接获取链接失败，则记录文章id，以后可以构建链接
+                        try:
+                            article_id = article.get_attribute("data-id") or article.get_attribute("id")
+                            link = f"https://k.weixin.qq.com/article?id={article_id}"
+                        except:
+                            link = "https://k.weixin.qq.com/"
+                    
+                    # 获取来源
+                    source = ""
+                    try:
+                        source_elem = article.find_element(By.CSS_SELECTOR, ".account, .source")
+                        source = source_elem.text.strip()
+                    except:
+                        pass
+                    
+                    # 获取摘要
+                    summary = ""
+                    try:
+                        summary_elem = article.find_element(By.CSS_SELECTOR, ".desc, .summary, p")
+                        summary = summary_elem.text.strip()
+                    except:
+                        pass
+                    
+                    news = {
+                        'title': title,
+                        'url': link,
+                        'content': f"来源: {source} | 摘要: {summary[:50] if summary else '无摘要'}",
+                        'source': 'weixin',
+                        'publish_time': current_time
+                    }
+                    
+                    result.append(news)
+                    
+                    # 限制获取前20条
+                    if len(result) >= 20:
+                        break
+                        
+                except Exception as e:
+                    continue
+                    
+            return result
+            
+        except Exception as e:
+            return []
+    
+    def _fetch_from_weixin_dushu(self, browser_manager):
+        """从微信读书获取热门书评"""
+        url = "https://weread.qq.com/web/category/all"
+        
+        try:
+            # 获取页面内容
+            page_source, driver = browser_manager.get_page_content(url, wait_time=8)
+            
+            result = []
+            current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+            
+            # 尝试点击排行榜标签
+            try:
+                rank_tab = driver.find_element(By.XPATH, "//a[contains(text(), '排行榜')]")
+                rank_tab.click()
+                time.sleep(3)  # 等待内容加载
+            except:
+                # 如果找不到排行榜标签，继续尝试获取当前页面内容
+                pass
+            
+            # 获取热门书籍列表
+            books = driver.find_elements(By.CSS_SELECTOR, ".shelf-item, .book-item")
+            
+            for book in books:
+                try:
+                    # 获取书籍标题和链接
+                    title_elem = book.find_element(By.CSS_SELECTOR, ".title, h3")
+                    title = title_elem.text.strip()
+                    
+                    # 尝试获取链接
+                    link = "https://weread.qq.com/web/category/all"
+                    try:
+                        link_elem = book.find_element(By.TAG_NAME, "a")
+                        link = link_elem.get_attribute("href")
+                    except:
+                        book_id = book.get_attribute("data-bid") or book.get_attribute("id")
+                        if book_id:
+                            link = f"https://weread.qq.com/web/reader/{book_id}"
+                    
+                    # 获取作者
+                    author = ""
+                    try:
+                        author_elem = book.find_element(By.CSS_SELECTOR, ".author, .writer")
+                        author = author_elem.text.strip()
+                    except:
+                        pass
+                    
+                    # 获取摘要/简介
+                    intro = ""
+                    try:
+                        intro_elem = book.find_element(By.CSS_SELECTOR, ".intro, .desc")
+                        intro = intro_elem.text.strip()
+                    except:
+                        pass
+                    
+                    news = {
+                        'title': f"热门书籍: {title}",
+                        'url': link,
+                        'content': f"作者: {author} | 简介: {intro[:50] if intro else '无简介'}",
+                        'source': 'weixin',
+                        'publish_time': current_time
+                    }
+                    
+                    result.append(news)
+                    
+                    # 限制获取前20条
+                    if len(result) >= 20:
+                        break
+                        
+                except Exception as e:
+                    continue
+                    
+            return result
+            
+        except Exception as e:
+            return []
+    
+    def crawler_name(self):
+        return "weixin"