init
This commit is contained in:
228
app/services/sites/weixin.py
Normal file
228
app/services/sites/weixin.py
Normal file
@@ -0,0 +1,228 @@
|
||||
import json
|
||||
import datetime
|
||||
import time
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib3
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
from .crawler import Crawler
|
||||
from ..browser_manager import BrowserManager
|
||||
|
||||
# 禁用SSL警告
|
||||
urllib3.disable_warnings()
|
||||
|
||||
class WeiXinCrawler(Crawler):
|
||||
"""
|
||||
微信热门内容爬虫
|
||||
使用微信看一看热门页面获取数据
|
||||
"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
"""获取微信热门内容"""
|
||||
current_time = datetime.datetime.now()
|
||||
browser_manager = BrowserManager()
|
||||
|
||||
try:
|
||||
# 首先尝试从微信看一看获取热门内容
|
||||
result = self._fetch_from_weixin_kankan(browser_manager)
|
||||
|
||||
if result and len(result) > 0:
|
||||
# 缓存数据
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
# 如果看一看失败,尝试从微信读书获取热门书评
|
||||
result = self._fetch_from_weixin_dushu(browser_manager)
|
||||
if result and len(result) > 0:
|
||||
# 缓存数据
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
# 如果遇到错误,返回空列表
|
||||
return []
|
||||
|
||||
# 所有方法都失败,返回空列表
|
||||
return []
|
||||
|
||||
def _fetch_from_weixin_kankan(self, browser_manager):
|
||||
"""从微信看一看页面获取热门内容"""
|
||||
url = "https://k.weixin.qq.com/"
|
||||
|
||||
try:
|
||||
# 获取页面内容
|
||||
page_source, driver = browser_manager.get_page_content(url, wait_time=10)
|
||||
|
||||
# 等待热门内容加载
|
||||
try:
|
||||
WebDriverWait(driver, 15).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, ".hot"))
|
||||
)
|
||||
except:
|
||||
# 如果等待超时,仍然尝试获取内容
|
||||
pass
|
||||
|
||||
# 点击"热点"标签切换到热门内容
|
||||
try:
|
||||
hot_tab = driver.find_element(By.XPATH, "//div[contains(text(), '热点') and @class='tab']")
|
||||
hot_tab.click()
|
||||
time.sleep(3) # 等待内容加载
|
||||
except:
|
||||
# 如果找不到热点标签,继续尝试获取当前页面内容
|
||||
pass
|
||||
|
||||
result = []
|
||||
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# 获取文章列表
|
||||
articles = driver.find_elements(By.CSS_SELECTOR, ".article-item")
|
||||
|
||||
if not articles:
|
||||
# 尝试其他可能的选择器
|
||||
articles = driver.find_elements(By.CSS_SELECTOR, ".doc-item")
|
||||
|
||||
if not articles:
|
||||
# 再尝试其他可能的选择器
|
||||
articles = driver.find_elements(By.CSS_SELECTOR, ".item")
|
||||
|
||||
for article in articles:
|
||||
try:
|
||||
# 获取文章标题和链接
|
||||
title_elem = article.find_element(By.CSS_SELECTOR, "h3, .title")
|
||||
title = title_elem.text.strip()
|
||||
|
||||
# 尝试获取链接
|
||||
link = None
|
||||
try:
|
||||
link_elem = article.find_element(By.TAG_NAME, "a")
|
||||
link = link_elem.get_attribute("href")
|
||||
except:
|
||||
# 如果直接获取链接失败,则记录文章id,以后可以构建链接
|
||||
try:
|
||||
article_id = article.get_attribute("data-id") or article.get_attribute("id")
|
||||
link = f"https://k.weixin.qq.com/article?id={article_id}"
|
||||
except:
|
||||
link = "https://k.weixin.qq.com/"
|
||||
|
||||
# 获取来源
|
||||
source = ""
|
||||
try:
|
||||
source_elem = article.find_element(By.CSS_SELECTOR, ".account, .source")
|
||||
source = source_elem.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
# 获取摘要
|
||||
summary = ""
|
||||
try:
|
||||
summary_elem = article.find_element(By.CSS_SELECTOR, ".desc, .summary, p")
|
||||
summary = summary_elem.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': link,
|
||||
'content': f"来源: {source} | 摘要: {summary[:50] if summary else '无摘要'}",
|
||||
'source': 'weixin',
|
||||
'publish_time': current_time
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
|
||||
# 限制获取前20条
|
||||
if len(result) >= 20:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def _fetch_from_weixin_dushu(self, browser_manager):
|
||||
"""从微信读书获取热门书评"""
|
||||
url = "https://weread.qq.com/web/category/all"
|
||||
|
||||
try:
|
||||
# 获取页面内容
|
||||
page_source, driver = browser_manager.get_page_content(url, wait_time=8)
|
||||
|
||||
result = []
|
||||
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# 尝试点击排行榜标签
|
||||
try:
|
||||
rank_tab = driver.find_element(By.XPATH, "//a[contains(text(), '排行榜')]")
|
||||
rank_tab.click()
|
||||
time.sleep(3) # 等待内容加载
|
||||
except:
|
||||
# 如果找不到排行榜标签,继续尝试获取当前页面内容
|
||||
pass
|
||||
|
||||
# 获取热门书籍列表
|
||||
books = driver.find_elements(By.CSS_SELECTOR, ".shelf-item, .book-item")
|
||||
|
||||
for book in books:
|
||||
try:
|
||||
# 获取书籍标题和链接
|
||||
title_elem = book.find_element(By.CSS_SELECTOR, ".title, h3")
|
||||
title = title_elem.text.strip()
|
||||
|
||||
# 尝试获取链接
|
||||
link = "https://weread.qq.com/web/category/all"
|
||||
try:
|
||||
link_elem = book.find_element(By.TAG_NAME, "a")
|
||||
link = link_elem.get_attribute("href")
|
||||
except:
|
||||
book_id = book.get_attribute("data-bid") or book.get_attribute("id")
|
||||
if book_id:
|
||||
link = f"https://weread.qq.com/web/reader/{book_id}"
|
||||
|
||||
# 获取作者
|
||||
author = ""
|
||||
try:
|
||||
author_elem = book.find_element(By.CSS_SELECTOR, ".author, .writer")
|
||||
author = author_elem.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
# 获取摘要/简介
|
||||
intro = ""
|
||||
try:
|
||||
intro_elem = book.find_element(By.CSS_SELECTOR, ".intro, .desc")
|
||||
intro = intro_elem.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
news = {
|
||||
'title': f"热门书籍: {title}",
|
||||
'url': link,
|
||||
'content': f"作者: {author} | 简介: {intro[:50] if intro else '无简介'}",
|
||||
'source': 'weixin',
|
||||
'publish_time': current_time
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
|
||||
# 限制获取前20条
|
||||
if len(result) >= 20:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "weixin"
|
||||
Reference in New Issue
Block a user