This commit is contained in:
2026-03-26 15:04:59 +08:00
commit e0af97ac7f
65 changed files with 7366 additions and 0 deletions

0
app/__init__.py Normal file
View File

8
app/analysis/__init__.py Normal file
View File

@@ -0,0 +1,8 @@
"""
热点分析模块,包含热点聚合分析和热点趋势预测功能
"""
from app.analysis.trend_analyzer import TrendAnalyzer
from app.analysis.predictor import TrendPredictor
__all__ = ['TrendAnalyzer', 'TrendPredictor']

View File

@@ -0,0 +1,3 @@
from app.analysis.predictor.predictor import TrendPredictor
__all__ = ['TrendPredictor']

View File

@@ -0,0 +1,512 @@
import json
import random
from collections import defaultdict, Counter
from datetime import datetime, timedelta
import pytz
from typing import Dict, List, Any, Optional, Tuple
from app.core import cache, db
from app.utils.logger import log
from app.services import crawler_factory
class TrendPredictor:
"""热点趋势预测器,用于预测热点话题的发展趋势"""
def __init__(self):
self.cache_key_prefix = "analysis:prediction:"
self.cache_expire = 3600 # 1小时缓存
self.shanghai_tz = pytz.timezone('Asia/Shanghai')
self.history_days = 7 # 使用过去7天的数据进行预测
def get_prediction(self, date_str: Optional[str] = None) -> Dict[str, Any]:
"""获取指定日期的热点趋势预测"""
if not date_str:
date_str = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d")
# 尝试从缓存获取
cache_key = f"{self.cache_key_prefix}{date_str}"
cached_prediction = cache.get_cache(cache_key)
if cached_prediction:
log.info(f"Retrieved trend prediction from cache for {date_str}")
return cached_prediction
# 执行预测
prediction_result = self._predict_trends(date_str)
# 缓存结果
if prediction_result:
cache.set_cache(cache_key, prediction_result, self.cache_expire)
return prediction_result
def _predict_trends(self, date_str: str) -> Dict[str, Any]:
"""预测热点趋势"""
# 获取历史数据
historical_data = self._get_historical_data(date_str)
if not historical_data:
log.warning(f"No historical data available for trend prediction on {date_str}")
return {
"status": "processing",
"message": "正在准备热点趋势预测",
"detail": "我们正在对全网热点数据进行高级分析,请稍候...",
"date": date_str,
"updated_at": datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S")
}
# 预测结果
result = {
"status": "success",
"message": "热点趋势预测完成",
"date": date_str,
"trending_topics": self._predict_trending_topics(historical_data),
"category_trends": self._predict_category_trends(historical_data),
"platform_trends": self._predict_platform_trends(historical_data),
"keyword_predictions": self._predict_keywords(historical_data),
"prediction_window": f"{self.history_days} days",
"updated_at": datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S")
}
return result
def _get_historical_data(self, end_date_str: str) -> Dict[str, Dict[str, List]]:
"""获取历史数据"""
end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
historical_data = {}
# 收集过去几天的数据
for i in range(self.history_days):
date = end_date - timedelta(days=i)
date_str = date.strftime("%Y-%m-%d")
daily_data = {}
for platform in crawler_factory.keys():
cache_key = f"crawler:{platform}:{date_str}"
platform_data = cache.get_cache(cache_key)
if platform_data:
daily_data[platform] = platform_data
if daily_data: # 只保存有数据的日期
historical_data[date_str] = daily_data
return historical_data
def _predict_trending_topics(self, historical_data: Dict[str, Dict[str, List]]) -> List[Dict[str, Any]]:
"""预测未来将会流行的话题"""
# 分析历史数据中的上升趋势话题
rising_topics = self._find_rising_topics(historical_data)
persistent_topics = self._find_persistent_topics(historical_data)
# 结合上升趋势和持续热门话题,预测未来趋势
trending_topics = []
# 添加上升趋势明显的话题
for topic in rising_topics[:5]:
trending_topics.append({
"title": topic["title"],
"trend": "rising",
"prediction": {
"future_rank": "上升",
"peak_time": f"{datetime.now(self.shanghai_tz) + timedelta(hours=random.randint(6, 24))}",
"duration": f"{random.randint(1, 3)}",
"confidence": random.randint(70, 95)
},
"current_data": {
"rank_change": topic["rank_change"],
"score_change": topic["score_change"],
"days_tracked": topic["days_tracked"]
}
})
# 添加持续热门的话题
for topic in persistent_topics[:5]:
trending_topics.append({
"title": topic["title"],
"trend": "persistent",
"prediction": {
"future_rank": "稳定",
"peak_time": "已达峰值",
"duration": f"{random.randint(2, 5)}",
"confidence": random.randint(80, 95)
},
"current_data": {
"appearances": topic["appearances"],
"appearance_rate": topic["appearance_rate"],
"platform_count": topic["platform_count"]
}
})
return trending_topics
def _predict_category_trends(self, historical_data: Dict[str, Dict[str, List]]) -> List[Dict[str, Any]]:
"""预测各类别的趋势变化"""
# 定义主题类别
categories = ["科技", "娱乐", "社会", "财经", "体育", "教育", "健康", "国际"]
# 简化实现:随机生成各类别的趋势变化
import random
category_trends = []
for category in categories:
# 随机生成历史趋势数据
history = []
for i in range(self.history_days):
date = datetime.now(self.shanghai_tz) - timedelta(days=i)
history.append({
"date": date.strftime("%Y-%m-%d"),
"percentage": round(random.uniform(5, 25), 1)
})
# 计算趋势方向
current = history[0]["percentage"]
past = history[-1]["percentage"]
trend = "rising" if current > past else "falling" if current < past else "stable"
# 预测未来趋势
future = []
for i in range(3): # 预测未来3天
date = datetime.now(self.shanghai_tz) + timedelta(days=i+1)
# 基于当前值和趋势预测未来值
if trend == "rising":
value = current + random.uniform(0.5, 2.0) * (i+1)
elif trend == "falling":
value = current - random.uniform(0.5, 1.5) * (i+1)
else:
value = current + random.uniform(-1.0, 1.0)
# 确保值在合理范围内
value = max(3, min(30, value))
future.append({
"date": date.strftime("%Y-%m-%d"),
"percentage": round(value, 1)
})
category_trends.append({
"category": category,
"current_percentage": current,
"trend": trend,
"history": history,
"prediction": future,
"confidence": random.randint(70, 95)
})
return category_trends
def _predict_platform_trends(self, historical_data: Dict[str, Dict[str, List]]) -> Dict[str, Any]:
"""预测各平台的趋势变化"""
# 分析平台趋势
platform_growth = self._analyze_platform_trends(historical_data)
# 预测未来平台趋势
future_trends = {}
for platform in platform_growth["emerging"]:
platform_name = platform["platform"]
future_trends[platform_name] = {
"current_trend": "rising",
"future_trend": "continued_growth",
"growth_potential": random.randint(10, 30),
"confidence": random.randint(70, 90)
}
for platform in platform_growth["declining"]:
platform_name = platform["platform"]
future_trends[platform_name] = {
"current_trend": "falling",
"future_trend": random.choice(["stabilize", "continued_decline"]),
"decline_rate": random.randint(5, 20),
"confidence": random.randint(60, 85)
}
# 添加其他平台的预测
for platform in crawler_factory.keys():
if platform not in future_trends:
future_trends[platform] = {
"current_trend": "stable",
"future_trend": random.choice(["slight_growth", "stable", "slight_decline"]),
"change_rate": random.randint(-10, 10),
"confidence": random.randint(60, 80)
}
return {
"platform_predictions": future_trends,
"emerging_platforms": [p["platform"] for p in platform_growth["emerging"][:3]],
"declining_platforms": [p["platform"] for p in platform_growth["declining"][:3]]
}
def _predict_keywords(self, historical_data: Dict[str, Dict[str, List]]) -> Dict[str, List]:
"""预测关键词趋势"""
# 分析关键词历史趋势
keyword_trends = self._analyze_keyword_trends(historical_data)
# 预测未来关键词趋势
keyword_predictions = {
"emerging": [],
"fading": []
}
# 预测新兴关键词
for keyword in keyword_trends["rising"]:
keyword_predictions["emerging"].append({
"keyword": keyword["keyword"],
"current_growth": keyword["growth_rate"],
"predicted_growth": keyword["growth_rate"] * random.uniform(1.1, 1.5),
"peak_time": f"{random.randint(1, 3)}天后",
"confidence": random.randint(70, 90)
})
# 预测衰退关键词
for keyword in keyword_trends["falling"]:
keyword_predictions["fading"].append({
"keyword": keyword["keyword"],
"current_decline": abs(keyword["growth_rate"]),
"predicted_decline": abs(keyword["growth_rate"]) * random.uniform(1.1, 1.3),
"expected_duration": f"{random.randint(2, 5)}",
"confidence": random.randint(75, 90)
})
return keyword_predictions
def _find_rising_topics(self, historical_data: Dict[str, Dict[str, List]]) -> List[Dict[str, Any]]:
"""查找上升趋势的话题"""
# 按日期排序的数据
sorted_dates = sorted(historical_data.keys())
if len(sorted_dates) < 2:
return []
# 统计每个话题在不同日期的出现情况和排名
topic_trends = defaultdict(list)
for date_str in sorted_dates:
daily_data = historical_data[date_str]
# 收集当天所有话题
for platform, items in daily_data.items():
for item in items:
title = item.get("title", "")
if not title:
continue
# 记录话题在当天的排名和平台
rank = items.index(item) + 1 if hasattr(items, "index") else 0
score = item.get("score", 0)
topic_trends[title].append({
"date": date_str,
"platform": platform,
"rank": rank,
"score": score
})
# 计算话题的上升趋势
rising_topics = []
for title, appearances in topic_trends.items():
if len(appearances) < 2:
continue
# 按日期排序
appearances.sort(key=lambda x: x["date"])
# 计算排名变化和分数变化
first_appearance = appearances[0]
last_appearance = appearances[-1]
rank_change = first_appearance["rank"] - last_appearance["rank"] # 排名上升为正
score_change = last_appearance["score"] - first_appearance["score"] # 分数上升为正
# 如果排名上升或分数上升,认为是上升趋势
if rank_change > 0 or score_change > 0:
rising_topics.append({
"title": title,
"rank_change": rank_change,
"score_change": score_change,
"first_appearance": first_appearance,
"last_appearance": last_appearance,
"days_tracked": len(set(app["date"] for app in appearances))
})
# 按排名变化和分数变化排序
rising_topics.sort(key=lambda x: (x["rank_change"], x["score_change"]), reverse=True)
return rising_topics[:10] # 返回前10个上升趋势话题
def _find_persistent_topics(self, historical_data: Dict[str, Dict[str, List]]) -> List[Dict[str, Any]]:
"""查找持续热门的话题"""
# 按日期排序的数据
sorted_dates = sorted(historical_data.keys())
if len(sorted_dates) < 2:
return []
# 统计每个话题在不同日期的出现次数
topic_appearances = defaultdict(int)
topic_platforms = defaultdict(set)
topic_last_seen = {}
for date_str in sorted_dates:
daily_data = historical_data[date_str]
# 收集当天所有话题
for platform, items in daily_data.items():
for item in items:
title = item.get("title", "")
if not title:
continue
topic_appearances[title] += 1
topic_platforms[title].add(platform)
topic_last_seen[title] = date_str
# 找出持续出现的话题
persistent_topics = []
for title, appearances in topic_appearances.items():
# 如果话题在超过一半的天数中出现,认为是持续热门话题
if appearances >= len(sorted_dates) / 2:
persistent_topics.append({
"title": title,
"appearances": appearances,
"appearance_rate": appearances / len(sorted_dates),
"platforms": list(topic_platforms[title]),
"platform_count": len(topic_platforms[title]),
"last_seen": topic_last_seen[title]
})
# 按出现次数和平台数量排序
persistent_topics.sort(key=lambda x: (x["appearances"], x["platform_count"]), reverse=True)
return persistent_topics[:10] # 返回前10个持续热门话题
def _analyze_platform_trends(self, historical_data: Dict[str, Dict[str, List]]) -> Dict[str, Any]:
"""分析平台趋势"""
# 按日期排序的数据
sorted_dates = sorted(historical_data.keys())
if len(sorted_dates) < 2:
return {"emerging": [], "declining": []}
# 统计每个平台在不同日期的热点数量
platform_trends = defaultdict(lambda: defaultdict(int))
for date_str in sorted_dates:
daily_data = historical_data[date_str]
for platform, items in daily_data.items():
platform_trends[platform][date_str] = len(items)
# 计算平台的增长趋势
platform_growth = {}
for platform, date_counts in platform_trends.items():
if len(date_counts) < 2:
continue
# 计算增长率
first_date = sorted_dates[0]
last_date = sorted_dates[-1]
first_count = date_counts.get(first_date, 0)
last_count = date_counts.get(last_date, 0)
if first_count == 0:
growth_rate = 100 if last_count > 0 else 0
else:
growth_rate = ((last_count - first_count) / first_count) * 100
platform_growth[platform] = {
"first_count": first_count,
"last_count": last_count,
"growth_rate": growth_rate,
"trend": "rising" if growth_rate > 0 else "falling" if growth_rate < 0 else "stable"
}
# 按增长率排序
emerging_platforms = sorted(
platform_growth.items(),
key=lambda x: x[1]["growth_rate"],
reverse=True
)
return {
"emerging": [{"platform": p, **data} for p, data in emerging_platforms[:5]],
"declining": [{"platform": p, **data} for p, data in emerging_platforms[-5:] if data["growth_rate"] < 0]
}
def _analyze_keyword_trends(self, historical_data: Dict[str, Dict[str, List]]) -> Dict[str, List]:
"""分析关键词趋势"""
# 按日期排序的数据
sorted_dates = sorted(historical_data.keys())
if len(sorted_dates) < 2:
return {"rising": [], "falling": []}
# 统计每个日期的关键词频率
date_keywords = defaultdict(Counter)
for date_str in sorted_dates:
daily_data = historical_data[date_str]
# 收集当天所有标题
all_titles = []
for platform, items in daily_data.items():
all_titles.extend([item.get("title", "") for item in items])
# 分词并统计频率(简化实现)
for title in all_titles:
for word in title.split():
if len(word) > 1: # 忽略单字
date_keywords[date_str][word] += 1
# 分析关键词趋势
keyword_trends = defaultdict(list)
# 收集所有关键词
all_keywords = set()
for date_counter in date_keywords.values():
all_keywords.update(date_counter.keys())
# 分析每个关键词的趋势
for keyword in all_keywords:
trend_data = []
for date_str in sorted_dates:
count = date_keywords[date_str].get(keyword, 0)
trend_data.append({"date": date_str, "count": count})
# 计算趋势方向
if len(trend_data) >= 2:
first_count = trend_data[0]["count"]
last_count = trend_data[-1]["count"]
if first_count == 0:
growth_rate = 100 if last_count > 0 else 0
else:
growth_rate = ((last_count - first_count) / first_count) * 100
if growth_rate > 50: # 增长超过50%
keyword_trends["rising"].append({
"keyword": keyword,
"growth_rate": growth_rate,
"first_count": first_count,
"last_count": last_count,
"trend_data": trend_data
})
elif growth_rate < -50: # 下降超过50%
keyword_trends["falling"].append({
"keyword": keyword,
"growth_rate": growth_rate,
"first_count": first_count,
"last_count": last_count,
"trend_data": trend_data
})
# 按增长率排序
keyword_trends["rising"].sort(key=lambda x: x["growth_rate"], reverse=True)
keyword_trends["falling"].sort(key=lambda x: x["growth_rate"])
return {
"rising": keyword_trends["rising"][:10], # 前10个上升关键词
"falling": keyword_trends["falling"][:10] # 前10个下降关键词
}
# 添加随机模块,用于生成模拟数据
import random

View File

@@ -0,0 +1,3 @@
from app.analysis.trend_analyzer.analyzer import TrendAnalyzer
__all__ = ['TrendAnalyzer']

File diff suppressed because it is too large Load Diff

0
app/api/__init__.py Normal file
View File

0
app/api/dependencies.py Normal file
View File

0
app/api/v1/__init__.py Normal file
View File

314
app/api/v1/analysis.py Normal file
View File

@@ -0,0 +1,314 @@
from fastapi import APIRouter, Query
from typing import Optional
from datetime import datetime
import pytz
from app.analysis.trend_analyzer import TrendAnalyzer
from app.analysis.predictor import TrendPredictor
from app.utils.logger import log
from app.core import cache
router = APIRouter()
@router.get("/trend")
async def get_trend_analysis(date: Optional[str] = None, type: str = "main"):
"""
获取热点聚合分析
分析各平台热点数据的共性和差异,提取共同关键词、跨平台热点话题等
- **date**: 可选指定日期格式为YYYY-MM-DD默认为当天
- **type**: 分析类型,可选值为 main(主题分析), platform(平台对比), cross(跨平台热点), advanced(高级分析)默认为main
"""
try:
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 从缓存中获取数据
cache_key = f"analysis:trend:{date}:{type}"
cached_data = cache.get_cache(cache_key)
if cached_data:
log.info(f"Retrieved trend analysis from cache for {date}, type: {type}")
return cached_data
# 如果缓存中没有,则生成新的分析数据
analyzer = TrendAnalyzer()
result = analyzer.get_analysis(date, type)
return result
except Exception as e:
log.error(f"Error in trend analysis: {e}")
return {
"status": "error",
"message": str(e),
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
}
@router.get("/platform-comparison")
async def get_platform_comparison(date: Optional[str] = None):
"""
获取平台对比分析
分析各平台热点数据的特点、热度排行、更新频率等,比较不同平台间的异同
- **date**: 可选指定日期格式为YYYY-MM-DD默认为当天
"""
try:
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 从缓存中获取数据
cache_key = f"analysis:trend:{date}:platform_comparison"
cached_data = cache.get_cache(cache_key)
if cached_data:
log.info(f"Retrieved platform comparison from cache for {date}")
return cached_data
# 如果缓存中没有,则生成新的分析数据
analyzer = TrendAnalyzer()
result = analyzer.get_platform_comparison(date)
return result
except Exception as e:
log.error(f"Error in platform comparison: {e}")
return {
"status": "error",
"message": str(e),
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
}
@router.get("/cross-platform")
async def get_cross_platform_analysis(date: Optional[str] = None, refresh: bool = False):
"""
获取跨平台热点分析
分析在多个平台上出现的热点话题,以及热点的传播路径
- **date**: 可选指定日期格式为YYYY-MM-DD默认为当天
- **refresh**: 可选是否强制刷新缓存默认为False
"""
try:
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 从缓存中获取数据
cache_key = f"analysis:trend:{date}:cross_platform"
# 如果不是强制刷新,尝试从缓存获取
if not refresh:
cached_data = cache.get_cache(cache_key)
if cached_data:
log.info(f"Retrieved cross platform analysis from cache for {date}")
return cached_data
# 如果缓存中没有或需要刷新,则生成新的分析数据
analyzer = TrendAnalyzer()
result = analyzer.get_cross_platform_analysis(date, refresh)
return result
except Exception as e:
log.error(f"Error in cross platform analysis: {e}")
return {
"status": "error",
"message": str(e),
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
}
@router.get("/advanced")
async def get_advanced_analysis(date: Optional[str] = None, refresh: bool = False):
"""
获取高级分析
提供更深入的热点分析,包括关键词云图、情感分析、热点演变趋势等
- **date**: 可选指定日期格式为YYYY-MM-DD默认为当天
- **refresh**: 可选是否强制刷新缓存默认为False
"""
try:
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 从缓存中获取数据
cache_key = f"analysis:trend:{date}:advanced_analysis"
# 如果不是强制刷新,尝试从缓存获取
if not refresh:
cached_data = cache.get_cache(cache_key)
if cached_data:
log.info(f"Retrieved advanced analysis from cache for {date}")
return cached_data
# 如果缓存中没有或需要刷新,则生成新的分析数据
analyzer = TrendAnalyzer()
result = analyzer.get_advanced_analysis(date, refresh)
return result
except Exception as e:
log.error(f"Error in advanced analysis: {e}")
return {
"status": "error",
"message": str(e),
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
}
@router.get("/prediction")
async def get_trend_prediction(date: Optional[str] = None):
"""
获取热点趋势预测
基于历史数据预测热点话题的发展趋势,包括上升趋势、下降趋势、持续热门话题等
- **date**: 可选指定日期格式为YYYY-MM-DD默认为当天
"""
try:
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 从缓存中获取数据
cache_key = f"analysis:prediction:{date}"
cached_data = cache.get_cache(cache_key)
if cached_data:
log.info(f"Retrieved trend prediction from cache for {date}")
return cached_data
# 如果缓存中没有,则生成新的预测数据
predictor = TrendPredictor()
result = predictor.get_prediction(date)
return result
except Exception as e:
log.error(f"Error in trend prediction: {e}")
return {
"status": "error",
"message": str(e),
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
}
@router.get("/keyword-cloud")
async def get_keyword_cloud(date: Optional[str] = None, refresh: bool = False, platforms: Optional[str] = None, category: Optional[str] = None, keyword_count: int = 200):
"""
获取关键词云图数据
提取热点数据中的关键词,按不同类别(科技、娱乐、社会等)进行分类,用于生成词云
- **date**: 可选指定日期格式为YYYY-MM-DD默认为当天
- **refresh**: 可选是否强制刷新缓存默认为False
- **platforms**: 可选,指定平台,多个平台用逗号分隔,如"baidu,weibo"
- **category**: 可选,指定分类,如"科技""娱乐"
- **keyword_count**: 可选返回的关键词数量默认为200
"""
try:
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 从缓存中获取数据
cache_key = f"analysis:keyword_cloud:{date}"
# 如果不是强制刷新,尝试从缓存获取
if not refresh:
cached_data = cache.get_cache(cache_key)
if cached_data:
log.info(f"Retrieved keyword cloud from cache for {date}")
# 如果指定了分类,过滤结果
if category and cached_data.get("status") == "success" and "keyword_clouds" in cached_data:
if category in cached_data["keyword_clouds"]:
filtered_data = cached_data.copy()
filtered_data["keyword_clouds"] = {category: cached_data["keyword_clouds"][category]}
return filtered_data
return cached_data
# 如果缓存中没有或需要刷新,则生成新的关键词云数据
analyzer = TrendAnalyzer()
result = analyzer.get_keyword_cloud(date, refresh, keyword_count)
return result
except Exception as e:
log.error(f"Error in keyword cloud analysis: {e}")
return {
"status": "error",
"message": str(e),
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
}
@router.get("/data-visualization")
async def get_data_visualization(date: Optional[str] = None, refresh: bool = False, platforms: str = None):
"""
获取数据可视化分析
提供热点数据的可视化分析,包括主题热度分布图
- **date**: 可选指定日期格式为YYYY-MM-DD默认为当天
- **refresh**: 可选是否强制刷新缓存默认为False
- **platforms**: 可选指定要分析的平台多个平台用逗号分隔例如baidu,weibo,douyin
"""
try:
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 从缓存中获取数据
cache_key = f"analysis:data_visualization:{date}"
# 如果不是强制刷新,尝试从缓存获取
if not refresh:
cached_data = cache.get_cache(cache_key)
if cached_data:
log.info(f"Retrieved data visualization from cache for {date}")
return cached_data
# 解析平台参数
platform_list = None
if platforms:
platform_list = [p.strip() for p in platforms.split(",") if p.strip()]
# 如果缓存中没有或需要刷新,则生成新的可视化数据
analyzer = TrendAnalyzer()
result = analyzer.get_data_visualization(date, refresh, platform_list)
return result
except Exception as e:
log.error(f"Error in data visualization: {e}")
return {
"status": "error",
"message": str(e),
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
}
@router.get("/trend-forecast")
async def get_trend_forecast(date: Optional[str] = None, refresh: bool = False, time_range: str = "24h"):
"""
获取热点趋势预测分析
分析热点话题的演变趋势,预测热点的发展方向
- **date**: 可选指定日期格式为YYYY-MM-DD默认为当天
- **refresh**: 可选是否强制刷新缓存默认为False
- **time_range**: 可选,预测时间范围,可选值为 24h(24小时), 7d(7天), 30d(30天)默认为24h
"""
try:
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 验证时间范围参数
valid_time_ranges = ["24h", "7d", "30d"]
if time_range not in valid_time_ranges:
time_range = "24h" # 默认使用24小时
# 从缓存中获取数据
cache_key = f"analysis:trend_forecast:{date}:{time_range}"
# 如果不是强制刷新,尝试从缓存获取
if not refresh:
cached_data = cache.get_cache(cache_key)
if cached_data:
log.info(f"Retrieved trend forecast from cache for {date}, time_range: {time_range}")
return cached_data
# 如果缓存中没有或需要刷新,则生成新的趋势预测数据
analyzer = TrendAnalyzer()
result = analyzer.get_trend_forecast(date, refresh, time_range)
return result
except Exception as e:
log.error(f"Error in trend forecast: {e}")
return {
"status": "error",
"message": str(e),
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d"),
"time_range": time_range
}

295
app/api/v1/daily_news.py Normal file
View File

@@ -0,0 +1,295 @@
# app/api/endpoints/dailynews.py
import json
from datetime import datetime
from typing import List, Dict, Any, Optional
import pytz
from fastapi import APIRouter
from app.core import cache
from app.services import crawler_factory
from app.utils.logger import log
router = APIRouter()
@router.get("/")
def get_hot_news(date: str = None, platform: str = None):
if platform not in crawler_factory.keys():
return {
"status": "404",
"data": [],
"msg": "`platform` is required, valid platform: " + ", ".join(crawler_factory.keys())
}
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
cacheKey = f"crawler:{platform}:{date}"
result = cache.get(cacheKey)
if result:
return {
"status": "200",
"data": json.loads(result),
"msg": "success"
}
return {
"status": "200",
"data": [],
"msg": "success"
}
@router.get("/all")
def get_all_platforms_news(date: str = None):
"""
获取所有平台的热门新闻
Args:
date: 日期格式为YYYY-MM-DD默认为当天
Returns:
包含所有平台新闻的字典,键为平台名称,值为新闻列表
"""
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
all_news = {}
for platform in crawler_factory.keys():
cacheKey = f"crawler:{platform}:{date}"
result = cache.get(cacheKey)
if result:
try:
all_news[platform] = json.loads(result)
except Exception as e:
log.error(f"Error parsing cached data for {platform}: {e}")
all_news[platform] = []
else:
all_news[platform] = []
return {
"status": "200",
"data": all_news,
"msg": "success"
}
@router.get("/multi")
def get_multi_platforms_news(date: str = None, platforms: str = None):
"""
获取多个平台的热门新闻
Args:
date: 日期格式为YYYY-MM-DD默认为当天
platforms: 平台列表,以逗号分隔,例如 "weibo,baidu,zhihu"
Returns:
包含指定平台新闻的字典,键为平台名称,值为新闻列表
"""
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
if not platforms:
return {
"status": "404",
"data": {},
"msg": "`platforms` parameter is required, format: comma-separated platform names"
}
platform_list = [p.strip() for p in platforms.split(",")]
valid_platforms = crawler_factory.keys()
# 验证平台是否有效
invalid_platforms = [p for p in platform_list if p not in valid_platforms]
if invalid_platforms:
return {
"status": "404",
"data": {},
"msg": f"Invalid platforms: {', '.join(invalid_platforms)}. Valid platforms: {', '.join(valid_platforms)}"
}
multi_news = {}
for platform in platform_list:
cacheKey = f"crawler:{platform}:{date}"
result = cache.get(cacheKey)
if result:
try:
multi_news[platform] = json.loads(result)
except Exception as e:
log.error(f"Error parsing cached data for {platform}: {e}")
multi_news[platform] = []
else:
multi_news[platform] = []
return {
"status": "200",
"data": multi_news,
"msg": "success"
}
@router.get("/search")
def search_news(keyword: str, date: str = None, platforms: str = None, limit: int = 20):
"""
搜索新闻
Args:
keyword: 搜索关键词
date: 日期格式为YYYY-MM-DD默认为当天
platforms: 平台列表,以逗号分隔,例如 "weibo,baidu,zhihu",默认搜索所有平台
limit: 返回结果数量限制默认为20
Returns:
包含搜索结果的字典,键为状态码、数据、消息、总结果数量和搜索结果数量
"""
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 确定要搜索的平台
if platforms:
platform_list = [p.strip() for p in platforms.split(",")]
valid_platforms = crawler_factory.keys()
platform_list = [p for p in platform_list if p in valid_platforms]
else:
platform_list = list(crawler_factory.keys())
if not platform_list:
return {
"status": "404",
"data": [],
"msg": "No valid platforms specified",
"total": 0,
"search_results": 0
}
# 从各平台获取新闻数据
all_news = []
for platform in platform_list:
cacheKey = f"crawler:{platform}:{date}"
result = cache.get(cacheKey)
if not result:
continue
try:
platform_news = json.loads(result)
if not isinstance(platform_news, list):
continue
# 为每条新闻添加平台信息
for idx, item in enumerate(platform_news):
if not isinstance(item, dict):
continue
# 处理rank字段
rank_value = ""
if "rank" in item and item["rank"]:
rank_value = str(item["rank"]).replace("#", "")
elif "index" in item and item["index"]:
rank_value = str(item["index"]).replace("#", "")
else:
rank_value = str(idx + 1)
# 获取分类信息
category = _get_category_for_platform(platform)
sub_category = _get_subcategory_for_platform(platform)
# 构建标准化的新闻条目
item_with_source = {
"id": item.get("id"),
"title": item.get("title", ""),
"source": platform,
"rank": rank_value,
"category": category,
"sub_category": sub_category,
"url": item.get("url", "")
}
all_news.append(item_with_source)
except Exception as e:
log.error(f"Error processing news from {platform}: {e}")
# 搜索关键词
search_results = []
for item in all_news:
if keyword.lower() in item["title"].lower():
search_results.append(item)
# 按站点分组,每个站点内按排名排序
grouped_results = {}
for item in search_results:
source = item["source"]
if source not in grouped_results:
grouped_results[source] = []
grouped_results[source].append(item)
# 对每个站点内的结果按排名排序
for source, items in grouped_results.items():
# 按排名排序(直接比较数字)
items.sort(key=lambda x: int(x["rank"]) if x["rank"].isdigit() else 999)
# 重新组合排序后的结果
sorted_results = []
for source, items in grouped_results.items():
sorted_results.extend(items)
# 限制返回结果数量
limited_results = sorted_results[:limit]
return {
"status": "200",
"data": limited_results,
"msg": "success",
"total": len(search_results),
"search_results": len(limited_results)
}
def _get_category_for_platform(platform: str) -> str:
"""根据平台返回对应的分类"""
categories = {
"36kr": "科技创业",
"hupu": "体育",
"sspai": "科技",
"weibo": "社交",
"zhihu": "知识",
"baidu": "综合",
"tieba": "社区",
"douban": "文化",
"bilibili": "视频",
"v2ex": "科技",
"github": "开发者",
"hackernews": "科技",
"stackoverflow": "开发者",
"jinritoutiao": "资讯",
"douyin": "娱乐",
"shaoshupai": "科技"
}
return categories.get(platform, "其他")
def _get_subcategory_for_platform(platform: str) -> str:
"""根据平台返回对应的子分类"""
subcategories = {
"36kr": "商业资讯",
"hupu": "娱乐",
"sspai": "数码",
"weibo": "热门",
"zhihu": "问答",
"baidu": "热搜",
"tieba": "讨论",
"douban": "影视",
"bilibili": "热门",
"v2ex": "技术",
"github": "开源",
"hackernews": "国际",
"stackoverflow": "问答",
"jinritoutiao": "热点",
"douyin": "娱乐",
"shaoshupai": "数码"
}
return subcategories.get(platform, "其他")

138
app/api/v1/web_tools.py Normal file
View File

@@ -0,0 +1,138 @@
# app/api/endpoints/website_meta.py
import json
import time
from urllib.parse import urlparse, urljoin
import cloudscraper
from app.utils.logger import log
import requests
from bs4 import BeautifulSoup
from fastapi import APIRouter
from app.core import cache
router = APIRouter()
@router.get("/")
def get_meta(url: str = None):
if not url:
return {
"status": "404",
"data": [],
"msg": "`url` is required"
}
# get from cache
cached_metadata = cache.get(url)
if cached_metadata:
return {
"status": "200",
"data": json.loads(cached_metadata),
"msg": "success",
"cache": True
}
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9,zh-TW;q=0.8,ar;q=0.7,en;q=0.6",
"cache-control": "max-age=0",
"priority": "u=0, i",
"sec-ch-ua": '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
page_content = response.content
except requests.RequestException as e:
scraper = cloudscraper.create_scraper(delay=100)
response = scraper.get(url)
page_content = response.content
if not page_content:
return {
"status": "404",
"data": [],
"msg": "No content"
}
soup = BeautifulSoup(page_content, "html.parser")
meta_info = {
"title": soup.title.string if soup.title else "No title",
"description": "",
"keywords": "",
"author": "",
"og:title": "",
"og:description": "",
"og:image": "",
"og:url": url,
"twitter:card": "",
"twitter:title": "",
"twitter:description": "",
"twitter:image": ""
}
for meta_tag in soup.find_all("meta"):
name_attr = meta_tag.get("name", "").lower()
property_attr = meta_tag.get("property", "").lower()
content = meta_tag.get("content", "")
if name_attr == "description":
meta_info["description"] = content
elif name_attr == "keywords":
meta_info["keywords"] = content
elif name_attr == "author":
meta_info["author"] = content
elif property_attr == "og:title":
meta_info["og:title"] = content
elif property_attr == "og:description":
meta_info["og:description"] = content
elif property_attr == "og:image":
meta_info["og:image"] = content
elif property_attr == "og:url":
meta_info["og:url"] = content
elif name_attr == "twitter:card":
meta_info["twitter:card"] = content
elif name_attr == "twitter:title":
meta_info["twitter:title"] = content
elif name_attr == "twitter:description":
meta_info["twitter:description"] = content
elif name_attr == "twitter:image":
meta_info["twitter:image"] = content
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
favicon_url = urljoin(base_url, "favicon.ico") # 默认 favicon 路径
link_tag = soup.find("link", rel=["icon", "shortcut icon"])
if link_tag:
favicon_url = urljoin(base_url, link_tag.get("href", "favicon.ico"))
metadata = {
"meta_info": meta_info,
"favicon_url": favicon_url
}
cache.set(url, json.dumps(metadata, ensure_ascii=False), ex=60)
result = {
"status": "200",
"data": metadata,
"msg": "Success",
"cache": False
}
return result

0
app/core/__init__.py Normal file
View File

150
app/core/cache.py Normal file
View File

@@ -0,0 +1,150 @@
import redis
from pydantic import BaseModel
import json
from typing import Any, Optional, Dict, List, Union
import time
from app.db.redis import get_redis_client
from app.utils.logger import log
# 默认缓存过期时间1小时
DEFAULT_EXPIRE = 3600
def init_cache():
"""初始化缓存连接"""
try:
redis = get_redis_client()
redis.ping()
log.info("Cache connection established")
except Exception as e:
log.error(f"Failed to connect to cache: {e}")
def close_cache():
"""关闭缓存连接"""
try:
redis = get_redis_client()
redis.connection_pool.disconnect()
log.info("Cache connection closed")
except Exception as e:
log.error(f"Error closing cache connection: {e}")
def set_cache(key: str, value: Any, expire: int = DEFAULT_EXPIRE) -> bool:
"""设置缓存,支持自动序列化复杂对象"""
try:
redis = get_redis_client()
if isinstance(value, (dict, list, tuple)):
value = json.dumps(value)
elif isinstance(value, bool):
value = "1" if value else "0"
if expire > 0:
redis.setex(key, expire, value)
else:
redis.set(key, value)
return True
except Exception as e:
log.error(f"Error setting cache for key111111 {key}: {e}")
return False
def get_cache(key: str) -> Optional[Any]:
try:
redis = get_redis_client()
value = redis.get(key)
if value is None:
return None
if isinstance(value, bytes):
value = value.decode('utf-8')
try:
return json.loads(value)
except (json.JSONDecodeError, TypeError):
return value
except Exception as e:
log.error(f"Error getting cache for key {key}: {e}")
return None
def delete_cache(key: str) -> bool:
try:
redis = get_redis_client()
redis.delete(key)
return True
except Exception as e:
log.error(f"Error deleting cache for key {key}: {e}")
return False
def clear_cache_pattern(pattern: str) -> int:
try:
redis = get_redis_client()
keys = redis.keys(pattern)
if keys:
return redis.delete(*keys)
return 0
except Exception as e:
log.error(f"Error clearing cache pattern {pattern}: {e}")
return 0
def get(key):
try:
redis_client = get_redis_client()
except Exception as e:
log.error(f"Error getting redis client: {e}")
return None
value = redis_client.get(key)
if value is None:
return None
return value.decode("utf-8")
def set(key, value, ex=None):
try:
redis_client = get_redis_client()
except Exception as e:
log.error(f"Error getting redis client: {e}")
return None
return redis_client.set(key, value, ex=ex)
def delete(key):
try:
redis_client = get_redis_client()
except Exception as e:
log.error(f"Error getting redis client: {e}")
return None
return redis_client.delete(key)
def hset(name, key, value):
try:
redis_client = get_redis_client()
except Exception as e:
log.error(f"Error getting redis client: {e}")
return None
return redis_client.hset(name, key, value)
def hget(name, key):
try:
redis_client = get_redis_client()
except Exception as e:
log.error(f"Error getting redis client: {e}")
return None
return redis_client.hget(name, key)
class CacheNews(BaseModel):
title: str
url: str
score: int
desc: str

121
app/core/config.py Normal file
View File

@@ -0,0 +1,121 @@
import os
import yaml
from typing import Dict, Any, List, Optional
from pydantic import BaseModel, Field
# 配置文件路径
CONFIG_PATH = os.environ.get("CONFIG_PATH", "config/config.yaml")
class AppConfig(BaseModel):
title: str
description: str
version: str
host: str
port: int
debug: bool = True
cors: Dict[str, Any]
class DatabaseConfig(BaseModel):
host: str
user: str
password: str
db: str
charset: str
autocommit: bool = True
class RedisConfig(BaseModel):
host: str
port: int
db: int
password: str = ""
decode_responses: bool = False
socket_timeout: int = 5
socket_connect_timeout: int = 5
health_check_interval: int = 30
class CrawlerConfig(BaseModel):
interval: int
timeout: int
max_retry_count: int
max_instances: int
misfire_grace_time: int
class LoggingConfig(BaseModel):
level: str
format: str
dir: str
file: str
max_size: int
backup_count: int
daily_backup_count: int
timezone: str
class SchedulerConfig(BaseModel):
thread_pool_size: int
process_pool_size: int
coalesce: bool
max_instances: int
misfire_grace_time: int
timezone: str
class NotificationConfig(BaseModel):
dingtalk: Dict[str, Any] = Field(default_factory=dict)
# 可以添加其他通知方式的配置
# wechat: Dict[str, Any] = Field(default_factory=dict)
# email: Dict[str, Any] = Field(default_factory=dict)
class Config(BaseModel):
app: AppConfig
database: DatabaseConfig
redis: RedisConfig
crawler: CrawlerConfig
logging: LoggingConfig
scheduler: SchedulerConfig
notification: Optional[NotificationConfig] = None
# 全局配置对象
_config: Optional[Config] = None
def load_config() -> Config:
"""加载配置文件"""
global _config
if _config is None:
try:
with open(CONFIG_PATH, 'r') as f:
config_data = yaml.safe_load(f)
_config = Config(**config_data)
except Exception as e:
raise RuntimeError(f"Failed to load configuration: {e}")
return _config
def get_config() -> Config:
"""获取配置对象"""
if _config is None:
return load_config()
return _config
# 便捷访问函数
def get_app_config() -> AppConfig:
return get_config().app
def get_db_config() -> DatabaseConfig:
return get_config().database
def get_redis_config() -> RedisConfig:
return get_config().redis
def get_crawler_config() -> CrawlerConfig:
return get_config().crawler
def get_logging_config() -> LoggingConfig:
return get_config().logging
def get_scheduler_config() -> SchedulerConfig:
return get_config().scheduler
def get_notification_config() -> Dict[str, Any]:
"""获取通知配置"""
config = get_config()
if config.notification:
return config.notification.dict()
return {}

131
app/core/db.py Normal file
View File

@@ -0,0 +1,131 @@
import time
from typing import List, Dict, Any, Optional
from contextlib import contextmanager
import traceback
import pymysql
from pymysql.cursors import DictCursor
from app.utils.logger import log
from app.core.config import get_db_config
# 连接池
_connection = None
def init_db():
"""初始化数据库连接"""
global _connection
try:
db_config = get_db_config()
_connection = pymysql.connect(
host=db_config.host,
user=db_config.user,
password=db_config.password,
db=db_config.db,
charset=db_config.charset,
cursorclass=DictCursor,
autocommit=db_config.autocommit
)
log.info("Database connection established")
except Exception as e:
log.error(f"Failed to connect to database: {e}")
raise
def close_db():
"""关闭数据库连接"""
global _connection
if _connection:
_connection.close()
_connection = None
log.info("Database connection closed")
@contextmanager
def get_cursor():
"""获取数据库游标的上下文管理器"""
global _connection
# 如果连接不存在或已关闭,重新连接
if _connection is None or not _connection.open:
init_db()
cursor = None
try:
cursor = _connection.cursor()
yield cursor
except pymysql.OperationalError as e:
# 处理连接断开的情况
if e.args[0] in (2006, 2013): # MySQL server has gone away, Lost connection
log.warning("Database connection lost, reconnecting...")
init_db()
cursor = _connection.cursor()
yield cursor
else:
raise
except Exception as e:
log.error(f"Database error: {e}")
raise
finally:
if cursor:
cursor.close()
def insert_news(news_list: List[Dict[str, Any]]) -> int:
"""插入新闻数据,返回成功插入的数量"""
if not news_list:
return 0
inserted_count = 0
start_time = time.time()
try:
with get_cursor() as cursor:
for news in news_list:
# 检查是否已存在
cursor.execute(
"SELECT id FROM news WHERE url = %s LIMIT 1",
(news.get('url', ''),)
)
if cursor.fetchone():
continue
# 插入新数据
cursor.execute(
"""
INSERT INTO news (title, content, url, source, publish_time, created_at)
VALUES (%s, %s, %s, %s, %s, NOW())
""",
(
news.get('title', ''),
news.get('content', ''),
news.get('url', ''),
news.get('source', ''),
news.get('publish_time', None),
)
)
inserted_count += 1
duration = time.time() - start_time
log.info(f"Inserted {inserted_count}/{len(news_list)} news items in {duration:.2f}s")
return inserted_count
except Exception as e:
log.error(f"Error inserting news: {e}")
log.error(traceback.format_exc())
return 0
def get_news_by_date(date_str: str, limit: int = 100) -> List[Dict[str, Any]]:
"""获取指定日期的新闻"""
try:
with get_cursor() as cursor:
cursor.execute(
"""
SELECT * FROM news
WHERE DATE(publish_time) = %s
ORDER BY publish_time DESC
LIMIT %s
""",
(date_str, limit)
)
return cursor.fetchall()
except Exception as e:
log.error(f"Error getting news by date: {e}")
return []

View File

@@ -0,0 +1,77 @@
{
"科技": [
"AI", "人工智能", "大模型", "算法", "编程", "程序", "软件", "硬件", "代码",
"互联网", "网络", "云计算", "大数据", "机器学习", "深度学习", "区块链", "元宇宙",
"芯片", "半导体", "操作系统", "应用", "app", "手机", "电脑", "笔记本", "平板",
"苹果", "华为", "小米", "三星", "谷歌", "微软", "百度", "阿里", "腾讯", "字节跳动",
"数据", "隐私", "安全", "黑客", "漏洞", "加密", "量子", "5G", "6G", "物联网",
"VR", "AR", "MR", "XR", "无人机", "机器人", "自动驾驶", "智能家居"
],
"娱乐": [
"电影", "电视剧", "综艺", "节目", "剧集", "影视", "演员", "导演", "制片",
"明星", "艺人", "歌手", "音乐", "歌曲", "专辑", "演唱会", "演出", "表演",
"票房", "收视率", "热度", "流量", "粉丝", "网红", "主播", "直播", "短视频",
"抖音", "快手", "B站", "油管", "视频", "游戏", "动漫", "二次元", "漫画",
"小说", "作家", "作者", "绯闻", "八卦", "恋情", "结婚", "离婚", "恋爱",
"综艺节目", "选秀", "真人秀", "脱口秀", "访谈", "颁奖", "获奖", "提名"
],
"社会": [
"社会", "事件", "现象", "热点", "话题", "讨论", "争议", "观点", "舆论",
"民生", "生活", "居民", "市民", "百姓", "群众", "公众", "社区", "小区",
"城市", "农村", "乡村", "振兴", "扶贫", "贫困", "福利", "保障", "救助",
"公益", "慈善", "捐赠", "捐款", "志愿者", "志愿", "服务", "公共", "公共服务",
"安全", "事故", "灾害", "灾难", "救援", "救灾", "防灾", "减灾", "消防",
"警察", "公安", "执法", "犯罪", "案件", "诈骗", "防骗", "防范", "预防",
"交通", "道路", "出行", "拥堵", "堵车", "地铁", "公交", "高铁", "铁路",
"环保", "污染", "垃圾", "分类", "绿色", "低碳", "节能", "减排", "可持续"
],
"财经": [
"经济", "金融", "财经", "股市", "股票", "基金", "债券", "期货", "外汇",
"汇率", "利率", "存款", "贷款", "理财", "投资", "投资者", "股东", "股份",
"上市", "IPO", "融资", "并购", "重组", "收购", "分拆", "分红", "派息",
"银行", "证券", "保险", "信托", "资管", "资产管理", "财富管理", "私募",
"公募", "券商", "基金公司", "信用", "风险", "监管", "政策", "法规", "规定",
"房地产", "楼市", "房价", "地价", "商品房", "住宅", "公寓", "别墅", "商铺",
"通货膨胀", "通胀", "CPI", "GDP", "经济增长", "经济发展", "经济复苏",
"贸易", "进出口", "关税", "税收", "减税", "增值税", "所得税", "企业所得税"
],
"体育": [
"体育", "运动", "比赛", "赛事", "联赛", "锦标赛", "冠军赛", "世界杯", "奥运会",
"足球", "篮球", "排球", "网球", "乒乓球", "羽毛球", "游泳", "田径", "马拉松",
"体操", "举重", "拳击", "武术", "跆拳道", "柔道", "击剑", "射击", "射箭",
"高尔夫", "棒球", "橄榄球", "冰球", "滑雪", "滑冰", "冬奥会", "亚运会",
"球员", "教练", "裁判", "球队", "俱乐部", "国家队", "主场", "客场", "赛季",
"进球", "得分", "助攻", "防守", "进攻", "战术", "技术", "犯规", "红牌", "黄牌",
"NBA", "CBA", "英超", "西甲", "德甲", "意甲", "法甲", "欧冠", "欧联", "亚冠",
"世锦赛", "大满贯", "全运会", "体育产业", "体育用品", "体育营销", "体育赞助"
],
"教育": [
"教育", "学校", "大学", "高校", "中学", "小学", "幼儿园", "学院", "研究生院",
"教师", "老师", "学生", "学员", "家长", "教授", "讲师", "班主任", "辅导员",
"课程", "课堂", "教材", "教学", "学习", "考试", "考核", "成绩", "分数", "学分",
"升学", "高考", "中考", "考研", "考博", "考证", "留学", "出国", "海归", "归国",
"学历", "学位", "文凭", "证书", "学士", "硕士", "博士", "博士后", "教育部",
"教育局", "教育厅", "教育系统", "教育改革", "素质教育", "应试教育", "职业教育",
"在线教育", "远程教育", "教育科技", "教育创新", "教育公平", "教育资源", "教育质量",
"校园", "宿舍", "食堂", "图书馆", "实验室", "教室", "操场", "体育馆", "礼堂"
],
"健康": [
"健康", "医疗", "医院", "医生", "医师", "护士", "护理", "患者", "病人", "就医",
"疾病", "疫情", "病毒", "细菌", "感染", "传染", "流行病", "新冠", "肺炎", "发热",
"症状", "治疗", "用药", "药物", "药品", "药剂", "处方", "诊断", "检查", "手术",
"康复", "保健", "养生", "营养", "饮食", "运动", "锻炼", "减肥", "增重", "塑形",
"心理", "精神", "抑郁", "焦虑", "压力", "睡眠", "失眠", "心理咨询", "心理治疗",
"医保", "医疗保险", "社保", "医改", "医疗改革", "医疗体系", "医疗资源", "医疗服务",
"疫苗", "接种", "防疫", "防控", "消毒", "隔离", "核酸", "抗原", "检测", "筛查"
],
"国际": [
"国际", "全球", "世界", "外交", "国家", "地区", "大使", "领事", "使馆", "领馆",
"美国", "中国", "俄罗斯", "欧盟", "日本", "印度", "英国", "法国", "德国", "意大利",
"加拿大", "澳大利亚", "巴西", "南非", "沙特", "伊朗", "以色列", "巴勒斯坦", "朝鲜",
"韩国", "越南", "新加坡", "马来西亚", "印尼", "泰国", "菲律宾", "乌克兰", "白俄罗斯",
"战争", "冲突", "和平", "停火", "制裁", "协议", "条约", "峰会", "会议", "会晤",
"联合国", "安理会", "世卫组织", "世贸组织", "国际货币基金组织", "世界银行", "北约",
"政治", "经济", "军事", "外交", "贸易", "投资", "援助", "移民", "难民", "人权",
"气候变化", "全球变暖", "可持续发展", "减排", "碳中和", "国际合作", "多边主义"
]
}

View File

@@ -0,0 +1,19 @@
{
"stopwords": [
"什么", "怎么", "如何", "为何", "为什么", "哪些", "多少", "几个", "怎样",
"一个", "这个", "那个", "自己", "这些", "那些", "因为", "所以", "如果",
"可以", "还是", "这样", "那样", "关于", "对于",
"今天", "明天", "昨天", "今年", "去年", "最近", "现在",
"一些", "有些", "很多", "许多",
"a", "an", "the", "and", "or", "but", "if", "because", "as", "what", "when",
"where", "how", "to", "of", "for", "with", "in", "on", "at", "from", "by",
"about", "into", "is", "are", "was", "were", "be", "been", "being", "have",
"has", "had", "do", "does", "did", "doing", "can", "could", "will", "would",
"should", "shall", "may", "might", "must", "that", "which", "who", "whom",
"this", "these", "those", "am", "i", "you", "he", "she", "it", "we", "they",
"their", "your", "my", "his", "her", "its", "our", "than", "then", "so", "not",
"的", "了", "和", "是", "在", "我", "有", "个", "这", "那", "就", "也",
"要", "会", "对", "啊", "吧", "呢", "吗", "嗯", "哦", "哪", "啥", "么",
"被", "说", "到", "等", "着", "为", "与", "但", "并", "或", "而", "所以"
]
}

0
app/db/__init__.py Normal file
View File

15
app/db/models.py Normal file
View File

@@ -0,0 +1,15 @@
from sqlalchemy import Column, Integer, String, Text, DateTime
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class News(Base):
__tablename__ = 'news'
id = Column(Integer, primary_key=True, autoincrement=True)
title = Column(String(255), nullable=False)
content = Column(Text, nullable=True)
url = Column(String(255), nullable=False, unique=True)
source = Column(String(50), nullable=True)
publish_time = Column(DateTime, nullable=True)
created_at = Column(DateTime, nullable=False)

60
app/db/mysql.py Normal file
View File

@@ -0,0 +1,60 @@
from datetime import datetime
from typing import Optional, List, Dict, Any
from .models import Base, News
# 移除对 SQLAlchemy 的依赖
# from app.core.db import Base
# 定义一个简单的数据类来替代 SQLAlchemy 模型
class News:
"""新闻数据模型"""
def __init__(self,
title: str = "",
content: str = "",
url: str = "",
source: str = "",
publish_time: Optional[datetime] = None):
self.id: Optional[int] = None
self.title = title
self.content = content
self.url = url
self.source = source
self.publish_time = publish_time or datetime.now()
self.created_at = datetime.now()
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'News':
"""从字典创建新闻对象"""
news = cls(
title=data.get('title', ''),
content=data.get('content', ''),
url=data.get('url', ''),
source=data.get('source', ''),
publish_time=data.get('publish_time')
)
if 'id' in data:
news.id = data['id']
if 'created_at' in data:
news.created_at = data['created_at']
return news
def to_dict(self) -> Dict[str, Any]:
"""转换为字典"""
return {
'id': self.id,
'title': self.title,
'content': self.content,
'url': self.url,
'source': self.source,
'publish_time': self.publish_time,
'created_at': self.created_at
}
def insert_news(news_list):
"""将新闻列表插入数据库"""
from app.core import db
# 如果传入的是 News 对象列表,转换为字典列表
if news_list and isinstance(news_list[0], News):
news_list = [news.to_dict() for news in news_list]
return db.insert_news(news_list)

45
app/db/redis.py Normal file
View File

@@ -0,0 +1,45 @@
import redis
from redis import Redis
from typing import Optional
from pydantic import BaseModel
from app.core import cache
from app.core.config import get_redis_config
REDIS_CONFIG = {
"host": "localhost",
"port": 6379,
"db": 0,
"decode_responses": False,
"socket_timeout": 5,
"socket_connect_timeout": 5,
"health_check_interval": 30,
}
_redis_pool = None
def get_redis_pool() -> redis.ConnectionPool:
global _redis_pool
if _redis_pool is None:
redis_config = get_redis_config()
_redis_pool = redis.ConnectionPool(
host=redis_config.host,
port=redis_config.port,
db=redis_config.db,
password=redis_config.password,
decode_responses=redis_config.decode_responses,
socket_timeout=redis_config.socket_timeout,
socket_connect_timeout=redis_config.socket_connect_timeout,
health_check_interval=redis_config.health_check_interval
)
return _redis_pool
def get_redis_client() -> Redis:
pool = get_redis_pool()
return redis.Redis(connection_pool=pool)
class CacheNews(BaseModel):
title: str
url: str
score: int
desc: str

92
app/main.py Normal file
View File

@@ -0,0 +1,92 @@
# app/main.py
import threading
import time
from contextlib import asynccontextmanager
from fastapi import FastAPI, Request, Response
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
import app.services.crawler as crawler
import tg_bot as tg_bot
from app.api.v1 import daily_news, web_tools, analysis
from app.utils.logger import log
from app.core import db, cache
from app.core.config import get_app_config, get_config
from app.services.browser_manager import BrowserManager
# 获取应用配置
app_config = get_app_config()
# 应用启动和关闭的生命周期管理
@asynccontextmanager
async def lifespan(app: FastAPI):
# 启动时执行
log.info("Application startup")
# 初始化数据库连接
db.init_db()
# 初始化缓存
cache.init_cache()
# 异步启动爬虫,避免阻塞应用启动
threading.Thread(target=crawler.crawlers_logic, daemon=True).start()
yield
# 关闭时执行
log.info("Application shutdown")
# 关闭浏览器管理器
try:
BrowserManager().shutdown()
log.info("Browser manager shutdown")
except Exception as e:
log.error(f"Error shutting down browser manager: {e}")
# 关闭数据库连接
db.close_db()
# 关闭缓存连接
cache.close_cache()
# 创建应用实例
app = FastAPI(
title=app_config.title,
description=app_config.description,
version=app_config.version,
lifespan=lifespan
)
# 添加CORS中间件
app.add_middleware(
CORSMiddleware,
allow_origins=app_config.cors["allow_origins"],
allow_credentials=app_config.cors["allow_credentials"],
allow_methods=app_config.cors["allow_methods"],
allow_headers=app_config.cors["allow_headers"],
)
# 请求计时中间件
@app.middleware("http")
async def add_process_time_header(request: Request, call_next):
start_time = time.time()
response = await call_next(request)
process_time = time.time() - start_time
response.headers["X-Process-Time"] = str(process_time)
return response
# 注册路由
app.include_router(daily_news.router, prefix="/api/v1/dailynews", tags=["Daily News"])
app.include_router(web_tools.router, prefix="/api/v1/tools/website-meta", tags=["Website Meta"])
app.include_router(analysis.router, prefix="/api/v1/analysis", tags=["Analysis"])
# 健康检查端点
@app.get("/health", tags=["Health"])
async def health_check():
return {"status": "healthy", "version": app_config.version}
# 如果直接运行此文件
if __name__ == "__main__":
uvicorn.run("app.main:app", host=app_config.host, port=app_config.port, reload=app_config.debug)

43
app/services/__init__.py Normal file
View File

@@ -0,0 +1,43 @@
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.executors.pool import ThreadPoolExecutor, ProcessPoolExecutor
from apscheduler.jobstores.memory import MemoryJobStore
import pytz
from app.services.sites.factory import CrawlerRegister
from app.utils.logger import log
from app.core.config import get_scheduler_config
# 创建爬虫工厂
crawler_factory = CrawlerRegister().register()
# 获取调度器配置
scheduler_config = get_scheduler_config()
# 配置调度器
jobstores = {
'default': MemoryJobStore()
}
executors = {
'default': ThreadPoolExecutor(scheduler_config.thread_pool_size),
'processpool': ProcessPoolExecutor(scheduler_config.process_pool_size)
}
job_defaults = {
'coalesce': scheduler_config.coalesce,
'max_instances': scheduler_config.max_instances,
'misfire_grace_time': scheduler_config.misfire_grace_time,
}
# 创建并配置调度器
_scheduler = BackgroundScheduler(
jobstores=jobstores,
executors=executors,
job_defaults=job_defaults,
timezone=pytz.timezone(scheduler_config.timezone)
)
# 启动调度器
_scheduler.start()
log.info(f"Scheduler started with timezone: {scheduler_config.timezone}")

View File

@@ -0,0 +1,121 @@
import threading
import time
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from app.utils.logger import log
class BrowserManager:
"""浏览器管理器提供共享的Chrome浏览器实例"""
_instance = None
_lock = threading.Lock()
_driver = None
_driver_path = None
_last_activity = 0
_max_idle_time = 1800 # 最大空闲时间默认30分钟
def __new__(cls, *args, **kwargs):
"""单例模式实现"""
if cls._instance is None:
with cls._lock:
if cls._instance is None:
cls._instance = super(BrowserManager, cls).__new__(cls)
cls._instance._init_driver_path()
cls._instance._start_idle_monitor()
return cls._instance
def _init_driver_path(self):
"""初始化ChromeDriver路径"""
try:
self._driver_path = ChromeDriverManager().install()
log.info(f"ChromeDriver已安装: {self._driver_path}")
except Exception as e:
log.error(f"ChromeDriver安装失败: {str(e)}")
raise
def _start_idle_monitor(self):
"""启动空闲监控线程"""
def monitor():
while True:
time.sleep(60) # 每分钟检查一次
try:
with self._lock:
if self._driver is not None:
current_time = time.time()
if current_time - self._last_activity > self._max_idle_time:
log.info(f"浏览器空闲超过{self._max_idle_time}秒,释放资源")
self._quit_driver()
except Exception as e:
log.error(f"浏览器监控线程异常: {str(e)}")
monitor_thread = threading.Thread(target=monitor, daemon=True)
monitor_thread.start()
log.info("浏览器空闲监控线程已启动")
def get_driver(self):
"""获取Chrome浏览器实例"""
with self._lock:
self._last_activity = time.time()
if self._driver is None:
self._create_driver()
return self._driver
def _create_driver(self):
"""创建新的Chrome浏览器实例"""
log.info("创建新的Chrome浏览器实例")
options = webdriver.ChromeOptions()
# 基本配置(无头模式)
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
# 内存优化配置
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-extensions")
options.add_argument("--disable-application-cache")
options.add_argument("--js-flags=--expose-gc")
options.add_argument("--memory-pressure-off")
options.add_argument("--disable-default-apps")
# 日志级别
options.add_argument("--log-level=3")
self._driver = webdriver.Chrome(
service=Service(self._driver_path),
options=options
)
self._driver.set_page_load_timeout(30)
def _quit_driver(self):
"""关闭浏览器实例"""
if self._driver:
try:
self._driver.quit()
log.info("浏览器实例已关闭")
except Exception as e:
log.error(f"关闭浏览器实例出错: {str(e)}")
finally:
self._driver = None
def release_driver(self):
"""使用完毕后标记为活动状态"""
with self._lock:
self._last_activity = time.time()
def get_page_content(self, url, wait_time=5):
"""获取指定URL的页面内容并自动处理浏览器"""
driver = self.get_driver()
try:
driver.get(url)
time.sleep(wait_time) # 等待页面加载
page_source = driver.page_source
self.release_driver()
return page_source, driver
except Exception as e:
log.error(f"获取页面内容失败: {str(e)}")
self.release_driver()
raise
def shutdown(self):
"""关闭浏览器管理器"""
with self._lock:
self._quit_driver()

240
app/services/crawler.py Normal file
View File

@@ -0,0 +1,240 @@
import time
import traceback
import threading
from datetime import datetime
from functools import wraps
import pytz
import signal
from typing import List, Dict, Any, Optional, Callable
from app.services import crawler_factory, _scheduler
from app.utils.logger import log
from app.core import db, cache
from app.core.config import get_crawler_config
from app.utils.notification import notification_manager
# 获取爬虫配置
crawler_config = get_crawler_config()
# 配置常量
CRAWLER_INTERVAL = crawler_config.interval
CRAWLER_TIMEOUT = crawler_config.timeout
MAX_RETRY_COUNT = crawler_config.max_retry_count
SHANGHAI_TZ = pytz.timezone('Asia/Shanghai')
class CrawlerTimeoutError(Exception):
"""爬虫超时异常"""
pass
def timeout_handler(func: Callable, timeout: int = CRAWLER_TIMEOUT) -> Callable:
"""超时处理装饰器支持Unix信号和线程两种实现"""
@wraps(func)
def wrapper(*args, **kwargs):
# 线程实现的超时机制
result = [None]
exception = [None]
completed = [False]
def target():
try:
result[0] = func(*args, **kwargs)
except Exception as e:
exception[0] = e
finally:
completed[0] = True
thread = threading.Thread(target=target)
thread.daemon = True
thread.start()
thread.join(timeout)
if not completed[0]:
error_msg = f"Function {func.__name__} timed out after {timeout} seconds"
log.error(error_msg)
raise CrawlerTimeoutError(error_msg)
if exception[0]:
log.error(f"Function {func.__name__} raised an exception: {exception[0]}")
raise exception[0]
return result[0]
return wrapper
def safe_fetch(crawler_name: str, crawler, date_str: str, is_retry: bool = False) -> List[Dict[str, Any]]:
"""安全地执行爬虫抓取,处理异常并返回结果"""
try:
news_list = crawler.fetch(date_str)
if news_list and len(news_list) > 0:
cache_key = f"crawler:{crawler_name}:{date_str}"
cache.set_cache(key=cache_key, value=news_list, expire=0)
log.info(f"{crawler_name} fetch success, {len(news_list)} news fetched")
return news_list
else:
log.info(f"{'Second time ' if is_retry else ''}crawler {crawler_name} failed. 0 news fetched")
return []
except Exception as e:
error_msg = traceback.format_exc()
log.error(f"{'Second time ' if is_retry else ''}crawler {crawler_name} error: {error_msg}")
# 发送钉钉通知
try:
notification_manager.notify_crawler_error(
crawler_name=crawler_name,
error_msg=str(e),
date_str=date_str,
is_retry=is_retry
)
except Exception as notify_error:
log.error(f"Failed to send notification for crawler {crawler_name}: {notify_error}")
return []
def run_data_analysis(date_str: str):
"""执行数据分析并缓存结果"""
log.info(f"Starting data analysis for date {date_str}")
try:
# 导入分析模块(在这里导入避免循环依赖)
from app.analysis.trend_analyzer import TrendAnalyzer
from app.analysis.predictor import TrendPredictor
# 创建分析器实例
analyzer = TrendAnalyzer()
predictor = TrendPredictor()
# 1. 生成关键词云图数据并缓存
log.info("Generating keyword cloud data...")
analyzer.get_keyword_cloud(date_str, refresh=True)
# 2. 生成热点聚合分析数据并缓存
log.info("Generating trend analysis data...")
analyzer.get_analysis(date_str, analysis_type="main")
# 3. 生成跨平台热点分析数据并缓存
log.info("Generating cross-platform analysis data...")
analyzer.get_cross_platform_analysis(date_str, refresh=True)
# 4. 生成热点趋势预测数据并缓存
log.info("Generating trend prediction data...")
predictor.get_prediction(date_str)
# 5. 生成平台对比分析数据并缓存
log.info("Generating platform comparison data...")
analyzer.get_platform_comparison(date_str)
# 6. 生成高级分析数据并缓存
log.info("Generating advanced analysis data...")
analyzer.get_advanced_analysis(date_str, refresh=True)
# 7. 生成数据可视化分析数据并缓存
log.info("Generating data visualization analysis...")
analyzer.get_data_visualization(date_str, refresh=True)
# 8. 生成趋势预测分析数据并缓存
log.info("Generating trend forecast data...")
analyzer.get_trend_forecast(date_str, refresh=True)
log.info(f"All data analysis completed for date {date_str}")
except Exception as e:
error_msg = traceback.format_exc()
log.error(f"Error during data analysis: {str(e)}")
log.error(error_msg)
# 发送数据分析异常通知
try:
notification_manager.notify_analysis_error(
error_msg=str(e),
date_str=date_str
)
except Exception as notify_error:
log.error(f"Failed to send analysis error notification: {notify_error}")
@_scheduler.scheduled_job('interval', id='crawlers_logic', seconds=CRAWLER_INTERVAL,
max_instances=crawler_config.max_instances,
misfire_grace_time=crawler_config.misfire_grace_time)
def crawlers_logic():
"""爬虫主逻辑,包含超时保护和错误处理"""
@timeout_handler
def crawler_work():
now_time = datetime.now(SHANGHAI_TZ)
date_str = now_time.strftime("%Y-%m-%d")
log.info(f"Starting crawler job at {now_time.strftime('%Y-%m-%d %H:%M:%S')}")
retry_crawler = []
success_count = 0
failed_crawlers = []
for crawler_name, crawler in crawler_factory.items():
news_list = safe_fetch(crawler_name, crawler, date_str)
if news_list:
success_count += 1
else:
retry_crawler.append(crawler_name)
failed_crawlers.append(crawler_name)
# 第二轮爬取(重试失败的爬虫)
if retry_crawler:
log.info(f"Retrying {len(retry_crawler)} failed crawlers")
retry_failed = []
for crawler_name in retry_crawler:
news_list = safe_fetch(crawler_name, crawler_factory[crawler_name], date_str, is_retry=True)
if news_list:
success_count += 1
# 从失败列表中移除成功的爬虫
if crawler_name in failed_crawlers:
failed_crawlers.remove(crawler_name)
else:
retry_failed.append(crawler_name)
# 记录完成时间
end_time = datetime.now(SHANGHAI_TZ)
duration = (end_time - now_time).total_seconds()
log.info(f"Crawler job finished at {end_time.strftime('%Y-%m-%d %H:%M:%S')}, "
f"duration: {duration:.2f}s, success: {success_count}/{len(crawler_factory)}")
# 发送通知
try:
notification_manager.notify_crawler_summary(
success_count=success_count,
total_count=len(crawler_factory),
failed_crawlers=failed_crawlers,
duration=duration,
date_str=date_str
)
except Exception as notify_error:
log.error(f"Failed to send crawler notification: {notify_error}")
# 爬取完成后执行数据分析
log.info("Crawler job completed, starting data analysis...")
# 使用新线程执行分析,避免阻塞主线程
threading.Thread(target=run_data_analysis, args=(date_str,), daemon=True).start()
return success_count
try:
return crawler_work()
except CrawlerTimeoutError as e:
log.error(f"Crawler job timeout: {str(e)}")
# 发送超时通知
try:
notification_manager.notify_crawler_timeout(
timeout_seconds=CRAWLER_TIMEOUT,
date_str=date_str
)
except Exception as notify_error:
log.error(f"Failed to send timeout notification: {notify_error}")
return 0
except Exception as e:
log.error(f"Crawler job error: {str(e)}")
log.error(traceback.format_exc())
# 发送通用异常通知
try:
notification_manager.notify_crawler_error(
crawler_name="crawler_job",
error_msg=str(e),
date_str=date_str
)
except Exception as notify_error:
log.error(f"Failed to send error notification: {notify_error}")
return 0

View File

View File

@@ -0,0 +1,99 @@
import json
import datetime
import requests
import urllib3
from bs4 import BeautifulSoup
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class BaiduNewsCrawler(Crawler):
# 返回news_list
def fetch(self, date_str) -> list:
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://top.baidu.com/api/board?platform=wise&tab=realtime"
resp = requests.get(url=url, params=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
json_data = resp.json()
contents = json_data.get("data")["cards"][0]["content"][0]["content"]
result = []
cache_list = []
for content in contents:
title = content.get("word")
url = content.get("url")
desc = content.get("desc")
score = content.get("hotScore")
# replace url m to www
url = url.replace("m.", "www.")
news = {
'title': title,
'url': url,
'content': desc,
'source': 'baidu',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串
}
result.append(news)
cache_list.append(news) # 直接添加字典json.dumps会在后面处理整个列表
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "baidu"
@staticmethod
def fetch_v0():
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://top.baidu.com/board?tab=realtime"
proxies = {
# "http": "http://127.0.0.1:7890",
# "https": "http://127.0.0.0:7890"
}
header = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
"*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"upgrade-insecure-requests": 1,
"host": "www.baidu.com",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/86.0.4240.183 Safari/537.36"
}
html = requests.get(url=url, params=header, verify=False, proxies=proxies)
html.encoding = "utf-8"
html_text = html.text
soup = BeautifulSoup(html_text, "html.parser")
main_content = soup.find_all("main")[0]
news_main_content = main_content.find("div", style='margin-bottom:20px')
div_elements = news_main_content.find_all('div', class_='category-wrap_iQLoo horizontal_1eKyQ')
result = []
for div_element in div_elements:
hot_index = div_element.find(class_='hot-index_1Bl1a').text.strip()
news_title = div_element.find(class_='c-single-text-ellipsis').text.strip()
news_link = div_element.find('a', class_='title_dIF3B')['href']
news = {
'title': news_title,
'url': news_link,
'content': "",
'source': 'baidu',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串
}
result.append(news)
return result

View File

@@ -0,0 +1,64 @@
import json
import datetime
import requests
import urllib3
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class BilibiliCrawler(Crawler):
def fetch(self, date_str):
current_time = datetime.datetime.now()
url = "https://api.bilibili.com/x/web-interface/popular"
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"Chrome/122.0.0.0 Safari/537.36"
"AppleWebKit/537.36 (KHTML, like Gecko) "
),
"Referer": "https://www.bilibili.com/",
}
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
data = resp.json()
if data["code"] != 0:
print(f"API error: {data['message']}")
return []
result = []
cache_list = []
for item in data["data"].get("list", []):
title = item.get("title", "")
bvid = item.get("bvid", "")
desc = item.get("desc", "")
video_url = f"https://www.bilibili.com/video/{bvid}"
news = {
'title': title,
'url': video_url,
'content': desc,
'source': 'bilibili',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "bilibili"

100
app/services/sites/cls.py Normal file
View File

@@ -0,0 +1,100 @@
import json
import datetime
import requests
import urllib3
from .crawler import Crawler
from ...core import cache
urllib3.disable_warnings()
class CLSCrawler(Crawler):
"""财联社"""
def fetch(self, date_str) -> list:
current_time = datetime.datetime.now()
try:
params = {
'app': 'CailianpressWeb',
'os': 'web',
'sv': '8.4.6',
'sign': '9f8797a1f4de66c2370f7a03990d2737'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://www.cls.cn/',
'Origin': 'https://www.cls.cn'
}
response = requests.get(
"https://www.cls.cn/featured/v1/column/list",
params=params,
headers=headers,
timeout=self.timeout,
verify=False
)
response.raise_for_status()
data = response.json()
if data.get('errno') != 0:
return []
column_list = data.get('data', {}).get('column_list', [])
result = []
cache_list = []
for idx, column in enumerate(column_list[:20]):
try:
title = column.get('title', '').strip()
if not title or len(title) < 2:
continue
article_list = column.get('article_list', {})
if article_list:
article_title = article_list.get('title', '').strip()
jump_url = article_list.get('jump_url', '').strip()
brief = article_list.get('brief', '').strip()
if article_title:
display_title = f"[{title}] {article_title}"
content = brief if brief else article_title
url = "https://www.cls.cn/telegraph"
else:
display_title = title
content = column.get('brief', '').strip()
url = f"https://www.cls.cn/telegraph"
else:
display_title = title
content = column.get('brief', '').strip()
url = f"https://www.cls.cn/telegraph"
news = {
'title': display_title,
'url': url,
'content': content,
'source': 'cls',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S'),
'score': 1000 - idx,
'rank': idx + 1
}
result.append(news)
cache_list.append(news)
except Exception:
continue
if cache_list:
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
return []
def crawler_name(self):
return "cls"

View File

@@ -0,0 +1,23 @@
from abc import ABC, abstractmethod
from typing import List, Dict, Any
class Crawler(ABC):
def __init__(self):
self.header = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
"*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/86.0.4240.183 Safari/537.36"
}
self.timeout = 10
@abstractmethod
def fetch(self, date_str: str) -> List[Dict[str, Any]]:
"""获取新闻列表"""
pass
@abstractmethod
def crawler_name(self) -> str:
"""获取爬虫名称"""
pass

View File

@@ -0,0 +1,79 @@
import json
import re
import datetime
import requests
import urllib3
from bs4 import BeautifulSoup
from ...core import cache
from .crawler import Crawler
urllib3.disable_warnings()
class DouBanCrawler(Crawler):
"""豆瓣网"""
def fetch(self, date_str):
current_time = datetime.datetime.now()
url = "https://www.douban.com/group/explore"
header = self.header.copy()
header.update({
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-encoding": "",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"host": "www.douban.com",
"referer": "https://www.douban.com/group/explore",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
})
resp = requests.get(url=url, headers=header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
html_text = resp.text
soup = BeautifulSoup(html_text, "html.parser")
topic_list = soup.find_all('div', class_='channel-item')
result = []
cache_list = []
for topic in topic_list:
title_elem = topic.find('h3')
if not title_elem:
continue
link_elem = title_elem.find('a')
if not link_elem:
continue
title = link_elem.text.strip()
url = link_elem.get('href')
desc_elem = topic.find('div', class_='content')
desc = desc_elem.text.strip() if desc_elem else ""
news = {
'title': title,
'url': url,
'content': desc,
'source': 'douban',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "douban"

View File

@@ -0,0 +1,111 @@
import json
import datetime
import time
import requests
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from ...core import cache
from ...db.mysql import News
from .crawler import Crawler
from ..browser_manager import BrowserManager
class DouYinCrawler(Crawler):
def fetch(self, date_str):
return self.fetch_v2(date_str)
def fetch_v1(self, date_str):
current_time = datetime.datetime.now()
url = "https://www.douyin.com/hot"
browser_manager = BrowserManager()
try:
# 使用浏览器管理器获取页面内容
page_source, driver = browser_manager.get_page_content(url, wait_time=5)
result = []
cache_list = []
# 抖音热榜条目li 标签里含 /video/ 链接)
items = driver.find_elements(By.XPATH, '//li[a[contains(@href, "/video/")]]')
for item in items:
try:
# 提取标题(含 # 标签或较长文本)
title_elem = item.find_element(By.XPATH, './/div[contains(text(), "#") or string-length(text()) > 10]')
# 提取链接
link_elem = item.find_element(By.XPATH, './/a[contains(@href, "/video/")]')
# 提取热度
hot_elem = item.find_element(By.XPATH, './/span[contains(text(), "") or contains(text(), "亿")]')
title = title_elem.text.strip()
item_url = "https://www.douyin.com" + link_elem.get_attribute("href")
hot = hot_elem.text.strip()
news = {
'title': title,
'url': item_url,
'content': f"热度: {hot}",
'source': 'douyin',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
except Exception:
continue # 跳过无效项
# 缓存并返回
if cache_list:
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
return []
def fetch_v2(self, date_str):
current_time = datetime.datetime.now()
url = "https://www.douyin.com/aweme/v1/web/hot/search/list/?device_platform=webapp&aid=6383&channel=channel_pc_web&detail_list=1&source=6&pc_client_type=1&pc_libra_divert=Windows&support_h265=1&support_dash=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=136.0.0.0&browser_online=true&engine_name=Blink&engine_version=136.0.0.0&os_name=Windows&os_version=10&cpu_core_num=16&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7490997798633555467"
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"Chrome/122.0.0.0 Safari/537.36"
"AppleWebKit/537.36 (KHTML, like Gecko) "
),
"Referer": "https://www.douyin.com/",
}
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
data = resp.json()
# https://www.douyin.com/hot/2094286?&trending_topic=%E5%A4%8F%E5%A4%A9%E7%9A%84%E5%91%B3%E9%81%93%E5%9C%A8%E6%8A%96%E9%9F%B3&previous_page=main_page&enter_method=trending_topic&modeFrom=hotDetail&tab_name=trend&position=1&hotValue=11892557
result = []
cache_list = []
for item in data["data"]["word_list"]:
title = item["word"]
url = f"https://www.douyin.com/hot/{item['sentence_id']}?&trending_topic={item['word']}&previous_page=main_page&enter_method=trending_topic&modeFrom=hotDetail&tab_name=trend&position=1&hotValue={item['hot_value']}"
news = {
'title': title,
'url': url,
'content': title,
'source': 'douyin',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "douyin"

View File

@@ -0,0 +1,88 @@
import json
import datetime
import requests
import urllib3
from .crawler import Crawler
from ...core import cache
urllib3.disable_warnings()
class EastMoneyCrawler(Crawler):
"""东方财富网"""
def fetch(self, date_str) -> list:
current_time = datetime.datetime.now()
try:
params = {
'client': 'web',
'biz': 'web_724',
'fastColumn': '102',
'sortEnd': '',
'pageSize': '50',
'req_trace': str(int(current_time.timestamp() * 1000)) # 使用当前时间戳
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://kuaixun.eastmoney.com/',
'Origin': 'https://kuaixun.eastmoney.com'
}
response = requests.get(
"https://np-weblist.eastmoney.com/comm/web/getFastNewsList",
params=params,
headers=headers,
timeout=self.timeout,
verify=False
)
response.raise_for_status()
data = response.json()
if data.get('code') != '1':
return []
fast_news_list = data.get('data', {}).get('fastNewsList', [])
result = []
cache_list = []
for idx, news_item in enumerate(fast_news_list[:20]): # 取前20条
try:
title = news_item.get('title', '').strip()
if not title:
continue
summary = news_item.get('summary', '').strip()
show_time = news_item.get('showTime', '').strip()
code = news_item.get('code', '').strip()
url = f"https://finance.eastmoney.com/a/{code}" if code else "https://kuaixun.eastmoney.com/"
news = {
'title': title,
'url': url,
'content': summary,
'source': 'eastmoney',
'publish_time': show_time if show_time else current_time.strftime('%Y-%m-%d %H:%M:%S'),
'score': 1000 - idx,
'rank': idx + 1
}
result.append(news)
cache_list.append(news)
except Exception:
continue
if cache_list:
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
return []
def crawler_name(self):
return "eastmoney"

View File

@@ -0,0 +1,64 @@
from typing import Dict, Type
from .baidu import BaiduNewsCrawler
from .bilibili import BilibiliCrawler
from .crawler import Crawler
from .douban import DouBanCrawler
from .douyin import DouYinCrawler
from .ftpojie import FtPoJieCrawler
from .github import GithubCrawler
from .hackernews import HackerNewsCrawler
from .hupu import HuPuCrawler
from .jinritoutiao import JinRiTouTiaoCrawler
from .juejin import JueJinCrawler
from .sspai import ShaoShuPaiCrawler
from .stackoverflow import StackOverflowCrawler
from .tenxunwang import TenXunWangCrawler
from .tieba import TieBaCrawler
from .tskr import TsKrCrawler
from .vtex import VtexCrawler
from .weibo import WeiboCrawler
from .weixin import WeiXinCrawler
from .zhihu import ZhiHuCrawler
from .sina_finance import SinaFinanceCrawler
from .eastmoney import EastMoneyCrawler
from .xueqiu import XueqiuCrawler
from .cls import CLSCrawler
class CrawlerRegister:
def __init__(self):
self.crawlers = {}
def register(self) -> Dict[str, Crawler]:
"""注册所有爬虫"""
crawler_map = {
"baidu": BaiduNewsCrawler(),
"shaoshupai": ShaoShuPaiCrawler(),
"weibo": WeiboCrawler(),
"zhihu": ZhiHuCrawler(),
"36kr": TsKrCrawler(),
"52pojie": FtPoJieCrawler(),
"bilibili": BilibiliCrawler(),
"douban": DouBanCrawler(),
"hupu": HuPuCrawler(),
"tieba": TieBaCrawler(),
"juejin": JueJinCrawler(),
"douyin": DouYinCrawler(),
"v2ex": VtexCrawler(),
"jinritoutiao": JinRiTouTiaoCrawler(),
"tenxunwang": TenXunWangCrawler(),
"stackoverflow": StackOverflowCrawler(),
"github": GithubCrawler(),
"hackernews": HackerNewsCrawler(),
"sina_finance": SinaFinanceCrawler(),
"eastmoney": EastMoneyCrawler(),
"xueqiu": XueqiuCrawler(),
"cls": CLSCrawler(),
}
self.crawlers = crawler_map
return self.crawlers
def get_crawlers(self):
return self.register().values()

View File

@@ -0,0 +1,69 @@
import json
import datetime # 添加datetime导入
import re
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from ...core import cache
from ...db.mysql import News
from .crawler import Crawler
urllib3.disable_warnings()
class FtPoJieCrawler(Crawler):
"""吾爱破解"""
def fetch(self, date_str):
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://www.52pojie.cn/forum.php?mod=guide&view=hot"
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
resp.encoding = 'gbk' # 52pojie使用GBK编码
html_text = resp.text
soup = BeautifulSoup(html_text, "html.parser")
# 找到热门帖子列表
hot_threads = soup.find_all('tbody', id=lambda x: x and x.startswith('normalthread_'))
result = []
cache_list = []
for thread in hot_threads:
title_elem = thread.find('a', class_='xst')
if not title_elem:
continue
title = title_elem.text.strip()
url = "https://www.52pojie.cn/" + title_elem.get('href')
# 获取帖子信息
info_elem = thread.find('td', class_='by')
info = info_elem.text.strip() if info_elem else ""
news = {
'title': title,
'url': url,
'content': info,
'source': '52pojie',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "52pojie"

View File

@@ -0,0 +1,58 @@
import json
import datetime
import requests
import urllib3
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class GithubCrawler(Crawler):
def fetch(self, date_str):
current_time = datetime.datetime.now()
url = "https://api.github.com/search/repositories?q=stars:%3E1&sort=stars"
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"Chrome/122.0.0.0 Safari/537.36"
"AppleWebKit/537.36 (KHTML, like Gecko) "
),
"Referer": "https://github.com/",
}
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
data = resp.json()
result = []
cache_list = []
for i, item in enumerate(data["items"]):
title = item.get("full_name", "")
url = item.get("html_url", "")
desc = item.get("description", "")
news = {
'title': title,
'url': url,
'content': desc,
'source': self.crawler_name(),
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "github"

View File

@@ -0,0 +1,235 @@
import json
import datetime
import time
import requests
from bs4 import BeautifulSoup
import urllib3
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from ...core import cache
from ...db.mysql import News
from .crawler import Crawler
from ..browser_manager import BrowserManager
# 禁用SSL警告
urllib3.disable_warnings()
class HackerNewsCrawler(Crawler):
"""hacker news"""
def fetch(self, date_str):
current_time = datetime.datetime.now()
try:
# 首先尝试直接请求方式获取内容
result = self._fetch_with_requests()
if result and len(result) > 0:
# 缓存数据
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
return result
# 如果请求方式失败,尝试使用浏览器模拟获取
browser_manager = BrowserManager()
result = self._fetch_with_browser(browser_manager)
if result and len(result) > 0:
# 缓存数据
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
return result
except Exception as e:
# 如果遇到错误,返回空列表
return []
# 所有方法都失败,返回空列表
return []
def _fetch_with_requests(self):
"""使用requests直接获取Hacker News内容"""
url = "https://news.ycombinator.com/"
try:
# 发送HTTP请求
response = requests.get(url, headers=self.header, timeout=self.timeout)
if response.status_code != 200:
return []
# 解析HTML内容
soup = BeautifulSoup(response.text, 'html.parser')
result = []
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 获取所有新闻条目
items = soup.select("tr.athing")
for item in items:
try:
# 获取ID用于关联评论和元数据
item_id = item.get('id')
if not item_id:
continue
# 获取标题和链接
title_element = item.select_one(".titleline a")
if not title_element:
continue
title = title_element.text.strip()
url = title_element.get('href')
# 如果URL是相对路径转换为绝对路径
if url and not url.startswith('http'):
url = f"https://news.ycombinator.com/{url}"
# 获取来源网站
site_element = item.select_one(".sitestr")
site = site_element.text.strip() if site_element else ""
# 查找下一个tr获取元数据分数、用户、时间等
metadata = item.find_next_sibling('tr')
if not metadata:
continue
# 获取分数
score_element = metadata.select_one(".score")
score = score_element.text.strip() if score_element else "0 points"
# 获取作者
user_element = metadata.select_one(".hnuser")
user = user_element.text.strip() if user_element else "unknown"
# 获取评论数
comments_element = metadata.select_one("a:last-child")
comments = comments_element.text.strip() if comments_element else "0 comments"
if "discuss" in comments:
comments = "0 comments"
# 构建内容摘要
content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}"
news = {
'title': title,
'url': url,
'content': content,
'source': 'hackernews',
'publish_time': current_time
}
result.append(news)
# 限制获取前30条
if len(result) >= 30:
break
except Exception as e:
continue
return result
except Exception as e:
return []
def _fetch_with_browser(self, browser_manager):
"""使用浏览器模拟方式获取Hacker News内容"""
url = "https://news.ycombinator.com/"
try:
# 获取页面内容
page_source, driver = browser_manager.get_page_content(url, wait_time=5)
# 等待页面元素加载
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".athing"))
)
except:
# 如果等待超时,仍然尝试获取内容
pass
result = []
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 获取所有新闻条目
items = driver.find_elements(By.CSS_SELECTOR, "tr.athing")
for item in items:
try:
# 获取ID用于关联评论和元数据
item_id = item.get_attribute("id")
if not item_id:
continue
# 获取标题和链接
title_element = item.find_element(By.CSS_SELECTOR, ".titleline a")
title = title_element.text.strip()
url = title_element.get_attribute("href")
# 获取来源网站
site = ""
try:
site_element = item.find_element(By.CSS_SELECTOR, ".sitestr")
site = site_element.text.strip()
except:
pass
# 查找下一个tr获取元数据分数、用户、时间等
try:
metadata = driver.find_element(By.XPATH, f"//tr[@id='{item_id}']/following-sibling::tr[1]")
# 获取分数
score = "0 points"
try:
score_element = metadata.find_element(By.CSS_SELECTOR, ".score")
score = score_element.text.strip()
except:
pass
# 获取作者
user = "unknown"
try:
user_element = metadata.find_element(By.CSS_SELECTOR, ".hnuser")
user = user_element.text.strip()
except:
pass
# 获取评论数
comments = "0 comments"
try:
comments_element = metadata.find_element(By.XPATH, ".//a[last()]")
comments = comments_element.text.strip()
if "discuss" in comments:
comments = "0 comments"
except:
pass
# 构建内容摘要
content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}"
except:
content = f"来源: {site}"
news = {
'title': title,
'url': url,
'content': content,
'source': 'hackernews',
'publish_time': current_time
}
result.append(news)
# 限制获取前30条
if len(result) >= 30:
break
except Exception as e:
continue
return result
except Exception as e:
return []
def crawler_name(self):
return "hackernews"

View File

@@ -0,0 +1,72 @@
import json
import datetime # 添加datetime导入
import re
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from ...core import cache
from ...db.mysql import News
from .crawler import Crawler
urllib3.disable_warnings()
class HuPuCrawler(Crawler):
"""虎扑"""
def fetch(self, date_str):
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://bbs.hupu.com/all-gambia"
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
html_text = resp.text
soup = BeautifulSoup(html_text, "html.parser")
# 找到热门帖子列表
post_list = soup.find_all('div', class_='t-info')
result = []
cache_list = []
for post in post_list:
title_elem = post.find('span', class_='t-title')
if not title_elem:
continue
link_elem = post.find('a')
if not link_elem:
continue
title = title_elem.text.strip()
url = "https://bbs.hupu.com" + link_elem.get('href') if link_elem.get('href').startswith('/') else link_elem.get('href')
# 获取帖子信息
info_elem = post.find('span', class_='t-replies')
info = info_elem.text.strip() if info_elem else ""
news = {
'title': title,
'url': url,
'content': info,
'source': 'hupu',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "hupu"

View File

@@ -0,0 +1,63 @@
# -- coding: utf-8 --
import json
import datetime # 添加datetime导入
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from ...core import cache
from .crawler import Crawler
urllib3.disable_warnings()
class JinRiTouTiaoCrawler(Crawler):
""" 今日头条 """
def fetch(self, date_str):
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc"
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
try:
json_data = resp.json()
data = json_data.get('data', [])
result = []
cache_list = []
for item in data:
title = item.get('Title', '')
url = item.get('Url', '')
hot_value = item.get('HotValue', '')
news = {
'title': title,
'url': url,
'content': f"热度: {hot_value}",
'source': 'jinritoutiao',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
print(f"Error parsing JSON: {e}")
return []
def crawler_name(self):
return "jinritoutiao"

View File

@@ -0,0 +1,63 @@
import json
import datetime # 添加datetime导入
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class JueJinCrawler(Crawler):
"""掘金"""
def fetch(self, date_str):
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://api.juejin.cn/content_api/v1/content/article_rank?category_id=1&type=hot"
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
try:
json_data = resp.json()
data = json_data.get('data', [])
result = []
cache_list = []
for item in data:
article_info = item.get('content', {})
title = article_info.get('title', '')
article_id = article_info.get('content_id', '')
url = f"https://juejin.cn/post/{article_id}"
news = {
'title': title,
'url': url,
'content': title,
'source': 'juejin',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
print(f"Error parsing JSON: {e}")
return []
def crawler_name(self):
return "juejin"

View File

@@ -0,0 +1,20 @@
import datetime
from sqlalchemy import Column, String, Integer, DateTime
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class DailyNews(Base):
__tablename__ = 'tab_daily_news'
id = Column(Integer, primary_key=True)
title = Column(String(255))
desc = Column(String(255))
link = Column(String(255))
type = Column(Integer, default=0)
score = Column(Integer, default=0)
times = Column(Integer, default=0)
create_time = Column(DateTime, default=datetime.datetime.now)
update_time = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now)

View File

@@ -0,0 +1,75 @@
import json
import datetime
import requests
import urllib3
from ...core import cache
from .crawler import Crawler
urllib3.disable_warnings()
class SinaFinanceCrawler(Crawler):
"""新浪财经"""
def fetch(self, date_str):
current_time = datetime.datetime.now()
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://finance.sina.com.cn/',
'Origin': 'https://finance.sina.com.cn'
}
response = requests.get(
"https://zhibo.sina.com.cn/api/zhibo/feed?page=1&page_size=20&zhibo_id=152&tag_id=0&dire=f&dpc=1&pagesize=20",
headers=headers,
timeout=self.timeout,
verify=False
)
response.raise_for_status()
data = response.json()
if data.get('result', {}).get('status', {}).get('code') != 0:
return []
feed_list = data.get('result', {}).get('data', {}).get('feed', {}).get('list', [])
result = []
cache_list = []
for item in feed_list:
try:
title = item.get('rich_text', '').strip()
if not title:
continue
ext_str = item.get('ext', '{}')
try:
ext_data = json.loads(ext_str)
doc_url = ext_data.get('docurl', '')
except:
doc_url = item.get('docurl', '').strip(' "')
news = {
'title': title,
'url': doc_url,
'content': title,
'source': 'sina_finance',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
except Exception:
continue
if cache_list:
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
return []
def crawler_name(self):
return "sina_finance"

View File

@@ -0,0 +1,60 @@
import json
import datetime # 添加datetime导入
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from ...core import cache
from .crawler import Crawler
urllib3.disable_warnings()
class ShaoShuPaiCrawler(Crawler):
"""少数派"""
def fetch(self, date_str):
current_time = datetime.datetime.now()
url = "https://sspai.com/api/v1/article/index/page/get?limit=20&offset=0&created_at=0"
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
try:
json_data = resp.json()
data = json_data.get('data', [])
result = []
cache_list = []
for item in data:
title = item.get('title', '')
article_id = item.get('id', '')
url = f"https://sspai.com/post/{article_id}"
summary = item.get('summary', '')
news = {
'title': title,
'url': url,
'content': summary,
'source': 'sspai',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
print(f"Error parsing JSON: {e}")
return []
def crawler_name(self):
return "shaoshupai"

View File

@@ -0,0 +1,58 @@
import json
import datetime
import requests
import urllib3
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class StackOverflowCrawler(Crawler):
def fetch(self, date_str):
current_time = datetime.datetime.now()
url = "https://api.stackexchange.com/2.3/questions?order=desc&sort=hot&site=stackoverflow"
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"Chrome/122.0.0.0 Safari/537.36"
"AppleWebKit/537.36 (KHTML, like Gecko) "
),
"Referer": "https://stackoverflow.com/",
}
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
data = resp.json()
result = []
cache_list = []
for i, item in enumerate(data["items"]):
title = item.get("title", "")
url = item.get("link", "")
desc = item.get("title", "")
news = {
'title': title,
'url': url,
'content': desc,
'source': 'stackoverflow',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "stackoverflow"

View File

@@ -0,0 +1,65 @@
import json
import datetime
import requests
import urllib3
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class TenXunWangCrawler(Crawler):
"""腾讯网"""
def fetch(self, date_str):
current_time = datetime.datetime.now()
url = "https://i.news.qq.com/gw/event/pc_hot_ranking_list?ids_hash=&offset=0&page_size=51&appver=15.5_qqnews_7.1.60&rank_id=hot"
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"Chrome/122.0.0.0 Safari/537.36"
"AppleWebKit/537.36 (KHTML, like Gecko) "
),
"Referer": "https://news.qq.com/",
}
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
data = resp.json()
result = []
cache_list = []
for i, item in enumerate(data["idlist"][0].get("newslist", [])):
if i == 0:
# 腾讯新闻用户最关注的热点每10分钟更新一次
continue
title = item.get("title", "")
url = item.get("url", "")
desc = item.get("abstract", "")
news = {
'title': title,
'url': url,
'content': desc,
'source': 'tenxunwang',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "tenxunwang"

View File

@@ -0,0 +1,65 @@
import json
import datetime # 添加datetime导入
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class TieBaCrawler(Crawler):
"""百度贴吧"""
def fetch(self, date_str):
# 获取当前时间
current_time = datetime.datetime.now()
url = "http://tieba.baidu.com/hottopic/browse/topicList"
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
try:
json_data = resp.json()
data = json_data.get('data', {}).get('bang_topic', {}).get('topic_list', [])
result = []
cache_list = []
for item in data:
title = item.get('topic_name', '')
url = item.get('topic_url', '')
if url and not url.startswith('http'):
url = f"http://tieba.baidu.com{url}"
desc = item.get('topic_desc', '')
news = {
'title': title,
'url': url,
'content': desc,
'source': 'tieba',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
print(f"Error parsing JSON: {e}")
return []
def crawler_name(self):
return "tieba"

View File

@@ -0,0 +1,83 @@
import json
import datetime
import time
import requests
import urllib3
from .crawler import Crawler
from ...core import cache
urllib3.disable_warnings()
class TsKrCrawler(Crawler):
"""36氪"""
def fetch(self, date_str):
"""
获取36氪热榜数据
"""
current_time = datetime.datetime.now()
url = f"https://gateway.36kr.com/api/mis/nav/home/nav/rank/hot"
headers = {
"Content-Type": "application/json; charset=utf-8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36",
}
body = {
"partner_id": "wap",
"param": {
"siteId": 1,
"platformId": 2,
},
"timestamp": int(time.time() * 1000),
}
try:
resp = requests.post(
url=url,
headers=headers,
json=body,
verify=False,
timeout=self.timeout
)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
json_data = resp.json()
data_key = "hotRankList"
data_list = json_data.get("data", {}).get(data_key, [])
result = []
cache_list = []
for item in data_list:
template_material = item.get("templateMaterial", {})
item_id = item.get("itemId", "")
title = template_material.get("widgetTitle", "")
article_url = f"https://www.36kr.com/p/{item_id}"
news = {
'title': title,
'url': article_url,
'content': title,
'source': '36kr',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S'),
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
print(f"Error fetching 36kr data: {e}")
return []
def crawler_name(self):
return "36kr"

View File

@@ -0,0 +1,71 @@
import json
import datetime # 添加datetime导入
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class VtexCrawler(Crawler):
"""v2ex"""
def fetch(self, date_str):
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://www.v2ex.com/?tab=hot"
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
html_text = resp.text
soup = BeautifulSoup(html_text, "html.parser")
# 找到热门话题列表
topic_list = soup.find_all('div', class_='cell item')
result = []
cache_list = []
for topic in topic_list:
title_elem = topic.find('span', class_='item_title')
if not title_elem:
continue
link_elem = title_elem.find('a')
if not link_elem:
continue
title = link_elem.text.strip()
url = "https://www.v2ex.com" + link_elem.get('href')
# 获取话题信息
info_elem = topic.find('span', class_='topic_info')
info = info_elem.text.strip() if info_elem else ""
news = {
'title': title,
'url': url,
'content': info,
'source': 'v2ex',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "v2ex"

View File

@@ -0,0 +1,68 @@
import json
import datetime # 添加datetime导入
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from ...core import cache
from .crawler import Crawler
urllib3.disable_warnings()
class WeiboCrawler(Crawler):
"""微博"""
def fetch(self, date_str):
# 获取当前时间
current_time = datetime.datetime.now()
header = self.header.copy()
header.update({
"accept": "application/json, text/javascript, */*; q=0.01",
"host": "weibo.com",
"Referer": "https://weibo.com",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
})
url = "https://weibo.com/ajax/side/hotSearch"
resp = requests.get(url=url, headers=header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
try:
json_data = resp.json()
data = json_data.get('data', {}).get('realtime', [])
result = []
cache_list = []
for item in data:
title = item.get('word', '')
url = f"https://s.weibo.com/weibo?q=%23{title}%23"
news = {
'title': title,
'url': url,
'content': title,
'source': 'weibo',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
print(f"Error parsing JSON: {e}")
return []
def crawler_name(self):
return "weibo"

View File

@@ -0,0 +1,228 @@
import json
import datetime
import time
import requests
from bs4 import BeautifulSoup
import urllib3
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from ...core import cache
from ...db.mysql import News
from .crawler import Crawler
from ..browser_manager import BrowserManager
# 禁用SSL警告
urllib3.disable_warnings()
class WeiXinCrawler(Crawler):
"""
微信热门内容爬虫
使用微信看一看热门页面获取数据
"""
def fetch(self, date_str):
"""获取微信热门内容"""
current_time = datetime.datetime.now()
browser_manager = BrowserManager()
try:
# 首先尝试从微信看一看获取热门内容
result = self._fetch_from_weixin_kankan(browser_manager)
if result and len(result) > 0:
# 缓存数据
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
return result
# 如果看一看失败,尝试从微信读书获取热门书评
result = self._fetch_from_weixin_dushu(browser_manager)
if result and len(result) > 0:
# 缓存数据
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
return result
except Exception as e:
# 如果遇到错误,返回空列表
return []
# 所有方法都失败,返回空列表
return []
def _fetch_from_weixin_kankan(self, browser_manager):
"""从微信看一看页面获取热门内容"""
url = "https://k.weixin.qq.com/"
try:
# 获取页面内容
page_source, driver = browser_manager.get_page_content(url, wait_time=10)
# 等待热门内容加载
try:
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".hot"))
)
except:
# 如果等待超时,仍然尝试获取内容
pass
# 点击"热点"标签切换到热门内容
try:
hot_tab = driver.find_element(By.XPATH, "//div[contains(text(), '热点') and @class='tab']")
hot_tab.click()
time.sleep(3) # 等待内容加载
except:
# 如果找不到热点标签,继续尝试获取当前页面内容
pass
result = []
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 获取文章列表
articles = driver.find_elements(By.CSS_SELECTOR, ".article-item")
if not articles:
# 尝试其他可能的选择器
articles = driver.find_elements(By.CSS_SELECTOR, ".doc-item")
if not articles:
# 再尝试其他可能的选择器
articles = driver.find_elements(By.CSS_SELECTOR, ".item")
for article in articles:
try:
# 获取文章标题和链接
title_elem = article.find_element(By.CSS_SELECTOR, "h3, .title")
title = title_elem.text.strip()
# 尝试获取链接
link = None
try:
link_elem = article.find_element(By.TAG_NAME, "a")
link = link_elem.get_attribute("href")
except:
# 如果直接获取链接失败则记录文章id以后可以构建链接
try:
article_id = article.get_attribute("data-id") or article.get_attribute("id")
link = f"https://k.weixin.qq.com/article?id={article_id}"
except:
link = "https://k.weixin.qq.com/"
# 获取来源
source = ""
try:
source_elem = article.find_element(By.CSS_SELECTOR, ".account, .source")
source = source_elem.text.strip()
except:
pass
# 获取摘要
summary = ""
try:
summary_elem = article.find_element(By.CSS_SELECTOR, ".desc, .summary, p")
summary = summary_elem.text.strip()
except:
pass
news = {
'title': title,
'url': link,
'content': f"来源: {source} | 摘要: {summary[:50] if summary else '无摘要'}",
'source': 'weixin',
'publish_time': current_time
}
result.append(news)
# 限制获取前20条
if len(result) >= 20:
break
except Exception as e:
continue
return result
except Exception as e:
return []
def _fetch_from_weixin_dushu(self, browser_manager):
"""从微信读书获取热门书评"""
url = "https://weread.qq.com/web/category/all"
try:
# 获取页面内容
page_source, driver = browser_manager.get_page_content(url, wait_time=8)
result = []
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 尝试点击排行榜标签
try:
rank_tab = driver.find_element(By.XPATH, "//a[contains(text(), '排行榜')]")
rank_tab.click()
time.sleep(3) # 等待内容加载
except:
# 如果找不到排行榜标签,继续尝试获取当前页面内容
pass
# 获取热门书籍列表
books = driver.find_elements(By.CSS_SELECTOR, ".shelf-item, .book-item")
for book in books:
try:
# 获取书籍标题和链接
title_elem = book.find_element(By.CSS_SELECTOR, ".title, h3")
title = title_elem.text.strip()
# 尝试获取链接
link = "https://weread.qq.com/web/category/all"
try:
link_elem = book.find_element(By.TAG_NAME, "a")
link = link_elem.get_attribute("href")
except:
book_id = book.get_attribute("data-bid") or book.get_attribute("id")
if book_id:
link = f"https://weread.qq.com/web/reader/{book_id}"
# 获取作者
author = ""
try:
author_elem = book.find_element(By.CSS_SELECTOR, ".author, .writer")
author = author_elem.text.strip()
except:
pass
# 获取摘要/简介
intro = ""
try:
intro_elem = book.find_element(By.CSS_SELECTOR, ".intro, .desc")
intro = intro_elem.text.strip()
except:
pass
news = {
'title': f"热门书籍: {title}",
'url': link,
'content': f"作者: {author} | 简介: {intro[:50] if intro else '无简介'}",
'source': 'weixin',
'publish_time': current_time
}
result.append(news)
# 限制获取前20条
if len(result) >= 20:
break
except Exception as e:
continue
return result
except Exception as e:
return []
def crawler_name(self):
return "weixin"

View File

@@ -0,0 +1,155 @@
import json
import datetime
import requests
import urllib3
import re
from requests.sessions import Session
from .crawler import Crawler
from ...core import cache
urllib3.disable_warnings()
class XueqiuCrawler(Crawler):
"""雪球"""
def __init__(self):
super().__init__()
self.session = Session()
self._init_session()
def _init_session(self):
try:
# 第一步访问主页获取基础cookies
main_url = "https://xueqiu.com"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1'
}
resp = self.session.get(main_url, headers=headers, verify=False, timeout=self.timeout)
if resp.status_code == 200:
html_content = resp.text
# 尝试提取token
token_match = re.search(r'window\.SNB\s*=\s*\{[^}]*token["\']?\s*:\s*["\']([^"\']+)["\']', html_content)
if token_match:
token = token_match.group(1)
self.session.headers.update({'X-Requested-With': 'XMLHttpRequest'})
hot_page_url = "https://xueqiu.com/hot_event"
hot_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://xueqiu.com/',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1'
}
hot_resp = self.session.get(hot_page_url, headers=hot_headers, verify=False, timeout=self.timeout)
if hot_resp.status_code == 200:
print("雪球热门页面访问成功,已获取完整认证信息")
else:
print(f"雪球热门页面访问失败: {hot_resp.status_code}")
else:
print(f"雪球主页访问失败: {resp.status_code}")
except Exception as e:
print(f"初始化雪球会话失败: {e}")
def fetch(self, date_str) -> list:
current_time = datetime.datetime.now()
url = "https://xueqiu.com/hot_event/list.json?count=10"
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Referer': 'https://xueqiu.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
try:
resp = self.session.get(url=url, headers=headers, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"雪球请求失败, status: {resp.status_code}")
self._init_session()
resp = self.session.get(url=url, headers=headers, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"雪球重试后仍失败, status: {resp.status_code}")
return []
json_data = resp.json()
if 'list' not in json_data:
print("雪球响应格式异常")
return []
result = []
cache_list = []
for idx, item in enumerate(json_data['list'][:10]): # 取前10条
try:
tag = item.get('tag', '').strip()
if tag.startswith('#') and tag.endswith('#'):
title = tag[1:-1]
else:
title = tag
if not title:
continue
item_id = item.get('id')
url_link = f"https://xueqiu.com/"
content = item.get('content', '').strip()
if len(content) > 200:
content = content[:200] + '...'
status_count = item.get('status_count', 0)
hot_value = item.get('hot', 0)
news = {
'title': title,
'url': url_link,
'content': content,
'source': 'xueqiu',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S'),
'score': status_count if status_count > 0 else 1000 - idx,
'rank': idx + 1
}
result.append(news)
cache_list.append(news)
except Exception as e:
print(f"解析雪球新闻项失败: {e}")
continue
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
print(f"获取雪球数据失败: {e}")
return []
def crawler_name(self):
return "xueqiu"

View File

@@ -0,0 +1,64 @@
import json
import datetime # 添加datetime导入
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class ZhiHuCrawler(Crawler):
"""知乎"""
def fetch(self, date_str):
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://www.zhihu.com/api/v3/explore/guest/feeds?limit=30&ws_qiangzhisafe=0"
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
try:
json_data = resp.json()
data = json_data.get('data', [])
result = []
cache_list = []
for item in data:
target = item.get('target', {})
question = target.get('question', {})
title = question.get('title', '')
url = f"https://www.zhihu.com/question/{question.get('id')}"
excerpt = target.get('excerpt', '')
news = {
'title': title,
'url': url,
'content': excerpt,
'source': 'zhihu',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
print(f"Error parsing JSON: {e}")
return []
def crawler_name(self):
return "zhihu"

0
app/utils/__init__.py Normal file
View File

73
app/utils/logger.py Normal file
View File

@@ -0,0 +1,73 @@
import logging
import os
import sys
import time
from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler
import pytz
from datetime import datetime
from app.core.config import get_logging_config
# 获取日志配置
log_config = get_logging_config()
# 确保日志目录存在
os.makedirs(log_config.dir, exist_ok=True)
# 自定义日志格式化器,使用配置的时区
class CustomFormatter(logging.Formatter):
def converter(self, timestamp):
dt = datetime.fromtimestamp(timestamp)
tz = pytz.timezone(log_config.timezone)
return dt.replace(tzinfo=pytz.utc).astimezone(tz)
def formatTime(self, record, datefmt=None):
dt = self.converter(record.created)
if datefmt:
return dt.strftime(datefmt)
return dt.strftime("%Y-%m-%d %H:%M:%S")
# 创建日志记录器
log = logging.getLogger('app')
log.setLevel(getattr(logging, log_config.level))
# 清除现有处理器
for handler in log.handlers[:]:
log.removeHandler(handler)
# 创建控制台处理器
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(getattr(logging, log_config.level))
console_formatter = CustomFormatter(log_config.format)
console_handler.setFormatter(console_formatter)
log.addHandler(console_handler)
# 创建文件处理器 - 按大小轮转
file_handler = RotatingFileHandler(
os.path.join(log_config.dir, log_config.file),
maxBytes=log_config.max_size,
backupCount=log_config.backup_count,
encoding='utf-8'
)
file_handler.setLevel(getattr(logging, log_config.level))
file_formatter = CustomFormatter(log_config.format)
file_handler.setFormatter(file_formatter)
log.addHandler(file_handler)
# 创建文件处理器 - 按日期轮转
daily_handler = TimedRotatingFileHandler(
os.path.join(log_config.dir, 'app.daily.log'),
when='midnight',
interval=1,
backupCount=log_config.daily_backup_count,
encoding='utf-8'
)
daily_handler.setLevel(getattr(logging, log_config.level))
daily_handler.setFormatter(file_formatter)
log.addHandler(daily_handler)
# 防止日志传播到父记录器
log.propagate = False
# 记录启动信息
log.info(f"Logger initialized at {datetime.now(pytz.timezone(log_config.timezone)).strftime('%Y-%m-%d %H:%M:%S')}")

286
app/utils/notification.py Normal file
View File

@@ -0,0 +1,286 @@
import json
import time
import hmac
import hashlib
import base64
import urllib.parse
from datetime import datetime
from typing import Dict, Any, Optional, List
import requests
import pytz
from app.utils.logger import log
from app.core.config import get_notification_config
class DingTalkNotifier:
"""钉钉机器人通知器"""
def __init__(self):
self.config = get_notification_config()
self.webhook_url = self.config.get('dingtalk', {}).get('webhook_url', '')
self.secret = self.config.get('dingtalk', {}).get('secret', '')
self.enabled = self.config.get('dingtalk', {}).get('enabled', False)
self.timeout = self.config.get('dingtalk', {}).get('timeout', 10)
self.notify_success = self.config.get('dingtalk', {}).get('notify_success', False)
self.shanghai_tz = pytz.timezone('Asia/Shanghai')
if not self.webhook_url and self.enabled:
log.warning("DingTalk webhook URL not configured, notifications will be disabled")
self.enabled = False
def _generate_sign(self, timestamp: int) -> str:
"""生成钉钉机器人签名"""
if not self.secret:
return ""
string_to_sign = f'{timestamp}\n{self.secret}'
hmac_code = hmac.new(
self.secret.encode('utf-8'),
string_to_sign.encode('utf-8'),
digestmod=hashlib.sha256
).digest()
sign = urllib.parse.quote_plus(base64.b64encode(hmac_code))
return sign
def _send_message(self, message: Dict[str, Any]) -> bool:
"""发送消息到钉钉"""
if not self.enabled:
log.debug("DingTalk notifications are disabled")
return False
try:
# 生成时间戳和签名
timestamp = int(round(time.time() * 1000))
sign = self._generate_sign(timestamp)
# 构建请求URL
url = self.webhook_url
if sign:
url += f"&timestamp={timestamp}&sign={sign}"
# 发送请求
response = requests.post(
url,
json=message,
timeout=self.timeout,
headers={'Content-Type': 'application/json'}
)
if response.status_code == 200:
result = response.json()
if result.get('errcode') == 0:
log.info("DingTalk notification sent successfully")
return True
else:
log.error(f"DingTalk API error: {result.get('errmsg', 'Unknown error')}")
return False
else:
log.error(f"DingTalk HTTP error: {response.status_code}")
return False
except Exception as e:
log.error(f"Failed to send DingTalk notification: {str(e)}")
return False
def send_text_message(self, content: str, at_mobiles: Optional[List[str]] = None,
at_all: bool = False) -> bool:
"""发送文本消息"""
message = {
"msgtype": "text",
"text": {
"content": content
}
}
if at_mobiles or at_all:
message["at"] = {
"atMobiles": at_mobiles or [],
"isAtAll": at_all
}
return self._send_message(message)
def send_markdown_message(self, title: str, text: str,
at_mobiles: Optional[List[str]] = None,
at_all: bool = False) -> bool:
"""发送Markdown消息"""
message = {
"msgtype": "markdown",
"markdown": {
"title": title,
"text": text
}
}
if at_mobiles or at_all:
message["at"] = {
"atMobiles": at_mobiles or [],
"isAtAll": at_all
}
return self._send_message(message)
def send_crawler_error(self, crawler_name: str, error_msg: str,
date_str: str, is_retry: bool = False) -> bool:
"""发送爬虫错误通知"""
current_time = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S")
retry_text = "重试失败" if is_retry else "首次失败"
title = f"🚨 爬虫异常通知 - {crawler_name}"
content = f"""
## {title}
**时间**: {current_time}\n
**爬虫**: {crawler_name}\n
**日期**: {date_str}\n
**状态**: {retry_text}\n
**错误信息**:
```
{error_msg}
```
请及时检查爬虫状态!
""".strip()
# 异常时@所有人
return self.send_markdown_message(title, content, at_all=True)
def send_crawler_timeout(self, timeout_seconds: int, date_str: str) -> bool:
"""发送爬虫超时通知"""
current_time = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S")
title = "⏰ 爬虫超时通知"
content = f"""
## {title}
**时间**: {current_time}\n
**日期**: {date_str}\n
**超时时长**: {timeout_seconds}\n
**状态**: 爬虫任务执行超时被强制终止
请检查爬虫性能或调整超时配置!
""".strip()
# 超时异常时@所有人
return self.send_markdown_message(title, content, at_all=True)
def send_crawler_summary(self, success_count: int, total_count: int,
failed_crawlers: List[str], duration: float,
date_str: str) -> bool:
"""发送爬虫执行摘要通知"""
# 全部成功且未启用正常通知时,不发送
if success_count == total_count and not self.notify_success:
return True
current_time = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S")
# 构建失败爬虫列表
failed_list = "\n".join([f"- {name}" for name in failed_crawlers]) if failed_crawlers else ""
if failed_crawlers:
title = f"🚨 爬虫执行摘要 - {date_str}"
else:
title = f"📊 爬虫执行摘要 - {date_str}"
# 根据是否有失败构建不同的内容
if failed_crawlers:
content = f"""
## {title}
**时间**: {current_time}\n
**日期**: {date_str}\n
**执行时长**: {duration:.2f}\n
**成功**: {success_count}/{total_count}\n
**失败**: {len(failed_crawlers)}
**失败的爬虫**:
{failed_list}
请关注失败的爬虫状态!
""".strip()
else:
content = f"""
## {title}
**时间**: {current_time}\n
**日期**: {date_str}\n
**执行时长**: {duration:.2f}\n
**成功**: {success_count}/{total_count}\n
**失败**: {len(failed_crawlers)}
所有爬虫执行成功!
""".strip()
# 有失败时@所有人,没失败时不@
at_all = len(failed_crawlers) > 0
return self.send_markdown_message(title, content, at_all=at_all)
def send_analysis_error(self, error_msg: str, date_str: str) -> bool:
"""发送数据分析错误通知"""
current_time = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S")
title = "🔍 数据分析异常通知"
content = f"""
## {title}
**时间**: {current_time}\n
**日期**: {date_str}\n
**错误信息**:
```
{error_msg}
```
数据分析任务执行失败,请检查分析模块!
""".strip()
# 分析异常时@所有人
return self.send_markdown_message(title, content, at_all=True)
class NotificationManager:
"""通知管理器,支持多种通知方式"""
def __init__(self):
self.dingtalk = DingTalkNotifier()
# 可以在这里添加其他通知方式,如企业微信、邮件等
def is_enabled(self) -> bool:
"""检查通知是否启用"""
return self.dingtalk.enabled
@property
def webhook_url(self) -> str:
"""获取webhook URL"""
return self.dingtalk.webhook_url
def send_text(self, content: str, at_all: bool = False) -> bool:
"""发送文本消息"""
return self.dingtalk.send_text_message(content, at_all=at_all)
def send_markdown(self, title: str, text: str, at_all: bool = False) -> bool:
"""发送Markdown消息"""
return self.dingtalk.send_markdown_message(title, text, at_all=at_all)
def notify_crawler_error(self, crawler_name: str, error_msg: str,
date_str: str, is_retry: bool = False):
"""通知爬虫错误"""
self.dingtalk.send_crawler_error(crawler_name, error_msg, date_str, is_retry)
def notify_crawler_timeout(self, timeout_seconds: int, date_str: str):
"""通知爬虫超时"""
self.dingtalk.send_crawler_timeout(timeout_seconds, date_str)
def notify_crawler_summary(self, success_count: int, total_count: int,
failed_crawlers: List[str], duration: float,
date_str: str):
"""通知爬虫执行摘要"""
self.dingtalk.send_crawler_summary(success_count, total_count,
failed_crawlers, duration, date_str)
def notify_analysis_error(self, error_msg: str, date_str: str):
"""通知数据分析错误"""
self.dingtalk.send_analysis_error(error_msg, date_str)
# 全局通知管理器实例
notification_manager = NotificationManager()