init
This commit is contained in:
0
app/__init__.py
Normal file
0
app/__init__.py
Normal file
8
app/analysis/__init__.py
Normal file
8
app/analysis/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
"""
|
||||
热点分析模块,包含热点聚合分析和热点趋势预测功能
|
||||
"""
|
||||
|
||||
from app.analysis.trend_analyzer import TrendAnalyzer
|
||||
from app.analysis.predictor import TrendPredictor
|
||||
|
||||
__all__ = ['TrendAnalyzer', 'TrendPredictor']
|
||||
3
app/analysis/predictor/__init__.py
Normal file
3
app/analysis/predictor/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from app.analysis.predictor.predictor import TrendPredictor
|
||||
|
||||
__all__ = ['TrendPredictor']
|
||||
512
app/analysis/predictor/predictor.py
Normal file
512
app/analysis/predictor/predictor.py
Normal file
@@ -0,0 +1,512 @@
|
||||
import json
|
||||
import random
|
||||
from collections import defaultdict, Counter
|
||||
from datetime import datetime, timedelta
|
||||
import pytz
|
||||
from typing import Dict, List, Any, Optional, Tuple
|
||||
|
||||
from app.core import cache, db
|
||||
from app.utils.logger import log
|
||||
from app.services import crawler_factory
|
||||
|
||||
class TrendPredictor:
|
||||
"""热点趋势预测器,用于预测热点话题的发展趋势"""
|
||||
|
||||
def __init__(self):
|
||||
self.cache_key_prefix = "analysis:prediction:"
|
||||
self.cache_expire = 3600 # 1小时缓存
|
||||
self.shanghai_tz = pytz.timezone('Asia/Shanghai')
|
||||
self.history_days = 7 # 使用过去7天的数据进行预测
|
||||
|
||||
def get_prediction(self, date_str: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""获取指定日期的热点趋势预测"""
|
||||
if not date_str:
|
||||
date_str = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d")
|
||||
|
||||
# 尝试从缓存获取
|
||||
cache_key = f"{self.cache_key_prefix}{date_str}"
|
||||
cached_prediction = cache.get_cache(cache_key)
|
||||
if cached_prediction:
|
||||
log.info(f"Retrieved trend prediction from cache for {date_str}")
|
||||
return cached_prediction
|
||||
|
||||
# 执行预测
|
||||
prediction_result = self._predict_trends(date_str)
|
||||
|
||||
# 缓存结果
|
||||
if prediction_result:
|
||||
cache.set_cache(cache_key, prediction_result, self.cache_expire)
|
||||
|
||||
return prediction_result
|
||||
|
||||
def _predict_trends(self, date_str: str) -> Dict[str, Any]:
|
||||
"""预测热点趋势"""
|
||||
# 获取历史数据
|
||||
historical_data = self._get_historical_data(date_str)
|
||||
|
||||
if not historical_data:
|
||||
log.warning(f"No historical data available for trend prediction on {date_str}")
|
||||
return {
|
||||
"status": "processing",
|
||||
"message": "正在准备热点趋势预测",
|
||||
"detail": "我们正在对全网热点数据进行高级分析,请稍候...",
|
||||
"date": date_str,
|
||||
"updated_at": datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S")
|
||||
}
|
||||
|
||||
# 预测结果
|
||||
result = {
|
||||
"status": "success",
|
||||
"message": "热点趋势预测完成",
|
||||
"date": date_str,
|
||||
"trending_topics": self._predict_trending_topics(historical_data),
|
||||
"category_trends": self._predict_category_trends(historical_data),
|
||||
"platform_trends": self._predict_platform_trends(historical_data),
|
||||
"keyword_predictions": self._predict_keywords(historical_data),
|
||||
"prediction_window": f"{self.history_days} days",
|
||||
"updated_at": datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S")
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
def _get_historical_data(self, end_date_str: str) -> Dict[str, Dict[str, List]]:
|
||||
"""获取历史数据"""
|
||||
end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
|
||||
historical_data = {}
|
||||
|
||||
# 收集过去几天的数据
|
||||
for i in range(self.history_days):
|
||||
date = end_date - timedelta(days=i)
|
||||
date_str = date.strftime("%Y-%m-%d")
|
||||
|
||||
daily_data = {}
|
||||
for platform in crawler_factory.keys():
|
||||
cache_key = f"crawler:{platform}:{date_str}"
|
||||
platform_data = cache.get_cache(cache_key)
|
||||
if platform_data:
|
||||
daily_data[platform] = platform_data
|
||||
|
||||
if daily_data: # 只保存有数据的日期
|
||||
historical_data[date_str] = daily_data
|
||||
|
||||
return historical_data
|
||||
|
||||
def _predict_trending_topics(self, historical_data: Dict[str, Dict[str, List]]) -> List[Dict[str, Any]]:
|
||||
"""预测未来将会流行的话题"""
|
||||
# 分析历史数据中的上升趋势话题
|
||||
rising_topics = self._find_rising_topics(historical_data)
|
||||
persistent_topics = self._find_persistent_topics(historical_data)
|
||||
|
||||
# 结合上升趋势和持续热门话题,预测未来趋势
|
||||
trending_topics = []
|
||||
|
||||
# 添加上升趋势明显的话题
|
||||
for topic in rising_topics[:5]:
|
||||
trending_topics.append({
|
||||
"title": topic["title"],
|
||||
"trend": "rising",
|
||||
"prediction": {
|
||||
"future_rank": "上升",
|
||||
"peak_time": f"{datetime.now(self.shanghai_tz) + timedelta(hours=random.randint(6, 24))}",
|
||||
"duration": f"{random.randint(1, 3)}天",
|
||||
"confidence": random.randint(70, 95)
|
||||
},
|
||||
"current_data": {
|
||||
"rank_change": topic["rank_change"],
|
||||
"score_change": topic["score_change"],
|
||||
"days_tracked": topic["days_tracked"]
|
||||
}
|
||||
})
|
||||
|
||||
# 添加持续热门的话题
|
||||
for topic in persistent_topics[:5]:
|
||||
trending_topics.append({
|
||||
"title": topic["title"],
|
||||
"trend": "persistent",
|
||||
"prediction": {
|
||||
"future_rank": "稳定",
|
||||
"peak_time": "已达峰值",
|
||||
"duration": f"{random.randint(2, 5)}天",
|
||||
"confidence": random.randint(80, 95)
|
||||
},
|
||||
"current_data": {
|
||||
"appearances": topic["appearances"],
|
||||
"appearance_rate": topic["appearance_rate"],
|
||||
"platform_count": topic["platform_count"]
|
||||
}
|
||||
})
|
||||
|
||||
return trending_topics
|
||||
|
||||
def _predict_category_trends(self, historical_data: Dict[str, Dict[str, List]]) -> List[Dict[str, Any]]:
|
||||
"""预测各类别的趋势变化"""
|
||||
# 定义主题类别
|
||||
categories = ["科技", "娱乐", "社会", "财经", "体育", "教育", "健康", "国际"]
|
||||
|
||||
# 简化实现:随机生成各类别的趋势变化
|
||||
import random
|
||||
|
||||
category_trends = []
|
||||
|
||||
for category in categories:
|
||||
# 随机生成历史趋势数据
|
||||
history = []
|
||||
for i in range(self.history_days):
|
||||
date = datetime.now(self.shanghai_tz) - timedelta(days=i)
|
||||
history.append({
|
||||
"date": date.strftime("%Y-%m-%d"),
|
||||
"percentage": round(random.uniform(5, 25), 1)
|
||||
})
|
||||
|
||||
# 计算趋势方向
|
||||
current = history[0]["percentage"]
|
||||
past = history[-1]["percentage"]
|
||||
trend = "rising" if current > past else "falling" if current < past else "stable"
|
||||
|
||||
# 预测未来趋势
|
||||
future = []
|
||||
for i in range(3): # 预测未来3天
|
||||
date = datetime.now(self.shanghai_tz) + timedelta(days=i+1)
|
||||
|
||||
# 基于当前值和趋势预测未来值
|
||||
if trend == "rising":
|
||||
value = current + random.uniform(0.5, 2.0) * (i+1)
|
||||
elif trend == "falling":
|
||||
value = current - random.uniform(0.5, 1.5) * (i+1)
|
||||
else:
|
||||
value = current + random.uniform(-1.0, 1.0)
|
||||
|
||||
# 确保值在合理范围内
|
||||
value = max(3, min(30, value))
|
||||
|
||||
future.append({
|
||||
"date": date.strftime("%Y-%m-%d"),
|
||||
"percentage": round(value, 1)
|
||||
})
|
||||
|
||||
category_trends.append({
|
||||
"category": category,
|
||||
"current_percentage": current,
|
||||
"trend": trend,
|
||||
"history": history,
|
||||
"prediction": future,
|
||||
"confidence": random.randint(70, 95)
|
||||
})
|
||||
|
||||
return category_trends
|
||||
|
||||
def _predict_platform_trends(self, historical_data: Dict[str, Dict[str, List]]) -> Dict[str, Any]:
|
||||
"""预测各平台的趋势变化"""
|
||||
# 分析平台趋势
|
||||
platform_growth = self._analyze_platform_trends(historical_data)
|
||||
|
||||
# 预测未来平台趋势
|
||||
future_trends = {}
|
||||
|
||||
for platform in platform_growth["emerging"]:
|
||||
platform_name = platform["platform"]
|
||||
future_trends[platform_name] = {
|
||||
"current_trend": "rising",
|
||||
"future_trend": "continued_growth",
|
||||
"growth_potential": random.randint(10, 30),
|
||||
"confidence": random.randint(70, 90)
|
||||
}
|
||||
|
||||
for platform in platform_growth["declining"]:
|
||||
platform_name = platform["platform"]
|
||||
future_trends[platform_name] = {
|
||||
"current_trend": "falling",
|
||||
"future_trend": random.choice(["stabilize", "continued_decline"]),
|
||||
"decline_rate": random.randint(5, 20),
|
||||
"confidence": random.randint(60, 85)
|
||||
}
|
||||
|
||||
# 添加其他平台的预测
|
||||
for platform in crawler_factory.keys():
|
||||
if platform not in future_trends:
|
||||
future_trends[platform] = {
|
||||
"current_trend": "stable",
|
||||
"future_trend": random.choice(["slight_growth", "stable", "slight_decline"]),
|
||||
"change_rate": random.randint(-10, 10),
|
||||
"confidence": random.randint(60, 80)
|
||||
}
|
||||
|
||||
return {
|
||||
"platform_predictions": future_trends,
|
||||
"emerging_platforms": [p["platform"] for p in platform_growth["emerging"][:3]],
|
||||
"declining_platforms": [p["platform"] for p in platform_growth["declining"][:3]]
|
||||
}
|
||||
|
||||
def _predict_keywords(self, historical_data: Dict[str, Dict[str, List]]) -> Dict[str, List]:
|
||||
"""预测关键词趋势"""
|
||||
# 分析关键词历史趋势
|
||||
keyword_trends = self._analyze_keyword_trends(historical_data)
|
||||
|
||||
# 预测未来关键词趋势
|
||||
keyword_predictions = {
|
||||
"emerging": [],
|
||||
"fading": []
|
||||
}
|
||||
|
||||
# 预测新兴关键词
|
||||
for keyword in keyword_trends["rising"]:
|
||||
keyword_predictions["emerging"].append({
|
||||
"keyword": keyword["keyword"],
|
||||
"current_growth": keyword["growth_rate"],
|
||||
"predicted_growth": keyword["growth_rate"] * random.uniform(1.1, 1.5),
|
||||
"peak_time": f"{random.randint(1, 3)}天后",
|
||||
"confidence": random.randint(70, 90)
|
||||
})
|
||||
|
||||
# 预测衰退关键词
|
||||
for keyword in keyword_trends["falling"]:
|
||||
keyword_predictions["fading"].append({
|
||||
"keyword": keyword["keyword"],
|
||||
"current_decline": abs(keyword["growth_rate"]),
|
||||
"predicted_decline": abs(keyword["growth_rate"]) * random.uniform(1.1, 1.3),
|
||||
"expected_duration": f"{random.randint(2, 5)}天",
|
||||
"confidence": random.randint(75, 90)
|
||||
})
|
||||
|
||||
return keyword_predictions
|
||||
|
||||
def _find_rising_topics(self, historical_data: Dict[str, Dict[str, List]]) -> List[Dict[str, Any]]:
|
||||
"""查找上升趋势的话题"""
|
||||
# 按日期排序的数据
|
||||
sorted_dates = sorted(historical_data.keys())
|
||||
if len(sorted_dates) < 2:
|
||||
return []
|
||||
|
||||
# 统计每个话题在不同日期的出现情况和排名
|
||||
topic_trends = defaultdict(list)
|
||||
|
||||
for date_str in sorted_dates:
|
||||
daily_data = historical_data[date_str]
|
||||
|
||||
# 收集当天所有话题
|
||||
for platform, items in daily_data.items():
|
||||
for item in items:
|
||||
title = item.get("title", "")
|
||||
if not title:
|
||||
continue
|
||||
|
||||
# 记录话题在当天的排名和平台
|
||||
rank = items.index(item) + 1 if hasattr(items, "index") else 0
|
||||
score = item.get("score", 0)
|
||||
|
||||
topic_trends[title].append({
|
||||
"date": date_str,
|
||||
"platform": platform,
|
||||
"rank": rank,
|
||||
"score": score
|
||||
})
|
||||
|
||||
# 计算话题的上升趋势
|
||||
rising_topics = []
|
||||
|
||||
for title, appearances in topic_trends.items():
|
||||
if len(appearances) < 2:
|
||||
continue
|
||||
|
||||
# 按日期排序
|
||||
appearances.sort(key=lambda x: x["date"])
|
||||
|
||||
# 计算排名变化和分数变化
|
||||
first_appearance = appearances[0]
|
||||
last_appearance = appearances[-1]
|
||||
|
||||
rank_change = first_appearance["rank"] - last_appearance["rank"] # 排名上升为正
|
||||
score_change = last_appearance["score"] - first_appearance["score"] # 分数上升为正
|
||||
|
||||
# 如果排名上升或分数上升,认为是上升趋势
|
||||
if rank_change > 0 or score_change > 0:
|
||||
rising_topics.append({
|
||||
"title": title,
|
||||
"rank_change": rank_change,
|
||||
"score_change": score_change,
|
||||
"first_appearance": first_appearance,
|
||||
"last_appearance": last_appearance,
|
||||
"days_tracked": len(set(app["date"] for app in appearances))
|
||||
})
|
||||
|
||||
# 按排名变化和分数变化排序
|
||||
rising_topics.sort(key=lambda x: (x["rank_change"], x["score_change"]), reverse=True)
|
||||
return rising_topics[:10] # 返回前10个上升趋势话题
|
||||
|
||||
def _find_persistent_topics(self, historical_data: Dict[str, Dict[str, List]]) -> List[Dict[str, Any]]:
|
||||
"""查找持续热门的话题"""
|
||||
# 按日期排序的数据
|
||||
sorted_dates = sorted(historical_data.keys())
|
||||
if len(sorted_dates) < 2:
|
||||
return []
|
||||
|
||||
# 统计每个话题在不同日期的出现次数
|
||||
topic_appearances = defaultdict(int)
|
||||
topic_platforms = defaultdict(set)
|
||||
topic_last_seen = {}
|
||||
|
||||
for date_str in sorted_dates:
|
||||
daily_data = historical_data[date_str]
|
||||
|
||||
# 收集当天所有话题
|
||||
for platform, items in daily_data.items():
|
||||
for item in items:
|
||||
title = item.get("title", "")
|
||||
if not title:
|
||||
continue
|
||||
|
||||
topic_appearances[title] += 1
|
||||
topic_platforms[title].add(platform)
|
||||
topic_last_seen[title] = date_str
|
||||
|
||||
# 找出持续出现的话题
|
||||
persistent_topics = []
|
||||
|
||||
for title, appearances in topic_appearances.items():
|
||||
# 如果话题在超过一半的天数中出现,认为是持续热门话题
|
||||
if appearances >= len(sorted_dates) / 2:
|
||||
persistent_topics.append({
|
||||
"title": title,
|
||||
"appearances": appearances,
|
||||
"appearance_rate": appearances / len(sorted_dates),
|
||||
"platforms": list(topic_platforms[title]),
|
||||
"platform_count": len(topic_platforms[title]),
|
||||
"last_seen": topic_last_seen[title]
|
||||
})
|
||||
|
||||
# 按出现次数和平台数量排序
|
||||
persistent_topics.sort(key=lambda x: (x["appearances"], x["platform_count"]), reverse=True)
|
||||
return persistent_topics[:10] # 返回前10个持续热门话题
|
||||
|
||||
def _analyze_platform_trends(self, historical_data: Dict[str, Dict[str, List]]) -> Dict[str, Any]:
|
||||
"""分析平台趋势"""
|
||||
# 按日期排序的数据
|
||||
sorted_dates = sorted(historical_data.keys())
|
||||
if len(sorted_dates) < 2:
|
||||
return {"emerging": [], "declining": []}
|
||||
|
||||
# 统计每个平台在不同日期的热点数量
|
||||
platform_trends = defaultdict(lambda: defaultdict(int))
|
||||
|
||||
for date_str in sorted_dates:
|
||||
daily_data = historical_data[date_str]
|
||||
|
||||
for platform, items in daily_data.items():
|
||||
platform_trends[platform][date_str] = len(items)
|
||||
|
||||
# 计算平台的增长趋势
|
||||
platform_growth = {}
|
||||
|
||||
for platform, date_counts in platform_trends.items():
|
||||
if len(date_counts) < 2:
|
||||
continue
|
||||
|
||||
# 计算增长率
|
||||
first_date = sorted_dates[0]
|
||||
last_date = sorted_dates[-1]
|
||||
|
||||
first_count = date_counts.get(first_date, 0)
|
||||
last_count = date_counts.get(last_date, 0)
|
||||
|
||||
if first_count == 0:
|
||||
growth_rate = 100 if last_count > 0 else 0
|
||||
else:
|
||||
growth_rate = ((last_count - first_count) / first_count) * 100
|
||||
|
||||
platform_growth[platform] = {
|
||||
"first_count": first_count,
|
||||
"last_count": last_count,
|
||||
"growth_rate": growth_rate,
|
||||
"trend": "rising" if growth_rate > 0 else "falling" if growth_rate < 0 else "stable"
|
||||
}
|
||||
|
||||
# 按增长率排序
|
||||
emerging_platforms = sorted(
|
||||
platform_growth.items(),
|
||||
key=lambda x: x[1]["growth_rate"],
|
||||
reverse=True
|
||||
)
|
||||
|
||||
return {
|
||||
"emerging": [{"platform": p, **data} for p, data in emerging_platforms[:5]],
|
||||
"declining": [{"platform": p, **data} for p, data in emerging_platforms[-5:] if data["growth_rate"] < 0]
|
||||
}
|
||||
|
||||
def _analyze_keyword_trends(self, historical_data: Dict[str, Dict[str, List]]) -> Dict[str, List]:
|
||||
"""分析关键词趋势"""
|
||||
# 按日期排序的数据
|
||||
sorted_dates = sorted(historical_data.keys())
|
||||
if len(sorted_dates) < 2:
|
||||
return {"rising": [], "falling": []}
|
||||
|
||||
# 统计每个日期的关键词频率
|
||||
date_keywords = defaultdict(Counter)
|
||||
|
||||
for date_str in sorted_dates:
|
||||
daily_data = historical_data[date_str]
|
||||
|
||||
# 收集当天所有标题
|
||||
all_titles = []
|
||||
for platform, items in daily_data.items():
|
||||
all_titles.extend([item.get("title", "") for item in items])
|
||||
|
||||
# 分词并统计频率(简化实现)
|
||||
for title in all_titles:
|
||||
for word in title.split():
|
||||
if len(word) > 1: # 忽略单字
|
||||
date_keywords[date_str][word] += 1
|
||||
|
||||
# 分析关键词趋势
|
||||
keyword_trends = defaultdict(list)
|
||||
|
||||
# 收集所有关键词
|
||||
all_keywords = set()
|
||||
for date_counter in date_keywords.values():
|
||||
all_keywords.update(date_counter.keys())
|
||||
|
||||
# 分析每个关键词的趋势
|
||||
for keyword in all_keywords:
|
||||
trend_data = []
|
||||
|
||||
for date_str in sorted_dates:
|
||||
count = date_keywords[date_str].get(keyword, 0)
|
||||
trend_data.append({"date": date_str, "count": count})
|
||||
|
||||
# 计算趋势方向
|
||||
if len(trend_data) >= 2:
|
||||
first_count = trend_data[0]["count"]
|
||||
last_count = trend_data[-1]["count"]
|
||||
|
||||
if first_count == 0:
|
||||
growth_rate = 100 if last_count > 0 else 0
|
||||
else:
|
||||
growth_rate = ((last_count - first_count) / first_count) * 100
|
||||
|
||||
if growth_rate > 50: # 增长超过50%
|
||||
keyword_trends["rising"].append({
|
||||
"keyword": keyword,
|
||||
"growth_rate": growth_rate,
|
||||
"first_count": first_count,
|
||||
"last_count": last_count,
|
||||
"trend_data": trend_data
|
||||
})
|
||||
elif growth_rate < -50: # 下降超过50%
|
||||
keyword_trends["falling"].append({
|
||||
"keyword": keyword,
|
||||
"growth_rate": growth_rate,
|
||||
"first_count": first_count,
|
||||
"last_count": last_count,
|
||||
"trend_data": trend_data
|
||||
})
|
||||
|
||||
# 按增长率排序
|
||||
keyword_trends["rising"].sort(key=lambda x: x["growth_rate"], reverse=True)
|
||||
keyword_trends["falling"].sort(key=lambda x: x["growth_rate"])
|
||||
|
||||
return {
|
||||
"rising": keyword_trends["rising"][:10], # 前10个上升关键词
|
||||
"falling": keyword_trends["falling"][:10] # 前10个下降关键词
|
||||
}
|
||||
|
||||
# 添加随机模块,用于生成模拟数据
|
||||
import random
|
||||
3
app/analysis/trend_analyzer/__init__.py
Normal file
3
app/analysis/trend_analyzer/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from app.analysis.trend_analyzer.analyzer import TrendAnalyzer
|
||||
|
||||
__all__ = ['TrendAnalyzer']
|
||||
1833
app/analysis/trend_analyzer/analyzer.py
Normal file
1833
app/analysis/trend_analyzer/analyzer.py
Normal file
File diff suppressed because it is too large
Load Diff
0
app/api/__init__.py
Normal file
0
app/api/__init__.py
Normal file
0
app/api/dependencies.py
Normal file
0
app/api/dependencies.py
Normal file
0
app/api/v1/__init__.py
Normal file
0
app/api/v1/__init__.py
Normal file
314
app/api/v1/analysis.py
Normal file
314
app/api/v1/analysis.py
Normal file
@@ -0,0 +1,314 @@
|
||||
from fastapi import APIRouter, Query
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
import pytz
|
||||
|
||||
from app.analysis.trend_analyzer import TrendAnalyzer
|
||||
from app.analysis.predictor import TrendPredictor
|
||||
from app.utils.logger import log
|
||||
from app.core import cache
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.get("/trend")
|
||||
async def get_trend_analysis(date: Optional[str] = None, type: str = "main"):
|
||||
"""
|
||||
获取热点聚合分析
|
||||
|
||||
分析各平台热点数据的共性和差异,提取共同关键词、跨平台热点话题等
|
||||
|
||||
- **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天
|
||||
- **type**: 分析类型,可选值为 main(主题分析), platform(平台对比), cross(跨平台热点), advanced(高级分析),默认为main
|
||||
"""
|
||||
try:
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 从缓存中获取数据
|
||||
cache_key = f"analysis:trend:{date}:{type}"
|
||||
cached_data = cache.get_cache(cache_key)
|
||||
|
||||
if cached_data:
|
||||
log.info(f"Retrieved trend analysis from cache for {date}, type: {type}")
|
||||
return cached_data
|
||||
|
||||
# 如果缓存中没有,则生成新的分析数据
|
||||
analyzer = TrendAnalyzer()
|
||||
result = analyzer.get_analysis(date, type)
|
||||
return result
|
||||
except Exception as e:
|
||||
log.error(f"Error in trend analysis: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
}
|
||||
|
||||
@router.get("/platform-comparison")
|
||||
async def get_platform_comparison(date: Optional[str] = None):
|
||||
"""
|
||||
获取平台对比分析
|
||||
|
||||
分析各平台热点数据的特点、热度排行、更新频率等,比较不同平台间的异同
|
||||
|
||||
- **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天
|
||||
"""
|
||||
try:
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 从缓存中获取数据
|
||||
cache_key = f"analysis:trend:{date}:platform_comparison"
|
||||
cached_data = cache.get_cache(cache_key)
|
||||
|
||||
if cached_data:
|
||||
log.info(f"Retrieved platform comparison from cache for {date}")
|
||||
return cached_data
|
||||
|
||||
# 如果缓存中没有,则生成新的分析数据
|
||||
analyzer = TrendAnalyzer()
|
||||
result = analyzer.get_platform_comparison(date)
|
||||
return result
|
||||
except Exception as e:
|
||||
log.error(f"Error in platform comparison: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
}
|
||||
|
||||
@router.get("/cross-platform")
|
||||
async def get_cross_platform_analysis(date: Optional[str] = None, refresh: bool = False):
|
||||
"""
|
||||
获取跨平台热点分析
|
||||
|
||||
分析在多个平台上出现的热点话题,以及热点的传播路径
|
||||
|
||||
- **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天
|
||||
- **refresh**: 可选,是否强制刷新缓存,默认为False
|
||||
"""
|
||||
try:
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 从缓存中获取数据
|
||||
cache_key = f"analysis:trend:{date}:cross_platform"
|
||||
|
||||
# 如果不是强制刷新,尝试从缓存获取
|
||||
if not refresh:
|
||||
cached_data = cache.get_cache(cache_key)
|
||||
if cached_data:
|
||||
log.info(f"Retrieved cross platform analysis from cache for {date}")
|
||||
return cached_data
|
||||
|
||||
# 如果缓存中没有或需要刷新,则生成新的分析数据
|
||||
analyzer = TrendAnalyzer()
|
||||
result = analyzer.get_cross_platform_analysis(date, refresh)
|
||||
return result
|
||||
except Exception as e:
|
||||
log.error(f"Error in cross platform analysis: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
}
|
||||
|
||||
@router.get("/advanced")
|
||||
async def get_advanced_analysis(date: Optional[str] = None, refresh: bool = False):
|
||||
"""
|
||||
获取高级分析
|
||||
|
||||
提供更深入的热点分析,包括关键词云图、情感分析、热点演变趋势等
|
||||
|
||||
- **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天
|
||||
- **refresh**: 可选,是否强制刷新缓存,默认为False
|
||||
"""
|
||||
try:
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 从缓存中获取数据
|
||||
cache_key = f"analysis:trend:{date}:advanced_analysis"
|
||||
|
||||
# 如果不是强制刷新,尝试从缓存获取
|
||||
if not refresh:
|
||||
cached_data = cache.get_cache(cache_key)
|
||||
if cached_data:
|
||||
log.info(f"Retrieved advanced analysis from cache for {date}")
|
||||
return cached_data
|
||||
|
||||
# 如果缓存中没有或需要刷新,则生成新的分析数据
|
||||
analyzer = TrendAnalyzer()
|
||||
result = analyzer.get_advanced_analysis(date, refresh)
|
||||
return result
|
||||
except Exception as e:
|
||||
log.error(f"Error in advanced analysis: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
}
|
||||
|
||||
@router.get("/prediction")
|
||||
async def get_trend_prediction(date: Optional[str] = None):
|
||||
"""
|
||||
获取热点趋势预测
|
||||
|
||||
基于历史数据预测热点话题的发展趋势,包括上升趋势、下降趋势、持续热门话题等
|
||||
|
||||
- **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天
|
||||
"""
|
||||
try:
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 从缓存中获取数据
|
||||
cache_key = f"analysis:prediction:{date}"
|
||||
cached_data = cache.get_cache(cache_key)
|
||||
|
||||
if cached_data:
|
||||
log.info(f"Retrieved trend prediction from cache for {date}")
|
||||
return cached_data
|
||||
|
||||
# 如果缓存中没有,则生成新的预测数据
|
||||
predictor = TrendPredictor()
|
||||
result = predictor.get_prediction(date)
|
||||
return result
|
||||
except Exception as e:
|
||||
log.error(f"Error in trend prediction: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
}
|
||||
|
||||
@router.get("/keyword-cloud")
|
||||
async def get_keyword_cloud(date: Optional[str] = None, refresh: bool = False, platforms: Optional[str] = None, category: Optional[str] = None, keyword_count: int = 200):
|
||||
"""
|
||||
获取关键词云图数据
|
||||
|
||||
提取热点数据中的关键词,按不同类别(科技、娱乐、社会等)进行分类,用于生成词云
|
||||
|
||||
- **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天
|
||||
- **refresh**: 可选,是否强制刷新缓存,默认为False
|
||||
- **platforms**: 可选,指定平台,多个平台用逗号分隔,如"baidu,weibo"
|
||||
- **category**: 可选,指定分类,如"科技"、"娱乐"等
|
||||
- **keyword_count**: 可选,返回的关键词数量,默认为200
|
||||
"""
|
||||
try:
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 从缓存中获取数据
|
||||
cache_key = f"analysis:keyword_cloud:{date}"
|
||||
|
||||
# 如果不是强制刷新,尝试从缓存获取
|
||||
if not refresh:
|
||||
cached_data = cache.get_cache(cache_key)
|
||||
if cached_data:
|
||||
log.info(f"Retrieved keyword cloud from cache for {date}")
|
||||
# 如果指定了分类,过滤结果
|
||||
if category and cached_data.get("status") == "success" and "keyword_clouds" in cached_data:
|
||||
if category in cached_data["keyword_clouds"]:
|
||||
filtered_data = cached_data.copy()
|
||||
filtered_data["keyword_clouds"] = {category: cached_data["keyword_clouds"][category]}
|
||||
return filtered_data
|
||||
return cached_data
|
||||
|
||||
# 如果缓存中没有或需要刷新,则生成新的关键词云数据
|
||||
analyzer = TrendAnalyzer()
|
||||
result = analyzer.get_keyword_cloud(date, refresh, keyword_count)
|
||||
return result
|
||||
except Exception as e:
|
||||
log.error(f"Error in keyword cloud analysis: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
}
|
||||
|
||||
@router.get("/data-visualization")
|
||||
async def get_data_visualization(date: Optional[str] = None, refresh: bool = False, platforms: str = None):
|
||||
"""
|
||||
获取数据可视化分析
|
||||
|
||||
提供热点数据的可视化分析,包括主题热度分布图
|
||||
|
||||
- **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天
|
||||
- **refresh**: 可选,是否强制刷新缓存,默认为False
|
||||
- **platforms**: 可选,指定要分析的平台,多个平台用逗号分隔,例如:baidu,weibo,douyin
|
||||
"""
|
||||
try:
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 从缓存中获取数据
|
||||
cache_key = f"analysis:data_visualization:{date}"
|
||||
|
||||
# 如果不是强制刷新,尝试从缓存获取
|
||||
if not refresh:
|
||||
cached_data = cache.get_cache(cache_key)
|
||||
if cached_data:
|
||||
log.info(f"Retrieved data visualization from cache for {date}")
|
||||
return cached_data
|
||||
|
||||
# 解析平台参数
|
||||
platform_list = None
|
||||
if platforms:
|
||||
platform_list = [p.strip() for p in platforms.split(",") if p.strip()]
|
||||
|
||||
# 如果缓存中没有或需要刷新,则生成新的可视化数据
|
||||
analyzer = TrendAnalyzer()
|
||||
result = analyzer.get_data_visualization(date, refresh, platform_list)
|
||||
return result
|
||||
except Exception as e:
|
||||
log.error(f"Error in data visualization: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
}
|
||||
|
||||
@router.get("/trend-forecast")
|
||||
async def get_trend_forecast(date: Optional[str] = None, refresh: bool = False, time_range: str = "24h"):
|
||||
"""
|
||||
获取热点趋势预测分析
|
||||
|
||||
分析热点话题的演变趋势,预测热点的发展方向
|
||||
|
||||
- **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天
|
||||
- **refresh**: 可选,是否强制刷新缓存,默认为False
|
||||
- **time_range**: 可选,预测时间范围,可选值为 24h(24小时), 7d(7天), 30d(30天),默认为24h
|
||||
"""
|
||||
try:
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 验证时间范围参数
|
||||
valid_time_ranges = ["24h", "7d", "30d"]
|
||||
if time_range not in valid_time_ranges:
|
||||
time_range = "24h" # 默认使用24小时
|
||||
|
||||
# 从缓存中获取数据
|
||||
cache_key = f"analysis:trend_forecast:{date}:{time_range}"
|
||||
|
||||
# 如果不是强制刷新,尝试从缓存获取
|
||||
if not refresh:
|
||||
cached_data = cache.get_cache(cache_key)
|
||||
if cached_data:
|
||||
log.info(f"Retrieved trend forecast from cache for {date}, time_range: {time_range}")
|
||||
return cached_data
|
||||
|
||||
# 如果缓存中没有或需要刷新,则生成新的趋势预测数据
|
||||
analyzer = TrendAnalyzer()
|
||||
result = analyzer.get_trend_forecast(date, refresh, time_range)
|
||||
return result
|
||||
except Exception as e:
|
||||
log.error(f"Error in trend forecast: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d"),
|
||||
"time_range": time_range
|
||||
}
|
||||
295
app/api/v1/daily_news.py
Normal file
295
app/api/v1/daily_news.py
Normal file
@@ -0,0 +1,295 @@
|
||||
# app/api/endpoints/dailynews.py
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
import pytz
|
||||
from fastapi import APIRouter
|
||||
|
||||
from app.core import cache
|
||||
from app.services import crawler_factory
|
||||
from app.utils.logger import log
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/")
|
||||
def get_hot_news(date: str = None, platform: str = None):
|
||||
if platform not in crawler_factory.keys():
|
||||
return {
|
||||
"status": "404",
|
||||
"data": [],
|
||||
"msg": "`platform` is required, valid platform: " + ", ".join(crawler_factory.keys())
|
||||
}
|
||||
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
cacheKey = f"crawler:{platform}:{date}"
|
||||
result = cache.get(cacheKey)
|
||||
if result:
|
||||
return {
|
||||
"status": "200",
|
||||
"data": json.loads(result),
|
||||
"msg": "success"
|
||||
}
|
||||
|
||||
return {
|
||||
"status": "200",
|
||||
"data": [],
|
||||
"msg": "success"
|
||||
}
|
||||
|
||||
|
||||
@router.get("/all")
|
||||
def get_all_platforms_news(date: str = None):
|
||||
"""
|
||||
获取所有平台的热门新闻
|
||||
|
||||
Args:
|
||||
date: 日期,格式为YYYY-MM-DD,默认为当天
|
||||
|
||||
Returns:
|
||||
包含所有平台新闻的字典,键为平台名称,值为新闻列表
|
||||
"""
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
all_news = {}
|
||||
|
||||
for platform in crawler_factory.keys():
|
||||
cacheKey = f"crawler:{platform}:{date}"
|
||||
result = cache.get(cacheKey)
|
||||
if result:
|
||||
try:
|
||||
all_news[platform] = json.loads(result)
|
||||
except Exception as e:
|
||||
log.error(f"Error parsing cached data for {platform}: {e}")
|
||||
all_news[platform] = []
|
||||
else:
|
||||
all_news[platform] = []
|
||||
|
||||
return {
|
||||
"status": "200",
|
||||
"data": all_news,
|
||||
"msg": "success"
|
||||
}
|
||||
|
||||
|
||||
@router.get("/multi")
|
||||
def get_multi_platforms_news(date: str = None, platforms: str = None):
|
||||
"""
|
||||
获取多个平台的热门新闻
|
||||
|
||||
Args:
|
||||
date: 日期,格式为YYYY-MM-DD,默认为当天
|
||||
platforms: 平台列表,以逗号分隔,例如 "weibo,baidu,zhihu"
|
||||
|
||||
Returns:
|
||||
包含指定平台新闻的字典,键为平台名称,值为新闻列表
|
||||
"""
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
if not platforms:
|
||||
return {
|
||||
"status": "404",
|
||||
"data": {},
|
||||
"msg": "`platforms` parameter is required, format: comma-separated platform names"
|
||||
}
|
||||
|
||||
platform_list = [p.strip() for p in platforms.split(",")]
|
||||
valid_platforms = crawler_factory.keys()
|
||||
|
||||
# 验证平台是否有效
|
||||
invalid_platforms = [p for p in platform_list if p not in valid_platforms]
|
||||
if invalid_platforms:
|
||||
return {
|
||||
"status": "404",
|
||||
"data": {},
|
||||
"msg": f"Invalid platforms: {', '.join(invalid_platforms)}. Valid platforms: {', '.join(valid_platforms)}"
|
||||
}
|
||||
|
||||
multi_news = {}
|
||||
|
||||
for platform in platform_list:
|
||||
cacheKey = f"crawler:{platform}:{date}"
|
||||
result = cache.get(cacheKey)
|
||||
if result:
|
||||
try:
|
||||
multi_news[platform] = json.loads(result)
|
||||
except Exception as e:
|
||||
log.error(f"Error parsing cached data for {platform}: {e}")
|
||||
multi_news[platform] = []
|
||||
else:
|
||||
multi_news[platform] = []
|
||||
|
||||
return {
|
||||
"status": "200",
|
||||
"data": multi_news,
|
||||
"msg": "success"
|
||||
}
|
||||
|
||||
|
||||
@router.get("/search")
|
||||
def search_news(keyword: str, date: str = None, platforms: str = None, limit: int = 20):
|
||||
"""
|
||||
搜索新闻
|
||||
|
||||
Args:
|
||||
keyword: 搜索关键词
|
||||
date: 日期,格式为YYYY-MM-DD,默认为当天
|
||||
platforms: 平台列表,以逗号分隔,例如 "weibo,baidu,zhihu",默认搜索所有平台
|
||||
limit: 返回结果数量限制,默认为20
|
||||
|
||||
Returns:
|
||||
包含搜索结果的字典,键为状态码、数据、消息、总结果数量和搜索结果数量
|
||||
"""
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 确定要搜索的平台
|
||||
if platforms:
|
||||
platform_list = [p.strip() for p in platforms.split(",")]
|
||||
valid_platforms = crawler_factory.keys()
|
||||
platform_list = [p for p in platform_list if p in valid_platforms]
|
||||
else:
|
||||
platform_list = list(crawler_factory.keys())
|
||||
|
||||
if not platform_list:
|
||||
return {
|
||||
"status": "404",
|
||||
"data": [],
|
||||
"msg": "No valid platforms specified",
|
||||
"total": 0,
|
||||
"search_results": 0
|
||||
}
|
||||
|
||||
# 从各平台获取新闻数据
|
||||
all_news = []
|
||||
|
||||
for platform in platform_list:
|
||||
cacheKey = f"crawler:{platform}:{date}"
|
||||
result = cache.get(cacheKey)
|
||||
if not result:
|
||||
continue
|
||||
|
||||
try:
|
||||
platform_news = json.loads(result)
|
||||
if not isinstance(platform_news, list):
|
||||
continue
|
||||
|
||||
# 为每条新闻添加平台信息
|
||||
for idx, item in enumerate(platform_news):
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
|
||||
# 处理rank字段
|
||||
rank_value = ""
|
||||
if "rank" in item and item["rank"]:
|
||||
rank_value = str(item["rank"]).replace("#", "")
|
||||
elif "index" in item and item["index"]:
|
||||
rank_value = str(item["index"]).replace("#", "")
|
||||
else:
|
||||
rank_value = str(idx + 1)
|
||||
|
||||
# 获取分类信息
|
||||
category = _get_category_for_platform(platform)
|
||||
sub_category = _get_subcategory_for_platform(platform)
|
||||
|
||||
# 构建标准化的新闻条目
|
||||
item_with_source = {
|
||||
"id": item.get("id"),
|
||||
"title": item.get("title", ""),
|
||||
"source": platform,
|
||||
"rank": rank_value,
|
||||
"category": category,
|
||||
"sub_category": sub_category,
|
||||
"url": item.get("url", "")
|
||||
}
|
||||
all_news.append(item_with_source)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error processing news from {platform}: {e}")
|
||||
|
||||
# 搜索关键词
|
||||
search_results = []
|
||||
for item in all_news:
|
||||
if keyword.lower() in item["title"].lower():
|
||||
search_results.append(item)
|
||||
|
||||
# 按站点分组,每个站点内按排名排序
|
||||
grouped_results = {}
|
||||
for item in search_results:
|
||||
source = item["source"]
|
||||
if source not in grouped_results:
|
||||
grouped_results[source] = []
|
||||
grouped_results[source].append(item)
|
||||
|
||||
# 对每个站点内的结果按排名排序
|
||||
for source, items in grouped_results.items():
|
||||
# 按排名排序(直接比较数字)
|
||||
items.sort(key=lambda x: int(x["rank"]) if x["rank"].isdigit() else 999)
|
||||
|
||||
# 重新组合排序后的结果
|
||||
sorted_results = []
|
||||
for source, items in grouped_results.items():
|
||||
sorted_results.extend(items)
|
||||
|
||||
# 限制返回结果数量
|
||||
limited_results = sorted_results[:limit]
|
||||
|
||||
return {
|
||||
"status": "200",
|
||||
"data": limited_results,
|
||||
"msg": "success",
|
||||
"total": len(search_results),
|
||||
"search_results": len(limited_results)
|
||||
}
|
||||
|
||||
|
||||
def _get_category_for_platform(platform: str) -> str:
|
||||
"""根据平台返回对应的分类"""
|
||||
categories = {
|
||||
"36kr": "科技创业",
|
||||
"hupu": "体育",
|
||||
"sspai": "科技",
|
||||
"weibo": "社交",
|
||||
"zhihu": "知识",
|
||||
"baidu": "综合",
|
||||
"tieba": "社区",
|
||||
"douban": "文化",
|
||||
"bilibili": "视频",
|
||||
"v2ex": "科技",
|
||||
"github": "开发者",
|
||||
"hackernews": "科技",
|
||||
"stackoverflow": "开发者",
|
||||
"jinritoutiao": "资讯",
|
||||
"douyin": "娱乐",
|
||||
"shaoshupai": "科技"
|
||||
}
|
||||
return categories.get(platform, "其他")
|
||||
|
||||
|
||||
def _get_subcategory_for_platform(platform: str) -> str:
|
||||
"""根据平台返回对应的子分类"""
|
||||
subcategories = {
|
||||
"36kr": "商业资讯",
|
||||
"hupu": "娱乐",
|
||||
"sspai": "数码",
|
||||
"weibo": "热门",
|
||||
"zhihu": "问答",
|
||||
"baidu": "热搜",
|
||||
"tieba": "讨论",
|
||||
"douban": "影视",
|
||||
"bilibili": "热门",
|
||||
"v2ex": "技术",
|
||||
"github": "开源",
|
||||
"hackernews": "国际",
|
||||
"stackoverflow": "问答",
|
||||
"jinritoutiao": "热点",
|
||||
"douyin": "娱乐",
|
||||
"shaoshupai": "数码"
|
||||
}
|
||||
return subcategories.get(platform, "其他")
|
||||
|
||||
138
app/api/v1/web_tools.py
Normal file
138
app/api/v1/web_tools.py
Normal file
@@ -0,0 +1,138 @@
|
||||
# app/api/endpoints/website_meta.py
|
||||
import json
|
||||
import time
|
||||
from urllib.parse import urlparse, urljoin
|
||||
|
||||
import cloudscraper
|
||||
|
||||
from app.utils.logger import log
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from fastapi import APIRouter
|
||||
|
||||
from app.core import cache
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/")
|
||||
def get_meta(url: str = None):
|
||||
if not url:
|
||||
return {
|
||||
"status": "404",
|
||||
"data": [],
|
||||
"msg": "`url` is required"
|
||||
}
|
||||
|
||||
# get from cache
|
||||
cached_metadata = cache.get(url)
|
||||
if cached_metadata:
|
||||
return {
|
||||
"status": "200",
|
||||
"data": json.loads(cached_metadata),
|
||||
"msg": "success",
|
||||
"cache": True
|
||||
}
|
||||
|
||||
headers = {
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
"accept-language": "zh-CN,zh;q=0.9,zh-TW;q=0.8,ar;q=0.7,en;q=0.6",
|
||||
"cache-control": "max-age=0",
|
||||
"priority": "u=0, i",
|
||||
"sec-ch-ua": '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-ch-ua-platform": '"Windows"',
|
||||
"sec-fetch-dest": "document",
|
||||
"sec-fetch-mode": "navigate",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-user": "?1",
|
||||
"upgrade-insecure-requests": "1",
|
||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
page_content = response.content
|
||||
except requests.RequestException as e:
|
||||
scraper = cloudscraper.create_scraper(delay=100)
|
||||
response = scraper.get(url)
|
||||
page_content = response.content
|
||||
|
||||
if not page_content:
|
||||
return {
|
||||
"status": "404",
|
||||
"data": [],
|
||||
"msg": "No content"
|
||||
}
|
||||
|
||||
soup = BeautifulSoup(page_content, "html.parser")
|
||||
meta_info = {
|
||||
"title": soup.title.string if soup.title else "No title",
|
||||
"description": "",
|
||||
"keywords": "",
|
||||
"author": "",
|
||||
"og:title": "",
|
||||
"og:description": "",
|
||||
"og:image": "",
|
||||
"og:url": url,
|
||||
"twitter:card": "",
|
||||
"twitter:title": "",
|
||||
"twitter:description": "",
|
||||
"twitter:image": ""
|
||||
}
|
||||
|
||||
for meta_tag in soup.find_all("meta"):
|
||||
name_attr = meta_tag.get("name", "").lower()
|
||||
property_attr = meta_tag.get("property", "").lower()
|
||||
content = meta_tag.get("content", "")
|
||||
|
||||
if name_attr == "description":
|
||||
meta_info["description"] = content
|
||||
elif name_attr == "keywords":
|
||||
meta_info["keywords"] = content
|
||||
elif name_attr == "author":
|
||||
meta_info["author"] = content
|
||||
|
||||
elif property_attr == "og:title":
|
||||
meta_info["og:title"] = content
|
||||
elif property_attr == "og:description":
|
||||
meta_info["og:description"] = content
|
||||
elif property_attr == "og:image":
|
||||
meta_info["og:image"] = content
|
||||
elif property_attr == "og:url":
|
||||
meta_info["og:url"] = content
|
||||
|
||||
elif name_attr == "twitter:card":
|
||||
meta_info["twitter:card"] = content
|
||||
elif name_attr == "twitter:title":
|
||||
meta_info["twitter:title"] = content
|
||||
elif name_attr == "twitter:description":
|
||||
meta_info["twitter:description"] = content
|
||||
elif name_attr == "twitter:image":
|
||||
meta_info["twitter:image"] = content
|
||||
|
||||
parsed_url = urlparse(url)
|
||||
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||
favicon_url = urljoin(base_url, "favicon.ico") # 默认 favicon 路径
|
||||
|
||||
link_tag = soup.find("link", rel=["icon", "shortcut icon"])
|
||||
if link_tag:
|
||||
favicon_url = urljoin(base_url, link_tag.get("href", "favicon.ico"))
|
||||
|
||||
metadata = {
|
||||
"meta_info": meta_info,
|
||||
"favicon_url": favicon_url
|
||||
}
|
||||
|
||||
cache.set(url, json.dumps(metadata, ensure_ascii=False), ex=60)
|
||||
result = {
|
||||
"status": "200",
|
||||
"data": metadata,
|
||||
"msg": "Success",
|
||||
"cache": False
|
||||
}
|
||||
|
||||
return result
|
||||
0
app/core/__init__.py
Normal file
0
app/core/__init__.py
Normal file
150
app/core/cache.py
Normal file
150
app/core/cache.py
Normal file
@@ -0,0 +1,150 @@
|
||||
import redis
|
||||
from pydantic import BaseModel
|
||||
import json
|
||||
from typing import Any, Optional, Dict, List, Union
|
||||
import time
|
||||
|
||||
from app.db.redis import get_redis_client
|
||||
from app.utils.logger import log
|
||||
|
||||
# 默认缓存过期时间(1小时)
|
||||
DEFAULT_EXPIRE = 3600
|
||||
|
||||
def init_cache():
|
||||
"""初始化缓存连接"""
|
||||
try:
|
||||
redis = get_redis_client()
|
||||
redis.ping()
|
||||
log.info("Cache connection established")
|
||||
except Exception as e:
|
||||
log.error(f"Failed to connect to cache: {e}")
|
||||
|
||||
def close_cache():
|
||||
"""关闭缓存连接"""
|
||||
try:
|
||||
redis = get_redis_client()
|
||||
redis.connection_pool.disconnect()
|
||||
log.info("Cache connection closed")
|
||||
except Exception as e:
|
||||
log.error(f"Error closing cache connection: {e}")
|
||||
|
||||
def set_cache(key: str, value: Any, expire: int = DEFAULT_EXPIRE) -> bool:
|
||||
"""设置缓存,支持自动序列化复杂对象"""
|
||||
try:
|
||||
redis = get_redis_client()
|
||||
if isinstance(value, (dict, list, tuple)):
|
||||
value = json.dumps(value)
|
||||
elif isinstance(value, bool):
|
||||
value = "1" if value else "0"
|
||||
|
||||
if expire > 0:
|
||||
redis.setex(key, expire, value)
|
||||
else:
|
||||
redis.set(key, value)
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f"Error setting cache for key111111 {key}: {e}")
|
||||
return False
|
||||
|
||||
def get_cache(key: str) -> Optional[Any]:
|
||||
try:
|
||||
redis = get_redis_client()
|
||||
value = redis.get(key)
|
||||
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
if isinstance(value, bytes):
|
||||
value = value.decode('utf-8')
|
||||
|
||||
try:
|
||||
return json.loads(value)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return value
|
||||
except Exception as e:
|
||||
log.error(f"Error getting cache for key {key}: {e}")
|
||||
return None
|
||||
|
||||
def delete_cache(key: str) -> bool:
|
||||
try:
|
||||
redis = get_redis_client()
|
||||
redis.delete(key)
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f"Error deleting cache for key {key}: {e}")
|
||||
return False
|
||||
|
||||
def clear_cache_pattern(pattern: str) -> int:
|
||||
try:
|
||||
redis = get_redis_client()
|
||||
keys = redis.keys(pattern)
|
||||
if keys:
|
||||
return redis.delete(*keys)
|
||||
return 0
|
||||
except Exception as e:
|
||||
log.error(f"Error clearing cache pattern {pattern}: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
def get(key):
|
||||
try:
|
||||
redis_client = get_redis_client()
|
||||
except Exception as e:
|
||||
log.error(f"Error getting redis client: {e}")
|
||||
return None
|
||||
|
||||
value = redis_client.get(key)
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
return value.decode("utf-8")
|
||||
|
||||
|
||||
def set(key, value, ex=None):
|
||||
try:
|
||||
redis_client = get_redis_client()
|
||||
except Exception as e:
|
||||
log.error(f"Error getting redis client: {e}")
|
||||
return None
|
||||
|
||||
return redis_client.set(key, value, ex=ex)
|
||||
|
||||
|
||||
def delete(key):
|
||||
|
||||
try:
|
||||
redis_client = get_redis_client()
|
||||
except Exception as e:
|
||||
log.error(f"Error getting redis client: {e}")
|
||||
return None
|
||||
|
||||
return redis_client.delete(key)
|
||||
|
||||
|
||||
def hset(name, key, value):
|
||||
|
||||
try:
|
||||
redis_client = get_redis_client()
|
||||
except Exception as e:
|
||||
log.error(f"Error getting redis client: {e}")
|
||||
return None
|
||||
|
||||
return redis_client.hset(name, key, value)
|
||||
|
||||
|
||||
def hget(name, key):
|
||||
|
||||
try:
|
||||
redis_client = get_redis_client()
|
||||
except Exception as e:
|
||||
log.error(f"Error getting redis client: {e}")
|
||||
return None
|
||||
|
||||
return redis_client.hget(name, key)
|
||||
|
||||
|
||||
class CacheNews(BaseModel):
|
||||
title: str
|
||||
url: str
|
||||
score: int
|
||||
desc: str
|
||||
121
app/core/config.py
Normal file
121
app/core/config.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import os
|
||||
import yaml
|
||||
from typing import Dict, Any, List, Optional
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
# 配置文件路径
|
||||
CONFIG_PATH = os.environ.get("CONFIG_PATH", "config/config.yaml")
|
||||
|
||||
class AppConfig(BaseModel):
|
||||
title: str
|
||||
description: str
|
||||
version: str
|
||||
host: str
|
||||
port: int
|
||||
debug: bool = True
|
||||
cors: Dict[str, Any]
|
||||
|
||||
class DatabaseConfig(BaseModel):
|
||||
host: str
|
||||
user: str
|
||||
password: str
|
||||
db: str
|
||||
charset: str
|
||||
autocommit: bool = True
|
||||
|
||||
class RedisConfig(BaseModel):
|
||||
host: str
|
||||
port: int
|
||||
db: int
|
||||
password: str = ""
|
||||
decode_responses: bool = False
|
||||
socket_timeout: int = 5
|
||||
socket_connect_timeout: int = 5
|
||||
health_check_interval: int = 30
|
||||
|
||||
class CrawlerConfig(BaseModel):
|
||||
interval: int
|
||||
timeout: int
|
||||
max_retry_count: int
|
||||
max_instances: int
|
||||
misfire_grace_time: int
|
||||
|
||||
class LoggingConfig(BaseModel):
|
||||
level: str
|
||||
format: str
|
||||
dir: str
|
||||
file: str
|
||||
max_size: int
|
||||
backup_count: int
|
||||
daily_backup_count: int
|
||||
timezone: str
|
||||
|
||||
class SchedulerConfig(BaseModel):
|
||||
thread_pool_size: int
|
||||
process_pool_size: int
|
||||
coalesce: bool
|
||||
max_instances: int
|
||||
misfire_grace_time: int
|
||||
timezone: str
|
||||
|
||||
class NotificationConfig(BaseModel):
|
||||
dingtalk: Dict[str, Any] = Field(default_factory=dict)
|
||||
# 可以添加其他通知方式的配置
|
||||
# wechat: Dict[str, Any] = Field(default_factory=dict)
|
||||
# email: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
class Config(BaseModel):
|
||||
app: AppConfig
|
||||
database: DatabaseConfig
|
||||
redis: RedisConfig
|
||||
crawler: CrawlerConfig
|
||||
logging: LoggingConfig
|
||||
scheduler: SchedulerConfig
|
||||
notification: Optional[NotificationConfig] = None
|
||||
|
||||
# 全局配置对象
|
||||
_config: Optional[Config] = None
|
||||
|
||||
def load_config() -> Config:
|
||||
"""加载配置文件"""
|
||||
global _config
|
||||
if _config is None:
|
||||
try:
|
||||
with open(CONFIG_PATH, 'r') as f:
|
||||
config_data = yaml.safe_load(f)
|
||||
_config = Config(**config_data)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to load configuration: {e}")
|
||||
return _config
|
||||
|
||||
def get_config() -> Config:
|
||||
"""获取配置对象"""
|
||||
if _config is None:
|
||||
return load_config()
|
||||
return _config
|
||||
|
||||
# 便捷访问函数
|
||||
def get_app_config() -> AppConfig:
|
||||
return get_config().app
|
||||
|
||||
def get_db_config() -> DatabaseConfig:
|
||||
return get_config().database
|
||||
|
||||
def get_redis_config() -> RedisConfig:
|
||||
return get_config().redis
|
||||
|
||||
def get_crawler_config() -> CrawlerConfig:
|
||||
return get_config().crawler
|
||||
|
||||
def get_logging_config() -> LoggingConfig:
|
||||
return get_config().logging
|
||||
|
||||
def get_scheduler_config() -> SchedulerConfig:
|
||||
return get_config().scheduler
|
||||
|
||||
def get_notification_config() -> Dict[str, Any]:
|
||||
"""获取通知配置"""
|
||||
config = get_config()
|
||||
if config.notification:
|
||||
return config.notification.dict()
|
||||
return {}
|
||||
131
app/core/db.py
Normal file
131
app/core/db.py
Normal file
@@ -0,0 +1,131 @@
|
||||
import time
|
||||
from typing import List, Dict, Any, Optional
|
||||
from contextlib import contextmanager
|
||||
import traceback
|
||||
|
||||
import pymysql
|
||||
from pymysql.cursors import DictCursor
|
||||
|
||||
from app.utils.logger import log
|
||||
from app.core.config import get_db_config
|
||||
|
||||
# 连接池
|
||||
_connection = None
|
||||
|
||||
def init_db():
|
||||
"""初始化数据库连接"""
|
||||
global _connection
|
||||
try:
|
||||
db_config = get_db_config()
|
||||
_connection = pymysql.connect(
|
||||
host=db_config.host,
|
||||
user=db_config.user,
|
||||
password=db_config.password,
|
||||
db=db_config.db,
|
||||
charset=db_config.charset,
|
||||
cursorclass=DictCursor,
|
||||
autocommit=db_config.autocommit
|
||||
)
|
||||
log.info("Database connection established")
|
||||
except Exception as e:
|
||||
log.error(f"Failed to connect to database: {e}")
|
||||
raise
|
||||
|
||||
def close_db():
|
||||
"""关闭数据库连接"""
|
||||
global _connection
|
||||
if _connection:
|
||||
_connection.close()
|
||||
_connection = None
|
||||
log.info("Database connection closed")
|
||||
|
||||
@contextmanager
|
||||
def get_cursor():
|
||||
"""获取数据库游标的上下文管理器"""
|
||||
global _connection
|
||||
|
||||
# 如果连接不存在或已关闭,重新连接
|
||||
if _connection is None or not _connection.open:
|
||||
init_db()
|
||||
|
||||
cursor = None
|
||||
try:
|
||||
cursor = _connection.cursor()
|
||||
yield cursor
|
||||
except pymysql.OperationalError as e:
|
||||
# 处理连接断开的情况
|
||||
if e.args[0] in (2006, 2013): # MySQL server has gone away, Lost connection
|
||||
log.warning("Database connection lost, reconnecting...")
|
||||
init_db()
|
||||
cursor = _connection.cursor()
|
||||
yield cursor
|
||||
else:
|
||||
raise
|
||||
except Exception as e:
|
||||
log.error(f"Database error: {e}")
|
||||
raise
|
||||
finally:
|
||||
if cursor:
|
||||
cursor.close()
|
||||
|
||||
def insert_news(news_list: List[Dict[str, Any]]) -> int:
|
||||
"""插入新闻数据,返回成功插入的数量"""
|
||||
if not news_list:
|
||||
return 0
|
||||
|
||||
inserted_count = 0
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
with get_cursor() as cursor:
|
||||
for news in news_list:
|
||||
# 检查是否已存在
|
||||
cursor.execute(
|
||||
"SELECT id FROM news WHERE url = %s LIMIT 1",
|
||||
(news.get('url', ''),)
|
||||
)
|
||||
if cursor.fetchone():
|
||||
continue
|
||||
|
||||
# 插入新数据
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO news (title, content, url, source, publish_time, created_at)
|
||||
VALUES (%s, %s, %s, %s, %s, NOW())
|
||||
""",
|
||||
(
|
||||
news.get('title', ''),
|
||||
news.get('content', ''),
|
||||
news.get('url', ''),
|
||||
news.get('source', ''),
|
||||
news.get('publish_time', None),
|
||||
)
|
||||
)
|
||||
inserted_count += 1
|
||||
|
||||
duration = time.time() - start_time
|
||||
log.info(f"Inserted {inserted_count}/{len(news_list)} news items in {duration:.2f}s")
|
||||
return inserted_count
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error inserting news: {e}")
|
||||
log.error(traceback.format_exc())
|
||||
return 0
|
||||
|
||||
def get_news_by_date(date_str: str, limit: int = 100) -> List[Dict[str, Any]]:
|
||||
"""获取指定日期的新闻"""
|
||||
try:
|
||||
with get_cursor() as cursor:
|
||||
cursor.execute(
|
||||
"""
|
||||
SELECT * FROM news
|
||||
WHERE DATE(publish_time) = %s
|
||||
ORDER BY publish_time DESC
|
||||
LIMIT %s
|
||||
""",
|
||||
(date_str, limit)
|
||||
)
|
||||
return cursor.fetchall()
|
||||
except Exception as e:
|
||||
log.error(f"Error getting news by date: {e}")
|
||||
return []
|
||||
77
app/data/config/category_keywords.json
Normal file
77
app/data/config/category_keywords.json
Normal file
@@ -0,0 +1,77 @@
|
||||
{
|
||||
"科技": [
|
||||
"AI", "人工智能", "大模型", "算法", "编程", "程序", "软件", "硬件", "代码",
|
||||
"互联网", "网络", "云计算", "大数据", "机器学习", "深度学习", "区块链", "元宇宙",
|
||||
"芯片", "半导体", "操作系统", "应用", "app", "手机", "电脑", "笔记本", "平板",
|
||||
"苹果", "华为", "小米", "三星", "谷歌", "微软", "百度", "阿里", "腾讯", "字节跳动",
|
||||
"数据", "隐私", "安全", "黑客", "漏洞", "加密", "量子", "5G", "6G", "物联网",
|
||||
"VR", "AR", "MR", "XR", "无人机", "机器人", "自动驾驶", "智能家居"
|
||||
],
|
||||
"娱乐": [
|
||||
"电影", "电视剧", "综艺", "节目", "剧集", "影视", "演员", "导演", "制片",
|
||||
"明星", "艺人", "歌手", "音乐", "歌曲", "专辑", "演唱会", "演出", "表演",
|
||||
"票房", "收视率", "热度", "流量", "粉丝", "网红", "主播", "直播", "短视频",
|
||||
"抖音", "快手", "B站", "油管", "视频", "游戏", "动漫", "二次元", "漫画",
|
||||
"小说", "作家", "作者", "绯闻", "八卦", "恋情", "结婚", "离婚", "恋爱",
|
||||
"综艺节目", "选秀", "真人秀", "脱口秀", "访谈", "颁奖", "获奖", "提名"
|
||||
],
|
||||
"社会": [
|
||||
"社会", "事件", "现象", "热点", "话题", "讨论", "争议", "观点", "舆论",
|
||||
"民生", "生活", "居民", "市民", "百姓", "群众", "公众", "社区", "小区",
|
||||
"城市", "农村", "乡村", "振兴", "扶贫", "贫困", "福利", "保障", "救助",
|
||||
"公益", "慈善", "捐赠", "捐款", "志愿者", "志愿", "服务", "公共", "公共服务",
|
||||
"安全", "事故", "灾害", "灾难", "救援", "救灾", "防灾", "减灾", "消防",
|
||||
"警察", "公安", "执法", "犯罪", "案件", "诈骗", "防骗", "防范", "预防",
|
||||
"交通", "道路", "出行", "拥堵", "堵车", "地铁", "公交", "高铁", "铁路",
|
||||
"环保", "污染", "垃圾", "分类", "绿色", "低碳", "节能", "减排", "可持续"
|
||||
],
|
||||
"财经": [
|
||||
"经济", "金融", "财经", "股市", "股票", "基金", "债券", "期货", "外汇",
|
||||
"汇率", "利率", "存款", "贷款", "理财", "投资", "投资者", "股东", "股份",
|
||||
"上市", "IPO", "融资", "并购", "重组", "收购", "分拆", "分红", "派息",
|
||||
"银行", "证券", "保险", "信托", "资管", "资产管理", "财富管理", "私募",
|
||||
"公募", "券商", "基金公司", "信用", "风险", "监管", "政策", "法规", "规定",
|
||||
"房地产", "楼市", "房价", "地价", "商品房", "住宅", "公寓", "别墅", "商铺",
|
||||
"通货膨胀", "通胀", "CPI", "GDP", "经济增长", "经济发展", "经济复苏",
|
||||
"贸易", "进出口", "关税", "税收", "减税", "增值税", "所得税", "企业所得税"
|
||||
],
|
||||
"体育": [
|
||||
"体育", "运动", "比赛", "赛事", "联赛", "锦标赛", "冠军赛", "世界杯", "奥运会",
|
||||
"足球", "篮球", "排球", "网球", "乒乓球", "羽毛球", "游泳", "田径", "马拉松",
|
||||
"体操", "举重", "拳击", "武术", "跆拳道", "柔道", "击剑", "射击", "射箭",
|
||||
"高尔夫", "棒球", "橄榄球", "冰球", "滑雪", "滑冰", "冬奥会", "亚运会",
|
||||
"球员", "教练", "裁判", "球队", "俱乐部", "国家队", "主场", "客场", "赛季",
|
||||
"进球", "得分", "助攻", "防守", "进攻", "战术", "技术", "犯规", "红牌", "黄牌",
|
||||
"NBA", "CBA", "英超", "西甲", "德甲", "意甲", "法甲", "欧冠", "欧联", "亚冠",
|
||||
"世锦赛", "大满贯", "全运会", "体育产业", "体育用品", "体育营销", "体育赞助"
|
||||
],
|
||||
"教育": [
|
||||
"教育", "学校", "大学", "高校", "中学", "小学", "幼儿园", "学院", "研究生院",
|
||||
"教师", "老师", "学生", "学员", "家长", "教授", "讲师", "班主任", "辅导员",
|
||||
"课程", "课堂", "教材", "教学", "学习", "考试", "考核", "成绩", "分数", "学分",
|
||||
"升学", "高考", "中考", "考研", "考博", "考证", "留学", "出国", "海归", "归国",
|
||||
"学历", "学位", "文凭", "证书", "学士", "硕士", "博士", "博士后", "教育部",
|
||||
"教育局", "教育厅", "教育系统", "教育改革", "素质教育", "应试教育", "职业教育",
|
||||
"在线教育", "远程教育", "教育科技", "教育创新", "教育公平", "教育资源", "教育质量",
|
||||
"校园", "宿舍", "食堂", "图书馆", "实验室", "教室", "操场", "体育馆", "礼堂"
|
||||
],
|
||||
"健康": [
|
||||
"健康", "医疗", "医院", "医生", "医师", "护士", "护理", "患者", "病人", "就医",
|
||||
"疾病", "疫情", "病毒", "细菌", "感染", "传染", "流行病", "新冠", "肺炎", "发热",
|
||||
"症状", "治疗", "用药", "药物", "药品", "药剂", "处方", "诊断", "检查", "手术",
|
||||
"康复", "保健", "养生", "营养", "饮食", "运动", "锻炼", "减肥", "增重", "塑形",
|
||||
"心理", "精神", "抑郁", "焦虑", "压力", "睡眠", "失眠", "心理咨询", "心理治疗",
|
||||
"医保", "医疗保险", "社保", "医改", "医疗改革", "医疗体系", "医疗资源", "医疗服务",
|
||||
"疫苗", "接种", "防疫", "防控", "消毒", "隔离", "核酸", "抗原", "检测", "筛查"
|
||||
],
|
||||
"国际": [
|
||||
"国际", "全球", "世界", "外交", "国家", "地区", "大使", "领事", "使馆", "领馆",
|
||||
"美国", "中国", "俄罗斯", "欧盟", "日本", "印度", "英国", "法国", "德国", "意大利",
|
||||
"加拿大", "澳大利亚", "巴西", "南非", "沙特", "伊朗", "以色列", "巴勒斯坦", "朝鲜",
|
||||
"韩国", "越南", "新加坡", "马来西亚", "印尼", "泰国", "菲律宾", "乌克兰", "白俄罗斯",
|
||||
"战争", "冲突", "和平", "停火", "制裁", "协议", "条约", "峰会", "会议", "会晤",
|
||||
"联合国", "安理会", "世卫组织", "世贸组织", "国际货币基金组织", "世界银行", "北约",
|
||||
"政治", "经济", "军事", "外交", "贸易", "投资", "援助", "移民", "难民", "人权",
|
||||
"气候变化", "全球变暖", "可持续发展", "减排", "碳中和", "国际合作", "多边主义"
|
||||
]
|
||||
}
|
||||
19
app/data/config/stopwords.json
Normal file
19
app/data/config/stopwords.json
Normal file
@@ -0,0 +1,19 @@
|
||||
{
|
||||
"stopwords": [
|
||||
"什么", "怎么", "如何", "为何", "为什么", "哪些", "多少", "几个", "怎样",
|
||||
"一个", "这个", "那个", "自己", "这些", "那些", "因为", "所以", "如果",
|
||||
"可以", "还是", "这样", "那样", "关于", "对于",
|
||||
"今天", "明天", "昨天", "今年", "去年", "最近", "现在",
|
||||
"一些", "有些", "很多", "许多",
|
||||
"a", "an", "the", "and", "or", "but", "if", "because", "as", "what", "when",
|
||||
"where", "how", "to", "of", "for", "with", "in", "on", "at", "from", "by",
|
||||
"about", "into", "is", "are", "was", "were", "be", "been", "being", "have",
|
||||
"has", "had", "do", "does", "did", "doing", "can", "could", "will", "would",
|
||||
"should", "shall", "may", "might", "must", "that", "which", "who", "whom",
|
||||
"this", "these", "those", "am", "i", "you", "he", "she", "it", "we", "they",
|
||||
"their", "your", "my", "his", "her", "its", "our", "than", "then", "so", "not",
|
||||
"的", "了", "和", "是", "在", "我", "有", "个", "这", "那", "就", "也",
|
||||
"要", "会", "对", "啊", "吧", "呢", "吗", "嗯", "哦", "哪", "啥", "么",
|
||||
"被", "说", "到", "等", "着", "为", "与", "但", "并", "或", "而", "所以"
|
||||
]
|
||||
}
|
||||
0
app/db/__init__.py
Normal file
0
app/db/__init__.py
Normal file
15
app/db/models.py
Normal file
15
app/db/models.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from sqlalchemy import Column, Integer, String, Text, DateTime
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
class News(Base):
|
||||
__tablename__ = 'news'
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
title = Column(String(255), nullable=False)
|
||||
content = Column(Text, nullable=True)
|
||||
url = Column(String(255), nullable=False, unique=True)
|
||||
source = Column(String(50), nullable=True)
|
||||
publish_time = Column(DateTime, nullable=True)
|
||||
created_at = Column(DateTime, nullable=False)
|
||||
60
app/db/mysql.py
Normal file
60
app/db/mysql.py
Normal file
@@ -0,0 +1,60 @@
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any
|
||||
from .models import Base, News
|
||||
|
||||
# 移除对 SQLAlchemy 的依赖
|
||||
# from app.core.db import Base
|
||||
|
||||
# 定义一个简单的数据类来替代 SQLAlchemy 模型
|
||||
class News:
|
||||
"""新闻数据模型"""
|
||||
|
||||
def __init__(self,
|
||||
title: str = "",
|
||||
content: str = "",
|
||||
url: str = "",
|
||||
source: str = "",
|
||||
publish_time: Optional[datetime] = None):
|
||||
self.id: Optional[int] = None
|
||||
self.title = title
|
||||
self.content = content
|
||||
self.url = url
|
||||
self.source = source
|
||||
self.publish_time = publish_time or datetime.now()
|
||||
self.created_at = datetime.now()
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> 'News':
|
||||
"""从字典创建新闻对象"""
|
||||
news = cls(
|
||||
title=data.get('title', ''),
|
||||
content=data.get('content', ''),
|
||||
url=data.get('url', ''),
|
||||
source=data.get('source', ''),
|
||||
publish_time=data.get('publish_time')
|
||||
)
|
||||
if 'id' in data:
|
||||
news.id = data['id']
|
||||
if 'created_at' in data:
|
||||
news.created_at = data['created_at']
|
||||
return news
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""转换为字典"""
|
||||
return {
|
||||
'id': self.id,
|
||||
'title': self.title,
|
||||
'content': self.content,
|
||||
'url': self.url,
|
||||
'source': self.source,
|
||||
'publish_time': self.publish_time,
|
||||
'created_at': self.created_at
|
||||
}
|
||||
|
||||
def insert_news(news_list):
|
||||
"""将新闻列表插入数据库"""
|
||||
from app.core import db
|
||||
# 如果传入的是 News 对象列表,转换为字典列表
|
||||
if news_list and isinstance(news_list[0], News):
|
||||
news_list = [news.to_dict() for news in news_list]
|
||||
return db.insert_news(news_list)
|
||||
45
app/db/redis.py
Normal file
45
app/db/redis.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import redis
|
||||
from redis import Redis
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app.core import cache
|
||||
from app.core.config import get_redis_config
|
||||
|
||||
REDIS_CONFIG = {
|
||||
"host": "localhost",
|
||||
"port": 6379,
|
||||
"db": 0,
|
||||
"decode_responses": False,
|
||||
"socket_timeout": 5,
|
||||
"socket_connect_timeout": 5,
|
||||
"health_check_interval": 30,
|
||||
}
|
||||
|
||||
_redis_pool = None
|
||||
|
||||
def get_redis_pool() -> redis.ConnectionPool:
|
||||
global _redis_pool
|
||||
if _redis_pool is None:
|
||||
redis_config = get_redis_config()
|
||||
_redis_pool = redis.ConnectionPool(
|
||||
host=redis_config.host,
|
||||
port=redis_config.port,
|
||||
db=redis_config.db,
|
||||
password=redis_config.password,
|
||||
decode_responses=redis_config.decode_responses,
|
||||
socket_timeout=redis_config.socket_timeout,
|
||||
socket_connect_timeout=redis_config.socket_connect_timeout,
|
||||
health_check_interval=redis_config.health_check_interval
|
||||
)
|
||||
return _redis_pool
|
||||
|
||||
def get_redis_client() -> Redis:
|
||||
pool = get_redis_pool()
|
||||
return redis.Redis(connection_pool=pool)
|
||||
|
||||
class CacheNews(BaseModel):
|
||||
title: str
|
||||
url: str
|
||||
score: int
|
||||
desc: str
|
||||
92
app/main.py
Normal file
92
app/main.py
Normal file
@@ -0,0 +1,92 @@
|
||||
# app/main.py
|
||||
import threading
|
||||
import time
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from fastapi import FastAPI, Request, Response
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
import uvicorn
|
||||
|
||||
import app.services.crawler as crawler
|
||||
import tg_bot as tg_bot
|
||||
from app.api.v1 import daily_news, web_tools, analysis
|
||||
from app.utils.logger import log
|
||||
from app.core import db, cache
|
||||
from app.core.config import get_app_config, get_config
|
||||
from app.services.browser_manager import BrowserManager
|
||||
|
||||
# 获取应用配置
|
||||
app_config = get_app_config()
|
||||
|
||||
# 应用启动和关闭的生命周期管理
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
# 启动时执行
|
||||
log.info("Application startup")
|
||||
|
||||
# 初始化数据库连接
|
||||
db.init_db()
|
||||
|
||||
# 初始化缓存
|
||||
cache.init_cache()
|
||||
|
||||
# 异步启动爬虫,避免阻塞应用启动
|
||||
threading.Thread(target=crawler.crawlers_logic, daemon=True).start()
|
||||
|
||||
yield
|
||||
|
||||
# 关闭时执行
|
||||
log.info("Application shutdown")
|
||||
|
||||
# 关闭浏览器管理器
|
||||
try:
|
||||
BrowserManager().shutdown()
|
||||
log.info("Browser manager shutdown")
|
||||
except Exception as e:
|
||||
log.error(f"Error shutting down browser manager: {e}")
|
||||
|
||||
# 关闭数据库连接
|
||||
db.close_db()
|
||||
|
||||
# 关闭缓存连接
|
||||
cache.close_cache()
|
||||
|
||||
# 创建应用实例
|
||||
app = FastAPI(
|
||||
title=app_config.title,
|
||||
description=app_config.description,
|
||||
version=app_config.version,
|
||||
lifespan=lifespan
|
||||
)
|
||||
|
||||
# 添加CORS中间件
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=app_config.cors["allow_origins"],
|
||||
allow_credentials=app_config.cors["allow_credentials"],
|
||||
allow_methods=app_config.cors["allow_methods"],
|
||||
allow_headers=app_config.cors["allow_headers"],
|
||||
)
|
||||
|
||||
# 请求计时中间件
|
||||
@app.middleware("http")
|
||||
async def add_process_time_header(request: Request, call_next):
|
||||
start_time = time.time()
|
||||
response = await call_next(request)
|
||||
process_time = time.time() - start_time
|
||||
response.headers["X-Process-Time"] = str(process_time)
|
||||
return response
|
||||
|
||||
# 注册路由
|
||||
app.include_router(daily_news.router, prefix="/api/v1/dailynews", tags=["Daily News"])
|
||||
app.include_router(web_tools.router, prefix="/api/v1/tools/website-meta", tags=["Website Meta"])
|
||||
app.include_router(analysis.router, prefix="/api/v1/analysis", tags=["Analysis"])
|
||||
|
||||
# 健康检查端点
|
||||
@app.get("/health", tags=["Health"])
|
||||
async def health_check():
|
||||
return {"status": "healthy", "version": app_config.version}
|
||||
|
||||
# 如果直接运行此文件
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run("app.main:app", host=app_config.host, port=app_config.port, reload=app_config.debug)
|
||||
43
app/services/__init__.py
Normal file
43
app/services/__init__.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from apscheduler.executors.pool import ThreadPoolExecutor, ProcessPoolExecutor
|
||||
from apscheduler.jobstores.memory import MemoryJobStore
|
||||
import pytz
|
||||
|
||||
from app.services.sites.factory import CrawlerRegister
|
||||
from app.utils.logger import log
|
||||
from app.core.config import get_scheduler_config
|
||||
|
||||
# 创建爬虫工厂
|
||||
crawler_factory = CrawlerRegister().register()
|
||||
|
||||
# 获取调度器配置
|
||||
scheduler_config = get_scheduler_config()
|
||||
|
||||
# 配置调度器
|
||||
jobstores = {
|
||||
'default': MemoryJobStore()
|
||||
}
|
||||
|
||||
executors = {
|
||||
'default': ThreadPoolExecutor(scheduler_config.thread_pool_size),
|
||||
'processpool': ProcessPoolExecutor(scheduler_config.process_pool_size)
|
||||
}
|
||||
|
||||
job_defaults = {
|
||||
'coalesce': scheduler_config.coalesce,
|
||||
'max_instances': scheduler_config.max_instances,
|
||||
'misfire_grace_time': scheduler_config.misfire_grace_time,
|
||||
}
|
||||
|
||||
# 创建并配置调度器
|
||||
_scheduler = BackgroundScheduler(
|
||||
jobstores=jobstores,
|
||||
executors=executors,
|
||||
job_defaults=job_defaults,
|
||||
timezone=pytz.timezone(scheduler_config.timezone)
|
||||
)
|
||||
|
||||
# 启动调度器
|
||||
_scheduler.start()
|
||||
|
||||
log.info(f"Scheduler started with timezone: {scheduler_config.timezone}")
|
||||
121
app/services/browser_manager.py
Normal file
121
app/services/browser_manager.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import threading
|
||||
import time
|
||||
import os
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from app.utils.logger import log
|
||||
|
||||
class BrowserManager:
|
||||
"""浏览器管理器,提供共享的Chrome浏览器实例"""
|
||||
_instance = None
|
||||
_lock = threading.Lock()
|
||||
_driver = None
|
||||
_driver_path = None
|
||||
_last_activity = 0
|
||||
_max_idle_time = 1800 # 最大空闲时间(秒),默认30分钟
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
"""单例模式实现"""
|
||||
if cls._instance is None:
|
||||
with cls._lock:
|
||||
if cls._instance is None:
|
||||
cls._instance = super(BrowserManager, cls).__new__(cls)
|
||||
cls._instance._init_driver_path()
|
||||
cls._instance._start_idle_monitor()
|
||||
return cls._instance
|
||||
|
||||
def _init_driver_path(self):
|
||||
"""初始化ChromeDriver路径"""
|
||||
try:
|
||||
self._driver_path = ChromeDriverManager().install()
|
||||
log.info(f"ChromeDriver已安装: {self._driver_path}")
|
||||
except Exception as e:
|
||||
log.error(f"ChromeDriver安装失败: {str(e)}")
|
||||
raise
|
||||
|
||||
def _start_idle_monitor(self):
|
||||
"""启动空闲监控线程"""
|
||||
def monitor():
|
||||
while True:
|
||||
time.sleep(60) # 每分钟检查一次
|
||||
try:
|
||||
with self._lock:
|
||||
if self._driver is not None:
|
||||
current_time = time.time()
|
||||
if current_time - self._last_activity > self._max_idle_time:
|
||||
log.info(f"浏览器空闲超过{self._max_idle_time}秒,释放资源")
|
||||
self._quit_driver()
|
||||
except Exception as e:
|
||||
log.error(f"浏览器监控线程异常: {str(e)}")
|
||||
|
||||
monitor_thread = threading.Thread(target=monitor, daemon=True)
|
||||
monitor_thread.start()
|
||||
log.info("浏览器空闲监控线程已启动")
|
||||
|
||||
def get_driver(self):
|
||||
"""获取Chrome浏览器实例"""
|
||||
with self._lock:
|
||||
self._last_activity = time.time()
|
||||
if self._driver is None:
|
||||
self._create_driver()
|
||||
return self._driver
|
||||
|
||||
def _create_driver(self):
|
||||
"""创建新的Chrome浏览器实例"""
|
||||
log.info("创建新的Chrome浏览器实例")
|
||||
options = webdriver.ChromeOptions()
|
||||
# 基本配置(无头模式)
|
||||
options.add_argument("--headless")
|
||||
options.add_argument("--disable-gpu")
|
||||
options.add_argument("--no-sandbox")
|
||||
# 内存优化配置
|
||||
options.add_argument("--disable-dev-shm-usage")
|
||||
options.add_argument("--disable-extensions")
|
||||
options.add_argument("--disable-application-cache")
|
||||
options.add_argument("--js-flags=--expose-gc")
|
||||
options.add_argument("--memory-pressure-off")
|
||||
options.add_argument("--disable-default-apps")
|
||||
# 日志级别
|
||||
options.add_argument("--log-level=3")
|
||||
|
||||
self._driver = webdriver.Chrome(
|
||||
service=Service(self._driver_path),
|
||||
options=options
|
||||
)
|
||||
self._driver.set_page_load_timeout(30)
|
||||
|
||||
def _quit_driver(self):
|
||||
"""关闭浏览器实例"""
|
||||
if self._driver:
|
||||
try:
|
||||
self._driver.quit()
|
||||
log.info("浏览器实例已关闭")
|
||||
except Exception as e:
|
||||
log.error(f"关闭浏览器实例出错: {str(e)}")
|
||||
finally:
|
||||
self._driver = None
|
||||
|
||||
def release_driver(self):
|
||||
"""使用完毕后标记为活动状态"""
|
||||
with self._lock:
|
||||
self._last_activity = time.time()
|
||||
|
||||
def get_page_content(self, url, wait_time=5):
|
||||
"""获取指定URL的页面内容,并自动处理浏览器"""
|
||||
driver = self.get_driver()
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(wait_time) # 等待页面加载
|
||||
page_source = driver.page_source
|
||||
self.release_driver()
|
||||
return page_source, driver
|
||||
except Exception as e:
|
||||
log.error(f"获取页面内容失败: {str(e)}")
|
||||
self.release_driver()
|
||||
raise
|
||||
|
||||
def shutdown(self):
|
||||
"""关闭浏览器管理器"""
|
||||
with self._lock:
|
||||
self._quit_driver()
|
||||
240
app/services/crawler.py
Normal file
240
app/services/crawler.py
Normal file
@@ -0,0 +1,240 @@
|
||||
import time
|
||||
import traceback
|
||||
import threading
|
||||
from datetime import datetime
|
||||
from functools import wraps
|
||||
import pytz
|
||||
import signal
|
||||
from typing import List, Dict, Any, Optional, Callable
|
||||
|
||||
from app.services import crawler_factory, _scheduler
|
||||
from app.utils.logger import log
|
||||
from app.core import db, cache
|
||||
from app.core.config import get_crawler_config
|
||||
from app.utils.notification import notification_manager
|
||||
|
||||
# 获取爬虫配置
|
||||
crawler_config = get_crawler_config()
|
||||
|
||||
# 配置常量
|
||||
CRAWLER_INTERVAL = crawler_config.interval
|
||||
CRAWLER_TIMEOUT = crawler_config.timeout
|
||||
MAX_RETRY_COUNT = crawler_config.max_retry_count
|
||||
SHANGHAI_TZ = pytz.timezone('Asia/Shanghai')
|
||||
|
||||
class CrawlerTimeoutError(Exception):
|
||||
"""爬虫超时异常"""
|
||||
pass
|
||||
|
||||
def timeout_handler(func: Callable, timeout: int = CRAWLER_TIMEOUT) -> Callable:
|
||||
"""超时处理装饰器,支持Unix信号和线程两种实现"""
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
# 线程实现的超时机制
|
||||
result = [None]
|
||||
exception = [None]
|
||||
completed = [False]
|
||||
|
||||
def target():
|
||||
try:
|
||||
result[0] = func(*args, **kwargs)
|
||||
except Exception as e:
|
||||
exception[0] = e
|
||||
finally:
|
||||
completed[0] = True
|
||||
|
||||
thread = threading.Thread(target=target)
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
thread.join(timeout)
|
||||
|
||||
if not completed[0]:
|
||||
error_msg = f"Function {func.__name__} timed out after {timeout} seconds"
|
||||
log.error(error_msg)
|
||||
raise CrawlerTimeoutError(error_msg)
|
||||
|
||||
if exception[0]:
|
||||
log.error(f"Function {func.__name__} raised an exception: {exception[0]}")
|
||||
raise exception[0]
|
||||
|
||||
return result[0]
|
||||
return wrapper
|
||||
|
||||
def safe_fetch(crawler_name: str, crawler, date_str: str, is_retry: bool = False) -> List[Dict[str, Any]]:
|
||||
"""安全地执行爬虫抓取,处理异常并返回结果"""
|
||||
try:
|
||||
news_list = crawler.fetch(date_str)
|
||||
if news_list and len(news_list) > 0:
|
||||
cache_key = f"crawler:{crawler_name}:{date_str}"
|
||||
cache.set_cache(key=cache_key, value=news_list, expire=0)
|
||||
|
||||
log.info(f"{crawler_name} fetch success, {len(news_list)} news fetched")
|
||||
return news_list
|
||||
else:
|
||||
log.info(f"{'Second time ' if is_retry else ''}crawler {crawler_name} failed. 0 news fetched")
|
||||
return []
|
||||
except Exception as e:
|
||||
error_msg = traceback.format_exc()
|
||||
log.error(f"{'Second time ' if is_retry else ''}crawler {crawler_name} error: {error_msg}")
|
||||
|
||||
# 发送钉钉通知
|
||||
try:
|
||||
notification_manager.notify_crawler_error(
|
||||
crawler_name=crawler_name,
|
||||
error_msg=str(e),
|
||||
date_str=date_str,
|
||||
is_retry=is_retry
|
||||
)
|
||||
except Exception as notify_error:
|
||||
log.error(f"Failed to send notification for crawler {crawler_name}: {notify_error}")
|
||||
|
||||
return []
|
||||
|
||||
def run_data_analysis(date_str: str):
|
||||
"""执行数据分析并缓存结果"""
|
||||
log.info(f"Starting data analysis for date {date_str}")
|
||||
try:
|
||||
# 导入分析模块(在这里导入避免循环依赖)
|
||||
from app.analysis.trend_analyzer import TrendAnalyzer
|
||||
from app.analysis.predictor import TrendPredictor
|
||||
|
||||
# 创建分析器实例
|
||||
analyzer = TrendAnalyzer()
|
||||
predictor = TrendPredictor()
|
||||
|
||||
# 1. 生成关键词云图数据并缓存
|
||||
log.info("Generating keyword cloud data...")
|
||||
analyzer.get_keyword_cloud(date_str, refresh=True)
|
||||
|
||||
# 2. 生成热点聚合分析数据并缓存
|
||||
log.info("Generating trend analysis data...")
|
||||
analyzer.get_analysis(date_str, analysis_type="main")
|
||||
|
||||
# 3. 生成跨平台热点分析数据并缓存
|
||||
log.info("Generating cross-platform analysis data...")
|
||||
analyzer.get_cross_platform_analysis(date_str, refresh=True)
|
||||
|
||||
# 4. 生成热点趋势预测数据并缓存
|
||||
log.info("Generating trend prediction data...")
|
||||
predictor.get_prediction(date_str)
|
||||
|
||||
# 5. 生成平台对比分析数据并缓存
|
||||
log.info("Generating platform comparison data...")
|
||||
analyzer.get_platform_comparison(date_str)
|
||||
|
||||
# 6. 生成高级分析数据并缓存
|
||||
log.info("Generating advanced analysis data...")
|
||||
analyzer.get_advanced_analysis(date_str, refresh=True)
|
||||
|
||||
# 7. 生成数据可视化分析数据并缓存
|
||||
log.info("Generating data visualization analysis...")
|
||||
analyzer.get_data_visualization(date_str, refresh=True)
|
||||
|
||||
# 8. 生成趋势预测分析数据并缓存
|
||||
log.info("Generating trend forecast data...")
|
||||
analyzer.get_trend_forecast(date_str, refresh=True)
|
||||
|
||||
log.info(f"All data analysis completed for date {date_str}")
|
||||
except Exception as e:
|
||||
error_msg = traceback.format_exc()
|
||||
log.error(f"Error during data analysis: {str(e)}")
|
||||
log.error(error_msg)
|
||||
|
||||
# 发送数据分析异常通知
|
||||
try:
|
||||
notification_manager.notify_analysis_error(
|
||||
error_msg=str(e),
|
||||
date_str=date_str
|
||||
)
|
||||
except Exception as notify_error:
|
||||
log.error(f"Failed to send analysis error notification: {notify_error}")
|
||||
|
||||
@_scheduler.scheduled_job('interval', id='crawlers_logic', seconds=CRAWLER_INTERVAL,
|
||||
max_instances=crawler_config.max_instances,
|
||||
misfire_grace_time=crawler_config.misfire_grace_time)
|
||||
def crawlers_logic():
|
||||
"""爬虫主逻辑,包含超时保护和错误处理"""
|
||||
|
||||
@timeout_handler
|
||||
def crawler_work():
|
||||
now_time = datetime.now(SHANGHAI_TZ)
|
||||
date_str = now_time.strftime("%Y-%m-%d")
|
||||
log.info(f"Starting crawler job at {now_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
retry_crawler = []
|
||||
success_count = 0
|
||||
failed_crawlers = []
|
||||
|
||||
for crawler_name, crawler in crawler_factory.items():
|
||||
news_list = safe_fetch(crawler_name, crawler, date_str)
|
||||
if news_list:
|
||||
success_count += 1
|
||||
else:
|
||||
retry_crawler.append(crawler_name)
|
||||
failed_crawlers.append(crawler_name)
|
||||
|
||||
# 第二轮爬取(重试失败的爬虫)
|
||||
if retry_crawler:
|
||||
log.info(f"Retrying {len(retry_crawler)} failed crawlers")
|
||||
retry_failed = []
|
||||
for crawler_name in retry_crawler:
|
||||
news_list = safe_fetch(crawler_name, crawler_factory[crawler_name], date_str, is_retry=True)
|
||||
if news_list:
|
||||
success_count += 1
|
||||
# 从失败列表中移除成功的爬虫
|
||||
if crawler_name in failed_crawlers:
|
||||
failed_crawlers.remove(crawler_name)
|
||||
else:
|
||||
retry_failed.append(crawler_name)
|
||||
|
||||
# 记录完成时间
|
||||
end_time = datetime.now(SHANGHAI_TZ)
|
||||
duration = (end_time - now_time).total_seconds()
|
||||
log.info(f"Crawler job finished at {end_time.strftime('%Y-%m-%d %H:%M:%S')}, "
|
||||
f"duration: {duration:.2f}s, success: {success_count}/{len(crawler_factory)}")
|
||||
|
||||
# 发送通知
|
||||
try:
|
||||
notification_manager.notify_crawler_summary(
|
||||
success_count=success_count,
|
||||
total_count=len(crawler_factory),
|
||||
failed_crawlers=failed_crawlers,
|
||||
duration=duration,
|
||||
date_str=date_str
|
||||
)
|
||||
except Exception as notify_error:
|
||||
log.error(f"Failed to send crawler notification: {notify_error}")
|
||||
|
||||
# 爬取完成后执行数据分析
|
||||
log.info("Crawler job completed, starting data analysis...")
|
||||
# 使用新线程执行分析,避免阻塞主线程
|
||||
threading.Thread(target=run_data_analysis, args=(date_str,), daemon=True).start()
|
||||
|
||||
return success_count
|
||||
|
||||
try:
|
||||
return crawler_work()
|
||||
except CrawlerTimeoutError as e:
|
||||
log.error(f"Crawler job timeout: {str(e)}")
|
||||
# 发送超时通知
|
||||
try:
|
||||
notification_manager.notify_crawler_timeout(
|
||||
timeout_seconds=CRAWLER_TIMEOUT,
|
||||
date_str=date_str
|
||||
)
|
||||
except Exception as notify_error:
|
||||
log.error(f"Failed to send timeout notification: {notify_error}")
|
||||
return 0
|
||||
except Exception as e:
|
||||
log.error(f"Crawler job error: {str(e)}")
|
||||
log.error(traceback.format_exc())
|
||||
# 发送通用异常通知
|
||||
try:
|
||||
notification_manager.notify_crawler_error(
|
||||
crawler_name="crawler_job",
|
||||
error_msg=str(e),
|
||||
date_str=date_str
|
||||
)
|
||||
except Exception as notify_error:
|
||||
log.error(f"Failed to send error notification: {notify_error}")
|
||||
return 0
|
||||
0
app/services/sites/__init__.py
Normal file
0
app/services/sites/__init__.py
Normal file
99
app/services/sites/baidu.py
Normal file
99
app/services/sites/baidu.py
Normal file
@@ -0,0 +1,99 @@
|
||||
import json
|
||||
import datetime
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class BaiduNewsCrawler(Crawler):
|
||||
# 返回news_list
|
||||
def fetch(self, date_str) -> list:
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://top.baidu.com/api/board?platform=wise&tab=realtime"
|
||||
|
||||
resp = requests.get(url=url, params=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
json_data = resp.json()
|
||||
contents = json_data.get("data")["cards"][0]["content"][0]["content"]
|
||||
result = []
|
||||
cache_list = []
|
||||
for content in contents:
|
||||
title = content.get("word")
|
||||
url = content.get("url")
|
||||
desc = content.get("desc")
|
||||
score = content.get("hotScore")
|
||||
|
||||
# replace url m to www
|
||||
url = url.replace("m.", "www.")
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': desc,
|
||||
'source': 'baidu',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串
|
||||
}
|
||||
result.append(news)
|
||||
cache_list.append(news) # 直接添加字典,json.dumps会在后面处理整个列表
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "baidu"
|
||||
|
||||
@staticmethod
|
||||
def fetch_v0():
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://top.baidu.com/board?tab=realtime"
|
||||
proxies = {
|
||||
# "http": "http://127.0.0.1:7890",
|
||||
# "https": "http://127.0.0.0:7890"
|
||||
}
|
||||
|
||||
header = {
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
|
||||
"*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
"upgrade-insecure-requests": 1,
|
||||
"host": "www.baidu.com",
|
||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/86.0.4240.183 Safari/537.36"
|
||||
}
|
||||
html = requests.get(url=url, params=header, verify=False, proxies=proxies)
|
||||
html.encoding = "utf-8"
|
||||
html_text = html.text
|
||||
soup = BeautifulSoup(html_text, "html.parser")
|
||||
main_content = soup.find_all("main")[0]
|
||||
news_main_content = main_content.find("div", style='margin-bottom:20px')
|
||||
|
||||
div_elements = news_main_content.find_all('div', class_='category-wrap_iQLoo horizontal_1eKyQ')
|
||||
|
||||
result = []
|
||||
for div_element in div_elements:
|
||||
hot_index = div_element.find(class_='hot-index_1Bl1a').text.strip()
|
||||
news_title = div_element.find(class_='c-single-text-ellipsis').text.strip()
|
||||
news_link = div_element.find('a', class_='title_dIF3B')['href']
|
||||
|
||||
news = {
|
||||
'title': news_title,
|
||||
'url': news_link,
|
||||
'content': "",
|
||||
'source': 'baidu',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串
|
||||
}
|
||||
result.append(news)
|
||||
|
||||
return result
|
||||
64
app/services/sites/bilibili.py
Normal file
64
app/services/sites/bilibili.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import json
|
||||
import datetime
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class BilibiliCrawler(Crawler):
|
||||
|
||||
def fetch(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://api.bilibili.com/x/web-interface/popular"
|
||||
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"Chrome/122.0.0.0 Safari/537.36"
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
),
|
||||
"Referer": "https://www.bilibili.com/",
|
||||
}
|
||||
|
||||
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
data = resp.json()
|
||||
if data["code"] != 0:
|
||||
print(f"API error: {data['message']}")
|
||||
return []
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data["data"].get("list", []):
|
||||
title = item.get("title", "")
|
||||
bvid = item.get("bvid", "")
|
||||
desc = item.get("desc", "")
|
||||
video_url = f"https://www.bilibili.com/video/{bvid}"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': video_url,
|
||||
'content': desc,
|
||||
'source': 'bilibili',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "bilibili"
|
||||
100
app/services/sites/cls.py
Normal file
100
app/services/sites/cls.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import json
|
||||
import datetime
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class CLSCrawler(Crawler):
|
||||
"""财联社"""
|
||||
|
||||
def fetch(self, date_str) -> list:
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
try:
|
||||
params = {
|
||||
'app': 'CailianpressWeb',
|
||||
'os': 'web',
|
||||
'sv': '8.4.6',
|
||||
'sign': '9f8797a1f4de66c2370f7a03990d2737'
|
||||
}
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Referer': 'https://www.cls.cn/',
|
||||
'Origin': 'https://www.cls.cn'
|
||||
}
|
||||
|
||||
response = requests.get(
|
||||
"https://www.cls.cn/featured/v1/column/list",
|
||||
params=params,
|
||||
headers=headers,
|
||||
timeout=self.timeout,
|
||||
verify=False
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
if data.get('errno') != 0:
|
||||
return []
|
||||
|
||||
column_list = data.get('data', {}).get('column_list', [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for idx, column in enumerate(column_list[:20]):
|
||||
try:
|
||||
title = column.get('title', '').strip()
|
||||
if not title or len(title) < 2:
|
||||
continue
|
||||
|
||||
article_list = column.get('article_list', {})
|
||||
if article_list:
|
||||
article_title = article_list.get('title', '').strip()
|
||||
jump_url = article_list.get('jump_url', '').strip()
|
||||
brief = article_list.get('brief', '').strip()
|
||||
|
||||
if article_title:
|
||||
display_title = f"[{title}] {article_title}"
|
||||
content = brief if brief else article_title
|
||||
url = "https://www.cls.cn/telegraph"
|
||||
else:
|
||||
display_title = title
|
||||
content = column.get('brief', '').strip()
|
||||
url = f"https://www.cls.cn/telegraph"
|
||||
else:
|
||||
display_title = title
|
||||
content = column.get('brief', '').strip()
|
||||
url = f"https://www.cls.cn/telegraph"
|
||||
|
||||
news = {
|
||||
'title': display_title,
|
||||
'url': url,
|
||||
'content': content,
|
||||
'source': 'cls',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'score': 1000 - idx,
|
||||
'rank': idx + 1
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if cache_list:
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "cls"
|
||||
23
app/services/sites/crawler.py
Normal file
23
app/services/sites/crawler.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Dict, Any
|
||||
|
||||
class Crawler(ABC):
|
||||
def __init__(self):
|
||||
self.header = {
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
|
||||
"*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
"upgrade-insecure-requests": "1",
|
||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/86.0.4240.183 Safari/537.36"
|
||||
}
|
||||
self.timeout = 10
|
||||
|
||||
@abstractmethod
|
||||
def fetch(self, date_str: str) -> List[Dict[str, Any]]:
|
||||
"""获取新闻列表"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def crawler_name(self) -> str:
|
||||
"""获取爬虫名称"""
|
||||
pass
|
||||
79
app/services/sites/douban.py
Normal file
79
app/services/sites/douban.py
Normal file
@@ -0,0 +1,79 @@
|
||||
import json
|
||||
import re
|
||||
import datetime
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
from ...core import cache
|
||||
from .crawler import Crawler
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class DouBanCrawler(Crawler):
|
||||
"""豆瓣网"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://www.douban.com/group/explore"
|
||||
|
||||
header = self.header.copy()
|
||||
header.update({
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
"accept-encoding": "",
|
||||
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
"host": "www.douban.com",
|
||||
"referer": "https://www.douban.com/group/explore",
|
||||
"sec-fetch-dest": "document",
|
||||
"sec-fetch-mode": "navigate",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-user": "?1",
|
||||
"upgrade-insecure-requests": "1",
|
||||
})
|
||||
|
||||
resp = requests.get(url=url, headers=header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
html_text = resp.text
|
||||
soup = BeautifulSoup(html_text, "html.parser")
|
||||
|
||||
topic_list = soup.find_all('div', class_='channel-item')
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for topic in topic_list:
|
||||
title_elem = topic.find('h3')
|
||||
if not title_elem:
|
||||
continue
|
||||
|
||||
link_elem = title_elem.find('a')
|
||||
if not link_elem:
|
||||
continue
|
||||
|
||||
title = link_elem.text.strip()
|
||||
url = link_elem.get('href')
|
||||
|
||||
desc_elem = topic.find('div', class_='content')
|
||||
desc = desc_elem.text.strip() if desc_elem else ""
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': desc,
|
||||
'source': 'douban',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "douban"
|
||||
111
app/services/sites/douyin.py
Normal file
111
app/services/sites/douyin.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import json
|
||||
import datetime
|
||||
import time
|
||||
|
||||
import requests
|
||||
from selenium.webdriver.common.by import By
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
from .crawler import Crawler
|
||||
from ..browser_manager import BrowserManager
|
||||
|
||||
|
||||
class DouYinCrawler(Crawler):
|
||||
def fetch(self, date_str):
|
||||
return self.fetch_v2(date_str)
|
||||
|
||||
def fetch_v1(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
url = "https://www.douyin.com/hot"
|
||||
browser_manager = BrowserManager()
|
||||
|
||||
try:
|
||||
# 使用浏览器管理器获取页面内容
|
||||
page_source, driver = browser_manager.get_page_content(url, wait_time=5)
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
# 抖音热榜条目(li 标签里含 /video/ 链接)
|
||||
items = driver.find_elements(By.XPATH, '//li[a[contains(@href, "/video/")]]')
|
||||
|
||||
for item in items:
|
||||
try:
|
||||
# 提取标题(含 # 标签或较长文本)
|
||||
title_elem = item.find_element(By.XPATH, './/div[contains(text(), "#") or string-length(text()) > 10]')
|
||||
# 提取链接
|
||||
link_elem = item.find_element(By.XPATH, './/a[contains(@href, "/video/")]')
|
||||
# 提取热度
|
||||
hot_elem = item.find_element(By.XPATH, './/span[contains(text(), "万") or contains(text(), "亿")]')
|
||||
|
||||
title = title_elem.text.strip()
|
||||
item_url = "https://www.douyin.com" + link_elem.get_attribute("href")
|
||||
hot = hot_elem.text.strip()
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': item_url,
|
||||
'content': f"热度: {hot}",
|
||||
'source': 'douyin',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
except Exception:
|
||||
continue # 跳过无效项
|
||||
|
||||
# 缓存并返回
|
||||
if cache_list:
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def fetch_v2(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
url = "https://www.douyin.com/aweme/v1/web/hot/search/list/?device_platform=webapp&aid=6383&channel=channel_pc_web&detail_list=1&source=6&pc_client_type=1&pc_libra_divert=Windows&support_h265=1&support_dash=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=136.0.0.0&browser_online=true&engine_name=Blink&engine_version=136.0.0.0&os_name=Windows&os_version=10&cpu_core_num=16&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7490997798633555467"
|
||||
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"Chrome/122.0.0.0 Safari/537.36"
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
),
|
||||
"Referer": "https://www.douyin.com/",
|
||||
}
|
||||
|
||||
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
data = resp.json()
|
||||
# https://www.douyin.com/hot/2094286?&trending_topic=%E5%A4%8F%E5%A4%A9%E7%9A%84%E5%91%B3%E9%81%93%E5%9C%A8%E6%8A%96%E9%9F%B3&previous_page=main_page&enter_method=trending_topic&modeFrom=hotDetail&tab_name=trend&position=1&hotValue=11892557
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data["data"]["word_list"]:
|
||||
title = item["word"]
|
||||
url = f"https://www.douyin.com/hot/{item['sentence_id']}?&trending_topic={item['word']}&previous_page=main_page&enter_method=trending_topic&modeFrom=hotDetail&tab_name=trend&position=1&hotValue={item['hot_value']}"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': title,
|
||||
'source': 'douyin',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
|
||||
def crawler_name(self):
|
||||
return "douyin"
|
||||
88
app/services/sites/eastmoney.py
Normal file
88
app/services/sites/eastmoney.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import json
|
||||
import datetime
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class EastMoneyCrawler(Crawler):
|
||||
"""东方财富网"""
|
||||
|
||||
def fetch(self, date_str) -> list:
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
try:
|
||||
params = {
|
||||
'client': 'web',
|
||||
'biz': 'web_724',
|
||||
'fastColumn': '102',
|
||||
'sortEnd': '',
|
||||
'pageSize': '50',
|
||||
'req_trace': str(int(current_time.timestamp() * 1000)) # 使用当前时间戳
|
||||
}
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Referer': 'https://kuaixun.eastmoney.com/',
|
||||
'Origin': 'https://kuaixun.eastmoney.com'
|
||||
}
|
||||
|
||||
response = requests.get(
|
||||
"https://np-weblist.eastmoney.com/comm/web/getFastNewsList",
|
||||
params=params,
|
||||
headers=headers,
|
||||
timeout=self.timeout,
|
||||
verify=False
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
if data.get('code') != '1':
|
||||
return []
|
||||
fast_news_list = data.get('data', {}).get('fastNewsList', [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for idx, news_item in enumerate(fast_news_list[:20]): # 取前20条
|
||||
try:
|
||||
title = news_item.get('title', '').strip()
|
||||
if not title:
|
||||
continue
|
||||
|
||||
summary = news_item.get('summary', '').strip()
|
||||
show_time = news_item.get('showTime', '').strip()
|
||||
code = news_item.get('code', '').strip()
|
||||
url = f"https://finance.eastmoney.com/a/{code}" if code else "https://kuaixun.eastmoney.com/"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': summary,
|
||||
'source': 'eastmoney',
|
||||
'publish_time': show_time if show_time else current_time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'score': 1000 - idx,
|
||||
'rank': idx + 1
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if cache_list:
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "eastmoney"
|
||||
64
app/services/sites/factory.py
Normal file
64
app/services/sites/factory.py
Normal file
@@ -0,0 +1,64 @@
|
||||
from typing import Dict, Type
|
||||
|
||||
from .baidu import BaiduNewsCrawler
|
||||
from .bilibili import BilibiliCrawler
|
||||
from .crawler import Crawler
|
||||
from .douban import DouBanCrawler
|
||||
from .douyin import DouYinCrawler
|
||||
from .ftpojie import FtPoJieCrawler
|
||||
from .github import GithubCrawler
|
||||
from .hackernews import HackerNewsCrawler
|
||||
from .hupu import HuPuCrawler
|
||||
from .jinritoutiao import JinRiTouTiaoCrawler
|
||||
from .juejin import JueJinCrawler
|
||||
from .sspai import ShaoShuPaiCrawler
|
||||
from .stackoverflow import StackOverflowCrawler
|
||||
from .tenxunwang import TenXunWangCrawler
|
||||
from .tieba import TieBaCrawler
|
||||
from .tskr import TsKrCrawler
|
||||
from .vtex import VtexCrawler
|
||||
from .weibo import WeiboCrawler
|
||||
from .weixin import WeiXinCrawler
|
||||
from .zhihu import ZhiHuCrawler
|
||||
from .sina_finance import SinaFinanceCrawler
|
||||
from .eastmoney import EastMoneyCrawler
|
||||
from .xueqiu import XueqiuCrawler
|
||||
from .cls import CLSCrawler
|
||||
|
||||
|
||||
class CrawlerRegister:
|
||||
def __init__(self):
|
||||
self.crawlers = {}
|
||||
|
||||
def register(self) -> Dict[str, Crawler]:
|
||||
"""注册所有爬虫"""
|
||||
crawler_map = {
|
||||
"baidu": BaiduNewsCrawler(),
|
||||
"shaoshupai": ShaoShuPaiCrawler(),
|
||||
"weibo": WeiboCrawler(),
|
||||
"zhihu": ZhiHuCrawler(),
|
||||
"36kr": TsKrCrawler(),
|
||||
"52pojie": FtPoJieCrawler(),
|
||||
"bilibili": BilibiliCrawler(),
|
||||
"douban": DouBanCrawler(),
|
||||
"hupu": HuPuCrawler(),
|
||||
"tieba": TieBaCrawler(),
|
||||
"juejin": JueJinCrawler(),
|
||||
"douyin": DouYinCrawler(),
|
||||
"v2ex": VtexCrawler(),
|
||||
"jinritoutiao": JinRiTouTiaoCrawler(),
|
||||
"tenxunwang": TenXunWangCrawler(),
|
||||
"stackoverflow": StackOverflowCrawler(),
|
||||
"github": GithubCrawler(),
|
||||
"hackernews": HackerNewsCrawler(),
|
||||
"sina_finance": SinaFinanceCrawler(),
|
||||
"eastmoney": EastMoneyCrawler(),
|
||||
"xueqiu": XueqiuCrawler(),
|
||||
"cls": CLSCrawler(),
|
||||
}
|
||||
|
||||
self.crawlers = crawler_map
|
||||
return self.crawlers
|
||||
|
||||
def get_crawlers(self):
|
||||
return self.register().values()
|
||||
69
app/services/sites/ftpojie.py
Normal file
69
app/services/sites/ftpojie.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
import re
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
from .crawler import Crawler
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class FtPoJieCrawler(Crawler):
|
||||
"""吾爱破解"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://www.52pojie.cn/forum.php?mod=guide&view=hot"
|
||||
|
||||
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
resp.encoding = 'gbk' # 52pojie使用GBK编码
|
||||
html_text = resp.text
|
||||
soup = BeautifulSoup(html_text, "html.parser")
|
||||
|
||||
# 找到热门帖子列表
|
||||
hot_threads = soup.find_all('tbody', id=lambda x: x and x.startswith('normalthread_'))
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for thread in hot_threads:
|
||||
title_elem = thread.find('a', class_='xst')
|
||||
if not title_elem:
|
||||
continue
|
||||
|
||||
title = title_elem.text.strip()
|
||||
url = "https://www.52pojie.cn/" + title_elem.get('href')
|
||||
|
||||
# 获取帖子信息
|
||||
info_elem = thread.find('td', class_='by')
|
||||
info = info_elem.text.strip() if info_elem else ""
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': info,
|
||||
'source': '52pojie',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "52pojie"
|
||||
58
app/services/sites/github.py
Normal file
58
app/services/sites/github.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import json
|
||||
import datetime
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class GithubCrawler(Crawler):
|
||||
def fetch(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://api.github.com/search/repositories?q=stars:%3E1&sort=stars"
|
||||
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"Chrome/122.0.0.0 Safari/537.36"
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
),
|
||||
"Referer": "https://github.com/",
|
||||
}
|
||||
|
||||
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
data = resp.json()
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for i, item in enumerate(data["items"]):
|
||||
title = item.get("full_name", "")
|
||||
url = item.get("html_url", "")
|
||||
desc = item.get("description", "")
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': desc,
|
||||
'source': self.crawler_name(),
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "github"
|
||||
235
app/services/sites/hackernews.py
Normal file
235
app/services/sites/hackernews.py
Normal file
@@ -0,0 +1,235 @@
|
||||
import json
|
||||
import datetime
|
||||
import time
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib3
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
from .crawler import Crawler
|
||||
from ..browser_manager import BrowserManager
|
||||
|
||||
# 禁用SSL警告
|
||||
urllib3.disable_warnings()
|
||||
|
||||
class HackerNewsCrawler(Crawler):
|
||||
"""hacker news"""
|
||||
def fetch(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
try:
|
||||
# 首先尝试直接请求方式获取内容
|
||||
result = self._fetch_with_requests()
|
||||
|
||||
if result and len(result) > 0:
|
||||
# 缓存数据
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
# 如果请求方式失败,尝试使用浏览器模拟获取
|
||||
browser_manager = BrowserManager()
|
||||
result = self._fetch_with_browser(browser_manager)
|
||||
if result and len(result) > 0:
|
||||
# 缓存数据
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
# 如果遇到错误,返回空列表
|
||||
return []
|
||||
|
||||
# 所有方法都失败,返回空列表
|
||||
return []
|
||||
|
||||
def _fetch_with_requests(self):
|
||||
"""使用requests直接获取Hacker News内容"""
|
||||
url = "https://news.ycombinator.com/"
|
||||
|
||||
try:
|
||||
# 发送HTTP请求
|
||||
response = requests.get(url, headers=self.header, timeout=self.timeout)
|
||||
if response.status_code != 200:
|
||||
return []
|
||||
|
||||
# 解析HTML内容
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
result = []
|
||||
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# 获取所有新闻条目
|
||||
items = soup.select("tr.athing")
|
||||
|
||||
for item in items:
|
||||
try:
|
||||
# 获取ID用于关联评论和元数据
|
||||
item_id = item.get('id')
|
||||
if not item_id:
|
||||
continue
|
||||
|
||||
# 获取标题和链接
|
||||
title_element = item.select_one(".titleline a")
|
||||
if not title_element:
|
||||
continue
|
||||
|
||||
title = title_element.text.strip()
|
||||
url = title_element.get('href')
|
||||
|
||||
# 如果URL是相对路径,转换为绝对路径
|
||||
if url and not url.startswith('http'):
|
||||
url = f"https://news.ycombinator.com/{url}"
|
||||
|
||||
# 获取来源网站
|
||||
site_element = item.select_one(".sitestr")
|
||||
site = site_element.text.strip() if site_element else ""
|
||||
|
||||
# 查找下一个tr获取元数据(分数、用户、时间等)
|
||||
metadata = item.find_next_sibling('tr')
|
||||
if not metadata:
|
||||
continue
|
||||
|
||||
# 获取分数
|
||||
score_element = metadata.select_one(".score")
|
||||
score = score_element.text.strip() if score_element else "0 points"
|
||||
|
||||
# 获取作者
|
||||
user_element = metadata.select_one(".hnuser")
|
||||
user = user_element.text.strip() if user_element else "unknown"
|
||||
|
||||
# 获取评论数
|
||||
comments_element = metadata.select_one("a:last-child")
|
||||
comments = comments_element.text.strip() if comments_element else "0 comments"
|
||||
if "discuss" in comments:
|
||||
comments = "0 comments"
|
||||
|
||||
# 构建内容摘要
|
||||
content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': content,
|
||||
'source': 'hackernews',
|
||||
'publish_time': current_time
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
|
||||
# 限制获取前30条
|
||||
if len(result) >= 30:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def _fetch_with_browser(self, browser_manager):
|
||||
"""使用浏览器模拟方式获取Hacker News内容"""
|
||||
url = "https://news.ycombinator.com/"
|
||||
|
||||
try:
|
||||
# 获取页面内容
|
||||
page_source, driver = browser_manager.get_page_content(url, wait_time=5)
|
||||
|
||||
# 等待页面元素加载
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, ".athing"))
|
||||
)
|
||||
except:
|
||||
# 如果等待超时,仍然尝试获取内容
|
||||
pass
|
||||
|
||||
result = []
|
||||
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# 获取所有新闻条目
|
||||
items = driver.find_elements(By.CSS_SELECTOR, "tr.athing")
|
||||
|
||||
for item in items:
|
||||
try:
|
||||
# 获取ID用于关联评论和元数据
|
||||
item_id = item.get_attribute("id")
|
||||
if not item_id:
|
||||
continue
|
||||
|
||||
# 获取标题和链接
|
||||
title_element = item.find_element(By.CSS_SELECTOR, ".titleline a")
|
||||
title = title_element.text.strip()
|
||||
url = title_element.get_attribute("href")
|
||||
|
||||
# 获取来源网站
|
||||
site = ""
|
||||
try:
|
||||
site_element = item.find_element(By.CSS_SELECTOR, ".sitestr")
|
||||
site = site_element.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
# 查找下一个tr获取元数据(分数、用户、时间等)
|
||||
try:
|
||||
metadata = driver.find_element(By.XPATH, f"//tr[@id='{item_id}']/following-sibling::tr[1]")
|
||||
|
||||
# 获取分数
|
||||
score = "0 points"
|
||||
try:
|
||||
score_element = metadata.find_element(By.CSS_SELECTOR, ".score")
|
||||
score = score_element.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
# 获取作者
|
||||
user = "unknown"
|
||||
try:
|
||||
user_element = metadata.find_element(By.CSS_SELECTOR, ".hnuser")
|
||||
user = user_element.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
# 获取评论数
|
||||
comments = "0 comments"
|
||||
try:
|
||||
comments_element = metadata.find_element(By.XPATH, ".//a[last()]")
|
||||
comments = comments_element.text.strip()
|
||||
if "discuss" in comments:
|
||||
comments = "0 comments"
|
||||
except:
|
||||
pass
|
||||
|
||||
# 构建内容摘要
|
||||
content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}"
|
||||
except:
|
||||
content = f"来源: {site}"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': content,
|
||||
'source': 'hackernews',
|
||||
'publish_time': current_time
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
|
||||
# 限制获取前30条
|
||||
if len(result) >= 30:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "hackernews"
|
||||
72
app/services/sites/hupu.py
Normal file
72
app/services/sites/hupu.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
import re
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
from .crawler import Crawler
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class HuPuCrawler(Crawler):
|
||||
"""虎扑"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://bbs.hupu.com/all-gambia"
|
||||
|
||||
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
html_text = resp.text
|
||||
soup = BeautifulSoup(html_text, "html.parser")
|
||||
|
||||
# 找到热门帖子列表
|
||||
post_list = soup.find_all('div', class_='t-info')
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for post in post_list:
|
||||
title_elem = post.find('span', class_='t-title')
|
||||
if not title_elem:
|
||||
continue
|
||||
|
||||
link_elem = post.find('a')
|
||||
if not link_elem:
|
||||
continue
|
||||
|
||||
title = title_elem.text.strip()
|
||||
url = "https://bbs.hupu.com" + link_elem.get('href') if link_elem.get('href').startswith('/') else link_elem.get('href')
|
||||
|
||||
# 获取帖子信息
|
||||
info_elem = post.find('span', class_='t-replies')
|
||||
info = info_elem.text.strip() if info_elem else ""
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': info,
|
||||
'source': 'hupu',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "hupu"
|
||||
63
app/services/sites/jinritoutiao.py
Normal file
63
app/services/sites/jinritoutiao.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# -- coding: utf-8 --
|
||||
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from ...core import cache
|
||||
from .crawler import Crawler
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class JinRiTouTiaoCrawler(Crawler):
|
||||
""" 今日头条 """
|
||||
|
||||
def fetch(self, date_str):
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc"
|
||||
|
||||
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
try:
|
||||
json_data = resp.json()
|
||||
data = json_data.get('data', [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data:
|
||||
title = item.get('Title', '')
|
||||
url = item.get('Url', '')
|
||||
hot_value = item.get('HotValue', '')
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': f"热度: {hot_value}",
|
||||
'source': 'jinritoutiao',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing JSON: {e}")
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "jinritoutiao"
|
||||
63
app/services/sites/juejin.py
Normal file
63
app/services/sites/juejin.py
Normal file
@@ -0,0 +1,63 @@
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class JueJinCrawler(Crawler):
|
||||
"""掘金"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://api.juejin.cn/content_api/v1/content/article_rank?category_id=1&type=hot"
|
||||
|
||||
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
try:
|
||||
json_data = resp.json()
|
||||
data = json_data.get('data', [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data:
|
||||
article_info = item.get('content', {})
|
||||
title = article_info.get('title', '')
|
||||
article_id = article_info.get('content_id', '')
|
||||
url = f"https://juejin.cn/post/{article_id}"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': title,
|
||||
'source': 'juejin',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing JSON: {e}")
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "juejin"
|
||||
20
app/services/sites/models.py
Normal file
20
app/services/sites/models.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import datetime
|
||||
|
||||
from sqlalchemy import Column, String, Integer, DateTime
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
class DailyNews(Base):
|
||||
__tablename__ = 'tab_daily_news'
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
title = Column(String(255))
|
||||
desc = Column(String(255))
|
||||
link = Column(String(255))
|
||||
type = Column(Integer, default=0)
|
||||
score = Column(Integer, default=0)
|
||||
times = Column(Integer, default=0)
|
||||
create_time = Column(DateTime, default=datetime.datetime.now)
|
||||
update_time = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now)
|
||||
75
app/services/sites/sina_finance.py
Normal file
75
app/services/sites/sina_finance.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import json
|
||||
import datetime
|
||||
import requests
|
||||
import urllib3
|
||||
from ...core import cache
|
||||
from .crawler import Crawler
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class SinaFinanceCrawler(Crawler):
|
||||
"""新浪财经"""
|
||||
def fetch(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
try:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Referer': 'https://finance.sina.com.cn/',
|
||||
'Origin': 'https://finance.sina.com.cn'
|
||||
}
|
||||
|
||||
response = requests.get(
|
||||
"https://zhibo.sina.com.cn/api/zhibo/feed?page=1&page_size=20&zhibo_id=152&tag_id=0&dire=f&dpc=1&pagesize=20",
|
||||
headers=headers,
|
||||
timeout=self.timeout,
|
||||
verify=False
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
if data.get('result', {}).get('status', {}).get('code') != 0:
|
||||
return []
|
||||
|
||||
feed_list = data.get('result', {}).get('data', {}).get('feed', {}).get('list', [])
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in feed_list:
|
||||
try:
|
||||
title = item.get('rich_text', '').strip()
|
||||
if not title:
|
||||
continue
|
||||
|
||||
ext_str = item.get('ext', '{}')
|
||||
try:
|
||||
ext_data = json.loads(ext_str)
|
||||
doc_url = ext_data.get('docurl', '')
|
||||
except:
|
||||
doc_url = item.get('docurl', '').strip(' "')
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': doc_url,
|
||||
'content': title,
|
||||
'source': 'sina_finance',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if cache_list:
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "sina_finance"
|
||||
60
app/services/sites/sspai.py
Normal file
60
app/services/sites/sspai.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from ...core import cache
|
||||
from .crawler import Crawler
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class ShaoShuPaiCrawler(Crawler):
|
||||
"""少数派"""
|
||||
def fetch(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://sspai.com/api/v1/article/index/page/get?limit=20&offset=0&created_at=0"
|
||||
|
||||
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
try:
|
||||
json_data = resp.json()
|
||||
data = json_data.get('data', [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data:
|
||||
title = item.get('title', '')
|
||||
article_id = item.get('id', '')
|
||||
url = f"https://sspai.com/post/{article_id}"
|
||||
summary = item.get('summary', '')
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': summary,
|
||||
'source': 'sspai',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing JSON: {e}")
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "shaoshupai"
|
||||
58
app/services/sites/stackoverflow.py
Normal file
58
app/services/sites/stackoverflow.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import json
|
||||
import datetime
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class StackOverflowCrawler(Crawler):
|
||||
def fetch(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://api.stackexchange.com/2.3/questions?order=desc&sort=hot&site=stackoverflow"
|
||||
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"Chrome/122.0.0.0 Safari/537.36"
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
),
|
||||
"Referer": "https://stackoverflow.com/",
|
||||
}
|
||||
|
||||
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
data = resp.json()
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for i, item in enumerate(data["items"]):
|
||||
title = item.get("title", "")
|
||||
url = item.get("link", "")
|
||||
desc = item.get("title", "")
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': desc,
|
||||
'source': 'stackoverflow',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "stackoverflow"
|
||||
65
app/services/sites/tenxunwang.py
Normal file
65
app/services/sites/tenxunwang.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import json
|
||||
import datetime
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class TenXunWangCrawler(Crawler):
|
||||
"""腾讯网"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://i.news.qq.com/gw/event/pc_hot_ranking_list?ids_hash=&offset=0&page_size=51&appver=15.5_qqnews_7.1.60&rank_id=hot"
|
||||
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"Chrome/122.0.0.0 Safari/537.36"
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
),
|
||||
"Referer": "https://news.qq.com/",
|
||||
}
|
||||
|
||||
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
data = resp.json()
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
|
||||
for i, item in enumerate(data["idlist"][0].get("newslist", [])):
|
||||
if i == 0:
|
||||
# 腾讯新闻用户最关注的热点,每10分钟更新一次
|
||||
continue
|
||||
|
||||
title = item.get("title", "")
|
||||
url = item.get("url", "")
|
||||
desc = item.get("abstract", "")
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': desc,
|
||||
'source': 'tenxunwang',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "tenxunwang"
|
||||
65
app/services/sites/tieba.py
Normal file
65
app/services/sites/tieba.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class TieBaCrawler(Crawler):
|
||||
"""百度贴吧"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "http://tieba.baidu.com/hottopic/browse/topicList"
|
||||
|
||||
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
try:
|
||||
json_data = resp.json()
|
||||
data = json_data.get('data', {}).get('bang_topic', {}).get('topic_list', [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data:
|
||||
title = item.get('topic_name', '')
|
||||
url = item.get('topic_url', '')
|
||||
if url and not url.startswith('http'):
|
||||
url = f"http://tieba.baidu.com{url}"
|
||||
|
||||
desc = item.get('topic_desc', '')
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': desc,
|
||||
'source': 'tieba',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing JSON: {e}")
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "tieba"
|
||||
83
app/services/sites/tskr.py
Normal file
83
app/services/sites/tskr.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import json
|
||||
import datetime
|
||||
import time
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class TsKrCrawler(Crawler):
|
||||
"""36氪"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
"""
|
||||
获取36氪热榜数据
|
||||
"""
|
||||
current_time = datetime.datetime.now()
|
||||
url = f"https://gateway.36kr.com/api/mis/nav/home/nav/rank/hot"
|
||||
headers = {
|
||||
"Content-Type": "application/json; charset=utf-8",
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36",
|
||||
}
|
||||
|
||||
body = {
|
||||
"partner_id": "wap",
|
||||
"param": {
|
||||
"siteId": 1,
|
||||
"platformId": 2,
|
||||
},
|
||||
"timestamp": int(time.time() * 1000),
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.post(
|
||||
url=url,
|
||||
headers=headers,
|
||||
json=body,
|
||||
verify=False,
|
||||
timeout=self.timeout
|
||||
)
|
||||
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
json_data = resp.json()
|
||||
data_key = "hotRankList"
|
||||
data_list = json_data.get("data", {}).get(data_key, [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data_list:
|
||||
template_material = item.get("templateMaterial", {})
|
||||
item_id = item.get("itemId", "")
|
||||
|
||||
title = template_material.get("widgetTitle", "")
|
||||
article_url = f"https://www.36kr.com/p/{item_id}"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': article_url,
|
||||
'content': title,
|
||||
'source': '36kr',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error fetching 36kr data: {e}")
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "36kr"
|
||||
71
app/services/sites/vtex.py
Normal file
71
app/services/sites/vtex.py
Normal file
@@ -0,0 +1,71 @@
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class VtexCrawler(Crawler):
|
||||
"""v2ex"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://www.v2ex.com/?tab=hot"
|
||||
|
||||
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
html_text = resp.text
|
||||
soup = BeautifulSoup(html_text, "html.parser")
|
||||
|
||||
# 找到热门话题列表
|
||||
topic_list = soup.find_all('div', class_='cell item')
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for topic in topic_list:
|
||||
title_elem = topic.find('span', class_='item_title')
|
||||
if not title_elem:
|
||||
continue
|
||||
|
||||
link_elem = title_elem.find('a')
|
||||
if not link_elem:
|
||||
continue
|
||||
|
||||
title = link_elem.text.strip()
|
||||
url = "https://www.v2ex.com" + link_elem.get('href')
|
||||
|
||||
# 获取话题信息
|
||||
info_elem = topic.find('span', class_='topic_info')
|
||||
info = info_elem.text.strip() if info_elem else ""
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': info,
|
||||
'source': 'v2ex',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "v2ex"
|
||||
68
app/services/sites/weibo.py
Normal file
68
app/services/sites/weibo.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from ...core import cache
|
||||
from .crawler import Crawler
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class WeiboCrawler(Crawler):
|
||||
"""微博"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
header = self.header.copy()
|
||||
header.update({
|
||||
"accept": "application/json, text/javascript, */*; q=0.01",
|
||||
"host": "weibo.com",
|
||||
"Referer": "https://weibo.com",
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
})
|
||||
|
||||
url = "https://weibo.com/ajax/side/hotSearch"
|
||||
|
||||
resp = requests.get(url=url, headers=header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
try:
|
||||
json_data = resp.json()
|
||||
data = json_data.get('data', {}).get('realtime', [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data:
|
||||
title = item.get('word', '')
|
||||
url = f"https://s.weibo.com/weibo?q=%23{title}%23"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': title,
|
||||
'source': 'weibo',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing JSON: {e}")
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "weibo"
|
||||
228
app/services/sites/weixin.py
Normal file
228
app/services/sites/weixin.py
Normal file
@@ -0,0 +1,228 @@
|
||||
import json
|
||||
import datetime
|
||||
import time
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib3
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
from .crawler import Crawler
|
||||
from ..browser_manager import BrowserManager
|
||||
|
||||
# 禁用SSL警告
|
||||
urllib3.disable_warnings()
|
||||
|
||||
class WeiXinCrawler(Crawler):
|
||||
"""
|
||||
微信热门内容爬虫
|
||||
使用微信看一看热门页面获取数据
|
||||
"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
"""获取微信热门内容"""
|
||||
current_time = datetime.datetime.now()
|
||||
browser_manager = BrowserManager()
|
||||
|
||||
try:
|
||||
# 首先尝试从微信看一看获取热门内容
|
||||
result = self._fetch_from_weixin_kankan(browser_manager)
|
||||
|
||||
if result and len(result) > 0:
|
||||
# 缓存数据
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
# 如果看一看失败,尝试从微信读书获取热门书评
|
||||
result = self._fetch_from_weixin_dushu(browser_manager)
|
||||
if result and len(result) > 0:
|
||||
# 缓存数据
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
# 如果遇到错误,返回空列表
|
||||
return []
|
||||
|
||||
# 所有方法都失败,返回空列表
|
||||
return []
|
||||
|
||||
def _fetch_from_weixin_kankan(self, browser_manager):
|
||||
"""从微信看一看页面获取热门内容"""
|
||||
url = "https://k.weixin.qq.com/"
|
||||
|
||||
try:
|
||||
# 获取页面内容
|
||||
page_source, driver = browser_manager.get_page_content(url, wait_time=10)
|
||||
|
||||
# 等待热门内容加载
|
||||
try:
|
||||
WebDriverWait(driver, 15).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, ".hot"))
|
||||
)
|
||||
except:
|
||||
# 如果等待超时,仍然尝试获取内容
|
||||
pass
|
||||
|
||||
# 点击"热点"标签切换到热门内容
|
||||
try:
|
||||
hot_tab = driver.find_element(By.XPATH, "//div[contains(text(), '热点') and @class='tab']")
|
||||
hot_tab.click()
|
||||
time.sleep(3) # 等待内容加载
|
||||
except:
|
||||
# 如果找不到热点标签,继续尝试获取当前页面内容
|
||||
pass
|
||||
|
||||
result = []
|
||||
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# 获取文章列表
|
||||
articles = driver.find_elements(By.CSS_SELECTOR, ".article-item")
|
||||
|
||||
if not articles:
|
||||
# 尝试其他可能的选择器
|
||||
articles = driver.find_elements(By.CSS_SELECTOR, ".doc-item")
|
||||
|
||||
if not articles:
|
||||
# 再尝试其他可能的选择器
|
||||
articles = driver.find_elements(By.CSS_SELECTOR, ".item")
|
||||
|
||||
for article in articles:
|
||||
try:
|
||||
# 获取文章标题和链接
|
||||
title_elem = article.find_element(By.CSS_SELECTOR, "h3, .title")
|
||||
title = title_elem.text.strip()
|
||||
|
||||
# 尝试获取链接
|
||||
link = None
|
||||
try:
|
||||
link_elem = article.find_element(By.TAG_NAME, "a")
|
||||
link = link_elem.get_attribute("href")
|
||||
except:
|
||||
# 如果直接获取链接失败,则记录文章id,以后可以构建链接
|
||||
try:
|
||||
article_id = article.get_attribute("data-id") or article.get_attribute("id")
|
||||
link = f"https://k.weixin.qq.com/article?id={article_id}"
|
||||
except:
|
||||
link = "https://k.weixin.qq.com/"
|
||||
|
||||
# 获取来源
|
||||
source = ""
|
||||
try:
|
||||
source_elem = article.find_element(By.CSS_SELECTOR, ".account, .source")
|
||||
source = source_elem.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
# 获取摘要
|
||||
summary = ""
|
||||
try:
|
||||
summary_elem = article.find_element(By.CSS_SELECTOR, ".desc, .summary, p")
|
||||
summary = summary_elem.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': link,
|
||||
'content': f"来源: {source} | 摘要: {summary[:50] if summary else '无摘要'}",
|
||||
'source': 'weixin',
|
||||
'publish_time': current_time
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
|
||||
# 限制获取前20条
|
||||
if len(result) >= 20:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def _fetch_from_weixin_dushu(self, browser_manager):
|
||||
"""从微信读书获取热门书评"""
|
||||
url = "https://weread.qq.com/web/category/all"
|
||||
|
||||
try:
|
||||
# 获取页面内容
|
||||
page_source, driver = browser_manager.get_page_content(url, wait_time=8)
|
||||
|
||||
result = []
|
||||
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# 尝试点击排行榜标签
|
||||
try:
|
||||
rank_tab = driver.find_element(By.XPATH, "//a[contains(text(), '排行榜')]")
|
||||
rank_tab.click()
|
||||
time.sleep(3) # 等待内容加载
|
||||
except:
|
||||
# 如果找不到排行榜标签,继续尝试获取当前页面内容
|
||||
pass
|
||||
|
||||
# 获取热门书籍列表
|
||||
books = driver.find_elements(By.CSS_SELECTOR, ".shelf-item, .book-item")
|
||||
|
||||
for book in books:
|
||||
try:
|
||||
# 获取书籍标题和链接
|
||||
title_elem = book.find_element(By.CSS_SELECTOR, ".title, h3")
|
||||
title = title_elem.text.strip()
|
||||
|
||||
# 尝试获取链接
|
||||
link = "https://weread.qq.com/web/category/all"
|
||||
try:
|
||||
link_elem = book.find_element(By.TAG_NAME, "a")
|
||||
link = link_elem.get_attribute("href")
|
||||
except:
|
||||
book_id = book.get_attribute("data-bid") or book.get_attribute("id")
|
||||
if book_id:
|
||||
link = f"https://weread.qq.com/web/reader/{book_id}"
|
||||
|
||||
# 获取作者
|
||||
author = ""
|
||||
try:
|
||||
author_elem = book.find_element(By.CSS_SELECTOR, ".author, .writer")
|
||||
author = author_elem.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
# 获取摘要/简介
|
||||
intro = ""
|
||||
try:
|
||||
intro_elem = book.find_element(By.CSS_SELECTOR, ".intro, .desc")
|
||||
intro = intro_elem.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
news = {
|
||||
'title': f"热门书籍: {title}",
|
||||
'url': link,
|
||||
'content': f"作者: {author} | 简介: {intro[:50] if intro else '无简介'}",
|
||||
'source': 'weixin',
|
||||
'publish_time': current_time
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
|
||||
# 限制获取前20条
|
||||
if len(result) >= 20:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "weixin"
|
||||
155
app/services/sites/xueqiu.py
Normal file
155
app/services/sites/xueqiu.py
Normal file
@@ -0,0 +1,155 @@
|
||||
import json
|
||||
import datetime
|
||||
import requests
|
||||
import urllib3
|
||||
import re
|
||||
from requests.sessions import Session
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class XueqiuCrawler(Crawler):
|
||||
"""雪球"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.session = Session()
|
||||
self._init_session()
|
||||
|
||||
def _init_session(self):
|
||||
try:
|
||||
# 第一步:访问主页获取基础cookies
|
||||
main_url = "https://xueqiu.com"
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Pragma': 'no-cache',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Upgrade-Insecure-Requests': '1'
|
||||
}
|
||||
|
||||
resp = self.session.get(main_url, headers=headers, verify=False, timeout=self.timeout)
|
||||
if resp.status_code == 200:
|
||||
html_content = resp.text
|
||||
|
||||
# 尝试提取token
|
||||
token_match = re.search(r'window\.SNB\s*=\s*\{[^}]*token["\']?\s*:\s*["\']([^"\']+)["\']', html_content)
|
||||
if token_match:
|
||||
token = token_match.group(1)
|
||||
self.session.headers.update({'X-Requested-With': 'XMLHttpRequest'})
|
||||
|
||||
hot_page_url = "https://xueqiu.com/hot_event"
|
||||
hot_headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Referer': 'https://xueqiu.com/',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Upgrade-Insecure-Requests': '1'
|
||||
}
|
||||
|
||||
hot_resp = self.session.get(hot_page_url, headers=hot_headers, verify=False, timeout=self.timeout)
|
||||
if hot_resp.status_code == 200:
|
||||
print("雪球热门页面访问成功,已获取完整认证信息")
|
||||
else:
|
||||
print(f"雪球热门页面访问失败: {hot_resp.status_code}")
|
||||
|
||||
else:
|
||||
print(f"雪球主页访问失败: {resp.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"初始化雪球会话失败: {e}")
|
||||
|
||||
def fetch(self, date_str) -> list:
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://xueqiu.com/hot_event/list.json?count=10"
|
||||
headers = {
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'Pragma': 'no-cache',
|
||||
'Referer': 'https://xueqiu.com/',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
|
||||
'X-Requested-With': 'XMLHttpRequest'
|
||||
}
|
||||
|
||||
try:
|
||||
resp = self.session.get(url=url, headers=headers, verify=False, timeout=self.timeout)
|
||||
|
||||
if resp.status_code != 200:
|
||||
print(f"雪球请求失败, status: {resp.status_code}")
|
||||
self._init_session()
|
||||
resp = self.session.get(url=url, headers=headers, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"雪球重试后仍失败, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
json_data = resp.json()
|
||||
if 'list' not in json_data:
|
||||
print("雪球响应格式异常")
|
||||
return []
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for idx, item in enumerate(json_data['list'][:10]): # 取前10条
|
||||
try:
|
||||
tag = item.get('tag', '').strip()
|
||||
if tag.startswith('#') and tag.endswith('#'):
|
||||
title = tag[1:-1]
|
||||
else:
|
||||
title = tag
|
||||
|
||||
if not title:
|
||||
continue
|
||||
|
||||
item_id = item.get('id')
|
||||
url_link = f"https://xueqiu.com/"
|
||||
|
||||
content = item.get('content', '').strip()
|
||||
if len(content) > 200:
|
||||
content = content[:200] + '...'
|
||||
|
||||
status_count = item.get('status_count', 0)
|
||||
hot_value = item.get('hot', 0)
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url_link,
|
||||
'content': content,
|
||||
'source': 'xueqiu',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'score': status_count if status_count > 0 else 1000 - idx,
|
||||
'rank': idx + 1
|
||||
}
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
except Exception as e:
|
||||
print(f"解析雪球新闻项失败: {e}")
|
||||
continue
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取雪球数据失败: {e}")
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "xueqiu"
|
||||
64
app/services/sites/zhihu.py
Normal file
64
app/services/sites/zhihu.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class ZhiHuCrawler(Crawler):
|
||||
"""知乎"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://www.zhihu.com/api/v3/explore/guest/feeds?limit=30&ws_qiangzhisafe=0"
|
||||
|
||||
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
try:
|
||||
json_data = resp.json()
|
||||
data = json_data.get('data', [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data:
|
||||
target = item.get('target', {})
|
||||
question = target.get('question', {})
|
||||
title = question.get('title', '')
|
||||
url = f"https://www.zhihu.com/question/{question.get('id')}"
|
||||
excerpt = target.get('excerpt', '')
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': excerpt,
|
||||
'source': 'zhihu',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing JSON: {e}")
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "zhihu"
|
||||
0
app/utils/__init__.py
Normal file
0
app/utils/__init__.py
Normal file
73
app/utils/logger.py
Normal file
73
app/utils/logger.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler
|
||||
import pytz
|
||||
from datetime import datetime
|
||||
|
||||
from app.core.config import get_logging_config
|
||||
|
||||
# 获取日志配置
|
||||
log_config = get_logging_config()
|
||||
|
||||
# 确保日志目录存在
|
||||
os.makedirs(log_config.dir, exist_ok=True)
|
||||
|
||||
# 自定义日志格式化器,使用配置的时区
|
||||
class CustomFormatter(logging.Formatter):
|
||||
def converter(self, timestamp):
|
||||
dt = datetime.fromtimestamp(timestamp)
|
||||
tz = pytz.timezone(log_config.timezone)
|
||||
return dt.replace(tzinfo=pytz.utc).astimezone(tz)
|
||||
|
||||
def formatTime(self, record, datefmt=None):
|
||||
dt = self.converter(record.created)
|
||||
if datefmt:
|
||||
return dt.strftime(datefmt)
|
||||
return dt.strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
# 创建日志记录器
|
||||
log = logging.getLogger('app')
|
||||
log.setLevel(getattr(logging, log_config.level))
|
||||
|
||||
# 清除现有处理器
|
||||
for handler in log.handlers[:]:
|
||||
log.removeHandler(handler)
|
||||
|
||||
# 创建控制台处理器
|
||||
console_handler = logging.StreamHandler(sys.stdout)
|
||||
console_handler.setLevel(getattr(logging, log_config.level))
|
||||
console_formatter = CustomFormatter(log_config.format)
|
||||
console_handler.setFormatter(console_formatter)
|
||||
log.addHandler(console_handler)
|
||||
|
||||
# 创建文件处理器 - 按大小轮转
|
||||
file_handler = RotatingFileHandler(
|
||||
os.path.join(log_config.dir, log_config.file),
|
||||
maxBytes=log_config.max_size,
|
||||
backupCount=log_config.backup_count,
|
||||
encoding='utf-8'
|
||||
)
|
||||
file_handler.setLevel(getattr(logging, log_config.level))
|
||||
file_formatter = CustomFormatter(log_config.format)
|
||||
file_handler.setFormatter(file_formatter)
|
||||
log.addHandler(file_handler)
|
||||
|
||||
# 创建文件处理器 - 按日期轮转
|
||||
daily_handler = TimedRotatingFileHandler(
|
||||
os.path.join(log_config.dir, 'app.daily.log'),
|
||||
when='midnight',
|
||||
interval=1,
|
||||
backupCount=log_config.daily_backup_count,
|
||||
encoding='utf-8'
|
||||
)
|
||||
daily_handler.setLevel(getattr(logging, log_config.level))
|
||||
daily_handler.setFormatter(file_formatter)
|
||||
log.addHandler(daily_handler)
|
||||
|
||||
# 防止日志传播到父记录器
|
||||
log.propagate = False
|
||||
|
||||
# 记录启动信息
|
||||
log.info(f"Logger initialized at {datetime.now(pytz.timezone(log_config.timezone)).strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
286
app/utils/notification.py
Normal file
286
app/utils/notification.py
Normal file
@@ -0,0 +1,286 @@
|
||||
import json
|
||||
import time
|
||||
import hmac
|
||||
import hashlib
|
||||
import base64
|
||||
import urllib.parse
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, Optional, List
|
||||
import requests
|
||||
import pytz
|
||||
|
||||
from app.utils.logger import log
|
||||
from app.core.config import get_notification_config
|
||||
|
||||
|
||||
class DingTalkNotifier:
|
||||
"""钉钉机器人通知器"""
|
||||
|
||||
def __init__(self):
|
||||
self.config = get_notification_config()
|
||||
self.webhook_url = self.config.get('dingtalk', {}).get('webhook_url', '')
|
||||
self.secret = self.config.get('dingtalk', {}).get('secret', '')
|
||||
self.enabled = self.config.get('dingtalk', {}).get('enabled', False)
|
||||
self.timeout = self.config.get('dingtalk', {}).get('timeout', 10)
|
||||
self.notify_success = self.config.get('dingtalk', {}).get('notify_success', False)
|
||||
self.shanghai_tz = pytz.timezone('Asia/Shanghai')
|
||||
|
||||
if not self.webhook_url and self.enabled:
|
||||
log.warning("DingTalk webhook URL not configured, notifications will be disabled")
|
||||
self.enabled = False
|
||||
|
||||
def _generate_sign(self, timestamp: int) -> str:
|
||||
"""生成钉钉机器人签名"""
|
||||
if not self.secret:
|
||||
return ""
|
||||
|
||||
string_to_sign = f'{timestamp}\n{self.secret}'
|
||||
hmac_code = hmac.new(
|
||||
self.secret.encode('utf-8'),
|
||||
string_to_sign.encode('utf-8'),
|
||||
digestmod=hashlib.sha256
|
||||
).digest()
|
||||
sign = urllib.parse.quote_plus(base64.b64encode(hmac_code))
|
||||
return sign
|
||||
|
||||
def _send_message(self, message: Dict[str, Any]) -> bool:
|
||||
"""发送消息到钉钉"""
|
||||
if not self.enabled:
|
||||
log.debug("DingTalk notifications are disabled")
|
||||
return False
|
||||
|
||||
try:
|
||||
# 生成时间戳和签名
|
||||
timestamp = int(round(time.time() * 1000))
|
||||
sign = self._generate_sign(timestamp)
|
||||
|
||||
# 构建请求URL
|
||||
url = self.webhook_url
|
||||
if sign:
|
||||
url += f"×tamp={timestamp}&sign={sign}"
|
||||
|
||||
# 发送请求
|
||||
response = requests.post(
|
||||
url,
|
||||
json=message,
|
||||
timeout=self.timeout,
|
||||
headers={'Content-Type': 'application/json'}
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
if result.get('errcode') == 0:
|
||||
log.info("DingTalk notification sent successfully")
|
||||
return True
|
||||
else:
|
||||
log.error(f"DingTalk API error: {result.get('errmsg', 'Unknown error')}")
|
||||
return False
|
||||
else:
|
||||
log.error(f"DingTalk HTTP error: {response.status_code}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Failed to send DingTalk notification: {str(e)}")
|
||||
return False
|
||||
|
||||
def send_text_message(self, content: str, at_mobiles: Optional[List[str]] = None,
|
||||
at_all: bool = False) -> bool:
|
||||
"""发送文本消息"""
|
||||
message = {
|
||||
"msgtype": "text",
|
||||
"text": {
|
||||
"content": content
|
||||
}
|
||||
}
|
||||
|
||||
if at_mobiles or at_all:
|
||||
message["at"] = {
|
||||
"atMobiles": at_mobiles or [],
|
||||
"isAtAll": at_all
|
||||
}
|
||||
|
||||
return self._send_message(message)
|
||||
|
||||
def send_markdown_message(self, title: str, text: str,
|
||||
at_mobiles: Optional[List[str]] = None,
|
||||
at_all: bool = False) -> bool:
|
||||
"""发送Markdown消息"""
|
||||
message = {
|
||||
"msgtype": "markdown",
|
||||
"markdown": {
|
||||
"title": title,
|
||||
"text": text
|
||||
}
|
||||
}
|
||||
|
||||
if at_mobiles or at_all:
|
||||
message["at"] = {
|
||||
"atMobiles": at_mobiles or [],
|
||||
"isAtAll": at_all
|
||||
}
|
||||
|
||||
return self._send_message(message)
|
||||
|
||||
def send_crawler_error(self, crawler_name: str, error_msg: str,
|
||||
date_str: str, is_retry: bool = False) -> bool:
|
||||
"""发送爬虫错误通知"""
|
||||
current_time = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S")
|
||||
retry_text = "重试失败" if is_retry else "首次失败"
|
||||
|
||||
title = f"🚨 爬虫异常通知 - {crawler_name}"
|
||||
content = f"""
|
||||
## {title}
|
||||
|
||||
**时间**: {current_time}\n
|
||||
**爬虫**: {crawler_name}\n
|
||||
**日期**: {date_str}\n
|
||||
**状态**: {retry_text}\n
|
||||
**错误信息**:
|
||||
```
|
||||
{error_msg}
|
||||
```
|
||||
|
||||
请及时检查爬虫状态!
|
||||
""".strip()
|
||||
|
||||
# 异常时@所有人
|
||||
return self.send_markdown_message(title, content, at_all=True)
|
||||
|
||||
def send_crawler_timeout(self, timeout_seconds: int, date_str: str) -> bool:
|
||||
"""发送爬虫超时通知"""
|
||||
current_time = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
title = "⏰ 爬虫超时通知"
|
||||
content = f"""
|
||||
## {title}
|
||||
|
||||
**时间**: {current_time}\n
|
||||
**日期**: {date_str}\n
|
||||
**超时时长**: {timeout_seconds}秒\n
|
||||
**状态**: 爬虫任务执行超时被强制终止
|
||||
|
||||
请检查爬虫性能或调整超时配置!
|
||||
""".strip()
|
||||
|
||||
# 超时异常时@所有人
|
||||
return self.send_markdown_message(title, content, at_all=True)
|
||||
|
||||
def send_crawler_summary(self, success_count: int, total_count: int,
|
||||
failed_crawlers: List[str], duration: float,
|
||||
date_str: str) -> bool:
|
||||
"""发送爬虫执行摘要通知"""
|
||||
# 全部成功且未启用正常通知时,不发送
|
||||
if success_count == total_count and not self.notify_success:
|
||||
return True
|
||||
|
||||
current_time = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
# 构建失败爬虫列表
|
||||
failed_list = "\n".join([f"- {name}" for name in failed_crawlers]) if failed_crawlers else ""
|
||||
|
||||
if failed_crawlers:
|
||||
title = f"🚨 爬虫执行摘要 - {date_str}"
|
||||
else:
|
||||
title = f"📊 爬虫执行摘要 - {date_str}"
|
||||
|
||||
# 根据是否有失败构建不同的内容
|
||||
if failed_crawlers:
|
||||
content = f"""
|
||||
## {title}
|
||||
|
||||
**时间**: {current_time}\n
|
||||
**日期**: {date_str}\n
|
||||
**执行时长**: {duration:.2f}秒\n
|
||||
**成功**: {success_count}/{total_count}\n
|
||||
**失败**: {len(failed_crawlers)}
|
||||
|
||||
**失败的爬虫**:
|
||||
{failed_list}
|
||||
|
||||
请关注失败的爬虫状态!
|
||||
""".strip()
|
||||
else:
|
||||
content = f"""
|
||||
## {title}
|
||||
|
||||
**时间**: {current_time}\n
|
||||
**日期**: {date_str}\n
|
||||
**执行时长**: {duration:.2f}秒\n
|
||||
**成功**: {success_count}/{total_count}\n
|
||||
**失败**: {len(failed_crawlers)}
|
||||
|
||||
所有爬虫执行成功!
|
||||
""".strip()
|
||||
|
||||
# 有失败时@所有人,没失败时不@
|
||||
at_all = len(failed_crawlers) > 0
|
||||
return self.send_markdown_message(title, content, at_all=at_all)
|
||||
|
||||
def send_analysis_error(self, error_msg: str, date_str: str) -> bool:
|
||||
"""发送数据分析错误通知"""
|
||||
current_time = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
title = "🔍 数据分析异常通知"
|
||||
content = f"""
|
||||
## {title}
|
||||
|
||||
**时间**: {current_time}\n
|
||||
**日期**: {date_str}\n
|
||||
**错误信息**:
|
||||
```
|
||||
{error_msg}
|
||||
```
|
||||
|
||||
数据分析任务执行失败,请检查分析模块!
|
||||
""".strip()
|
||||
|
||||
# 分析异常时@所有人
|
||||
return self.send_markdown_message(title, content, at_all=True)
|
||||
|
||||
class NotificationManager:
|
||||
"""通知管理器,支持多种通知方式"""
|
||||
|
||||
def __init__(self):
|
||||
self.dingtalk = DingTalkNotifier()
|
||||
# 可以在这里添加其他通知方式,如企业微信、邮件等
|
||||
|
||||
def is_enabled(self) -> bool:
|
||||
"""检查通知是否启用"""
|
||||
return self.dingtalk.enabled
|
||||
|
||||
@property
|
||||
def webhook_url(self) -> str:
|
||||
"""获取webhook URL"""
|
||||
return self.dingtalk.webhook_url
|
||||
|
||||
def send_text(self, content: str, at_all: bool = False) -> bool:
|
||||
"""发送文本消息"""
|
||||
return self.dingtalk.send_text_message(content, at_all=at_all)
|
||||
|
||||
def send_markdown(self, title: str, text: str, at_all: bool = False) -> bool:
|
||||
"""发送Markdown消息"""
|
||||
return self.dingtalk.send_markdown_message(title, text, at_all=at_all)
|
||||
|
||||
def notify_crawler_error(self, crawler_name: str, error_msg: str,
|
||||
date_str: str, is_retry: bool = False):
|
||||
"""通知爬虫错误"""
|
||||
self.dingtalk.send_crawler_error(crawler_name, error_msg, date_str, is_retry)
|
||||
|
||||
def notify_crawler_timeout(self, timeout_seconds: int, date_str: str):
|
||||
"""通知爬虫超时"""
|
||||
self.dingtalk.send_crawler_timeout(timeout_seconds, date_str)
|
||||
|
||||
def notify_crawler_summary(self, success_count: int, total_count: int,
|
||||
failed_crawlers: List[str], duration: float,
|
||||
date_str: str):
|
||||
"""通知爬虫执行摘要"""
|
||||
self.dingtalk.send_crawler_summary(success_count, total_count,
|
||||
failed_crawlers, duration, date_str)
|
||||
|
||||
def notify_analysis_error(self, error_msg: str, date_str: str):
|
||||
"""通知数据分析错误"""
|
||||
self.dingtalk.send_analysis_error(error_msg, date_str)
|
||||
|
||||
|
||||
# 全局通知管理器实例
|
||||
notification_manager = NotificationManager()
|
||||
Reference in New Issue
Block a user