This commit is contained in:
2026-03-26 15:04:59 +08:00
commit e0af97ac7f
65 changed files with 7366 additions and 0 deletions

0
app/api/v1/__init__.py Normal file
View File

314
app/api/v1/analysis.py Normal file
View File

@@ -0,0 +1,314 @@
from fastapi import APIRouter, Query
from typing import Optional
from datetime import datetime
import pytz
from app.analysis.trend_analyzer import TrendAnalyzer
from app.analysis.predictor import TrendPredictor
from app.utils.logger import log
from app.core import cache
router = APIRouter()
@router.get("/trend")
async def get_trend_analysis(date: Optional[str] = None, type: str = "main"):
"""
获取热点聚合分析
分析各平台热点数据的共性和差异,提取共同关键词、跨平台热点话题等
- **date**: 可选指定日期格式为YYYY-MM-DD默认为当天
- **type**: 分析类型,可选值为 main(主题分析), platform(平台对比), cross(跨平台热点), advanced(高级分析)默认为main
"""
try:
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 从缓存中获取数据
cache_key = f"analysis:trend:{date}:{type}"
cached_data = cache.get_cache(cache_key)
if cached_data:
log.info(f"Retrieved trend analysis from cache for {date}, type: {type}")
return cached_data
# 如果缓存中没有,则生成新的分析数据
analyzer = TrendAnalyzer()
result = analyzer.get_analysis(date, type)
return result
except Exception as e:
log.error(f"Error in trend analysis: {e}")
return {
"status": "error",
"message": str(e),
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
}
@router.get("/platform-comparison")
async def get_platform_comparison(date: Optional[str] = None):
"""
获取平台对比分析
分析各平台热点数据的特点、热度排行、更新频率等,比较不同平台间的异同
- **date**: 可选指定日期格式为YYYY-MM-DD默认为当天
"""
try:
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 从缓存中获取数据
cache_key = f"analysis:trend:{date}:platform_comparison"
cached_data = cache.get_cache(cache_key)
if cached_data:
log.info(f"Retrieved platform comparison from cache for {date}")
return cached_data
# 如果缓存中没有,则生成新的分析数据
analyzer = TrendAnalyzer()
result = analyzer.get_platform_comparison(date)
return result
except Exception as e:
log.error(f"Error in platform comparison: {e}")
return {
"status": "error",
"message": str(e),
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
}
@router.get("/cross-platform")
async def get_cross_platform_analysis(date: Optional[str] = None, refresh: bool = False):
"""
获取跨平台热点分析
分析在多个平台上出现的热点话题,以及热点的传播路径
- **date**: 可选指定日期格式为YYYY-MM-DD默认为当天
- **refresh**: 可选是否强制刷新缓存默认为False
"""
try:
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 从缓存中获取数据
cache_key = f"analysis:trend:{date}:cross_platform"
# 如果不是强制刷新,尝试从缓存获取
if not refresh:
cached_data = cache.get_cache(cache_key)
if cached_data:
log.info(f"Retrieved cross platform analysis from cache for {date}")
return cached_data
# 如果缓存中没有或需要刷新,则生成新的分析数据
analyzer = TrendAnalyzer()
result = analyzer.get_cross_platform_analysis(date, refresh)
return result
except Exception as e:
log.error(f"Error in cross platform analysis: {e}")
return {
"status": "error",
"message": str(e),
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
}
@router.get("/advanced")
async def get_advanced_analysis(date: Optional[str] = None, refresh: bool = False):
"""
获取高级分析
提供更深入的热点分析,包括关键词云图、情感分析、热点演变趋势等
- **date**: 可选指定日期格式为YYYY-MM-DD默认为当天
- **refresh**: 可选是否强制刷新缓存默认为False
"""
try:
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 从缓存中获取数据
cache_key = f"analysis:trend:{date}:advanced_analysis"
# 如果不是强制刷新,尝试从缓存获取
if not refresh:
cached_data = cache.get_cache(cache_key)
if cached_data:
log.info(f"Retrieved advanced analysis from cache for {date}")
return cached_data
# 如果缓存中没有或需要刷新,则生成新的分析数据
analyzer = TrendAnalyzer()
result = analyzer.get_advanced_analysis(date, refresh)
return result
except Exception as e:
log.error(f"Error in advanced analysis: {e}")
return {
"status": "error",
"message": str(e),
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
}
@router.get("/prediction")
async def get_trend_prediction(date: Optional[str] = None):
"""
获取热点趋势预测
基于历史数据预测热点话题的发展趋势,包括上升趋势、下降趋势、持续热门话题等
- **date**: 可选指定日期格式为YYYY-MM-DD默认为当天
"""
try:
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 从缓存中获取数据
cache_key = f"analysis:prediction:{date}"
cached_data = cache.get_cache(cache_key)
if cached_data:
log.info(f"Retrieved trend prediction from cache for {date}")
return cached_data
# 如果缓存中没有,则生成新的预测数据
predictor = TrendPredictor()
result = predictor.get_prediction(date)
return result
except Exception as e:
log.error(f"Error in trend prediction: {e}")
return {
"status": "error",
"message": str(e),
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
}
@router.get("/keyword-cloud")
async def get_keyword_cloud(date: Optional[str] = None, refresh: bool = False, platforms: Optional[str] = None, category: Optional[str] = None, keyword_count: int = 200):
"""
获取关键词云图数据
提取热点数据中的关键词,按不同类别(科技、娱乐、社会等)进行分类,用于生成词云
- **date**: 可选指定日期格式为YYYY-MM-DD默认为当天
- **refresh**: 可选是否强制刷新缓存默认为False
- **platforms**: 可选,指定平台,多个平台用逗号分隔,如"baidu,weibo"
- **category**: 可选,指定分类,如"科技""娱乐"
- **keyword_count**: 可选返回的关键词数量默认为200
"""
try:
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 从缓存中获取数据
cache_key = f"analysis:keyword_cloud:{date}"
# 如果不是强制刷新,尝试从缓存获取
if not refresh:
cached_data = cache.get_cache(cache_key)
if cached_data:
log.info(f"Retrieved keyword cloud from cache for {date}")
# 如果指定了分类,过滤结果
if category and cached_data.get("status") == "success" and "keyword_clouds" in cached_data:
if category in cached_data["keyword_clouds"]:
filtered_data = cached_data.copy()
filtered_data["keyword_clouds"] = {category: cached_data["keyword_clouds"][category]}
return filtered_data
return cached_data
# 如果缓存中没有或需要刷新,则生成新的关键词云数据
analyzer = TrendAnalyzer()
result = analyzer.get_keyword_cloud(date, refresh, keyword_count)
return result
except Exception as e:
log.error(f"Error in keyword cloud analysis: {e}")
return {
"status": "error",
"message": str(e),
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
}
@router.get("/data-visualization")
async def get_data_visualization(date: Optional[str] = None, refresh: bool = False, platforms: str = None):
"""
获取数据可视化分析
提供热点数据的可视化分析,包括主题热度分布图
- **date**: 可选指定日期格式为YYYY-MM-DD默认为当天
- **refresh**: 可选是否强制刷新缓存默认为False
- **platforms**: 可选指定要分析的平台多个平台用逗号分隔例如baidu,weibo,douyin
"""
try:
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 从缓存中获取数据
cache_key = f"analysis:data_visualization:{date}"
# 如果不是强制刷新,尝试从缓存获取
if not refresh:
cached_data = cache.get_cache(cache_key)
if cached_data:
log.info(f"Retrieved data visualization from cache for {date}")
return cached_data
# 解析平台参数
platform_list = None
if platforms:
platform_list = [p.strip() for p in platforms.split(",") if p.strip()]
# 如果缓存中没有或需要刷新,则生成新的可视化数据
analyzer = TrendAnalyzer()
result = analyzer.get_data_visualization(date, refresh, platform_list)
return result
except Exception as e:
log.error(f"Error in data visualization: {e}")
return {
"status": "error",
"message": str(e),
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
}
@router.get("/trend-forecast")
async def get_trend_forecast(date: Optional[str] = None, refresh: bool = False, time_range: str = "24h"):
"""
获取热点趋势预测分析
分析热点话题的演变趋势,预测热点的发展方向
- **date**: 可选指定日期格式为YYYY-MM-DD默认为当天
- **refresh**: 可选是否强制刷新缓存默认为False
- **time_range**: 可选,预测时间范围,可选值为 24h(24小时), 7d(7天), 30d(30天)默认为24h
"""
try:
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 验证时间范围参数
valid_time_ranges = ["24h", "7d", "30d"]
if time_range not in valid_time_ranges:
time_range = "24h" # 默认使用24小时
# 从缓存中获取数据
cache_key = f"analysis:trend_forecast:{date}:{time_range}"
# 如果不是强制刷新,尝试从缓存获取
if not refresh:
cached_data = cache.get_cache(cache_key)
if cached_data:
log.info(f"Retrieved trend forecast from cache for {date}, time_range: {time_range}")
return cached_data
# 如果缓存中没有或需要刷新,则生成新的趋势预测数据
analyzer = TrendAnalyzer()
result = analyzer.get_trend_forecast(date, refresh, time_range)
return result
except Exception as e:
log.error(f"Error in trend forecast: {e}")
return {
"status": "error",
"message": str(e),
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d"),
"time_range": time_range
}

295
app/api/v1/daily_news.py Normal file
View File

@@ -0,0 +1,295 @@
# app/api/endpoints/dailynews.py
import json
from datetime import datetime
from typing import List, Dict, Any, Optional
import pytz
from fastapi import APIRouter
from app.core import cache
from app.services import crawler_factory
from app.utils.logger import log
router = APIRouter()
@router.get("/")
def get_hot_news(date: str = None, platform: str = None):
if platform not in crawler_factory.keys():
return {
"status": "404",
"data": [],
"msg": "`platform` is required, valid platform: " + ", ".join(crawler_factory.keys())
}
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
cacheKey = f"crawler:{platform}:{date}"
result = cache.get(cacheKey)
if result:
return {
"status": "200",
"data": json.loads(result),
"msg": "success"
}
return {
"status": "200",
"data": [],
"msg": "success"
}
@router.get("/all")
def get_all_platforms_news(date: str = None):
"""
获取所有平台的热门新闻
Args:
date: 日期格式为YYYY-MM-DD默认为当天
Returns:
包含所有平台新闻的字典,键为平台名称,值为新闻列表
"""
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
all_news = {}
for platform in crawler_factory.keys():
cacheKey = f"crawler:{platform}:{date}"
result = cache.get(cacheKey)
if result:
try:
all_news[platform] = json.loads(result)
except Exception as e:
log.error(f"Error parsing cached data for {platform}: {e}")
all_news[platform] = []
else:
all_news[platform] = []
return {
"status": "200",
"data": all_news,
"msg": "success"
}
@router.get("/multi")
def get_multi_platforms_news(date: str = None, platforms: str = None):
"""
获取多个平台的热门新闻
Args:
date: 日期格式为YYYY-MM-DD默认为当天
platforms: 平台列表,以逗号分隔,例如 "weibo,baidu,zhihu"
Returns:
包含指定平台新闻的字典,键为平台名称,值为新闻列表
"""
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
if not platforms:
return {
"status": "404",
"data": {},
"msg": "`platforms` parameter is required, format: comma-separated platform names"
}
platform_list = [p.strip() for p in platforms.split(",")]
valid_platforms = crawler_factory.keys()
# 验证平台是否有效
invalid_platforms = [p for p in platform_list if p not in valid_platforms]
if invalid_platforms:
return {
"status": "404",
"data": {},
"msg": f"Invalid platforms: {', '.join(invalid_platforms)}. Valid platforms: {', '.join(valid_platforms)}"
}
multi_news = {}
for platform in platform_list:
cacheKey = f"crawler:{platform}:{date}"
result = cache.get(cacheKey)
if result:
try:
multi_news[platform] = json.loads(result)
except Exception as e:
log.error(f"Error parsing cached data for {platform}: {e}")
multi_news[platform] = []
else:
multi_news[platform] = []
return {
"status": "200",
"data": multi_news,
"msg": "success"
}
@router.get("/search")
def search_news(keyword: str, date: str = None, platforms: str = None, limit: int = 20):
"""
搜索新闻
Args:
keyword: 搜索关键词
date: 日期格式为YYYY-MM-DD默认为当天
platforms: 平台列表,以逗号分隔,例如 "weibo,baidu,zhihu",默认搜索所有平台
limit: 返回结果数量限制默认为20
Returns:
包含搜索结果的字典,键为状态码、数据、消息、总结果数量和搜索结果数量
"""
if not date:
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
# 确定要搜索的平台
if platforms:
platform_list = [p.strip() for p in platforms.split(",")]
valid_platforms = crawler_factory.keys()
platform_list = [p for p in platform_list if p in valid_platforms]
else:
platform_list = list(crawler_factory.keys())
if not platform_list:
return {
"status": "404",
"data": [],
"msg": "No valid platforms specified",
"total": 0,
"search_results": 0
}
# 从各平台获取新闻数据
all_news = []
for platform in platform_list:
cacheKey = f"crawler:{platform}:{date}"
result = cache.get(cacheKey)
if not result:
continue
try:
platform_news = json.loads(result)
if not isinstance(platform_news, list):
continue
# 为每条新闻添加平台信息
for idx, item in enumerate(platform_news):
if not isinstance(item, dict):
continue
# 处理rank字段
rank_value = ""
if "rank" in item and item["rank"]:
rank_value = str(item["rank"]).replace("#", "")
elif "index" in item and item["index"]:
rank_value = str(item["index"]).replace("#", "")
else:
rank_value = str(idx + 1)
# 获取分类信息
category = _get_category_for_platform(platform)
sub_category = _get_subcategory_for_platform(platform)
# 构建标准化的新闻条目
item_with_source = {
"id": item.get("id"),
"title": item.get("title", ""),
"source": platform,
"rank": rank_value,
"category": category,
"sub_category": sub_category,
"url": item.get("url", "")
}
all_news.append(item_with_source)
except Exception as e:
log.error(f"Error processing news from {platform}: {e}")
# 搜索关键词
search_results = []
for item in all_news:
if keyword.lower() in item["title"].lower():
search_results.append(item)
# 按站点分组,每个站点内按排名排序
grouped_results = {}
for item in search_results:
source = item["source"]
if source not in grouped_results:
grouped_results[source] = []
grouped_results[source].append(item)
# 对每个站点内的结果按排名排序
for source, items in grouped_results.items():
# 按排名排序(直接比较数字)
items.sort(key=lambda x: int(x["rank"]) if x["rank"].isdigit() else 999)
# 重新组合排序后的结果
sorted_results = []
for source, items in grouped_results.items():
sorted_results.extend(items)
# 限制返回结果数量
limited_results = sorted_results[:limit]
return {
"status": "200",
"data": limited_results,
"msg": "success",
"total": len(search_results),
"search_results": len(limited_results)
}
def _get_category_for_platform(platform: str) -> str:
"""根据平台返回对应的分类"""
categories = {
"36kr": "科技创业",
"hupu": "体育",
"sspai": "科技",
"weibo": "社交",
"zhihu": "知识",
"baidu": "综合",
"tieba": "社区",
"douban": "文化",
"bilibili": "视频",
"v2ex": "科技",
"github": "开发者",
"hackernews": "科技",
"stackoverflow": "开发者",
"jinritoutiao": "资讯",
"douyin": "娱乐",
"shaoshupai": "科技"
}
return categories.get(platform, "其他")
def _get_subcategory_for_platform(platform: str) -> str:
"""根据平台返回对应的子分类"""
subcategories = {
"36kr": "商业资讯",
"hupu": "娱乐",
"sspai": "数码",
"weibo": "热门",
"zhihu": "问答",
"baidu": "热搜",
"tieba": "讨论",
"douban": "影视",
"bilibili": "热门",
"v2ex": "技术",
"github": "开源",
"hackernews": "国际",
"stackoverflow": "问答",
"jinritoutiao": "热点",
"douyin": "娱乐",
"shaoshupai": "数码"
}
return subcategories.get(platform, "其他")

138
app/api/v1/web_tools.py Normal file
View File

@@ -0,0 +1,138 @@
# app/api/endpoints/website_meta.py
import json
import time
from urllib.parse import urlparse, urljoin
import cloudscraper
from app.utils.logger import log
import requests
from bs4 import BeautifulSoup
from fastapi import APIRouter
from app.core import cache
router = APIRouter()
@router.get("/")
def get_meta(url: str = None):
if not url:
return {
"status": "404",
"data": [],
"msg": "`url` is required"
}
# get from cache
cached_metadata = cache.get(url)
if cached_metadata:
return {
"status": "200",
"data": json.loads(cached_metadata),
"msg": "success",
"cache": True
}
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9,zh-TW;q=0.8,ar;q=0.7,en;q=0.6",
"cache-control": "max-age=0",
"priority": "u=0, i",
"sec-ch-ua": '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
page_content = response.content
except requests.RequestException as e:
scraper = cloudscraper.create_scraper(delay=100)
response = scraper.get(url)
page_content = response.content
if not page_content:
return {
"status": "404",
"data": [],
"msg": "No content"
}
soup = BeautifulSoup(page_content, "html.parser")
meta_info = {
"title": soup.title.string if soup.title else "No title",
"description": "",
"keywords": "",
"author": "",
"og:title": "",
"og:description": "",
"og:image": "",
"og:url": url,
"twitter:card": "",
"twitter:title": "",
"twitter:description": "",
"twitter:image": ""
}
for meta_tag in soup.find_all("meta"):
name_attr = meta_tag.get("name", "").lower()
property_attr = meta_tag.get("property", "").lower()
content = meta_tag.get("content", "")
if name_attr == "description":
meta_info["description"] = content
elif name_attr == "keywords":
meta_info["keywords"] = content
elif name_attr == "author":
meta_info["author"] = content
elif property_attr == "og:title":
meta_info["og:title"] = content
elif property_attr == "og:description":
meta_info["og:description"] = content
elif property_attr == "og:image":
meta_info["og:image"] = content
elif property_attr == "og:url":
meta_info["og:url"] = content
elif name_attr == "twitter:card":
meta_info["twitter:card"] = content
elif name_attr == "twitter:title":
meta_info["twitter:title"] = content
elif name_attr == "twitter:description":
meta_info["twitter:description"] = content
elif name_attr == "twitter:image":
meta_info["twitter:image"] = content
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
favicon_url = urljoin(base_url, "favicon.ico") # 默认 favicon 路径
link_tag = soup.find("link", rel=["icon", "shortcut icon"])
if link_tag:
favicon_url = urljoin(base_url, link_tag.get("href", "favicon.ico"))
metadata = {
"meta_info": meta_info,
"favicon_url": favicon_url
}
cache.set(url, json.dumps(metadata, ensure_ascii=False), ex=60)
result = {
"status": "200",
"data": metadata,
"msg": "Success",
"cache": False
}
return result