init
This commit is contained in:
0
app/api/__init__.py
Normal file
0
app/api/__init__.py
Normal file
0
app/api/dependencies.py
Normal file
0
app/api/dependencies.py
Normal file
0
app/api/v1/__init__.py
Normal file
0
app/api/v1/__init__.py
Normal file
314
app/api/v1/analysis.py
Normal file
314
app/api/v1/analysis.py
Normal file
@@ -0,0 +1,314 @@
|
||||
from fastapi import APIRouter, Query
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
import pytz
|
||||
|
||||
from app.analysis.trend_analyzer import TrendAnalyzer
|
||||
from app.analysis.predictor import TrendPredictor
|
||||
from app.utils.logger import log
|
||||
from app.core import cache
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.get("/trend")
|
||||
async def get_trend_analysis(date: Optional[str] = None, type: str = "main"):
|
||||
"""
|
||||
获取热点聚合分析
|
||||
|
||||
分析各平台热点数据的共性和差异,提取共同关键词、跨平台热点话题等
|
||||
|
||||
- **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天
|
||||
- **type**: 分析类型,可选值为 main(主题分析), platform(平台对比), cross(跨平台热点), advanced(高级分析),默认为main
|
||||
"""
|
||||
try:
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 从缓存中获取数据
|
||||
cache_key = f"analysis:trend:{date}:{type}"
|
||||
cached_data = cache.get_cache(cache_key)
|
||||
|
||||
if cached_data:
|
||||
log.info(f"Retrieved trend analysis from cache for {date}, type: {type}")
|
||||
return cached_data
|
||||
|
||||
# 如果缓存中没有,则生成新的分析数据
|
||||
analyzer = TrendAnalyzer()
|
||||
result = analyzer.get_analysis(date, type)
|
||||
return result
|
||||
except Exception as e:
|
||||
log.error(f"Error in trend analysis: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
}
|
||||
|
||||
@router.get("/platform-comparison")
|
||||
async def get_platform_comparison(date: Optional[str] = None):
|
||||
"""
|
||||
获取平台对比分析
|
||||
|
||||
分析各平台热点数据的特点、热度排行、更新频率等,比较不同平台间的异同
|
||||
|
||||
- **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天
|
||||
"""
|
||||
try:
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 从缓存中获取数据
|
||||
cache_key = f"analysis:trend:{date}:platform_comparison"
|
||||
cached_data = cache.get_cache(cache_key)
|
||||
|
||||
if cached_data:
|
||||
log.info(f"Retrieved platform comparison from cache for {date}")
|
||||
return cached_data
|
||||
|
||||
# 如果缓存中没有,则生成新的分析数据
|
||||
analyzer = TrendAnalyzer()
|
||||
result = analyzer.get_platform_comparison(date)
|
||||
return result
|
||||
except Exception as e:
|
||||
log.error(f"Error in platform comparison: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
}
|
||||
|
||||
@router.get("/cross-platform")
|
||||
async def get_cross_platform_analysis(date: Optional[str] = None, refresh: bool = False):
|
||||
"""
|
||||
获取跨平台热点分析
|
||||
|
||||
分析在多个平台上出现的热点话题,以及热点的传播路径
|
||||
|
||||
- **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天
|
||||
- **refresh**: 可选,是否强制刷新缓存,默认为False
|
||||
"""
|
||||
try:
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 从缓存中获取数据
|
||||
cache_key = f"analysis:trend:{date}:cross_platform"
|
||||
|
||||
# 如果不是强制刷新,尝试从缓存获取
|
||||
if not refresh:
|
||||
cached_data = cache.get_cache(cache_key)
|
||||
if cached_data:
|
||||
log.info(f"Retrieved cross platform analysis from cache for {date}")
|
||||
return cached_data
|
||||
|
||||
# 如果缓存中没有或需要刷新,则生成新的分析数据
|
||||
analyzer = TrendAnalyzer()
|
||||
result = analyzer.get_cross_platform_analysis(date, refresh)
|
||||
return result
|
||||
except Exception as e:
|
||||
log.error(f"Error in cross platform analysis: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
}
|
||||
|
||||
@router.get("/advanced")
|
||||
async def get_advanced_analysis(date: Optional[str] = None, refresh: bool = False):
|
||||
"""
|
||||
获取高级分析
|
||||
|
||||
提供更深入的热点分析,包括关键词云图、情感分析、热点演变趋势等
|
||||
|
||||
- **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天
|
||||
- **refresh**: 可选,是否强制刷新缓存,默认为False
|
||||
"""
|
||||
try:
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 从缓存中获取数据
|
||||
cache_key = f"analysis:trend:{date}:advanced_analysis"
|
||||
|
||||
# 如果不是强制刷新,尝试从缓存获取
|
||||
if not refresh:
|
||||
cached_data = cache.get_cache(cache_key)
|
||||
if cached_data:
|
||||
log.info(f"Retrieved advanced analysis from cache for {date}")
|
||||
return cached_data
|
||||
|
||||
# 如果缓存中没有或需要刷新,则生成新的分析数据
|
||||
analyzer = TrendAnalyzer()
|
||||
result = analyzer.get_advanced_analysis(date, refresh)
|
||||
return result
|
||||
except Exception as e:
|
||||
log.error(f"Error in advanced analysis: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
}
|
||||
|
||||
@router.get("/prediction")
|
||||
async def get_trend_prediction(date: Optional[str] = None):
|
||||
"""
|
||||
获取热点趋势预测
|
||||
|
||||
基于历史数据预测热点话题的发展趋势,包括上升趋势、下降趋势、持续热门话题等
|
||||
|
||||
- **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天
|
||||
"""
|
||||
try:
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 从缓存中获取数据
|
||||
cache_key = f"analysis:prediction:{date}"
|
||||
cached_data = cache.get_cache(cache_key)
|
||||
|
||||
if cached_data:
|
||||
log.info(f"Retrieved trend prediction from cache for {date}")
|
||||
return cached_data
|
||||
|
||||
# 如果缓存中没有,则生成新的预测数据
|
||||
predictor = TrendPredictor()
|
||||
result = predictor.get_prediction(date)
|
||||
return result
|
||||
except Exception as e:
|
||||
log.error(f"Error in trend prediction: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
}
|
||||
|
||||
@router.get("/keyword-cloud")
|
||||
async def get_keyword_cloud(date: Optional[str] = None, refresh: bool = False, platforms: Optional[str] = None, category: Optional[str] = None, keyword_count: int = 200):
|
||||
"""
|
||||
获取关键词云图数据
|
||||
|
||||
提取热点数据中的关键词,按不同类别(科技、娱乐、社会等)进行分类,用于生成词云
|
||||
|
||||
- **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天
|
||||
- **refresh**: 可选,是否强制刷新缓存,默认为False
|
||||
- **platforms**: 可选,指定平台,多个平台用逗号分隔,如"baidu,weibo"
|
||||
- **category**: 可选,指定分类,如"科技"、"娱乐"等
|
||||
- **keyword_count**: 可选,返回的关键词数量,默认为200
|
||||
"""
|
||||
try:
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 从缓存中获取数据
|
||||
cache_key = f"analysis:keyword_cloud:{date}"
|
||||
|
||||
# 如果不是强制刷新,尝试从缓存获取
|
||||
if not refresh:
|
||||
cached_data = cache.get_cache(cache_key)
|
||||
if cached_data:
|
||||
log.info(f"Retrieved keyword cloud from cache for {date}")
|
||||
# 如果指定了分类,过滤结果
|
||||
if category and cached_data.get("status") == "success" and "keyword_clouds" in cached_data:
|
||||
if category in cached_data["keyword_clouds"]:
|
||||
filtered_data = cached_data.copy()
|
||||
filtered_data["keyword_clouds"] = {category: cached_data["keyword_clouds"][category]}
|
||||
return filtered_data
|
||||
return cached_data
|
||||
|
||||
# 如果缓存中没有或需要刷新,则生成新的关键词云数据
|
||||
analyzer = TrendAnalyzer()
|
||||
result = analyzer.get_keyword_cloud(date, refresh, keyword_count)
|
||||
return result
|
||||
except Exception as e:
|
||||
log.error(f"Error in keyword cloud analysis: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
}
|
||||
|
||||
@router.get("/data-visualization")
|
||||
async def get_data_visualization(date: Optional[str] = None, refresh: bool = False, platforms: str = None):
|
||||
"""
|
||||
获取数据可视化分析
|
||||
|
||||
提供热点数据的可视化分析,包括主题热度分布图
|
||||
|
||||
- **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天
|
||||
- **refresh**: 可选,是否强制刷新缓存,默认为False
|
||||
- **platforms**: 可选,指定要分析的平台,多个平台用逗号分隔,例如:baidu,weibo,douyin
|
||||
"""
|
||||
try:
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 从缓存中获取数据
|
||||
cache_key = f"analysis:data_visualization:{date}"
|
||||
|
||||
# 如果不是强制刷新,尝试从缓存获取
|
||||
if not refresh:
|
||||
cached_data = cache.get_cache(cache_key)
|
||||
if cached_data:
|
||||
log.info(f"Retrieved data visualization from cache for {date}")
|
||||
return cached_data
|
||||
|
||||
# 解析平台参数
|
||||
platform_list = None
|
||||
if platforms:
|
||||
platform_list = [p.strip() for p in platforms.split(",") if p.strip()]
|
||||
|
||||
# 如果缓存中没有或需要刷新,则生成新的可视化数据
|
||||
analyzer = TrendAnalyzer()
|
||||
result = analyzer.get_data_visualization(date, refresh, platform_list)
|
||||
return result
|
||||
except Exception as e:
|
||||
log.error(f"Error in data visualization: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
}
|
||||
|
||||
@router.get("/trend-forecast")
|
||||
async def get_trend_forecast(date: Optional[str] = None, refresh: bool = False, time_range: str = "24h"):
|
||||
"""
|
||||
获取热点趋势预测分析
|
||||
|
||||
分析热点话题的演变趋势,预测热点的发展方向
|
||||
|
||||
- **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天
|
||||
- **refresh**: 可选,是否强制刷新缓存,默认为False
|
||||
- **time_range**: 可选,预测时间范围,可选值为 24h(24小时), 7d(7天), 30d(30天),默认为24h
|
||||
"""
|
||||
try:
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 验证时间范围参数
|
||||
valid_time_ranges = ["24h", "7d", "30d"]
|
||||
if time_range not in valid_time_ranges:
|
||||
time_range = "24h" # 默认使用24小时
|
||||
|
||||
# 从缓存中获取数据
|
||||
cache_key = f"analysis:trend_forecast:{date}:{time_range}"
|
||||
|
||||
# 如果不是强制刷新,尝试从缓存获取
|
||||
if not refresh:
|
||||
cached_data = cache.get_cache(cache_key)
|
||||
if cached_data:
|
||||
log.info(f"Retrieved trend forecast from cache for {date}, time_range: {time_range}")
|
||||
return cached_data
|
||||
|
||||
# 如果缓存中没有或需要刷新,则生成新的趋势预测数据
|
||||
analyzer = TrendAnalyzer()
|
||||
result = analyzer.get_trend_forecast(date, refresh, time_range)
|
||||
return result
|
||||
except Exception as e:
|
||||
log.error(f"Error in trend forecast: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
"date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d"),
|
||||
"time_range": time_range
|
||||
}
|
||||
295
app/api/v1/daily_news.py
Normal file
295
app/api/v1/daily_news.py
Normal file
@@ -0,0 +1,295 @@
|
||||
# app/api/endpoints/dailynews.py
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
import pytz
|
||||
from fastapi import APIRouter
|
||||
|
||||
from app.core import cache
|
||||
from app.services import crawler_factory
|
||||
from app.utils.logger import log
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/")
|
||||
def get_hot_news(date: str = None, platform: str = None):
|
||||
if platform not in crawler_factory.keys():
|
||||
return {
|
||||
"status": "404",
|
||||
"data": [],
|
||||
"msg": "`platform` is required, valid platform: " + ", ".join(crawler_factory.keys())
|
||||
}
|
||||
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
cacheKey = f"crawler:{platform}:{date}"
|
||||
result = cache.get(cacheKey)
|
||||
if result:
|
||||
return {
|
||||
"status": "200",
|
||||
"data": json.loads(result),
|
||||
"msg": "success"
|
||||
}
|
||||
|
||||
return {
|
||||
"status": "200",
|
||||
"data": [],
|
||||
"msg": "success"
|
||||
}
|
||||
|
||||
|
||||
@router.get("/all")
|
||||
def get_all_platforms_news(date: str = None):
|
||||
"""
|
||||
获取所有平台的热门新闻
|
||||
|
||||
Args:
|
||||
date: 日期,格式为YYYY-MM-DD,默认为当天
|
||||
|
||||
Returns:
|
||||
包含所有平台新闻的字典,键为平台名称,值为新闻列表
|
||||
"""
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
all_news = {}
|
||||
|
||||
for platform in crawler_factory.keys():
|
||||
cacheKey = f"crawler:{platform}:{date}"
|
||||
result = cache.get(cacheKey)
|
||||
if result:
|
||||
try:
|
||||
all_news[platform] = json.loads(result)
|
||||
except Exception as e:
|
||||
log.error(f"Error parsing cached data for {platform}: {e}")
|
||||
all_news[platform] = []
|
||||
else:
|
||||
all_news[platform] = []
|
||||
|
||||
return {
|
||||
"status": "200",
|
||||
"data": all_news,
|
||||
"msg": "success"
|
||||
}
|
||||
|
||||
|
||||
@router.get("/multi")
|
||||
def get_multi_platforms_news(date: str = None, platforms: str = None):
|
||||
"""
|
||||
获取多个平台的热门新闻
|
||||
|
||||
Args:
|
||||
date: 日期,格式为YYYY-MM-DD,默认为当天
|
||||
platforms: 平台列表,以逗号分隔,例如 "weibo,baidu,zhihu"
|
||||
|
||||
Returns:
|
||||
包含指定平台新闻的字典,键为平台名称,值为新闻列表
|
||||
"""
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
if not platforms:
|
||||
return {
|
||||
"status": "404",
|
||||
"data": {},
|
||||
"msg": "`platforms` parameter is required, format: comma-separated platform names"
|
||||
}
|
||||
|
||||
platform_list = [p.strip() for p in platforms.split(",")]
|
||||
valid_platforms = crawler_factory.keys()
|
||||
|
||||
# 验证平台是否有效
|
||||
invalid_platforms = [p for p in platform_list if p not in valid_platforms]
|
||||
if invalid_platforms:
|
||||
return {
|
||||
"status": "404",
|
||||
"data": {},
|
||||
"msg": f"Invalid platforms: {', '.join(invalid_platforms)}. Valid platforms: {', '.join(valid_platforms)}"
|
||||
}
|
||||
|
||||
multi_news = {}
|
||||
|
||||
for platform in platform_list:
|
||||
cacheKey = f"crawler:{platform}:{date}"
|
||||
result = cache.get(cacheKey)
|
||||
if result:
|
||||
try:
|
||||
multi_news[platform] = json.loads(result)
|
||||
except Exception as e:
|
||||
log.error(f"Error parsing cached data for {platform}: {e}")
|
||||
multi_news[platform] = []
|
||||
else:
|
||||
multi_news[platform] = []
|
||||
|
||||
return {
|
||||
"status": "200",
|
||||
"data": multi_news,
|
||||
"msg": "success"
|
||||
}
|
||||
|
||||
|
||||
@router.get("/search")
|
||||
def search_news(keyword: str, date: str = None, platforms: str = None, limit: int = 20):
|
||||
"""
|
||||
搜索新闻
|
||||
|
||||
Args:
|
||||
keyword: 搜索关键词
|
||||
date: 日期,格式为YYYY-MM-DD,默认为当天
|
||||
platforms: 平台列表,以逗号分隔,例如 "weibo,baidu,zhihu",默认搜索所有平台
|
||||
limit: 返回结果数量限制,默认为20
|
||||
|
||||
Returns:
|
||||
包含搜索结果的字典,键为状态码、数据、消息、总结果数量和搜索结果数量
|
||||
"""
|
||||
if not date:
|
||||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||||
|
||||
# 确定要搜索的平台
|
||||
if platforms:
|
||||
platform_list = [p.strip() for p in platforms.split(",")]
|
||||
valid_platforms = crawler_factory.keys()
|
||||
platform_list = [p for p in platform_list if p in valid_platforms]
|
||||
else:
|
||||
platform_list = list(crawler_factory.keys())
|
||||
|
||||
if not platform_list:
|
||||
return {
|
||||
"status": "404",
|
||||
"data": [],
|
||||
"msg": "No valid platforms specified",
|
||||
"total": 0,
|
||||
"search_results": 0
|
||||
}
|
||||
|
||||
# 从各平台获取新闻数据
|
||||
all_news = []
|
||||
|
||||
for platform in platform_list:
|
||||
cacheKey = f"crawler:{platform}:{date}"
|
||||
result = cache.get(cacheKey)
|
||||
if not result:
|
||||
continue
|
||||
|
||||
try:
|
||||
platform_news = json.loads(result)
|
||||
if not isinstance(platform_news, list):
|
||||
continue
|
||||
|
||||
# 为每条新闻添加平台信息
|
||||
for idx, item in enumerate(platform_news):
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
|
||||
# 处理rank字段
|
||||
rank_value = ""
|
||||
if "rank" in item and item["rank"]:
|
||||
rank_value = str(item["rank"]).replace("#", "")
|
||||
elif "index" in item and item["index"]:
|
||||
rank_value = str(item["index"]).replace("#", "")
|
||||
else:
|
||||
rank_value = str(idx + 1)
|
||||
|
||||
# 获取分类信息
|
||||
category = _get_category_for_platform(platform)
|
||||
sub_category = _get_subcategory_for_platform(platform)
|
||||
|
||||
# 构建标准化的新闻条目
|
||||
item_with_source = {
|
||||
"id": item.get("id"),
|
||||
"title": item.get("title", ""),
|
||||
"source": platform,
|
||||
"rank": rank_value,
|
||||
"category": category,
|
||||
"sub_category": sub_category,
|
||||
"url": item.get("url", "")
|
||||
}
|
||||
all_news.append(item_with_source)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error processing news from {platform}: {e}")
|
||||
|
||||
# 搜索关键词
|
||||
search_results = []
|
||||
for item in all_news:
|
||||
if keyword.lower() in item["title"].lower():
|
||||
search_results.append(item)
|
||||
|
||||
# 按站点分组,每个站点内按排名排序
|
||||
grouped_results = {}
|
||||
for item in search_results:
|
||||
source = item["source"]
|
||||
if source not in grouped_results:
|
||||
grouped_results[source] = []
|
||||
grouped_results[source].append(item)
|
||||
|
||||
# 对每个站点内的结果按排名排序
|
||||
for source, items in grouped_results.items():
|
||||
# 按排名排序(直接比较数字)
|
||||
items.sort(key=lambda x: int(x["rank"]) if x["rank"].isdigit() else 999)
|
||||
|
||||
# 重新组合排序后的结果
|
||||
sorted_results = []
|
||||
for source, items in grouped_results.items():
|
||||
sorted_results.extend(items)
|
||||
|
||||
# 限制返回结果数量
|
||||
limited_results = sorted_results[:limit]
|
||||
|
||||
return {
|
||||
"status": "200",
|
||||
"data": limited_results,
|
||||
"msg": "success",
|
||||
"total": len(search_results),
|
||||
"search_results": len(limited_results)
|
||||
}
|
||||
|
||||
|
||||
def _get_category_for_platform(platform: str) -> str:
|
||||
"""根据平台返回对应的分类"""
|
||||
categories = {
|
||||
"36kr": "科技创业",
|
||||
"hupu": "体育",
|
||||
"sspai": "科技",
|
||||
"weibo": "社交",
|
||||
"zhihu": "知识",
|
||||
"baidu": "综合",
|
||||
"tieba": "社区",
|
||||
"douban": "文化",
|
||||
"bilibili": "视频",
|
||||
"v2ex": "科技",
|
||||
"github": "开发者",
|
||||
"hackernews": "科技",
|
||||
"stackoverflow": "开发者",
|
||||
"jinritoutiao": "资讯",
|
||||
"douyin": "娱乐",
|
||||
"shaoshupai": "科技"
|
||||
}
|
||||
return categories.get(platform, "其他")
|
||||
|
||||
|
||||
def _get_subcategory_for_platform(platform: str) -> str:
|
||||
"""根据平台返回对应的子分类"""
|
||||
subcategories = {
|
||||
"36kr": "商业资讯",
|
||||
"hupu": "娱乐",
|
||||
"sspai": "数码",
|
||||
"weibo": "热门",
|
||||
"zhihu": "问答",
|
||||
"baidu": "热搜",
|
||||
"tieba": "讨论",
|
||||
"douban": "影视",
|
||||
"bilibili": "热门",
|
||||
"v2ex": "技术",
|
||||
"github": "开源",
|
||||
"hackernews": "国际",
|
||||
"stackoverflow": "问答",
|
||||
"jinritoutiao": "热点",
|
||||
"douyin": "娱乐",
|
||||
"shaoshupai": "数码"
|
||||
}
|
||||
return subcategories.get(platform, "其他")
|
||||
|
||||
138
app/api/v1/web_tools.py
Normal file
138
app/api/v1/web_tools.py
Normal file
@@ -0,0 +1,138 @@
|
||||
# app/api/endpoints/website_meta.py
|
||||
import json
|
||||
import time
|
||||
from urllib.parse import urlparse, urljoin
|
||||
|
||||
import cloudscraper
|
||||
|
||||
from app.utils.logger import log
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from fastapi import APIRouter
|
||||
|
||||
from app.core import cache
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/")
|
||||
def get_meta(url: str = None):
|
||||
if not url:
|
||||
return {
|
||||
"status": "404",
|
||||
"data": [],
|
||||
"msg": "`url` is required"
|
||||
}
|
||||
|
||||
# get from cache
|
||||
cached_metadata = cache.get(url)
|
||||
if cached_metadata:
|
||||
return {
|
||||
"status": "200",
|
||||
"data": json.loads(cached_metadata),
|
||||
"msg": "success",
|
||||
"cache": True
|
||||
}
|
||||
|
||||
headers = {
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
"accept-language": "zh-CN,zh;q=0.9,zh-TW;q=0.8,ar;q=0.7,en;q=0.6",
|
||||
"cache-control": "max-age=0",
|
||||
"priority": "u=0, i",
|
||||
"sec-ch-ua": '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-ch-ua-platform": '"Windows"',
|
||||
"sec-fetch-dest": "document",
|
||||
"sec-fetch-mode": "navigate",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-user": "?1",
|
||||
"upgrade-insecure-requests": "1",
|
||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
page_content = response.content
|
||||
except requests.RequestException as e:
|
||||
scraper = cloudscraper.create_scraper(delay=100)
|
||||
response = scraper.get(url)
|
||||
page_content = response.content
|
||||
|
||||
if not page_content:
|
||||
return {
|
||||
"status": "404",
|
||||
"data": [],
|
||||
"msg": "No content"
|
||||
}
|
||||
|
||||
soup = BeautifulSoup(page_content, "html.parser")
|
||||
meta_info = {
|
||||
"title": soup.title.string if soup.title else "No title",
|
||||
"description": "",
|
||||
"keywords": "",
|
||||
"author": "",
|
||||
"og:title": "",
|
||||
"og:description": "",
|
||||
"og:image": "",
|
||||
"og:url": url,
|
||||
"twitter:card": "",
|
||||
"twitter:title": "",
|
||||
"twitter:description": "",
|
||||
"twitter:image": ""
|
||||
}
|
||||
|
||||
for meta_tag in soup.find_all("meta"):
|
||||
name_attr = meta_tag.get("name", "").lower()
|
||||
property_attr = meta_tag.get("property", "").lower()
|
||||
content = meta_tag.get("content", "")
|
||||
|
||||
if name_attr == "description":
|
||||
meta_info["description"] = content
|
||||
elif name_attr == "keywords":
|
||||
meta_info["keywords"] = content
|
||||
elif name_attr == "author":
|
||||
meta_info["author"] = content
|
||||
|
||||
elif property_attr == "og:title":
|
||||
meta_info["og:title"] = content
|
||||
elif property_attr == "og:description":
|
||||
meta_info["og:description"] = content
|
||||
elif property_attr == "og:image":
|
||||
meta_info["og:image"] = content
|
||||
elif property_attr == "og:url":
|
||||
meta_info["og:url"] = content
|
||||
|
||||
elif name_attr == "twitter:card":
|
||||
meta_info["twitter:card"] = content
|
||||
elif name_attr == "twitter:title":
|
||||
meta_info["twitter:title"] = content
|
||||
elif name_attr == "twitter:description":
|
||||
meta_info["twitter:description"] = content
|
||||
elif name_attr == "twitter:image":
|
||||
meta_info["twitter:image"] = content
|
||||
|
||||
parsed_url = urlparse(url)
|
||||
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||
favicon_url = urljoin(base_url, "favicon.ico") # 默认 favicon 路径
|
||||
|
||||
link_tag = soup.find("link", rel=["icon", "shortcut icon"])
|
||||
if link_tag:
|
||||
favicon_url = urljoin(base_url, link_tag.get("href", "favicon.ico"))
|
||||
|
||||
metadata = {
|
||||
"meta_info": meta_info,
|
||||
"favicon_url": favicon_url
|
||||
}
|
||||
|
||||
cache.set(url, json.dumps(metadata, ensure_ascii=False), ex=60)
|
||||
result = {
|
||||
"status": "200",
|
||||
"data": metadata,
|
||||
"msg": "Success",
|
||||
"cache": False
|
||||
}
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user