415 lines
12 KiB
Python
415 lines
12 KiB
Python
# app/api/endpoints/dailynews.py
|
||
import json
|
||
from datetime import datetime
|
||
from typing import List, Dict, Any, Optional
|
||
|
||
import pytz
|
||
from fastapi import APIRouter, Query, Path
|
||
from fastapi.responses import JSONResponse
|
||
|
||
from app.core import cache
|
||
from app.services import crawler_factory
|
||
from app.utils.logger import log
|
||
|
||
router = APIRouter()
|
||
|
||
|
||
@router.get(
|
||
"/",
|
||
summary="获取单个平台的热门新闻",
|
||
description="从指定平台获取特定日期的热门新闻数据",
|
||
response_description="返回包含新闻列表的 JSON 对象",
|
||
responses={
|
||
200: {"description": "成功获取新闻数据"},
|
||
404: {"description": "平台不存在"}
|
||
}
|
||
)
|
||
def get_hot_news(
|
||
date: str = Query(
|
||
default=None,
|
||
description="日期,格式为 YYYY-MM-DD,默认为当天(北京时间)",
|
||
example="2024-01-15"
|
||
),
|
||
platform: str = Query(
|
||
default=None,
|
||
description=f"平台代码,可选值:{', '.join(crawler_factory.keys())}",
|
||
example="weibo"
|
||
)
|
||
):
|
||
"""
|
||
**获取指定平台的热门新闻**
|
||
|
||
根据指定的平台和日期获取热门新闻列表。数据来源于缓存,每 30 分钟更新一次。
|
||
|
||
**参数说明:**
|
||
- `platform`: 必需,平台标识符(如:baidu, weibo, zhihu, github 等)
|
||
- `date`: 可选,查询日期,默认当天
|
||
|
||
**支持的平台:**
|
||
- 综合资讯:百度、微博、知乎、抖音等
|
||
- 科技:GitHub、HackerNews、掘金、少数派等
|
||
- 财经:雪球、东方财富等
|
||
- 社区:贴吧、虎扑、豆瓣等
|
||
"""
|
||
if platform not in crawler_factory.keys():
|
||
return {
|
||
"status": "404",
|
||
"data": [],
|
||
"msg": "`platform` is required, valid platform: " + ", ".join(crawler_factory.keys())
|
||
}
|
||
|
||
if not date:
|
||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||
|
||
cacheKey = f"crawler:{platform}:{date}"
|
||
result = cache.get(cacheKey)
|
||
if result:
|
||
return {
|
||
"status": "200",
|
||
"data": json.loads(result),
|
||
"msg": "success"
|
||
}
|
||
|
||
return {
|
||
"status": "200",
|
||
"data": [],
|
||
"msg": "success"
|
||
}
|
||
|
||
|
||
@router.get(
|
||
"/all",
|
||
summary="获取所有平台的热门新闻",
|
||
description="一次性获取所有支持平台的热门新闻数据",
|
||
response_description="返回包含所有平台新闻的 JSON 对象",
|
||
responses={
|
||
200: {"description": "成功获取所有平台新闻数据"}
|
||
}
|
||
)
|
||
def get_all_platforms_news(
|
||
date: str = Query(
|
||
default=None,
|
||
description="日期,格式为 YYYY-MM-DD,默认为当天",
|
||
example="2024-01-15"
|
||
)
|
||
):
|
||
"""
|
||
**获取所有平台的热门新闻**
|
||
|
||
一次性获取所有支持平台的热门新闻数据,适合需要全量数据的场景。
|
||
|
||
**返回数据说明:**
|
||
返回一个字典,键为平台名称,值为该平台的新闻列表
|
||
|
||
**注意事项:**
|
||
- 数据量较大,建议按需使用
|
||
- 部分平台可能没有缓存数据,返回空数组
|
||
"""
|
||
if not date:
|
||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||
|
||
all_news = {}
|
||
|
||
for platform in crawler_factory.keys():
|
||
cacheKey = f"crawler:{platform}:{date}"
|
||
result = cache.get(cacheKey)
|
||
if result:
|
||
try:
|
||
all_news[platform] = json.loads(result)
|
||
except Exception as e:
|
||
log.error(f"Error parsing cached data for {platform}: {e}")
|
||
all_news[platform] = []
|
||
else:
|
||
all_news[platform] = []
|
||
|
||
return {
|
||
"status": "200",
|
||
"data": all_news,
|
||
"msg": "success"
|
||
}
|
||
|
||
|
||
@router.get(
|
||
"/multi",
|
||
summary="获取多个平台的热门新闻",
|
||
description="批量获取指定平台的热门新闻数据",
|
||
response_description="返回包含指定平台新闻的 JSON 对象",
|
||
responses={
|
||
200: {"description": "成功获取多平台新闻数据"},
|
||
404: {"description": "平台参数无效"}
|
||
}
|
||
)
|
||
def get_multi_platforms_news(
|
||
date: str = Query(
|
||
default=None,
|
||
description="日期,格式为 YYYY-MM-DD,默认为当天",
|
||
example="2024-01-15"
|
||
),
|
||
platforms: str = Query(
|
||
default=None,
|
||
description="平台列表,逗号分隔,例如:weibo,baidu,zhihu",
|
||
example="weibo,baidu,zhihu"
|
||
)
|
||
):
|
||
"""
|
||
**获取多个平台的热门新闻**
|
||
|
||
批量获取指定平台的热门新闻数据,相比 `/all` 接口更加灵活。
|
||
|
||
**参数说明:**
|
||
- `platforms`: 必需,平台列表,逗号分隔
|
||
- `date`: 可选,查询日期
|
||
|
||
**使用示例:**
|
||
```
|
||
/multi?platforms=weibo,baidu,zhihu&date=2024-01-15
|
||
```
|
||
"""
|
||
if not date:
|
||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||
|
||
if not platforms:
|
||
return {
|
||
"status": "404",
|
||
"data": {},
|
||
"msg": "`platforms` parameter is required, format: comma-separated platform names"
|
||
}
|
||
|
||
platform_list = [p.strip() for p in platforms.split(",")]
|
||
valid_platforms = crawler_factory.keys()
|
||
|
||
# 验证平台是否有效
|
||
invalid_platforms = [p for p in platform_list if p not in valid_platforms]
|
||
if invalid_platforms:
|
||
return {
|
||
"status": "404",
|
||
"data": {},
|
||
"msg": f"Invalid platforms: {', '.join(invalid_platforms)}. Valid platforms: {', '.join(valid_platforms)}"
|
||
}
|
||
|
||
multi_news = {}
|
||
|
||
for platform in platform_list:
|
||
cacheKey = f"crawler:{platform}:{date}"
|
||
result = cache.get(cacheKey)
|
||
if result:
|
||
try:
|
||
multi_news[platform] = json.loads(result)
|
||
except Exception as e:
|
||
log.error(f"Error parsing cached data for {platform}: {e}")
|
||
multi_news[platform] = []
|
||
else:
|
||
multi_news[platform] = []
|
||
|
||
return {
|
||
"status": "200",
|
||
"data": multi_news,
|
||
"msg": "success"
|
||
}
|
||
|
||
|
||
@router.get(
|
||
"/search",
|
||
summary="搜索新闻",
|
||
description="按关键词搜索跨平台的热门新闻",
|
||
response_description="返回包含搜索结果的 JSON 对象",
|
||
responses={
|
||
200: {"description": "成功获取搜索结果"},
|
||
404: {"description": "无有效平台"}
|
||
}
|
||
)
|
||
def search_news(
|
||
keyword: str = Query(
|
||
default=...,
|
||
description="搜索关键词",
|
||
example="AI"
|
||
),
|
||
date: str = Query(
|
||
default=None,
|
||
description="日期,格式为 YYYY-MM-DD,默认为当天",
|
||
example="2024-01-15"
|
||
),
|
||
platforms: str = Query(
|
||
default=None,
|
||
description="平台列表,逗号分隔,默认搜索所有平台",
|
||
example="weibo,baidu,zhihu"
|
||
),
|
||
limit: int = Query(
|
||
default=20,
|
||
ge=1,
|
||
le=100,
|
||
description="返回结果数量限制,范围 1-100",
|
||
example=20
|
||
)
|
||
):
|
||
"""
|
||
**搜索新闻**
|
||
|
||
按关键词在指定平台和日期的新闻中搜索相关内容。
|
||
|
||
**参数说明:**
|
||
- `keyword`: 必需,搜索关键词
|
||
- `date`: 可选,搜索日期
|
||
- `platforms`: 可选,限定搜索平台
|
||
- `limit`: 可选,返回结果数量,默认 20,最大 100
|
||
|
||
**搜索逻辑:**
|
||
1. 从各平台获取新闻数据
|
||
2. 按关键词匹配标题
|
||
3. 按平台分组并按排名排序
|
||
4. 返回限定数量的结果
|
||
|
||
**返回字段说明:**
|
||
- `source`: 新闻来源平台
|
||
- `rank`: 排名
|
||
- `category`: 分类
|
||
- `sub_category`: 子分类
|
||
"""
|
||
if not date:
|
||
date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d")
|
||
|
||
# 确定要搜索的平台
|
||
if platforms:
|
||
platform_list = [p.strip() for p in platforms.split(",")]
|
||
valid_platforms = crawler_factory.keys()
|
||
platform_list = [p for p in platform_list if p in valid_platforms]
|
||
else:
|
||
platform_list = list(crawler_factory.keys())
|
||
|
||
if not platform_list:
|
||
return {
|
||
"status": "404",
|
||
"data": [],
|
||
"msg": "No valid platforms specified",
|
||
"total": 0,
|
||
"search_results": 0
|
||
}
|
||
|
||
# 从各平台获取新闻数据
|
||
all_news = []
|
||
|
||
for platform in platform_list:
|
||
cacheKey = f"crawler:{platform}:{date}"
|
||
result = cache.get(cacheKey)
|
||
if not result:
|
||
continue
|
||
|
||
try:
|
||
platform_news = json.loads(result)
|
||
if not isinstance(platform_news, list):
|
||
continue
|
||
|
||
# 为每条新闻添加平台信息
|
||
for idx, item in enumerate(platform_news):
|
||
if not isinstance(item, dict):
|
||
continue
|
||
|
||
# 处理 rank 字段
|
||
rank_value = ""
|
||
if "rank" in item and item["rank"]:
|
||
rank_value = str(item["rank"]).replace("#", "")
|
||
elif "index" in item and item["index"]:
|
||
rank_value = str(item["index"]).replace("#", "")
|
||
else:
|
||
rank_value = str(idx + 1)
|
||
|
||
# 获取分类信息
|
||
category = _get_category_for_platform(platform)
|
||
sub_category = _get_subcategory_for_platform(platform)
|
||
|
||
# 构建标准化的新闻条目
|
||
item_with_source = {
|
||
"id": item.get("id"),
|
||
"title": item.get("title", ""),
|
||
"source": platform,
|
||
"rank": rank_value,
|
||
"category": category,
|
||
"sub_category": sub_category,
|
||
"url": item.get("url", "")
|
||
}
|
||
all_news.append(item_with_source)
|
||
|
||
except Exception as e:
|
||
log.error(f"Error processing news from {platform}: {e}")
|
||
|
||
# 搜索关键词
|
||
search_results = []
|
||
for item in all_news:
|
||
if keyword.lower() in item["title"].lower():
|
||
search_results.append(item)
|
||
|
||
# 按站点分组,每个站点内按排名排序
|
||
grouped_results = {}
|
||
for item in search_results:
|
||
source = item["source"]
|
||
if source not in grouped_results:
|
||
grouped_results[source] = []
|
||
grouped_results[source].append(item)
|
||
|
||
# 对每个站点内的结果按排名排序
|
||
for source, items in grouped_results.items():
|
||
# 按排名排序(直接比较数字)
|
||
items.sort(key=lambda x: int(x["rank"]) if x["rank"].isdigit() else 999)
|
||
|
||
# 重新组合排序后的结果
|
||
sorted_results = []
|
||
for source, items in grouped_results.items():
|
||
sorted_results.extend(items)
|
||
|
||
# 限制返回结果数量
|
||
limited_results = sorted_results[:limit]
|
||
|
||
return {
|
||
"status": "200",
|
||
"data": limited_results,
|
||
"msg": "success",
|
||
"total": len(search_results),
|
||
"search_results": len(limited_results)
|
||
}
|
||
|
||
|
||
def _get_category_for_platform(platform: str) -> str:
|
||
"""根据平台返回对应的分类"""
|
||
categories = {
|
||
"36kr": "科技创业",
|
||
"hupu": "体育",
|
||
"sspai": "科技",
|
||
"weibo": "社交",
|
||
"zhihu": "知识",
|
||
"baidu": "综合",
|
||
"tieba": "社区",
|
||
"douban": "文化",
|
||
"bilibili": "视频",
|
||
"v2ex": "科技",
|
||
"github": "开发者",
|
||
"hackernews": "科技",
|
||
"stackoverflow": "开发者",
|
||
"jinritoutiao": "资讯",
|
||
"douyin": "娱乐",
|
||
"shaoshupai": "科技"
|
||
}
|
||
return categories.get(platform, "其他")
|
||
|
||
|
||
def _get_subcategory_for_platform(platform: str) -> str:
|
||
"""根据平台返回对应的子分类"""
|
||
subcategories = {
|
||
"36kr": "商业资讯",
|
||
"hupu": "娱乐",
|
||
"sspai": "数码",
|
||
"weibo": "热门",
|
||
"zhihu": "问答",
|
||
"baidu": "热搜",
|
||
"tieba": "讨论",
|
||
"douban": "影视",
|
||
"bilibili": "热门",
|
||
"v2ex": "技术",
|
||
"github": "开源",
|
||
"hackernews": "国际",
|
||
"stackoverflow": "问答",
|
||
"jinritoutiao": "热点",
|
||
"douyin": "娱乐",
|
||
"shaoshupai": "数码"
|
||
}
|
||
return subcategories.get(platform, "其他")
|