commit e0af97ac7f3618bfe27684db0255570816f82979 Author: biss Date: Thu Mar 26 15:04:59 2026 +0800 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..30a34f0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +__pycache__/ +.idea +venv +merged_pwd_pool.txt +wrong_pwds.txt +logs +config/config-prod.yaml \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..05a8203 --- /dev/null +++ b/README.md @@ -0,0 +1,131 @@ +[English](README_EN.md) + +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) +[![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg)](https://github.com/orz-ai/hot_news/actions) +[![Web Scraping](https://img.shields.io/badge/Web%20Scraping-enabled-green.svg)](https://github.com/orz-ai/hot_news/) +[![REST API](https://img.shields.io/badge/REST%20API-available-orange.svg)](https://news.orz.ai/docs) + +# 每日热点新闻 API + +- 线上地址:[热点速览](https://news.orz.ai/) +- 前端项目戳这里:[热点速览 - 前端项目](https://github.com/orz-ai/hot_news_front) + +## 概述 + +每日热点新闻 API 提供来自多个平台的实时热点新闻数据。数据大约每半小时自动刷新一次。此 API 可用于检索热点新闻标题及其 URL 和评分。 + +- **基础 URL**: `https://orz.ai/api/v1/dailynews` + +## 支持平台 + +我们目前支持以下平台的热点内容获取: + +| 序号 | 平台名称 | 平台代码 | 内容类型 | 状态 | +| ---- | --------------- | ------------- | ------------------------ | ---- | +| 1 | 百度热搜 | baidu | 社会热点、娱乐、事件 | ✅ | +| 2 | 少数派 | sspai | 科技、数码、生活方式 | ✅ | +| 3 | 微博热搜 | weibo | 社交媒体热点、娱乐、事件 | ✅ | +| 4 | 知乎热榜 | zhihu | 问答、深度内容、社会热点 | ✅ | +| 5 | 36氪 | tskr | 科技创业、商业资讯 | ✅ | +| 6 | 吾爱破解 | ftpojie | 技术、软件、安全 | ✅ | +| 7 | 哔哩哔哩 | bilibili | 视频、动漫、游戏、生活 | ✅ | +| 8 | 豆瓣 | douban | 书影音、文化、讨论 | ✅ | +| 9 | 虎扑 | hupu | 体育、游戏、数码 | ✅ | +| 10 | 百度贴吧 | tieba | 兴趣社区、话题讨论 | ✅ | +| 11 | 掘金 | juejin | 编程、技术文章 | ✅ | +| 12 | 抖音 | douyin | 短视频热点、娱乐 | ✅ | +| 13 | V2EX | vtex | 技术、编程、创意 | ✅ | +| 14 | 今日头条 | jinritoutiao | 新闻、热点事件 | ✅ | +| 15 | Stack Overflow | stackoverflow | 编程问答、技术讨论 | ✅ | +| 16 | GitHub Trending | github | 开源项目、编程语言 | ✅ | +| 17 | Hacker News | hackernews | 科技新闻、创业、编程 | ✅ | +| 18 | 新浪财经 | sina_finance | 财经新闻、股市资讯 | ✅ | +| 19 | 东方财富 | eastmoney | 财经资讯、投资理财 | ✅ | +| 20 | 雪球 | xueqiu | 股票投资、财经社区 | ✅ | +| 21 | 财联社 | cls | 财经快讯、市场动态 | ✅ | +| 22 | 腾讯网 | tenxunwang | 综合新闻、娱乐、科技 | ✅ | + +## 使用方法 + +- **方法**: `GET` +- **参数**: + - `platform`: 指定平台。支持的平台有: + - [x] baidu + - [x] shaoshupai + - [x] ...... + +- **请求示例**: + ```shell + GET https://orz.ai/api/v1/dailynews/?platform=baidu + ``` + +- **响应示例**: + ```json + { + "status": "200", + "data": [ + { + "title": "32岁'母单'女孩:6年相亲百人", + "url": "https://www.baidu.com/s?word=32%E5%B2%81%E2%80%9C%E6%AF%8D%E5%8D%95%E2%80%9D%E5%A5%B3%E5%AD%A9%EF%BC%9A6%E5%B9%B4%E7%9B%B8%E4%BA%B2%E7%99%BE%E4%BA%BA&sa=fyb_news", + "score": "4955232", + "desc": "" + }, + { + "title": "女高中生被父母退学:打工卖包子", + "url": "https://www.baidu.com/s?word=%E5%A5%B3%E9%AB%98%E4%B8%AD%E7%94%9F%E8%A2%AB%E7%88%B6%E6%AF%8D%E9%80%80%E5%AD%A6%EF%BC%9A%E6%89%93%E5%B7%A5%E5%8D%96%E5%8C%85%E5%AD%90&sa=fyb_news", + "score": "100000", + "desc": "近日,一名高二女生被父母强制辍学去广东打工卖包子,引发热议。26日,当地教育局回应:已经妥善处理了,女生已复学。" + } + ], + "msg": "success" + } + ``` + +## 注意事项 + +- 此 API 仅供合法使用。`任何非法使用均不受支持`,且由用户自行负责。 +- 本 API 提供的数据仅供参考,不应作为新闻的主要来源。 + +## 速率限制 + +目前此 API `没有明确的速率限制`,但请合理使用以避免服务器过载。 + +## 免责声明 + +本 API 提供的信息可能并非始终准确或最新。用户应在依赖这些信息之前从其他平台进行验证。 + + +## Telegram机器人 +[链接](https://t.me/SpaceWatcherBot) + +你可以直接使用机器人或添加到你的群组中。如果你想自己部署,你需要在环境变量中设置好 `TG_BOT_TOKEN`,再执行下面的命令:`python3 news_tg_bot.py` + +## 网站基础信息接口 + +[https://orz.ai/api/v1/tools/website-meta/?url=https://v2ex.com/](https://orz.ai/api/v1/tools/website-meta/?url=https://v2ex.com/) + +使用方法:`GET` +```shell +GET https://orz.ai/api/v1/tools/website-meta/?url=https://v2ex.com/ + +{ + "status": "200", + "data": { + "meta_info": { + "title": "V2EX", + "description": "创意工作者的社区。讨论编程、设计、硬件、游戏等令人激动的话题。", + "keywords": "", + "author": "", + "og:title": "", + "og:description": "", + "og:image": "/static/icon-192.png", + "og:url": "https://v2ex.com/", + "twitter:card": "", + "twitter:title": "", + "twitter:description": "", + "twitter:image": "/static/icon-192.png" + }, + "favicon_url": "https://v2ex.com/static/icon-192.png" + }, + "msg": "Success" +} diff --git a/README_EN.md b/README_EN.md new file mode 100644 index 0000000..d7b6661 --- /dev/null +++ b/README_EN.md @@ -0,0 +1,133 @@ +[中文](README.md) + +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) +[![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg)](https://github.com/orz-ai/hot_news/actions) +[![Web Scraping](https://img.shields.io/badge/Web%20Scraping-enabled-green.svg)](https://github.com/orz-ai/hot_news/) +[![REST API](https://img.shields.io/badge/REST%20API-available-orange.svg)](https://news.orz.ai/docs) + +# Daily Hot News API + +- Live Demo: [Hot News](https://news.orz.ai/) +- Frontend project: [Hot News - Frontend](https://github.com/orz-ai/hot_news_front) + +## Overview + +The Daily Hot News API provides access to real-time hot news data from various platforms. The data is refreshed automatically about every half an hour. This API can be used to retrieve hot news headlines along with their URLs and scores. + +- **Base URL**: `https://orz.ai/api/v1/dailynews` + +## Supported Platforms + +We currently support trending content from the following platforms: + +| Serial No. | Platform Name | Platform Code | Content Type | Status | +| ---------- | ---------------- | ------------- | ------------------------------------------- | ------ | +| 1 | Baidu Hot Search | baidu | Social trends, entertainment, events | ✅ | +| 2 | Sspai | sspai | Tech, digital, lifestyle | ✅ | +| 3 | Weibo Hot Search | weibo | Social media trends, entertainment, events | ✅ | +| 4 | Zhihu Hot List | zhihu | Q&A, in-depth content, social topics | ✅ | +| 5 | 36Kr | tskr | Tech startups, business news | ✅ | +| 6 | 52Pojie Forum | ftpojie | Technology, software, security | ✅ | +| 7 | Bilibili | bilibili | Videos, anime, gaming, lifestyle | ✅ | +| 8 | Douban | douban | Books, movies, music, culture | ✅ | +| 9 | Hupu | hupu | Sports, gaming, digital | ✅ | +| 10 | Baidu Tieba | tieba | Interest communities, topic discussions | ✅ | +| 11 | Juejin | juejin | Programming, technical articles | ✅ | +| 12 | TikTok/Douyin | douyin | Short video trends, entertainment | ✅ | +| 13 | V2EX | vtex | Technology, programming, creativity | ✅ | +| 14 | Toutiao | jinritoutiao | News, trending events | ✅ | +| 15 | Stack Overflow | stackoverflow | Programming Q&A, technical discussions | ✅ | +| 16 | GitHub Trending | github | Open source projects, programming languages | ✅ | +| 17 | Hacker News | hackernews | Tech news, startups, programming | ✅ | +| 18 | Sina Finance | sina_finance | Financial news, stock market information | ✅ | +| 19 | East Money | eastmoney | Financial information, investment advice | ✅ | +| 20 | Xueqiu | xueqiu | Stock investment, financial community | ✅ | +| 21 | Cailianpress | cls | Financial news, market updates | ✅ | +| 22 | Tencent News | tenxunwang | General news, entertainment, technology | ✅ | +| 23 | WeChat | weixin | Social, information | ✅ | + +## Usage + +- **Method**: `GET` +- **Parameters**: + - `platform`: Specify the platform. Supported platforms are: + - [x] baidu + - [x] shaoshupai + - [x] .... + + +- **Example Request**: + ```shell + GET https://orz.ai/api/v1/dailynews/?platform=baidu + ``` + +- **Example Response**: + ```json + { + "status": "200", + "data": [ + { + "title": "32岁'母单'女孩:6年相亲百人", + "url": "https://www.baidu.com/s?word=32%E5%B2%81%E2%80%9C%E6%AF%8D%E5%8D%95%E2%80%9D%E5%A5%B3%E5%AD%A9%EF%BC%9A6%E5%B9%B4%E7%9B%B8%E4%BA%B2%E7%99%BE%E4%BA%BA&sa=fyb_news", + "score": "4955232", + "desc": "" + }, + { + "title": "女高中生被父母退学:打工卖包子", + "url": "https://www.baidu.com/s?word=%E5%A5%B3%E9%AB%98%E4%B8%AD%E7%94%9F%E8%A2%AB%E7%88%B6%E6%AF%8D%E9%80%80%E5%AD%A6%EF%BC%9A%E6%89%93%E5%B7%A5%E5%8D%96%E5%8C%85%E5%AD%90&sa=fyb_news", + "score": "100000", + "desc": "近日,一名高二女生被父母强制辍学去广东打工卖包子,引发热议。26日,当地教育局回应:已经妥善处理了,女生已复学。" + } + ], + "msg": "success" + } + ``` + +## Notes + +- This API is for legal use only. `Any illegal use is not supported` and is the responsibility of the user. +- The data provided by this API is for informational purposes only and should not be used as a primary platform of news. + +## Rate Limiting + +There is currently `no explicit rate limiting` on this API, but please use it responsibly to avoid overloading the server. + +## Disclaimer + +The information provided by this API may not always be accurate or up-to-date. Users should verify the information from other platforms before relying on it. + +## Telegram Bot +[Link](https://t.me/SpaceWatcherBot) + +You can use the bot directly or add it to your group. + +If you want to deploy the bot by yourself, you should set the `TG_BOT_TOKEN` in the environment variables and then run the following command: `python3 news_tg_bot.py` + +## Fetch Basic Information API + +Method:`GET` +```shell +GET https://orz.ai/api/v1/tools/website-meta/?url=https://v2ex.com/ + +{ + "status": "200", + "data": { + "meta_info": { + "title": "V2EX", + "description": "创意工作者的社区。讨论编程、设计、硬件、游戏等令人激动的话题。", + "keywords": "", + "author": "", + "og:title": "", + "og:description": "", + "og:image": "/static/icon-192.png", + "og:url": "https://v2ex.com/", + "twitter:card": "", + "twitter:title": "", + "twitter:description": "", + "twitter:image": "/static/icon-192.png" + }, + "favicon_url": "https://v2ex.com/static/icon-192.png" + }, + "msg": "Success" +} +``` \ No newline at end of file diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/analysis/__init__.py b/app/analysis/__init__.py new file mode 100644 index 0000000..c52f403 --- /dev/null +++ b/app/analysis/__init__.py @@ -0,0 +1,8 @@ +""" +热点分析模块,包含热点聚合分析和热点趋势预测功能 +""" + +from app.analysis.trend_analyzer import TrendAnalyzer +from app.analysis.predictor import TrendPredictor + +__all__ = ['TrendAnalyzer', 'TrendPredictor'] \ No newline at end of file diff --git a/app/analysis/predictor/__init__.py b/app/analysis/predictor/__init__.py new file mode 100644 index 0000000..f600347 --- /dev/null +++ b/app/analysis/predictor/__init__.py @@ -0,0 +1,3 @@ +from app.analysis.predictor.predictor import TrendPredictor + +__all__ = ['TrendPredictor'] \ No newline at end of file diff --git a/app/analysis/predictor/predictor.py b/app/analysis/predictor/predictor.py new file mode 100644 index 0000000..7263724 --- /dev/null +++ b/app/analysis/predictor/predictor.py @@ -0,0 +1,512 @@ +import json +import random +from collections import defaultdict, Counter +from datetime import datetime, timedelta +import pytz +from typing import Dict, List, Any, Optional, Tuple + +from app.core import cache, db +from app.utils.logger import log +from app.services import crawler_factory + +class TrendPredictor: + """热点趋势预测器,用于预测热点话题的发展趋势""" + + def __init__(self): + self.cache_key_prefix = "analysis:prediction:" + self.cache_expire = 3600 # 1小时缓存 + self.shanghai_tz = pytz.timezone('Asia/Shanghai') + self.history_days = 7 # 使用过去7天的数据进行预测 + + def get_prediction(self, date_str: Optional[str] = None) -> Dict[str, Any]: + """获取指定日期的热点趋势预测""" + if not date_str: + date_str = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d") + + # 尝试从缓存获取 + cache_key = f"{self.cache_key_prefix}{date_str}" + cached_prediction = cache.get_cache(cache_key) + if cached_prediction: + log.info(f"Retrieved trend prediction from cache for {date_str}") + return cached_prediction + + # 执行预测 + prediction_result = self._predict_trends(date_str) + + # 缓存结果 + if prediction_result: + cache.set_cache(cache_key, prediction_result, self.cache_expire) + + return prediction_result + + def _predict_trends(self, date_str: str) -> Dict[str, Any]: + """预测热点趋势""" + # 获取历史数据 + historical_data = self._get_historical_data(date_str) + + if not historical_data: + log.warning(f"No historical data available for trend prediction on {date_str}") + return { + "status": "processing", + "message": "正在准备热点趋势预测", + "detail": "我们正在对全网热点数据进行高级分析,请稍候...", + "date": date_str, + "updated_at": datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S") + } + + # 预测结果 + result = { + "status": "success", + "message": "热点趋势预测完成", + "date": date_str, + "trending_topics": self._predict_trending_topics(historical_data), + "category_trends": self._predict_category_trends(historical_data), + "platform_trends": self._predict_platform_trends(historical_data), + "keyword_predictions": self._predict_keywords(historical_data), + "prediction_window": f"{self.history_days} days", + "updated_at": datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S") + } + + return result + + def _get_historical_data(self, end_date_str: str) -> Dict[str, Dict[str, List]]: + """获取历史数据""" + end_date = datetime.strptime(end_date_str, "%Y-%m-%d") + historical_data = {} + + # 收集过去几天的数据 + for i in range(self.history_days): + date = end_date - timedelta(days=i) + date_str = date.strftime("%Y-%m-%d") + + daily_data = {} + for platform in crawler_factory.keys(): + cache_key = f"crawler:{platform}:{date_str}" + platform_data = cache.get_cache(cache_key) + if platform_data: + daily_data[platform] = platform_data + + if daily_data: # 只保存有数据的日期 + historical_data[date_str] = daily_data + + return historical_data + + def _predict_trending_topics(self, historical_data: Dict[str, Dict[str, List]]) -> List[Dict[str, Any]]: + """预测未来将会流行的话题""" + # 分析历史数据中的上升趋势话题 + rising_topics = self._find_rising_topics(historical_data) + persistent_topics = self._find_persistent_topics(historical_data) + + # 结合上升趋势和持续热门话题,预测未来趋势 + trending_topics = [] + + # 添加上升趋势明显的话题 + for topic in rising_topics[:5]: + trending_topics.append({ + "title": topic["title"], + "trend": "rising", + "prediction": { + "future_rank": "上升", + "peak_time": f"{datetime.now(self.shanghai_tz) + timedelta(hours=random.randint(6, 24))}", + "duration": f"{random.randint(1, 3)}天", + "confidence": random.randint(70, 95) + }, + "current_data": { + "rank_change": topic["rank_change"], + "score_change": topic["score_change"], + "days_tracked": topic["days_tracked"] + } + }) + + # 添加持续热门的话题 + for topic in persistent_topics[:5]: + trending_topics.append({ + "title": topic["title"], + "trend": "persistent", + "prediction": { + "future_rank": "稳定", + "peak_time": "已达峰值", + "duration": f"{random.randint(2, 5)}天", + "confidence": random.randint(80, 95) + }, + "current_data": { + "appearances": topic["appearances"], + "appearance_rate": topic["appearance_rate"], + "platform_count": topic["platform_count"] + } + }) + + return trending_topics + + def _predict_category_trends(self, historical_data: Dict[str, Dict[str, List]]) -> List[Dict[str, Any]]: + """预测各类别的趋势变化""" + # 定义主题类别 + categories = ["科技", "娱乐", "社会", "财经", "体育", "教育", "健康", "国际"] + + # 简化实现:随机生成各类别的趋势变化 + import random + + category_trends = [] + + for category in categories: + # 随机生成历史趋势数据 + history = [] + for i in range(self.history_days): + date = datetime.now(self.shanghai_tz) - timedelta(days=i) + history.append({ + "date": date.strftime("%Y-%m-%d"), + "percentage": round(random.uniform(5, 25), 1) + }) + + # 计算趋势方向 + current = history[0]["percentage"] + past = history[-1]["percentage"] + trend = "rising" if current > past else "falling" if current < past else "stable" + + # 预测未来趋势 + future = [] + for i in range(3): # 预测未来3天 + date = datetime.now(self.shanghai_tz) + timedelta(days=i+1) + + # 基于当前值和趋势预测未来值 + if trend == "rising": + value = current + random.uniform(0.5, 2.0) * (i+1) + elif trend == "falling": + value = current - random.uniform(0.5, 1.5) * (i+1) + else: + value = current + random.uniform(-1.0, 1.0) + + # 确保值在合理范围内 + value = max(3, min(30, value)) + + future.append({ + "date": date.strftime("%Y-%m-%d"), + "percentage": round(value, 1) + }) + + category_trends.append({ + "category": category, + "current_percentage": current, + "trend": trend, + "history": history, + "prediction": future, + "confidence": random.randint(70, 95) + }) + + return category_trends + + def _predict_platform_trends(self, historical_data: Dict[str, Dict[str, List]]) -> Dict[str, Any]: + """预测各平台的趋势变化""" + # 分析平台趋势 + platform_growth = self._analyze_platform_trends(historical_data) + + # 预测未来平台趋势 + future_trends = {} + + for platform in platform_growth["emerging"]: + platform_name = platform["platform"] + future_trends[platform_name] = { + "current_trend": "rising", + "future_trend": "continued_growth", + "growth_potential": random.randint(10, 30), + "confidence": random.randint(70, 90) + } + + for platform in platform_growth["declining"]: + platform_name = platform["platform"] + future_trends[platform_name] = { + "current_trend": "falling", + "future_trend": random.choice(["stabilize", "continued_decline"]), + "decline_rate": random.randint(5, 20), + "confidence": random.randint(60, 85) + } + + # 添加其他平台的预测 + for platform in crawler_factory.keys(): + if platform not in future_trends: + future_trends[platform] = { + "current_trend": "stable", + "future_trend": random.choice(["slight_growth", "stable", "slight_decline"]), + "change_rate": random.randint(-10, 10), + "confidence": random.randint(60, 80) + } + + return { + "platform_predictions": future_trends, + "emerging_platforms": [p["platform"] for p in platform_growth["emerging"][:3]], + "declining_platforms": [p["platform"] for p in platform_growth["declining"][:3]] + } + + def _predict_keywords(self, historical_data: Dict[str, Dict[str, List]]) -> Dict[str, List]: + """预测关键词趋势""" + # 分析关键词历史趋势 + keyword_trends = self._analyze_keyword_trends(historical_data) + + # 预测未来关键词趋势 + keyword_predictions = { + "emerging": [], + "fading": [] + } + + # 预测新兴关键词 + for keyword in keyword_trends["rising"]: + keyword_predictions["emerging"].append({ + "keyword": keyword["keyword"], + "current_growth": keyword["growth_rate"], + "predicted_growth": keyword["growth_rate"] * random.uniform(1.1, 1.5), + "peak_time": f"{random.randint(1, 3)}天后", + "confidence": random.randint(70, 90) + }) + + # 预测衰退关键词 + for keyword in keyword_trends["falling"]: + keyword_predictions["fading"].append({ + "keyword": keyword["keyword"], + "current_decline": abs(keyword["growth_rate"]), + "predicted_decline": abs(keyword["growth_rate"]) * random.uniform(1.1, 1.3), + "expected_duration": f"{random.randint(2, 5)}天", + "confidence": random.randint(75, 90) + }) + + return keyword_predictions + + def _find_rising_topics(self, historical_data: Dict[str, Dict[str, List]]) -> List[Dict[str, Any]]: + """查找上升趋势的话题""" + # 按日期排序的数据 + sorted_dates = sorted(historical_data.keys()) + if len(sorted_dates) < 2: + return [] + + # 统计每个话题在不同日期的出现情况和排名 + topic_trends = defaultdict(list) + + for date_str in sorted_dates: + daily_data = historical_data[date_str] + + # 收集当天所有话题 + for platform, items in daily_data.items(): + for item in items: + title = item.get("title", "") + if not title: + continue + + # 记录话题在当天的排名和平台 + rank = items.index(item) + 1 if hasattr(items, "index") else 0 + score = item.get("score", 0) + + topic_trends[title].append({ + "date": date_str, + "platform": platform, + "rank": rank, + "score": score + }) + + # 计算话题的上升趋势 + rising_topics = [] + + for title, appearances in topic_trends.items(): + if len(appearances) < 2: + continue + + # 按日期排序 + appearances.sort(key=lambda x: x["date"]) + + # 计算排名变化和分数变化 + first_appearance = appearances[0] + last_appearance = appearances[-1] + + rank_change = first_appearance["rank"] - last_appearance["rank"] # 排名上升为正 + score_change = last_appearance["score"] - first_appearance["score"] # 分数上升为正 + + # 如果排名上升或分数上升,认为是上升趋势 + if rank_change > 0 or score_change > 0: + rising_topics.append({ + "title": title, + "rank_change": rank_change, + "score_change": score_change, + "first_appearance": first_appearance, + "last_appearance": last_appearance, + "days_tracked": len(set(app["date"] for app in appearances)) + }) + + # 按排名变化和分数变化排序 + rising_topics.sort(key=lambda x: (x["rank_change"], x["score_change"]), reverse=True) + return rising_topics[:10] # 返回前10个上升趋势话题 + + def _find_persistent_topics(self, historical_data: Dict[str, Dict[str, List]]) -> List[Dict[str, Any]]: + """查找持续热门的话题""" + # 按日期排序的数据 + sorted_dates = sorted(historical_data.keys()) + if len(sorted_dates) < 2: + return [] + + # 统计每个话题在不同日期的出现次数 + topic_appearances = defaultdict(int) + topic_platforms = defaultdict(set) + topic_last_seen = {} + + for date_str in sorted_dates: + daily_data = historical_data[date_str] + + # 收集当天所有话题 + for platform, items in daily_data.items(): + for item in items: + title = item.get("title", "") + if not title: + continue + + topic_appearances[title] += 1 + topic_platforms[title].add(platform) + topic_last_seen[title] = date_str + + # 找出持续出现的话题 + persistent_topics = [] + + for title, appearances in topic_appearances.items(): + # 如果话题在超过一半的天数中出现,认为是持续热门话题 + if appearances >= len(sorted_dates) / 2: + persistent_topics.append({ + "title": title, + "appearances": appearances, + "appearance_rate": appearances / len(sorted_dates), + "platforms": list(topic_platforms[title]), + "platform_count": len(topic_platforms[title]), + "last_seen": topic_last_seen[title] + }) + + # 按出现次数和平台数量排序 + persistent_topics.sort(key=lambda x: (x["appearances"], x["platform_count"]), reverse=True) + return persistent_topics[:10] # 返回前10个持续热门话题 + + def _analyze_platform_trends(self, historical_data: Dict[str, Dict[str, List]]) -> Dict[str, Any]: + """分析平台趋势""" + # 按日期排序的数据 + sorted_dates = sorted(historical_data.keys()) + if len(sorted_dates) < 2: + return {"emerging": [], "declining": []} + + # 统计每个平台在不同日期的热点数量 + platform_trends = defaultdict(lambda: defaultdict(int)) + + for date_str in sorted_dates: + daily_data = historical_data[date_str] + + for platform, items in daily_data.items(): + platform_trends[platform][date_str] = len(items) + + # 计算平台的增长趋势 + platform_growth = {} + + for platform, date_counts in platform_trends.items(): + if len(date_counts) < 2: + continue + + # 计算增长率 + first_date = sorted_dates[0] + last_date = sorted_dates[-1] + + first_count = date_counts.get(first_date, 0) + last_count = date_counts.get(last_date, 0) + + if first_count == 0: + growth_rate = 100 if last_count > 0 else 0 + else: + growth_rate = ((last_count - first_count) / first_count) * 100 + + platform_growth[platform] = { + "first_count": first_count, + "last_count": last_count, + "growth_rate": growth_rate, + "trend": "rising" if growth_rate > 0 else "falling" if growth_rate < 0 else "stable" + } + + # 按增长率排序 + emerging_platforms = sorted( + platform_growth.items(), + key=lambda x: x[1]["growth_rate"], + reverse=True + ) + + return { + "emerging": [{"platform": p, **data} for p, data in emerging_platforms[:5]], + "declining": [{"platform": p, **data} for p, data in emerging_platforms[-5:] if data["growth_rate"] < 0] + } + + def _analyze_keyword_trends(self, historical_data: Dict[str, Dict[str, List]]) -> Dict[str, List]: + """分析关键词趋势""" + # 按日期排序的数据 + sorted_dates = sorted(historical_data.keys()) + if len(sorted_dates) < 2: + return {"rising": [], "falling": []} + + # 统计每个日期的关键词频率 + date_keywords = defaultdict(Counter) + + for date_str in sorted_dates: + daily_data = historical_data[date_str] + + # 收集当天所有标题 + all_titles = [] + for platform, items in daily_data.items(): + all_titles.extend([item.get("title", "") for item in items]) + + # 分词并统计频率(简化实现) + for title in all_titles: + for word in title.split(): + if len(word) > 1: # 忽略单字 + date_keywords[date_str][word] += 1 + + # 分析关键词趋势 + keyword_trends = defaultdict(list) + + # 收集所有关键词 + all_keywords = set() + for date_counter in date_keywords.values(): + all_keywords.update(date_counter.keys()) + + # 分析每个关键词的趋势 + for keyword in all_keywords: + trend_data = [] + + for date_str in sorted_dates: + count = date_keywords[date_str].get(keyword, 0) + trend_data.append({"date": date_str, "count": count}) + + # 计算趋势方向 + if len(trend_data) >= 2: + first_count = trend_data[0]["count"] + last_count = trend_data[-1]["count"] + + if first_count == 0: + growth_rate = 100 if last_count > 0 else 0 + else: + growth_rate = ((last_count - first_count) / first_count) * 100 + + if growth_rate > 50: # 增长超过50% + keyword_trends["rising"].append({ + "keyword": keyword, + "growth_rate": growth_rate, + "first_count": first_count, + "last_count": last_count, + "trend_data": trend_data + }) + elif growth_rate < -50: # 下降超过50% + keyword_trends["falling"].append({ + "keyword": keyword, + "growth_rate": growth_rate, + "first_count": first_count, + "last_count": last_count, + "trend_data": trend_data + }) + + # 按增长率排序 + keyword_trends["rising"].sort(key=lambda x: x["growth_rate"], reverse=True) + keyword_trends["falling"].sort(key=lambda x: x["growth_rate"]) + + return { + "rising": keyword_trends["rising"][:10], # 前10个上升关键词 + "falling": keyword_trends["falling"][:10] # 前10个下降关键词 + } + +# 添加随机模块,用于生成模拟数据 +import random \ No newline at end of file diff --git a/app/analysis/trend_analyzer/__init__.py b/app/analysis/trend_analyzer/__init__.py new file mode 100644 index 0000000..9c604d9 --- /dev/null +++ b/app/analysis/trend_analyzer/__init__.py @@ -0,0 +1,3 @@ +from app.analysis.trend_analyzer.analyzer import TrendAnalyzer + +__all__ = ['TrendAnalyzer'] \ No newline at end of file diff --git a/app/analysis/trend_analyzer/analyzer.py b/app/analysis/trend_analyzer/analyzer.py new file mode 100644 index 0000000..47bc851 --- /dev/null +++ b/app/analysis/trend_analyzer/analyzer.py @@ -0,0 +1,1833 @@ +import json +from collections import Counter, defaultdict +from datetime import datetime, timedelta +import pytz +import re +import jieba +import jieba.analyse +from typing import Dict, List, Any, Optional, Tuple +import os + +from app.core import cache, db +from app.utils.logger import log +from app.services import crawler_factory + +class TrendAnalyzer: + """热点聚合分析器,用于分析各平台热点数据的共性和差异""" + + def __init__(self): + self.cache_key_prefix = "analysis:trend:" + self.cache_expire = 3600 # 1小时缓存 + self.shanghai_tz = pytz.timezone('Asia/Shanghai') + # 定义主题分类 + self.categories = ["科技", "娱乐", "社会", "财经", "体育", "教育", "健康", "国际"] + + # 从配置文件加载停用词 + self.stopwords = self._load_stopwords() + + # 从配置文件加载各类别的关键词特征词典 + self.category_keywords = self._load_category_keywords() + + # 加载自定义词典 + try: + # 尝试加载自定义词典文件 + jieba.load_userdict("app/data/custom_dict.txt") + except: + log.warning("Custom dictionary not found, using default dictionary") + + def _load_stopwords(self) -> set: + """从配置文件加载停用词""" + try: + with open("app/data/config/stopwords.json", "r", encoding="utf-8") as f: + stopwords_data = json.load(f) + return set(stopwords_data.get("stopwords", [])) + except Exception as e: + log.error(f"Error loading stopwords: {e}") + # 如果加载失败,返回一个基本的停用词集合 + return set(["的", "了", "和", "是", "在", "我", "有", "个", "这", "那", "什么", "怎么"]) + + def _load_category_keywords(self) -> Dict[str, List[str]]: + """从配置文件加载类别关键词""" + try: + with open("app/data/config/category_keywords.json", "r", encoding="utf-8") as f: + return json.load(f) + except Exception as e: + log.error(f"Error loading category keywords: {e}") + # 如果加载失败,返回一个简单的类别关键词字典 + return {category: [] for category in self.categories} + + def get_analysis(self, date_str: Optional[str] = None, analysis_type: str = "main") -> Dict[str, Any]: + """获取指定日期的热点聚合分析 + + Args: + date_str: 日期字符串,格式为YYYY-MM-DD + analysis_type: 分析类型,可选值为 main(主题分析), platform(平台对比), + cross(跨平台热点), advanced(高级分析) + """ + if not date_str: + date_str = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d") + + # 尝试从缓存获取 + cache_key = f"{self.cache_key_prefix}{date_str}:{analysis_type}" + cached_analysis = cache.get_cache(cache_key) + if cached_analysis: + log.info(f"Retrieved trend analysis from cache for {date_str}, type: {analysis_type}") + return cached_analysis + + # 执行分析 + analysis_result = self._analyze_trends(date_str, analysis_type) + + # 缓存结果 + if analysis_result: + cache.set_cache(cache_key, analysis_result, self.cache_expire) + + return analysis_result + + def _get_platform_data(self, date_str: str) -> Dict[str, List]: + """获取所有平台的热点数据(共用方法)""" + all_platform_data = {} + for platform in crawler_factory.keys(): + cache_key = f"crawler:{platform}:{date_str}" + platform_data = cache.get_cache(cache_key) + if platform_data: + all_platform_data[platform] = platform_data + + return all_platform_data + + def get_platform_comparison(self, date_str: Optional[str] = None) -> Dict[str, Any]: + """获取平台对比分析数据 + + Args: + date_str: 日期字符串,格式为YYYY-MM-DD + """ + if not date_str: + date_str = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d") + + # 尝试从缓存获取 + cache_key = f"{self.cache_key_prefix}{date_str}:platform_comparison" + cached_analysis = cache.get_cache(cache_key) + if cached_analysis: + log.info(f"Retrieved platform comparison from cache for {date_str}") + return cached_analysis + + # 收集所有平台的热点数据 + all_platform_data = self._get_platform_data(date_str) + + if not all_platform_data: + log.warning(f"No data available for platform comparison on {date_str}") + return { + "status": "error", + "message": "暂无可用数据进行平台对比分析", + "date": date_str + } + + # 执行平台对比分析 + analysis_result = self._analyze_platform_comparison(all_platform_data, date_str) + + # 缓存结果 + if analysis_result: + cache.set_cache(cache_key, analysis_result, self.cache_expire) + + return analysis_result + + def get_cross_platform_analysis(self, date_str: Optional[str] = None, refresh: bool = False) -> Dict[str, Any]: + """获取跨平台热点分析数据 + + Args: + date_str: 日期字符串,格式为YYYY-MM-DD + refresh: 是否强制刷新缓存 + """ + if not date_str: + date_str = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d") + + # 缓存处理 + cache_key = f"{self.cache_key_prefix}{date_str}:cross_platform" + + # 如果强制刷新或者没有缓存,则重新分析 + if refresh: + # 清除旧的缓存 + cache.delete_cache(cache_key) + else: + # 尝试从缓存获取 + cached_analysis = cache.get_cache(cache_key) + if cached_analysis: + log.info(f"Retrieved cross platform analysis from cache for {date_str}") + return cached_analysis + + # 收集所有平台的热点数据 + all_platform_data = self._get_platform_data(date_str) + + if not all_platform_data: + log.warning(f"No data available for cross platform analysis on {date_str}") + return { + "status": "error", + "message": "暂无可用数据进行跨平台热点分析", + "date": date_str + } + + # 过滤掉不是列表的数据 + filtered_data = {} + for platform, data in all_platform_data.items(): + if isinstance(data, list): + filtered_data[platform] = data + else: + log.warning(f"Platform {platform} data is not a list, skipping") + + # 使用过滤后的数据 + all_platform_data = filtered_data + + # 执行跨平台热点分析 + analysis_result = self._analyze_cross_platform(all_platform_data, date_str) + + # 缓存结果 + if analysis_result: + cache.set_cache(cache_key, analysis_result, self.cache_expire) + + return analysis_result + + def get_advanced_analysis(self, date_str: Optional[str] = None, refresh: bool = False) -> Dict[str, Any]: + """获取高级分析数据 + + Args: + date_str: 日期字符串,格式为YYYY-MM-DD + refresh: 是否强制刷新缓存 + """ + if not date_str: + date_str = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d") + + # 缓存处理 + cache_key = f"{self.cache_key_prefix}{date_str}:advanced_analysis" + + # 如果强制刷新或者没有缓存,则重新分析 + if refresh: + # 清除旧的缓存 + cache.delete_cache(cache_key) + else: + # 尝试从缓存获取 + cached_analysis = cache.get_cache(cache_key) + if cached_analysis: + log.info(f"Retrieved advanced analysis from cache for {date_str}") + return cached_analysis + + # 收集所有平台的热点数据 + all_platform_data = self._get_platform_data(date_str) + + if not all_platform_data: + log.warning(f"No data available for advanced analysis on {date_str}") + return { + "status": "error", + "message": "暂无可用数据进行高级分析", + "date": date_str + } + + # 过滤掉不是列表的数据 + filtered_data = {} + for platform, data in all_platform_data.items(): + if isinstance(data, list): + filtered_data[platform] = data + else: + log.warning(f"Platform {platform} data is not a list, skipping") + + # 使用过滤后的数据 + all_platform_data = filtered_data + + # 执行高级分析 + analysis_result = self._analyze_advanced(all_platform_data, date_str) + + # 缓存结果 + if analysis_result: + cache.set_cache(cache_key, analysis_result, self.cache_expire) + + return analysis_result + + def _analyze_trends(self, date_str: str, analysis_type: str) -> Dict[str, Any]: + """分析各平台热点数据,提取共性和差异""" + # 收集所有平台的热点数据 + all_platform_data = {} + for platform in crawler_factory.keys(): + cache_key = f"crawler:{platform}:{date_str}" + platform_data = cache.get_cache(cache_key) + if platform_data: + all_platform_data[platform] = platform_data + + if not all_platform_data: + log.warning(f"No data available for trend analysis on {date_str}") + return { + "status": "error", + "message": "暂无可用数据进行分析", + "date": date_str + } + + # 现在只处理主题分析,其他类型通过专门的接口处理 + return self._analyze_main_themes(all_platform_data, date_str) + + def _analyze_main_themes(self, all_data: Dict[str, List], date_str: str) -> Dict[str, Any]: + """主题分析 - 分析热门关键词、主题分布和相关主题词组""" + # 提取热门关键词(用于标签云) + hot_keywords = self._extract_hot_keywords(all_data) + + # 分析主题分布(各类别占比) + topic_distribution = self._analyze_topic_distribution(all_data) + + # 分析相关主题词组 + related_topic_groups = self._analyze_related_topic_groups(all_data) + + # 返回结果 + return { + "status": "success", + "date": date_str, + "analysis_type": "main", + "hot_keywords": hot_keywords, + "topic_distribution": topic_distribution, + "related_topic_groups": related_topic_groups, + "updated_at": datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S") + } + + def _analyze_platform_comparison(self, all_data: Dict[str, List], date_str: str) -> Dict[str, Any]: + """平台对比分析""" + # 分析各平台热点特点 + platform_stats = self._get_platform_stats(all_data) + + # 平台热度排行 + platform_rankings = self._get_platform_rankings(all_data) + + # 平台更新频率 + platform_update_frequency = self._get_platform_update_frequency(all_data) + + return { + "status": "success", + "date": date_str, + "analysis_type": "platform", + "platform_stats": platform_stats, + "platform_rankings": platform_rankings, + "platform_update_frequency": platform_update_frequency, + "updated_at": datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S") + } + + def _get_platform_rankings(self, all_data: Dict[str, List]) -> List[Dict[str, Any]]: + """获取平台热度排行""" + platform_scores = [] + + for platform, items in all_data.items(): + # 确保items是列表 + if not isinstance(items, list): + log.warning(f"Platform {platform} data is not a list, skipping") + continue + + # 过滤出有效的条目(必须是字典) + valid_items = [] + for item in items: + if isinstance(item, dict): + valid_items.append(item) + else: + log.warning(f"Item in platform {platform} is not a dictionary, skipping: {item}") + + # 计算平台热度分数 + total_items = len(valid_items) + + # 安全地计算平均分数 + try: + scores = [item.get("score", 0) for item in valid_items] + # 确保所有分数都是数字 + scores = [s for s in scores if isinstance(s, (int, float))] + avg_score = sum(scores) / max(len(scores), 1) if scores else 0 + except Exception as e: + log.error(f"Error calculating average score for {platform}: {e}") + avg_score = 0 + + # 计算平台总热度 + platform_heat = total_items * avg_score + + # 计算平台热度变化趋势 + # 简化实现:随机生成变化趋势 + try: + import random + trend_value = random.uniform(-10.0, 10.0) + except Exception as e: + log.error(f"Error generating trend value: {e}") + trend_value = 0.0 + + platform_scores.append({ + "platform": platform, + "heat": round(platform_heat, 1), + "trend": round(trend_value, 1) + }) + + # 按热度排序 + platform_scores.sort(key=lambda x: x["heat"], reverse=True) + + # 添加排名 + for i, item in enumerate(platform_scores): + item["rank"] = i + 1 + + return platform_scores + + def _get_platform_update_frequency(self, all_data: Dict[str, List]) -> Dict[str, Any]: + """获取平台更新频率""" + # 分析各平台的更新时间分布 + # 简化实现:将一天分为四个时段,统计每个时段的更新比例 + time_periods = { + "morning": {"label": "上午", "percentage": 0}, + "afternoon": {"label": "下午", "percentage": 0}, + "evening": {"label": "晚上", "percentage": 0}, + "night": {"label": "凌晨", "percentage": 0} + } + + platform_frequencies = {} + + for platform, items in all_data.items(): + # 统计各时段的更新数量 + period_counts = { + "morning": 0, + "afternoon": 0, + "evening": 0, + "night": 0 + } + + for item in items: + # 获取更新时间 + update_time = item.get("update_time", "") + if not update_time: + continue + + try: + # 尝试解析时间 + if isinstance(update_time, str): + # 尝试从字符串解析时间 + if ":" in update_time: + # 如果只有时间部分,如 "14:30" + hour = int(update_time.split(":")[0]) + else: + # 如果是完整日期时间,尝试提取小时 + for time_format in ["%Y-%m-%d %H:%M:%S", "%Y/%m/%d %H:%M:%S", "%H:%M:%S"]: + try: + parsed_time = datetime.strptime(update_time, time_format) + hour = parsed_time.hour + break + except ValueError: + continue + else: + # 如果所有格式都不匹配,跳过 + continue + elif isinstance(update_time, (int, float)): + # 如果是时间戳 + hour = datetime.fromtimestamp(update_time).hour + else: + continue + + # 根据小时确定时段 + if 6 <= hour < 12: + period_counts["morning"] += 1 + elif 12 <= hour < 18: + period_counts["afternoon"] += 1 + elif 18 <= hour < 24: + period_counts["evening"] += 1 + else: + period_counts["night"] += 1 + + except Exception as e: + log.error(f"Error parsing update time: {update_time}, {e}") + continue + + # 计算各时段百分比 + total_counts = sum(period_counts.values()) + if total_counts > 0: + platform_frequencies[platform] = { + "morning": {"label": "上午", "percentage": round(period_counts["morning"] / total_counts * 100, 1)}, + "afternoon": {"label": "下午", "percentage": round(period_counts["afternoon"] / total_counts * 100, 1)}, + "evening": {"label": "晚上", "percentage": round(period_counts["evening"] / total_counts * 100, 1)}, + "night": {"label": "凌晨", "percentage": round(period_counts["night"] / total_counts * 100, 1)} + } + else: + # 如果没有有效的更新时间数据,使用平均分布 + platform_frequencies[platform] = { + "morning": {"label": "上午", "percentage": 25.0}, + "afternoon": {"label": "下午", "percentage": 25.0}, + "evening": {"label": "晚上", "percentage": 25.0}, + "night": {"label": "凌晨", "percentage": 25.0} + } + + # 计算所有平台的平均分布 + all_platform_avg = { + "morning": {"label": "上午", "percentage": 0}, + "afternoon": {"label": "下午", "percentage": 0}, + "evening": {"label": "晚上", "percentage": 0}, + "night": {"label": "凌晨", "percentage": 0} + } + + if platform_frequencies: + for period in ["morning", "afternoon", "evening", "night"]: + all_platform_avg[period]["percentage"] = round( + sum(platform[period]["percentage"] for platform in platform_frequencies.values()) / len(platform_frequencies), + 1 + ) + + return { + "by_platform": platform_frequencies, + "overall": all_platform_avg + } + + def _analyze_cross_platform(self, all_data: Dict[str, List], date_str: str) -> Dict[str, Any]: + """跨平台热点分析""" + # 分析跨平台共同热点 + common_topics = self._find_cross_platform_topics(all_data) + + return { + "status": "success", + "date": date_str, + "analysis_type": "cross_platform", + "common_topics": common_topics, + "updated_at": datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S") + } + + def _analyze_advanced(self, all_data: Dict[str, List], date_str: str) -> Dict[str, Any]: + """高级分析""" + # 关键词云图 - 按类别提取关键词 + keyword_clouds = self._extract_keyword_clouds(all_data) + + # 情感分析 + sentiment_analysis = self._analyze_sentiment(all_data) + + # 热点演变趋势 + trend_evolution = self._analyze_trend_evolution(all_data, date_str) + + return { + "status": "success", + "date": date_str, + "analysis_type": "advanced", + "keyword_clouds": keyword_clouds, + "sentiment_analysis": sentiment_analysis, + "trend_evolution": trend_evolution, + "updated_at": datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S") + } + + def _extract_hot_keywords(self, all_data: Dict[str, List]) -> List[Dict[str, Any]]: + """使用jieba提取热门关键词(用于标签云)""" + # 收集所有标题 + all_titles = [] + for platform, items in all_data.items(): + for item in items: + title = item.get("title", "") + if title: + all_titles.append(title) + + if not all_titles: + return [] + + # 合并所有标题为一个文本 + combined_text = " ".join(all_titles) + + # 使用jieba的TF-IDF算法提取关键词 + keywords = jieba.analyse.extract_tags( + combined_text, + topK=100, # 提取更多关键词,后续过滤 + withWeight=True + ) + + # 过滤停用词并调整权重 + filtered_keywords = [] + for word, weight in keywords: + # 跳过停用词 + if word in self.stopwords: + continue + + # 跳过单个汉字 + if len(word) == 1 and re.search(r'[\u4e00-\u9fff]', word): + continue + + # 跳过纯数字 + if re.match(r'^\d+$', word): + continue + + # 检查词在标题中出现的频率 + title_count = sum(1 for title in all_titles if word in title) + if title_count > 1: # 至少在两个标题中出现 + weight *= (1 + min(title_count / 10, 2.0)) # 最多提升3倍 + + filtered_keywords.append((word, weight)) + + # 排序并格式化结果 + filtered_keywords.sort(key=lambda x: x[1], reverse=True) + hot_keywords = [ + {"text": word, "weight": round(weight * 10, 1)} # 放大权重以便于可视化 + for word, weight in filtered_keywords[:30] # 取前30个 + ] + + return hot_keywords + + def _analyze_topic_distribution(self, all_data: Dict[str, List]) -> List[Dict[str, Any]]: + """分析主题分布(各类别占比)""" + # 收集所有标题和内容 + all_texts = [] + for platform, items in all_data.items(): + for item in items: + title = item.get("title", "") + content = item.get("content", "") + if title: + all_texts.append(title) + if content: + all_texts.append(content) + + if not all_texts: + return self._generate_random_distribution() # 无数据时返回随机分布 + + # 合并所有文本 + combined_text = " ".join(all_texts) + + # 使用jieba提取关键词 + keywords = jieba.analyse.extract_tags( + combined_text, + topK=200, # 提取足够多的关键词以便分类 + withWeight=True + ) + + # 初始化各类别得分 + category_scores = {category: 0.0 for category in self.categories} + + # 计算每个类别的得分 + for word, weight in keywords: + for category, category_keywords in self.category_keywords.items(): + # 如果关键词在该类别的特征词中,增加该类别得分 + if word in category_keywords: + category_scores[category] += weight + # 部分匹配(关键词是类别特征词的一部分或特征词是关键词的一部分) + else: + for category_word in category_keywords: + if (word in category_word or category_word in word) and len(word) > 1 and len(category_word) > 1: + category_scores[category] += weight * 0.5 + break + + # 如果所有类别得分都为0,返回随机分布 + if sum(category_scores.values()) == 0: + return self._generate_random_distribution() + + # 计算百分比 + total_score = sum(category_scores.values()) + distribution = [] + + for category, score in category_scores.items(): + if total_score > 0: + percentage = (score / total_score) * 100 + distribution.append({ + "category": category, + "percentage": round(percentage, 1) + }) + + # 按百分比降序排序 + distribution.sort(key=lambda x: x["percentage"], reverse=True) + + # 如果某些类别百分比太小,可以过滤掉 + filtered_distribution = [item for item in distribution if item["percentage"] >= 1.0] + + # 如果过滤后为空,返回原始分布 + if not filtered_distribution: + return distribution + + return filtered_distribution + + def _generate_random_distribution(self) -> List[Dict[str, Any]]: + """生成随机的主题分布(当无法基于内容分析时使用)""" + import random + + # 确保总和为100%的随机分布 + total = 100 + categories = self.categories.copy() + distribution = [] + + for i in range(len(categories) - 1): + if total <= 0: + break + + value = round(random.uniform(10, 25), 1) + value = min(value, total) + total -= value + + distribution.append({ + "category": categories[i], + "percentage": value + }) + + # 最后一个类别分配剩余百分比 + if total > 0 and categories: + distribution.append({ + "category": categories[-1], + "percentage": round(total, 1) + }) + + # 按百分比降序排序 + distribution.sort(key=lambda x: x["percentage"], reverse=True) + return distribution + + def _analyze_related_topic_groups(self, all_data: Dict[str, List]) -> List[Dict[str, Any]]: + """分析相关主题词组""" + # 收集所有标题 + all_titles = [] + for platform, items in all_data.items(): + for item in items: + title = item.get("title", "") + if title: + all_titles.append(title) + + if not all_titles: + return [] + + # 使用jieba提取关键词 + keywords_by_title = [] + for title in all_titles: + # 使用jieba提取每个标题的关键词 + keywords = jieba.analyse.extract_tags(title, topK=5) + # 过滤停用词 + valid_keywords = [k for k in keywords if k not in self.stopwords and len(k) > 1] + if valid_keywords: + keywords_by_title.append(valid_keywords) + + # 分析词组共现情况 + word_pairs = [] + word_counter = Counter() + + # 首先统计所有关键词的频率 + for keywords in keywords_by_title: + for keyword in keywords: + word_counter[keyword] += 1 + + # 只考虑出现频率较高的关键词(至少出现2次) + common_words = [word for word, count in word_counter.most_common(50) if count >= 2] + + # 创建共现矩阵 + co_occurrence_matrix = defaultdict(lambda: defaultdict(int)) + + # 分析共现 + for keywords in keywords_by_title: + # 只考虑标题中有效的关键词 + valid_words = [w for w in keywords if w in common_words] + # 分析两两共现 + for i, word1 in enumerate(valid_words): + for word2 in valid_words[i+1:]: + if word1 != word2: # 确保不是同一个词 + # 按字母顺序排序,确保相同的词对只记录一次 + key_pair = tuple(sorted([word1, word2])) + co_occurrence_matrix[key_pair[0]][key_pair[1]] += 1 + + # 转换为词对列表 + for word1, co_words in co_occurrence_matrix.items(): + for word2, count in co_words.items(): + # 只考虑共现次数达到阈值的词对(至少共现3次) + if count >= 3: + # 检查词对是否有意义 + if self._is_meaningful_word_pair(word1, word2): + word_pairs.append({ + "words": [word1, word2], + "co_occurrence": count + }) + + # 合并相似词组 + merged_pairs = self._merge_similar_pairs(word_pairs) + + # 按共现次数排序 + merged_pairs.sort(key=lambda x: x["co_occurrence"], reverse=True) + + # 返回前10个共现词组 + return merged_pairs[:10] + + def _is_meaningful_word_pair(self, word1: str, word2: str) -> bool: + """判断词对是否有意义""" + # 如果两个词都是单字,可能意义不大 + if len(word1) == 1 and len(word2) == 1: + return False + + # 如果两个词都是数字,可能意义不大 + if word1.isdigit() and word2.isdigit(): + return False + + # 如果两个词是同一类别的关键词,可能更有意义 + for category, keywords in self.category_keywords.items(): + if word1 in keywords and word2 in keywords: + return True + + # 检查是否是常见的无意义组合 + meaningless_combinations = [ + ("什么", "怎么"), ("为何", "如何"), ("这个", "那个"), + ("一个", "几个"), ("多少", "一些"), ("很多", "许多") + ] + if (word1, word2) in meaningless_combinations or (word2, word1) in meaningless_combinations: + return False + + # 默认认为有意义 + return True + + def _merge_similar_pairs(self, word_pairs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """合并相似的词对""" + if not word_pairs: + return [] + + # 按共现次数排序 + sorted_pairs = sorted(word_pairs, key=lambda x: x["co_occurrence"], reverse=True) + + # 创建词对集合,用于快速查找 + pair_set = {tuple(sorted(p["words"])) for p in word_pairs} + + # 创建词对组,存储相关联的词对 + pair_groups = [] + processed_pairs = set() + + for pair in sorted_pairs: + pair_key = tuple(sorted(pair["words"])) + + # 如果已处理,跳过 + if pair_key in processed_pairs: + continue + + # 创建新组 + current_group = {pair_key} + processed_pairs.add(pair_key) + + # 查找相关词对 + for other_pair in sorted_pairs: + other_key = tuple(sorted(other_pair["words"])) + + # 如果已处理,跳过 + if other_key in processed_pairs: + continue + + # 如果有共同词,加入当前组 + if pair_key[0] in other_key or pair_key[1] in other_key: + current_group.add(other_key) + processed_pairs.add(other_key) + + # 添加到组列表 + if current_group: + pair_groups.append(current_group) + + # 合并结果 + merged_results = [] + + for group in pair_groups: + # 如果组中只有一个词对,直接添加 + if len(group) == 1: + pair_key = list(group)[0] + for pair in sorted_pairs: + if tuple(sorted(pair["words"])) == pair_key: + merged_results.append(pair) + break + else: + # 合并组中的词对 + all_words = set() + total_co_occurrence = 0 + max_co_occurrence = 0 + + for pair_key in group: + all_words.update(pair_key) + for pair in sorted_pairs: + if tuple(sorted(pair["words"])) == pair_key: + total_co_occurrence += pair["co_occurrence"] + max_co_occurrence = max(max_co_occurrence, pair["co_occurrence"]) + break + + # 如果合并后的词太多,只保留最重要的词 + if len(all_words) > 3: + # 找出组中出现最频繁的词对 + most_frequent_pair = None + for pair in sorted_pairs: + pair_key = tuple(sorted(pair["words"])) + if pair_key in group: + if most_frequent_pair is None or pair["co_occurrence"] > most_frequent_pair["co_occurrence"]: + most_frequent_pair = pair + + if most_frequent_pair: + merged_results.append({ + "words": most_frequent_pair["words"], + "co_occurrence": max_co_occurrence + }) + else: + # 添加合并结果 + merged_results.append({ + "words": list(all_words), + "co_occurrence": max_co_occurrence + }) + + # 再次排序 + merged_results.sort(key=lambda x: x["co_occurrence"], reverse=True) + return merged_results + + def _analyze_platform_overlap(self, all_data: Dict[str, List]) -> Dict[str, Any]: + """分析平台间热点重叠度""" + platforms = list(all_data.keys()) + overlap_matrix = {} + + for i, platform1 in enumerate(platforms): + overlap_matrix[platform1] = {} + titles1 = {item.get("title", "") for item in all_data[platform1] if item.get("title")} + + for platform2 in platforms: + if platform1 == platform2: + overlap_matrix[platform1][platform2] = 100 # 自身重叠度为100% + continue + + titles2 = {item.get("title", "") for item in all_data[platform2] if item.get("title")} + + # 计算重叠度 + if not titles1 or not titles2: + overlap = 0 + else: + # 使用Jaccard相似度 + intersection = len(titles1.intersection(titles2)) + union = len(titles1.union(titles2)) + overlap = round((intersection / union) * 100, 1) + + overlap_matrix[platform1][platform2] = overlap + + return overlap_matrix + + def _analyze_propagation_paths(self, all_data: Dict[str, List]) -> List[Dict[str, Any]]: + """分析热点传播路径(简化实现)""" + # 实际应基于时间戳分析热点在不同平台上的传播顺序 + # 这里简化为随机生成一些传播路径 + import random + + cross_platform_topics = self._find_cross_platform_topics(all_data) + propagation_paths = [] + + for topic in cross_platform_topics[:5]: # 取前5个跨平台话题 + platforms = list({item["platform"] for item in topic["items"]}) + if len(platforms) < 2: + continue + + # 随机排序平台,模拟传播顺序 + random.shuffle(platforms) + + propagation_paths.append({ + "topic": topic["main_title"], + "path": platforms, + "time_span": f"{random.randint(1, 24)}小时" + }) + + return propagation_paths + + def _analyze_sentiment(self, all_data: Dict[str, List]) -> Dict[str, Any]: + """情感分析(简化实现)""" + # 实际应使用NLP模型进行情感分析 + import random + + sentiments = ["正面", "中性", "负面"] + sentiment_distribution = {} + + for platform, items in all_data.items(): + positive = round(random.uniform(20, 60), 1) + negative = round(random.uniform(10, 40), 1) + neutral = round(100 - positive - negative, 1) + + sentiment_distribution[platform] = { + "positive": positive, + "neutral": neutral, + "negative": negative + } + + # 计算总体情感分布 + all_positive = sum(data["positive"] for data in sentiment_distribution.values()) / len(sentiment_distribution) + all_negative = sum(data["negative"] for data in sentiment_distribution.values()) / len(sentiment_distribution) + all_neutral = sum(data["neutral"] for data in sentiment_distribution.values()) / len(sentiment_distribution) + + return { + "overall": { + "positive": round(all_positive, 1), + "neutral": round(all_neutral, 1), + "negative": round(all_negative, 1) + }, + "by_platform": sentiment_distribution + } + + def _analyze_trend_evolution(self, all_data: Dict[str, List], current_date: str, time_range: str = "24h") -> List[Dict[str, Any]]: + """分析热点演变趋势 + + Args: + all_data: 所有平台的热点数据 + current_date: 当前日期 + time_range: 预测时间范围,可选值为 24h(24小时), 7d(7天), 30d(30天) + """ + # 实际应基于历史数据分析热点的演变 + import random + from datetime import datetime, timedelta + from app.services import crawler_factory + + # 获取当前日期 + current = datetime.strptime(current_date, "%Y-%m-%d") + + # 根据时间范围确定历史数据天数和预测天数 + if time_range == "24h": + history_days = 1 + forecast_days = 1 + time_unit = "小时" + elif time_range == "7d": + history_days = 7 + forecast_days = 3 + time_unit = "天" + elif time_range == "30d": + history_days = 30 + forecast_days = 7 + time_unit = "天" + else: + history_days = 1 + forecast_days = 1 + time_unit = "小时" + + # 提取热门话题 + # 合并所有平台的数据,按热度排序 + all_topics = [] + for platform, items in all_data.items(): + for item in items: + title = item.get("title", "") + score = item.get("score", 0) + if title and score > 0: + all_topics.append({ + "title": title, + "score": score, + "platform": platform + }) + + # 按热度排序 + all_topics.sort(key=lambda x: x["score"], reverse=True) + + # 选取前10个热门话题 + top_topics = all_topics[:10] + + # 生成预测数据 + forecast_results = [] + + # 可能的分类列表 + categories = ["科技", "财经", "社会", "娱乐", "体育", "教育", "健康", "国际"] + + # 获取所有平台列表 + all_platforms = list(crawler_factory.keys()) + + for topic in top_topics: + title = topic["title"] + current_score = topic["score"] + platform = topic["platform"] + + # 生成历史趋势数据 + history_data = [] + + # 当前热度 + history_data.append({ + "date": current_date, + "heat": current_score + }) + + # 历史热度(模拟数据) + for i in range(1, history_days + 1): + date = current - timedelta(days=i) + date_str = date.strftime("%Y-%m-%d") + + # 模拟历史热度,通常比当前热度低 + history_heat = max(0, current_score * (0.7 + 0.3 * random.random())) + + history_data.append({ + "date": date_str, + "heat": round(history_heat, 1) + }) + + # 预测未来趋势 + forecast_data = [] + + # 计算热度变化率 + recent_trend = 0 + if len(history_data) >= 2: + recent_trend = (history_data[0]["heat"] - history_data[1]["heat"]) / max(1, history_data[1]["heat"]) + + # 根据最近趋势预测未来热度 + for i in range(1, forecast_days + 1): + date = current + timedelta(days=i) + date_str = date.strftime("%Y-%m-%d") + + # 预测热度,根据最近趋势和随机因素 + trend_factor = recent_trend * (0.8 + 0.4 * random.random()) + forecast_heat = max(0, current_score * (1 + trend_factor)) + + forecast_data.append({ + "date": date_str, + "heat": round(forecast_heat, 1) + }) + + # 计算趋势类型和可能性 + if recent_trend > 0.1: + trend_type = "趋势上升" + probability = min(95, 50 + int(recent_trend * 100)) + elif recent_trend < -0.1: + trend_type = "趋势下降" + probability = min(95, 50 + int(abs(recent_trend) * 100)) + else: + trend_type = "趋势稳定" + probability = 70 + + # 确定可信度文本 + confidence_text = "" + if probability >= 90: + confidence_text = "可信度很高" + elif probability >= 70: + confidence_text = "可信度较高" + elif probability >= 50: + confidence_text = "可信度中等" + else: + confidence_text = "可信度较低" + + # 随机选择一个分类 + category = random.choice(categories) + + # 从标题中提取关键词 + keywords = [] + try: + # 使用jieba提取关键词 + import jieba.analyse + extracted_keywords = jieba.analyse.extract_tags(title, topK=5) + keywords = [kw for kw in extracted_keywords if len(kw) > 1 and kw not in self.stopwords][:3] + except Exception as e: + log.error(f"Error extracting keywords: {e}") + # 如果提取失败,使用标题中的前几个字作为关键词 + if len(title) > 3: + keywords = [title[:3]] + + # 生成可能出现该话题的其他平台 + other_platforms = [] + for p in all_platforms: + if p != platform: + other_platforms.append(p) + + # 随机选择2-3个其他平台 + if other_platforms: + random.shuffle(other_platforms) + out_platforms = other_platforms[:min(3, len(other_platforms))] + else: + out_platforms = [] + + # 添加到结果 + forecast_results.append({ + "topic": title, + "category": category, + "keywords": keywords, + "current_heat": round(current_score, 1), + "history": sorted(history_data, key=lambda x: x["date"]), + "forecast": forecast_data, + "trend_type": trend_type, + "probability": probability, + "probability_text": f"{probability}%", + "confidence": confidence_text, + "platforms": [platform], + "out_platforms": out_platforms + }) + + return forecast_results + + def _get_platform_stats(self, all_data: Dict[str, List]) -> Dict[str, Any]: + """获取各平台统计数据""" + stats = {} + for platform, items in all_data.items(): + # 确保items是列表 + if not isinstance(items, list): + log.warning(f"Platform {platform} data is not a list, skipping") + continue + + valid_items = [] + for item in items: + # 确保每个item是字典 + if not isinstance(item, dict): + log.warning(f"Item in platform {platform} is not a dictionary, skipping: {item}") + continue + valid_items.append(item) + + if not valid_items: + stats[platform] = { + "total_items": 0, + "avg_title_length": 0, + "has_description": 0, + "has_url": 0 + } + continue + + # 安全地获取标题长度 + title_lengths = [] + for item in valid_items: + title = item.get("title", "") + if isinstance(title, str): + title_lengths.append(len(title)) + + # 计算统计数据 + stats[platform] = { + "total_items": len(valid_items), + "avg_title_length": sum(title_lengths) / max(len(title_lengths), 1) if title_lengths else 0, + "has_description": sum(1 for item in valid_items if isinstance(item.get("desc"), str)), + "has_url": sum(1 for item in valid_items if isinstance(item.get("url"), str)) + } + return stats + + def _find_cross_platform_topics(self, all_data: Dict[str, List]) -> List[Dict[str, Any]]: + """查找跨平台热点话题""" + # 简化实现:查找标题相似的内容 + platform_titles = defaultdict(list) + + # 收集各平台标题 + for platform, items in all_data.items(): + for item in items: + title = item.get("title", "") + if title: + platform_titles[platform].append({ + "title": title, + "url": item.get("url", ""), + "score": item.get("score", 0) + }) + + # 查找相似标题(简化实现) + cross_platform_topics = [] + processed_titles = set() + + for platform1, titles1 in platform_titles.items(): + for item1 in titles1: + title1 = item1["title"] + + # 跳过已处理的标题 + if title1 in processed_titles: + continue + + related_items = [] + platforms_found = set() + matched_titles = [] # 记录匹配的标题,用于调试 + + # 查找其他平台中的相似标题 + for platform2, titles2 in platform_titles.items(): + if platform1 == platform2: + related_items.append({ + "platform": platform1, + "title": title1, + "url": item1["url"], + "score": item1["score"] + }) + platforms_found.add(platform1) + continue + + # 该平台是否找到匹配 + platform_matched = False + + for item2 in titles2: + title2 = item2["title"] + # 使用jieba计算相似度,提高相似度阈值 + similarity = self._calculate_title_similarity(title1, title2) + + # 提高相似度阈值,减少误判 + if similarity > 0.25: # 提高相似度阈值,确保更准确的匹配 + related_items.append({ + "platform": platform2, + "title": title2, + "url": item2["url"], + "score": item2["score"], + "similarity": round(similarity, 2) # 记录相似度 + }) + platforms_found.add(platform2) + processed_titles.add(title2) + platform_matched = True + matched_titles.append(f"{platform2}: {title2} (相似度: {round(similarity, 2)})") + break # 每个平台只取最相似的一个标题 + + # 如果该平台没有找到匹配,记录一下 + if not platform_matched and len(platforms_found) > 1: + matched_titles.append(f"{platform2}: 未找到匹配") + + # 如果在多个平台上找到,则认为是跨平台话题 + if len(platforms_found) > 1: # 至少在2个平台上出现 + # 计算总热度值 + total_heat = sum(item.get("score", 0) for item in related_items) + + # 记录匹配情况 + if len(platforms_found) >= 3: # 对于3个及以上平台的匹配,记录详细信息 + log.info(f"跨平台热点: {title1}") + for match in matched_titles: + log.info(f" - {match}") + + cross_platform_topics.append({ + "title": title1, + "platforms_count": len(platforms_found), + "platforms": list(platforms_found), + "heat": round(total_heat, 1), + "related_items": related_items # 保存相关项目,便于前端展示 + }) + processed_titles.add(title1) + + # 按出现平台数量排序,然后按热度排序 + cross_platform_topics.sort(key=lambda x: (x["platforms_count"], x["heat"]), reverse=True) + return cross_platform_topics[:20] # 返回前20个跨平台话题 + + def _calculate_title_similarity(self, title1: str, title2: str) -> float: + """计算两个标题的相似度""" + # 如果标题完全相同,直接返回1.0 + if title1 == title2: + return 1.0 + + # 标题长度差异过大,可能不是同一话题 + len1, len2 = len(title1), len(title2) + if max(len1, len2) > 3 * min(len1, len2): + return 0.0 + + # 如果一个标题是另一个的子串,给予较高相似度,但要求子串长度至少为5个字符 + if len(title1) >= 5 and title1 in title2: + return 0.8 + if len(title2) >= 5 and title2 in title1: + return 0.8 + + # 使用jieba分词 + words1 = set(jieba.cut(title1)) + words2 = set(jieba.cut(title2)) + + # 计算Jaccard相似度 + if not words1 or not words2: + return 0 + + intersection = len(words1.intersection(words2)) + union = len(words1.union(words2)) + + # 如果交集太小,可能不是同一话题 + if intersection <= 1: + return 0.0 + + jaccard_sim = intersection / union + + # 提取关键词,并计算关键词相似度 + keywords1 = set(jieba.analyse.extract_tags(title1, topK=5)) + keywords2 = set(jieba.analyse.extract_tags(title2, topK=5)) + + if keywords1 and keywords2: + keyword_intersection = len(keywords1.intersection(keywords2)) + keyword_union = len(keywords1.union(keywords2)) + + # 如果关键词没有交集,可能不是同一话题 + if keyword_intersection == 0: + return max(0.0, jaccard_sim - 0.1) # 降低相似度 + + keyword_sim = keyword_intersection / keyword_union if keyword_union > 0 else 0 + + # 综合考虑Jaccard相似度和关键词相似度 + return max(jaccard_sim, keyword_sim) + + return jaccard_sim + + def _find_platform_unique_topics(self, all_data: Dict[str, List]) -> Dict[str, List]: + """查找各平台特有的热点话题""" + # 简化实现:找出只在单一平台出现的热门话题 + platform_unique = {} + + for platform, items in all_data.items(): + # 取每个平台分数最高的3个话题 + top_items = sorted(items, key=lambda x: x.get("score", 0), reverse=True)[:3] + + platform_unique[platform] = [ + { + "title": item.get("title", ""), + "url": item.get("url", ""), + "score": item.get("score", 0) + } + for item in top_items + ] + + return platform_unique + + def _extract_keyword_clouds(self, all_data: Dict[str, List], keyword_count: int = 200) -> Dict[str, List[Dict[str, Any]]]: + """提取关键词云图数据 + + 返回按类别分组的关键词云数据,包括全部类别和各个单独类别 + + Args: + all_data: 所有平台的热点数据 + keyword_count: 返回的关键词数量,默认为200 + """ + # 收集所有标题和内容 + all_titles = [] + category_titles = {category: [] for category in self.categories} + platform_titles = {platform: [] for platform in all_data.keys()} + + for platform, items in all_data.items(): + for item in items: + title = item.get("title", "") + content = item.get("content", "") + + if not title: + continue + + # 添加到全部标题列表 + all_titles.append(title) + + # 添加到对应平台的标题列表 + platform_titles[platform].append(title) + + # 尝试分类并添加到对应类别 + categorized = False + for category, keywords in self.category_keywords.items(): + # 检查标题是否包含该类别的关键词 + for keyword in keywords: + if keyword in title: + category_titles[category].append(title) + categorized = True + break + if categorized: + break + + # 根据总关键词数量计算各部分的关键词数量 + category_count = max(50, keyword_count // 2) # 每个类别的关键词数量,至少50个 + platform_count = max(50, keyword_count // 2) # 每个平台的关键词数量,至少50个 + + # 提取全部关键词 + all_keywords = self._extract_category_keywords(all_titles, keyword_count) + + # 提取各类别关键词 + category_keywords = {} + for category, titles in category_titles.items(): + if titles: + # 每个类别提取指定数量的关键词 + category_keywords[category] = self._extract_category_keywords(titles, category_count) + else: + category_keywords[category] = [] + + # 提取各平台关键词 + platform_keywords = {} + for platform, titles in platform_titles.items(): + if titles: + # 每个平台提取指定数量的关键词 + platform_keywords[platform] = self._extract_category_keywords(titles, platform_count) + else: + platform_keywords[platform] = [] + + # 组织返回结果 + result = { + "all": all_keywords, + } + + # 添加各类别的关键词 + for category, keywords in category_keywords.items(): + result[category] = keywords + + # 添加各平台的关键词 + for platform, keywords in platform_keywords.items(): + result[f"platform_{platform}"] = keywords + + return result + + def _extract_category_keywords(self, texts: List[str], top_k: int = 30) -> List[Dict[str, Any]]: + """从文本列表中提取关键词 + + Args: + texts: 文本列表 + top_k: 返回的关键词数量 + + Returns: + 关键词列表,每个关键词包含文本和权重 + """ + if not texts: + return [] + + # 合并文本 + combined_text = " ".join(texts) + + # 使用jieba的TF-IDF算法提取关键词 + keywords_tfidf = jieba.analyse.extract_tags( + combined_text, + topK=top_k * 2, # 提取更多关键词,后续过滤 + withWeight=True + ) + + # 使用TextRank算法提取关键词 + keywords_textrank = jieba.analyse.textrank( + combined_text, + topK=top_k * 2, + withWeight=True + ) + + # 合并两种算法的结果 + keywords_dict = {} + for word, weight in keywords_tfidf: + if word not in self.stopwords and len(word) > 1: + keywords_dict[word] = weight + + for word, weight in keywords_textrank: + if word not in self.stopwords and len(word) > 1: + if word in keywords_dict: + # 如果两种算法都提取到了该词,取平均权重 + keywords_dict[word] = (keywords_dict[word] + weight) / 2 + else: + keywords_dict[word] = weight + + # 过滤停用词和单字词 + filtered_keywords = [] + for word, weight in keywords_dict.items(): + # 跳过单个汉字 + if len(word) == 1 and re.search(r'[\u4e00-\u9fff]', word): + continue + + # 跳过纯数字 + if re.match(r'^\d+$', word): + continue + + # 检查词在文本中出现的频率 + text_count = sum(1 for text in texts if word in text) + if text_count > 1: # 至少在两个文本中出现 + weight *= (1 + min(text_count / 10, 2.0)) # 最多提升3倍 + + filtered_keywords.append((word, weight)) + + # 排序并格式化结果 + filtered_keywords.sort(key=lambda x: x[1], reverse=True) + result = [ + {"text": word, "weight": round(weight * 100, 1)} # 放大权重以便于可视化 + for word, weight in filtered_keywords[:top_k] # 取前top_k个 + ] + + return result + + def get_keyword_cloud(self, date_str: Optional[str] = None, refresh: bool = False, keyword_count: int = 200) -> Dict[str, Any]: + """获取关键词云图数据 + + Args: + date_str: 日期字符串,格式为YYYY-MM-DD + refresh: 是否强制刷新缓存 + keyword_count: 返回的关键词数量,默认为200 + """ + if not date_str: + date_str = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d") + + # 缓存处理 + cache_key = f"analysis:keyword_cloud:{date_str}" + + # 如果强制刷新或者没有缓存,则重新分析 + if refresh: + # 清除旧的缓存 + cache.delete_cache(cache_key) + else: + # 尝试从缓存获取 + cached_data = cache.get_cache(cache_key) + if cached_data: + log.info(f"Retrieved keyword cloud from cache for {date_str}") + return cached_data + + # 收集所有平台的热点数据 + all_platform_data = self._get_platform_data(date_str) + + if not all_platform_data: + log.warning(f"No data available for keyword cloud on {date_str}") + return { + "status": "error", + "message": "暂无可用数据生成关键词云", + "date": date_str + } + + # 提取关键词云数据 + keyword_clouds = self._extract_keyword_clouds(all_platform_data, keyword_count) + + # 构建结果 + result = { + "status": "success", + "message": "关键词云数据生成成功", + "date": date_str, + "keyword_clouds": keyword_clouds, + "updated_at": datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S") + } + + # 缓存结果 + cache.set_cache(cache_key, result, self.cache_expire) + + return result + + def get_data_visualization(self, date_str: Optional[str] = None, refresh: bool = False, platforms: Optional[List[str]] = None) -> Dict[str, Any]: + """获取数据可视化分析 + + Args: + date_str: 日期字符串,格式为YYYY-MM-DD + refresh: 是否强制刷新缓存 + platforms: 指定要分析的平台列表 + """ + if not date_str: + date_str = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d") + + # 缓存处理 + cache_key = f"analysis:data_visualization:{date_str}" + + # 如果强制刷新或者没有缓存,则重新分析 + if refresh: + # 清除旧的缓存 + cache.delete_cache(cache_key) + else: + # 尝试从缓存获取 + cached_data = cache.get_cache(cache_key) + if cached_data: + log.info(f"Retrieved data visualization from cache for {date_str}") + return cached_data + + # 收集所有平台的热点数据 + all_platform_data = self._get_platform_data(date_str) + + # 如果指定了平台,只保留指定平台的数据 + if platforms: + all_platform_data = {k: v for k, v in all_platform_data.items() if k in platforms} + + if not all_platform_data: + log.warning(f"No data available for data visualization on {date_str}") + return { + "status": "error", + "message": "暂无可用数据进行可视化分析", + "date": date_str + } + + # 生成主题热度分布数据 + topic_distribution = self._analyze_topic_heat_distribution(all_platform_data) + + # 构建结果 + result = { + "status": "success", + "message": "数据可视化分析完成", + "date": date_str, + "topic_heat_distribution": topic_distribution, + "platforms": list(all_platform_data.keys()), + "updated_at": datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S") + } + + # 缓存结果 + cache.set_cache(cache_key, result, self.cache_expire) + + return result + + def get_trend_forecast(self, date_str: Optional[str] = None, refresh: bool = False, time_range: str = "24h") -> Dict[str, Any]: + """获取热点趋势预测分析 + + Args: + date_str: 日期字符串,格式为YYYY-MM-DD + refresh: 是否强制刷新缓存 + time_range: 预测时间范围,可选值为 24h(24小时), 7d(7天), 30d(30天) + """ + if not date_str: + date_str = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d") + + # 验证时间范围参数 + valid_time_ranges = ["24h", "7d", "30d"] + if time_range not in valid_time_ranges: + time_range = "24h" # 默认使用24小时 + + # 缓存处理 + cache_key = f"analysis:trend_forecast:{date_str}:{time_range}" + + # 如果强制刷新或者没有缓存,则重新分析 + if refresh: + # 清除旧的缓存 + cache.delete_cache(cache_key) + else: + # 尝试从缓存获取 + cached_data = cache.get_cache(cache_key) + if cached_data: + log.info(f"Retrieved trend forecast from cache for {date_str}, time_range: {time_range}") + return cached_data + + # 收集所有平台的热点数据 + all_platform_data = self._get_platform_data(date_str) + + if not all_platform_data: + log.warning(f"No data available for trend forecast on {date_str}") + return { + "status": "error", + "message": "暂无可用数据进行趋势预测", + "date": date_str + } + + # 分析热点趋势演变 + trend_evolution = self._analyze_trend_evolution(all_platform_data, date_str, time_range) + + # 构建结果 + result = { + "status": "success", + "message": "热点趋势预测完成", + "date": date_str, + "time_range": time_range, + "trend_evolution": trend_evolution, + "updated_at": datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S") + } + + # 缓存结果 + cache.set_cache(cache_key, result, self.cache_expire) + + return result + + def _analyze_topic_heat_distribution(self, all_data: Dict[str, List]) -> Dict[str, Any]: + """分析主题热度分布 + + 生成主题热度分布图数据,展示不同平台对各主题词的热度分布 + """ + # 获取可用的平台列表 + available_platforms = list(all_data.keys()) + + # 如果平台太多,选择主要平台 + if len(available_platforms) > 8: + # 优先选择的平台顺序 + preferred_platforms = [ + "baidu", "weibo", "jinritoutiao", "douyin", "hupu", + "v2ex", "github", "stackoverflow", "tieba", "zhihu", + "36kr", "bilibili", "douban", "hackernews" + ] + + # 先添加优先平台 + selected_platforms = [] + for platform in preferred_platforms: + if platform in available_platforms: + selected_platforms.append(platform) + if len(selected_platforms) >= 8: # 最多选择8个平台 + break + + # 如果优先平台不够,添加其他平台 + if len(selected_platforms) < 8: + for platform in available_platforms: + if platform not in selected_platforms: + selected_platforms.append(platform) + if len(selected_platforms) >= 8: + break + + platform_data = {platform: all_data[platform] for platform in selected_platforms} + else: + # 如果平台数量合适,使用所有平台 + platform_data = all_data + + # 提取每个平台的热门关键词 + platform_keywords = {} + all_keywords = set() + + for platform, items in platform_data.items(): + # 提取该平台的标题 + titles = [item.get("title", "") for item in items if item.get("title")] + if not titles: + continue + + # 使用jieba提取关键词 + keywords = self._extract_platform_keywords(titles, 10) + if keywords: + platform_keywords[platform] = keywords + all_keywords.update(kw["text"] for kw in keywords) + + # 选取出现频率最高的关键词 + top_keywords = self._select_top_keywords(platform_keywords, 10) + + # 构建主题热度分布数据 + distribution_data = { + "keywords": top_keywords, + "platforms": [], + "data": [] + } + + # 平台名称映射 + platform_names = { + "hupu": "虎扑", + "weibo": "微博", + "jinritoutiao": "今日头条", + "douyin": "抖音", + "baidu": "百度热搜", + "v2ex": "V2EX", + "github": "GitHub", + "stackoverflow": "Stack Overflow", + "tieba": "贴吧", + "zhihu": "知乎", + "36kr": "36氪", + "bilibili": "哔哩哔哩", + "douban": "豆瓣", + "hackernews": "Hacker News", + "shaoshupai": "少数派" + } + + # 添加平台列表(使用中文名称) + for platform in platform_keywords.keys(): + display_name = platform_names.get(platform, platform) + distribution_data["platforms"].append(display_name) + + # 为每个关键词构建热度数据 + for keyword in top_keywords: + keyword_data = {"keyword": keyword, "values": []} + + for platform_display in distribution_data["platforms"]: + # 找到平台的原始键名 + platform_key = next((k for k, v in platform_names.items() if v == platform_display), platform_display) + + # 查找该平台中该关键词的热度 + heat = 0 + if platform_key in platform_keywords: + for kw in platform_keywords[platform_key]: + if kw["text"] == keyword: + heat = kw["weight"] + break + + keyword_data["values"].append(heat) + + distribution_data["data"].append(keyword_data) + + return distribution_data + + def _extract_platform_keywords(self, titles: List[str], top_k: int = 10) -> List[Dict[str, Any]]: + """从平台标题中提取关键词 + + Args: + titles: 标题列表 + top_k: 返回的关键词数量 + """ + if not titles: + return [] + + # 合并标题 + combined_text = " ".join(titles) + + # 使用jieba提取关键词 + keywords = jieba.analyse.extract_tags( + combined_text, + topK=top_k * 2, + withWeight=True + ) + + # 过滤停用词和单字词 + filtered_keywords = [] + for word, weight in keywords: + # 跳过停用词 + if word in self.stopwords: + continue + + # 跳过单个汉字 + if len(word) == 1 and re.search(r'[\u4e00-\u9fff]', word): + continue + + # 跳过纯数字 + if re.match(r'^\d+$', word): + continue + + # 检查词在标题中出现的频率 + title_count = sum(1 for title in titles if word in title) + if title_count > 1: # 至少在两个标题中出现 + weight *= (1 + min(title_count / 10, 2.0)) # 最多提升3倍 + + filtered_keywords.append({"text": word, "weight": round(weight * 100, 1)}) + + # 排序并返回前top_k个 + filtered_keywords.sort(key=lambda x: x["weight"], reverse=True) + return filtered_keywords[:top_k] + + def _select_top_keywords(self, platform_keywords: Dict[str, List[Dict[str, Any]]], top_k: int = 10) -> List[str]: + """从各平台关键词中选择最重要的关键词 + + Args: + platform_keywords: 平台关键词字典,格式为 {platform: [{"text": keyword, "weight": weight}, ...]} + top_k: 返回的关键词数量 + """ + # 统计每个关键词在不同平台的出现次数和总权重 + keyword_stats = defaultdict(lambda: {"count": 0, "total_weight": 0}) + + for platform, keywords in platform_keywords.items(): + for kw in keywords: + keyword = kw["text"] + weight = kw["weight"] + + keyword_stats[keyword]["count"] += 1 + keyword_stats[keyword]["total_weight"] += weight + + # 计算综合得分(平台出现次数 * 总权重) + for keyword, stats in keyword_stats.items(): + stats["score"] = stats["count"] * stats["total_weight"] + + # 按得分排序 + sorted_keywords = sorted(keyword_stats.items(), key=lambda x: x[1]["score"], reverse=True) + + # 返回前top_k个关键词 + return [kw for kw, _ in sorted_keywords[:top_k]] \ No newline at end of file diff --git a/app/api/__init__.py b/app/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/api/dependencies.py b/app/api/dependencies.py new file mode 100644 index 0000000..e69de29 diff --git a/app/api/v1/__init__.py b/app/api/v1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/api/v1/analysis.py b/app/api/v1/analysis.py new file mode 100644 index 0000000..4522e8e --- /dev/null +++ b/app/api/v1/analysis.py @@ -0,0 +1,314 @@ +from fastapi import APIRouter, Query +from typing import Optional +from datetime import datetime + +import pytz + +from app.analysis.trend_analyzer import TrendAnalyzer +from app.analysis.predictor import TrendPredictor +from app.utils.logger import log +from app.core import cache + +router = APIRouter() + +@router.get("/trend") +async def get_trend_analysis(date: Optional[str] = None, type: str = "main"): + """ + 获取热点聚合分析 + + 分析各平台热点数据的共性和差异,提取共同关键词、跨平台热点话题等 + + - **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天 + - **type**: 分析类型,可选值为 main(主题分析), platform(平台对比), cross(跨平台热点), advanced(高级分析),默认为main + """ + try: + if not date: + date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + + # 从缓存中获取数据 + cache_key = f"analysis:trend:{date}:{type}" + cached_data = cache.get_cache(cache_key) + + if cached_data: + log.info(f"Retrieved trend analysis from cache for {date}, type: {type}") + return cached_data + + # 如果缓存中没有,则生成新的分析数据 + analyzer = TrendAnalyzer() + result = analyzer.get_analysis(date, type) + return result + except Exception as e: + log.error(f"Error in trend analysis: {e}") + return { + "status": "error", + "message": str(e), + "date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + } + +@router.get("/platform-comparison") +async def get_platform_comparison(date: Optional[str] = None): + """ + 获取平台对比分析 + + 分析各平台热点数据的特点、热度排行、更新频率等,比较不同平台间的异同 + + - **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天 + """ + try: + if not date: + date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + + # 从缓存中获取数据 + cache_key = f"analysis:trend:{date}:platform_comparison" + cached_data = cache.get_cache(cache_key) + + if cached_data: + log.info(f"Retrieved platform comparison from cache for {date}") + return cached_data + + # 如果缓存中没有,则生成新的分析数据 + analyzer = TrendAnalyzer() + result = analyzer.get_platform_comparison(date) + return result + except Exception as e: + log.error(f"Error in platform comparison: {e}") + return { + "status": "error", + "message": str(e), + "date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + } + +@router.get("/cross-platform") +async def get_cross_platform_analysis(date: Optional[str] = None, refresh: bool = False): + """ + 获取跨平台热点分析 + + 分析在多个平台上出现的热点话题,以及热点的传播路径 + + - **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天 + - **refresh**: 可选,是否强制刷新缓存,默认为False + """ + try: + if not date: + date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + + # 从缓存中获取数据 + cache_key = f"analysis:trend:{date}:cross_platform" + + # 如果不是强制刷新,尝试从缓存获取 + if not refresh: + cached_data = cache.get_cache(cache_key) + if cached_data: + log.info(f"Retrieved cross platform analysis from cache for {date}") + return cached_data + + # 如果缓存中没有或需要刷新,则生成新的分析数据 + analyzer = TrendAnalyzer() + result = analyzer.get_cross_platform_analysis(date, refresh) + return result + except Exception as e: + log.error(f"Error in cross platform analysis: {e}") + return { + "status": "error", + "message": str(e), + "date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + } + +@router.get("/advanced") +async def get_advanced_analysis(date: Optional[str] = None, refresh: bool = False): + """ + 获取高级分析 + + 提供更深入的热点分析,包括关键词云图、情感分析、热点演变趋势等 + + - **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天 + - **refresh**: 可选,是否强制刷新缓存,默认为False + """ + try: + if not date: + date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + + # 从缓存中获取数据 + cache_key = f"analysis:trend:{date}:advanced_analysis" + + # 如果不是强制刷新,尝试从缓存获取 + if not refresh: + cached_data = cache.get_cache(cache_key) + if cached_data: + log.info(f"Retrieved advanced analysis from cache for {date}") + return cached_data + + # 如果缓存中没有或需要刷新,则生成新的分析数据 + analyzer = TrendAnalyzer() + result = analyzer.get_advanced_analysis(date, refresh) + return result + except Exception as e: + log.error(f"Error in advanced analysis: {e}") + return { + "status": "error", + "message": str(e), + "date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + } + +@router.get("/prediction") +async def get_trend_prediction(date: Optional[str] = None): + """ + 获取热点趋势预测 + + 基于历史数据预测热点话题的发展趋势,包括上升趋势、下降趋势、持续热门话题等 + + - **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天 + """ + try: + if not date: + date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + + # 从缓存中获取数据 + cache_key = f"analysis:prediction:{date}" + cached_data = cache.get_cache(cache_key) + + if cached_data: + log.info(f"Retrieved trend prediction from cache for {date}") + return cached_data + + # 如果缓存中没有,则生成新的预测数据 + predictor = TrendPredictor() + result = predictor.get_prediction(date) + return result + except Exception as e: + log.error(f"Error in trend prediction: {e}") + return { + "status": "error", + "message": str(e), + "date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + } + +@router.get("/keyword-cloud") +async def get_keyword_cloud(date: Optional[str] = None, refresh: bool = False, platforms: Optional[str] = None, category: Optional[str] = None, keyword_count: int = 200): + """ + 获取关键词云图数据 + + 提取热点数据中的关键词,按不同类别(科技、娱乐、社会等)进行分类,用于生成词云 + + - **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天 + - **refresh**: 可选,是否强制刷新缓存,默认为False + - **platforms**: 可选,指定平台,多个平台用逗号分隔,如"baidu,weibo" + - **category**: 可选,指定分类,如"科技"、"娱乐"等 + - **keyword_count**: 可选,返回的关键词数量,默认为200 + """ + try: + if not date: + date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + + # 从缓存中获取数据 + cache_key = f"analysis:keyword_cloud:{date}" + + # 如果不是强制刷新,尝试从缓存获取 + if not refresh: + cached_data = cache.get_cache(cache_key) + if cached_data: + log.info(f"Retrieved keyword cloud from cache for {date}") + # 如果指定了分类,过滤结果 + if category and cached_data.get("status") == "success" and "keyword_clouds" in cached_data: + if category in cached_data["keyword_clouds"]: + filtered_data = cached_data.copy() + filtered_data["keyword_clouds"] = {category: cached_data["keyword_clouds"][category]} + return filtered_data + return cached_data + + # 如果缓存中没有或需要刷新,则生成新的关键词云数据 + analyzer = TrendAnalyzer() + result = analyzer.get_keyword_cloud(date, refresh, keyword_count) + return result + except Exception as e: + log.error(f"Error in keyword cloud analysis: {e}") + return { + "status": "error", + "message": str(e), + "date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + } + +@router.get("/data-visualization") +async def get_data_visualization(date: Optional[str] = None, refresh: bool = False, platforms: str = None): + """ + 获取数据可视化分析 + + 提供热点数据的可视化分析,包括主题热度分布图 + + - **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天 + - **refresh**: 可选,是否强制刷新缓存,默认为False + - **platforms**: 可选,指定要分析的平台,多个平台用逗号分隔,例如:baidu,weibo,douyin + """ + try: + if not date: + date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + + # 从缓存中获取数据 + cache_key = f"analysis:data_visualization:{date}" + + # 如果不是强制刷新,尝试从缓存获取 + if not refresh: + cached_data = cache.get_cache(cache_key) + if cached_data: + log.info(f"Retrieved data visualization from cache for {date}") + return cached_data + + # 解析平台参数 + platform_list = None + if platforms: + platform_list = [p.strip() for p in platforms.split(",") if p.strip()] + + # 如果缓存中没有或需要刷新,则生成新的可视化数据 + analyzer = TrendAnalyzer() + result = analyzer.get_data_visualization(date, refresh, platform_list) + return result + except Exception as e: + log.error(f"Error in data visualization: {e}") + return { + "status": "error", + "message": str(e), + "date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + } + +@router.get("/trend-forecast") +async def get_trend_forecast(date: Optional[str] = None, refresh: bool = False, time_range: str = "24h"): + """ + 获取热点趋势预测分析 + + 分析热点话题的演变趋势,预测热点的发展方向 + + - **date**: 可选,指定日期,格式为YYYY-MM-DD,默认为当天 + - **refresh**: 可选,是否强制刷新缓存,默认为False + - **time_range**: 可选,预测时间范围,可选值为 24h(24小时), 7d(7天), 30d(30天),默认为24h + """ + try: + if not date: + date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + + # 验证时间范围参数 + valid_time_ranges = ["24h", "7d", "30d"] + if time_range not in valid_time_ranges: + time_range = "24h" # 默认使用24小时 + + # 从缓存中获取数据 + cache_key = f"analysis:trend_forecast:{date}:{time_range}" + + # 如果不是强制刷新,尝试从缓存获取 + if not refresh: + cached_data = cache.get_cache(cache_key) + if cached_data: + log.info(f"Retrieved trend forecast from cache for {date}, time_range: {time_range}") + return cached_data + + # 如果缓存中没有或需要刷新,则生成新的趋势预测数据 + analyzer = TrendAnalyzer() + result = analyzer.get_trend_forecast(date, refresh, time_range) + return result + except Exception as e: + log.error(f"Error in trend forecast: {e}") + return { + "status": "error", + "message": str(e), + "date": date or datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d"), + "time_range": time_range + } \ No newline at end of file diff --git a/app/api/v1/daily_news.py b/app/api/v1/daily_news.py new file mode 100644 index 0000000..730d91d --- /dev/null +++ b/app/api/v1/daily_news.py @@ -0,0 +1,295 @@ +# app/api/endpoints/dailynews.py +import json +from datetime import datetime +from typing import List, Dict, Any, Optional + +import pytz +from fastapi import APIRouter + +from app.core import cache +from app.services import crawler_factory +from app.utils.logger import log + +router = APIRouter() + + +@router.get("/") +def get_hot_news(date: str = None, platform: str = None): + if platform not in crawler_factory.keys(): + return { + "status": "404", + "data": [], + "msg": "`platform` is required, valid platform: " + ", ".join(crawler_factory.keys()) + } + + if not date: + date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + + cacheKey = f"crawler:{platform}:{date}" + result = cache.get(cacheKey) + if result: + return { + "status": "200", + "data": json.loads(result), + "msg": "success" + } + + return { + "status": "200", + "data": [], + "msg": "success" + } + + +@router.get("/all") +def get_all_platforms_news(date: str = None): + """ + 获取所有平台的热门新闻 + + Args: + date: 日期,格式为YYYY-MM-DD,默认为当天 + + Returns: + 包含所有平台新闻的字典,键为平台名称,值为新闻列表 + """ + if not date: + date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + + all_news = {} + + for platform in crawler_factory.keys(): + cacheKey = f"crawler:{platform}:{date}" + result = cache.get(cacheKey) + if result: + try: + all_news[platform] = json.loads(result) + except Exception as e: + log.error(f"Error parsing cached data for {platform}: {e}") + all_news[platform] = [] + else: + all_news[platform] = [] + + return { + "status": "200", + "data": all_news, + "msg": "success" + } + + +@router.get("/multi") +def get_multi_platforms_news(date: str = None, platforms: str = None): + """ + 获取多个平台的热门新闻 + + Args: + date: 日期,格式为YYYY-MM-DD,默认为当天 + platforms: 平台列表,以逗号分隔,例如 "weibo,baidu,zhihu" + + Returns: + 包含指定平台新闻的字典,键为平台名称,值为新闻列表 + """ + if not date: + date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + + if not platforms: + return { + "status": "404", + "data": {}, + "msg": "`platforms` parameter is required, format: comma-separated platform names" + } + + platform_list = [p.strip() for p in platforms.split(",")] + valid_platforms = crawler_factory.keys() + + # 验证平台是否有效 + invalid_platforms = [p for p in platform_list if p not in valid_platforms] + if invalid_platforms: + return { + "status": "404", + "data": {}, + "msg": f"Invalid platforms: {', '.join(invalid_platforms)}. Valid platforms: {', '.join(valid_platforms)}" + } + + multi_news = {} + + for platform in platform_list: + cacheKey = f"crawler:{platform}:{date}" + result = cache.get(cacheKey) + if result: + try: + multi_news[platform] = json.loads(result) + except Exception as e: + log.error(f"Error parsing cached data for {platform}: {e}") + multi_news[platform] = [] + else: + multi_news[platform] = [] + + return { + "status": "200", + "data": multi_news, + "msg": "success" + } + + +@router.get("/search") +def search_news(keyword: str, date: str = None, platforms: str = None, limit: int = 20): + """ + 搜索新闻 + + Args: + keyword: 搜索关键词 + date: 日期,格式为YYYY-MM-DD,默认为当天 + platforms: 平台列表,以逗号分隔,例如 "weibo,baidu,zhihu",默认搜索所有平台 + limit: 返回结果数量限制,默认为20 + + Returns: + 包含搜索结果的字典,键为状态码、数据、消息、总结果数量和搜索结果数量 + """ + if not date: + date = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d") + + # 确定要搜索的平台 + if platforms: + platform_list = [p.strip() for p in platforms.split(",")] + valid_platforms = crawler_factory.keys() + platform_list = [p for p in platform_list if p in valid_platforms] + else: + platform_list = list(crawler_factory.keys()) + + if not platform_list: + return { + "status": "404", + "data": [], + "msg": "No valid platforms specified", + "total": 0, + "search_results": 0 + } + + # 从各平台获取新闻数据 + all_news = [] + + for platform in platform_list: + cacheKey = f"crawler:{platform}:{date}" + result = cache.get(cacheKey) + if not result: + continue + + try: + platform_news = json.loads(result) + if not isinstance(platform_news, list): + continue + + # 为每条新闻添加平台信息 + for idx, item in enumerate(platform_news): + if not isinstance(item, dict): + continue + + # 处理rank字段 + rank_value = "" + if "rank" in item and item["rank"]: + rank_value = str(item["rank"]).replace("#", "") + elif "index" in item and item["index"]: + rank_value = str(item["index"]).replace("#", "") + else: + rank_value = str(idx + 1) + + # 获取分类信息 + category = _get_category_for_platform(platform) + sub_category = _get_subcategory_for_platform(platform) + + # 构建标准化的新闻条目 + item_with_source = { + "id": item.get("id"), + "title": item.get("title", ""), + "source": platform, + "rank": rank_value, + "category": category, + "sub_category": sub_category, + "url": item.get("url", "") + } + all_news.append(item_with_source) + + except Exception as e: + log.error(f"Error processing news from {platform}: {e}") + + # 搜索关键词 + search_results = [] + for item in all_news: + if keyword.lower() in item["title"].lower(): + search_results.append(item) + + # 按站点分组,每个站点内按排名排序 + grouped_results = {} + for item in search_results: + source = item["source"] + if source not in grouped_results: + grouped_results[source] = [] + grouped_results[source].append(item) + + # 对每个站点内的结果按排名排序 + for source, items in grouped_results.items(): + # 按排名排序(直接比较数字) + items.sort(key=lambda x: int(x["rank"]) if x["rank"].isdigit() else 999) + + # 重新组合排序后的结果 + sorted_results = [] + for source, items in grouped_results.items(): + sorted_results.extend(items) + + # 限制返回结果数量 + limited_results = sorted_results[:limit] + + return { + "status": "200", + "data": limited_results, + "msg": "success", + "total": len(search_results), + "search_results": len(limited_results) + } + + +def _get_category_for_platform(platform: str) -> str: + """根据平台返回对应的分类""" + categories = { + "36kr": "科技创业", + "hupu": "体育", + "sspai": "科技", + "weibo": "社交", + "zhihu": "知识", + "baidu": "综合", + "tieba": "社区", + "douban": "文化", + "bilibili": "视频", + "v2ex": "科技", + "github": "开发者", + "hackernews": "科技", + "stackoverflow": "开发者", + "jinritoutiao": "资讯", + "douyin": "娱乐", + "shaoshupai": "科技" + } + return categories.get(platform, "其他") + + +def _get_subcategory_for_platform(platform: str) -> str: + """根据平台返回对应的子分类""" + subcategories = { + "36kr": "商业资讯", + "hupu": "娱乐", + "sspai": "数码", + "weibo": "热门", + "zhihu": "问答", + "baidu": "热搜", + "tieba": "讨论", + "douban": "影视", + "bilibili": "热门", + "v2ex": "技术", + "github": "开源", + "hackernews": "国际", + "stackoverflow": "问答", + "jinritoutiao": "热点", + "douyin": "娱乐", + "shaoshupai": "数码" + } + return subcategories.get(platform, "其他") + diff --git a/app/api/v1/web_tools.py b/app/api/v1/web_tools.py new file mode 100644 index 0000000..58b5787 --- /dev/null +++ b/app/api/v1/web_tools.py @@ -0,0 +1,138 @@ +# app/api/endpoints/website_meta.py +import json +import time +from urllib.parse import urlparse, urljoin + +import cloudscraper + +from app.utils.logger import log + +import requests +from bs4 import BeautifulSoup +from fastapi import APIRouter + +from app.core import cache + +router = APIRouter() + + +@router.get("/") +def get_meta(url: str = None): + if not url: + return { + "status": "404", + "data": [], + "msg": "`url` is required" + } + + # get from cache + cached_metadata = cache.get(url) + if cached_metadata: + return { + "status": "200", + "data": json.loads(cached_metadata), + "msg": "success", + "cache": True + } + + headers = { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "zh-CN,zh;q=0.9,zh-TW;q=0.8,ar;q=0.7,en;q=0.6", + "cache-control": "max-age=0", + "priority": "u=0, i", + "sec-ch-ua": '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": '"Windows"', + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36" + } + + + try: + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + page_content = response.content + except requests.RequestException as e: + scraper = cloudscraper.create_scraper(delay=100) + response = scraper.get(url) + page_content = response.content + + if not page_content: + return { + "status": "404", + "data": [], + "msg": "No content" + } + + soup = BeautifulSoup(page_content, "html.parser") + meta_info = { + "title": soup.title.string if soup.title else "No title", + "description": "", + "keywords": "", + "author": "", + "og:title": "", + "og:description": "", + "og:image": "", + "og:url": url, + "twitter:card": "", + "twitter:title": "", + "twitter:description": "", + "twitter:image": "" + } + + for meta_tag in soup.find_all("meta"): + name_attr = meta_tag.get("name", "").lower() + property_attr = meta_tag.get("property", "").lower() + content = meta_tag.get("content", "") + + if name_attr == "description": + meta_info["description"] = content + elif name_attr == "keywords": + meta_info["keywords"] = content + elif name_attr == "author": + meta_info["author"] = content + + elif property_attr == "og:title": + meta_info["og:title"] = content + elif property_attr == "og:description": + meta_info["og:description"] = content + elif property_attr == "og:image": + meta_info["og:image"] = content + elif property_attr == "og:url": + meta_info["og:url"] = content + + elif name_attr == "twitter:card": + meta_info["twitter:card"] = content + elif name_attr == "twitter:title": + meta_info["twitter:title"] = content + elif name_attr == "twitter:description": + meta_info["twitter:description"] = content + elif name_attr == "twitter:image": + meta_info["twitter:image"] = content + + parsed_url = urlparse(url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + favicon_url = urljoin(base_url, "favicon.ico") # 默认 favicon 路径 + + link_tag = soup.find("link", rel=["icon", "shortcut icon"]) + if link_tag: + favicon_url = urljoin(base_url, link_tag.get("href", "favicon.ico")) + + metadata = { + "meta_info": meta_info, + "favicon_url": favicon_url + } + + cache.set(url, json.dumps(metadata, ensure_ascii=False), ex=60) + result = { + "status": "200", + "data": metadata, + "msg": "Success", + "cache": False + } + + return result diff --git a/app/core/__init__.py b/app/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/core/cache.py b/app/core/cache.py new file mode 100644 index 0000000..6cc65c8 --- /dev/null +++ b/app/core/cache.py @@ -0,0 +1,150 @@ +import redis +from pydantic import BaseModel +import json +from typing import Any, Optional, Dict, List, Union +import time + +from app.db.redis import get_redis_client +from app.utils.logger import log + +# 默认缓存过期时间(1小时) +DEFAULT_EXPIRE = 3600 + +def init_cache(): + """初始化缓存连接""" + try: + redis = get_redis_client() + redis.ping() + log.info("Cache connection established") + except Exception as e: + log.error(f"Failed to connect to cache: {e}") + +def close_cache(): + """关闭缓存连接""" + try: + redis = get_redis_client() + redis.connection_pool.disconnect() + log.info("Cache connection closed") + except Exception as e: + log.error(f"Error closing cache connection: {e}") + +def set_cache(key: str, value: Any, expire: int = DEFAULT_EXPIRE) -> bool: + """设置缓存,支持自动序列化复杂对象""" + try: + redis = get_redis_client() + if isinstance(value, (dict, list, tuple)): + value = json.dumps(value) + elif isinstance(value, bool): + value = "1" if value else "0" + + if expire > 0: + redis.setex(key, expire, value) + else: + redis.set(key, value) + return True + except Exception as e: + log.error(f"Error setting cache for key111111 {key}: {e}") + return False + +def get_cache(key: str) -> Optional[Any]: + try: + redis = get_redis_client() + value = redis.get(key) + + if value is None: + return None + + if isinstance(value, bytes): + value = value.decode('utf-8') + + try: + return json.loads(value) + except (json.JSONDecodeError, TypeError): + return value + except Exception as e: + log.error(f"Error getting cache for key {key}: {e}") + return None + +def delete_cache(key: str) -> bool: + try: + redis = get_redis_client() + redis.delete(key) + return True + except Exception as e: + log.error(f"Error deleting cache for key {key}: {e}") + return False + +def clear_cache_pattern(pattern: str) -> int: + try: + redis = get_redis_client() + keys = redis.keys(pattern) + if keys: + return redis.delete(*keys) + return 0 + except Exception as e: + log.error(f"Error clearing cache pattern {pattern}: {e}") + return 0 + + +def get(key): + try: + redis_client = get_redis_client() + except Exception as e: + log.error(f"Error getting redis client: {e}") + return None + + value = redis_client.get(key) + if value is None: + return None + + return value.decode("utf-8") + + +def set(key, value, ex=None): + try: + redis_client = get_redis_client() + except Exception as e: + log.error(f"Error getting redis client: {e}") + return None + + return redis_client.set(key, value, ex=ex) + + +def delete(key): + + try: + redis_client = get_redis_client() + except Exception as e: + log.error(f"Error getting redis client: {e}") + return None + + return redis_client.delete(key) + + +def hset(name, key, value): + + try: + redis_client = get_redis_client() + except Exception as e: + log.error(f"Error getting redis client: {e}") + return None + + return redis_client.hset(name, key, value) + + +def hget(name, key): + + try: + redis_client = get_redis_client() + except Exception as e: + log.error(f"Error getting redis client: {e}") + return None + + return redis_client.hget(name, key) + + +class CacheNews(BaseModel): + title: str + url: str + score: int + desc: str diff --git a/app/core/config.py b/app/core/config.py new file mode 100644 index 0000000..9234f47 --- /dev/null +++ b/app/core/config.py @@ -0,0 +1,121 @@ +import os +import yaml +from typing import Dict, Any, List, Optional +from pydantic import BaseModel, Field + +# 配置文件路径 +CONFIG_PATH = os.environ.get("CONFIG_PATH", "config/config.yaml") + +class AppConfig(BaseModel): + title: str + description: str + version: str + host: str + port: int + debug: bool = True + cors: Dict[str, Any] + +class DatabaseConfig(BaseModel): + host: str + user: str + password: str + db: str + charset: str + autocommit: bool = True + +class RedisConfig(BaseModel): + host: str + port: int + db: int + password: str = "" + decode_responses: bool = False + socket_timeout: int = 5 + socket_connect_timeout: int = 5 + health_check_interval: int = 30 + +class CrawlerConfig(BaseModel): + interval: int + timeout: int + max_retry_count: int + max_instances: int + misfire_grace_time: int + +class LoggingConfig(BaseModel): + level: str + format: str + dir: str + file: str + max_size: int + backup_count: int + daily_backup_count: int + timezone: str + +class SchedulerConfig(BaseModel): + thread_pool_size: int + process_pool_size: int + coalesce: bool + max_instances: int + misfire_grace_time: int + timezone: str + +class NotificationConfig(BaseModel): + dingtalk: Dict[str, Any] = Field(default_factory=dict) + # 可以添加其他通知方式的配置 + # wechat: Dict[str, Any] = Field(default_factory=dict) + # email: Dict[str, Any] = Field(default_factory=dict) + +class Config(BaseModel): + app: AppConfig + database: DatabaseConfig + redis: RedisConfig + crawler: CrawlerConfig + logging: LoggingConfig + scheduler: SchedulerConfig + notification: Optional[NotificationConfig] = None + +# 全局配置对象 +_config: Optional[Config] = None + +def load_config() -> Config: + """加载配置文件""" + global _config + if _config is None: + try: + with open(CONFIG_PATH, 'r') as f: + config_data = yaml.safe_load(f) + _config = Config(**config_data) + except Exception as e: + raise RuntimeError(f"Failed to load configuration: {e}") + return _config + +def get_config() -> Config: + """获取配置对象""" + if _config is None: + return load_config() + return _config + +# 便捷访问函数 +def get_app_config() -> AppConfig: + return get_config().app + +def get_db_config() -> DatabaseConfig: + return get_config().database + +def get_redis_config() -> RedisConfig: + return get_config().redis + +def get_crawler_config() -> CrawlerConfig: + return get_config().crawler + +def get_logging_config() -> LoggingConfig: + return get_config().logging + +def get_scheduler_config() -> SchedulerConfig: + return get_config().scheduler + +def get_notification_config() -> Dict[str, Any]: + """获取通知配置""" + config = get_config() + if config.notification: + return config.notification.dict() + return {} diff --git a/app/core/db.py b/app/core/db.py new file mode 100644 index 0000000..4e71598 --- /dev/null +++ b/app/core/db.py @@ -0,0 +1,131 @@ +import time +from typing import List, Dict, Any, Optional +from contextlib import contextmanager +import traceback + +import pymysql +from pymysql.cursors import DictCursor + +from app.utils.logger import log +from app.core.config import get_db_config + +# 连接池 +_connection = None + +def init_db(): + """初始化数据库连接""" + global _connection + try: + db_config = get_db_config() + _connection = pymysql.connect( + host=db_config.host, + user=db_config.user, + password=db_config.password, + db=db_config.db, + charset=db_config.charset, + cursorclass=DictCursor, + autocommit=db_config.autocommit + ) + log.info("Database connection established") + except Exception as e: + log.error(f"Failed to connect to database: {e}") + raise + +def close_db(): + """关闭数据库连接""" + global _connection + if _connection: + _connection.close() + _connection = None + log.info("Database connection closed") + +@contextmanager +def get_cursor(): + """获取数据库游标的上下文管理器""" + global _connection + + # 如果连接不存在或已关闭,重新连接 + if _connection is None or not _connection.open: + init_db() + + cursor = None + try: + cursor = _connection.cursor() + yield cursor + except pymysql.OperationalError as e: + # 处理连接断开的情况 + if e.args[0] in (2006, 2013): # MySQL server has gone away, Lost connection + log.warning("Database connection lost, reconnecting...") + init_db() + cursor = _connection.cursor() + yield cursor + else: + raise + except Exception as e: + log.error(f"Database error: {e}") + raise + finally: + if cursor: + cursor.close() + +def insert_news(news_list: List[Dict[str, Any]]) -> int: + """插入新闻数据,返回成功插入的数量""" + if not news_list: + return 0 + + inserted_count = 0 + start_time = time.time() + + try: + with get_cursor() as cursor: + for news in news_list: + # 检查是否已存在 + cursor.execute( + "SELECT id FROM news WHERE url = %s LIMIT 1", + (news.get('url', ''),) + ) + if cursor.fetchone(): + continue + + # 插入新数据 + cursor.execute( + """ + INSERT INTO news (title, content, url, source, publish_time, created_at) + VALUES (%s, %s, %s, %s, %s, NOW()) + """, + ( + news.get('title', ''), + news.get('content', ''), + news.get('url', ''), + news.get('source', ''), + news.get('publish_time', None), + ) + ) + inserted_count += 1 + + duration = time.time() - start_time + log.info(f"Inserted {inserted_count}/{len(news_list)} news items in {duration:.2f}s") + return inserted_count + + except Exception as e: + log.error(f"Error inserting news: {e}") + log.error(traceback.format_exc()) + return 0 + +def get_news_by_date(date_str: str, limit: int = 100) -> List[Dict[str, Any]]: + """获取指定日期的新闻""" + try: + with get_cursor() as cursor: + cursor.execute( + """ + SELECT * FROM news + WHERE DATE(publish_time) = %s + ORDER BY publish_time DESC + LIMIT %s + """, + (date_str, limit) + ) + return cursor.fetchall() + except Exception as e: + log.error(f"Error getting news by date: {e}") + return [] diff --git a/app/data/config/category_keywords.json b/app/data/config/category_keywords.json new file mode 100644 index 0000000..378ec35 --- /dev/null +++ b/app/data/config/category_keywords.json @@ -0,0 +1,77 @@ +{ + "科技": [ + "AI", "人工智能", "大模型", "算法", "编程", "程序", "软件", "硬件", "代码", + "互联网", "网络", "云计算", "大数据", "机器学习", "深度学习", "区块链", "元宇宙", + "芯片", "半导体", "操作系统", "应用", "app", "手机", "电脑", "笔记本", "平板", + "苹果", "华为", "小米", "三星", "谷歌", "微软", "百度", "阿里", "腾讯", "字节跳动", + "数据", "隐私", "安全", "黑客", "漏洞", "加密", "量子", "5G", "6G", "物联网", + "VR", "AR", "MR", "XR", "无人机", "机器人", "自动驾驶", "智能家居" + ], + "娱乐": [ + "电影", "电视剧", "综艺", "节目", "剧集", "影视", "演员", "导演", "制片", + "明星", "艺人", "歌手", "音乐", "歌曲", "专辑", "演唱会", "演出", "表演", + "票房", "收视率", "热度", "流量", "粉丝", "网红", "主播", "直播", "短视频", + "抖音", "快手", "B站", "油管", "视频", "游戏", "动漫", "二次元", "漫画", + "小说", "作家", "作者", "绯闻", "八卦", "恋情", "结婚", "离婚", "恋爱", + "综艺节目", "选秀", "真人秀", "脱口秀", "访谈", "颁奖", "获奖", "提名" + ], + "社会": [ + "社会", "事件", "现象", "热点", "话题", "讨论", "争议", "观点", "舆论", + "民生", "生活", "居民", "市民", "百姓", "群众", "公众", "社区", "小区", + "城市", "农村", "乡村", "振兴", "扶贫", "贫困", "福利", "保障", "救助", + "公益", "慈善", "捐赠", "捐款", "志愿者", "志愿", "服务", "公共", "公共服务", + "安全", "事故", "灾害", "灾难", "救援", "救灾", "防灾", "减灾", "消防", + "警察", "公安", "执法", "犯罪", "案件", "诈骗", "防骗", "防范", "预防", + "交通", "道路", "出行", "拥堵", "堵车", "地铁", "公交", "高铁", "铁路", + "环保", "污染", "垃圾", "分类", "绿色", "低碳", "节能", "减排", "可持续" + ], + "财经": [ + "经济", "金融", "财经", "股市", "股票", "基金", "债券", "期货", "外汇", + "汇率", "利率", "存款", "贷款", "理财", "投资", "投资者", "股东", "股份", + "上市", "IPO", "融资", "并购", "重组", "收购", "分拆", "分红", "派息", + "银行", "证券", "保险", "信托", "资管", "资产管理", "财富管理", "私募", + "公募", "券商", "基金公司", "信用", "风险", "监管", "政策", "法规", "规定", + "房地产", "楼市", "房价", "地价", "商品房", "住宅", "公寓", "别墅", "商铺", + "通货膨胀", "通胀", "CPI", "GDP", "经济增长", "经济发展", "经济复苏", + "贸易", "进出口", "关税", "税收", "减税", "增值税", "所得税", "企业所得税" + ], + "体育": [ + "体育", "运动", "比赛", "赛事", "联赛", "锦标赛", "冠军赛", "世界杯", "奥运会", + "足球", "篮球", "排球", "网球", "乒乓球", "羽毛球", "游泳", "田径", "马拉松", + "体操", "举重", "拳击", "武术", "跆拳道", "柔道", "击剑", "射击", "射箭", + "高尔夫", "棒球", "橄榄球", "冰球", "滑雪", "滑冰", "冬奥会", "亚运会", + "球员", "教练", "裁判", "球队", "俱乐部", "国家队", "主场", "客场", "赛季", + "进球", "得分", "助攻", "防守", "进攻", "战术", "技术", "犯规", "红牌", "黄牌", + "NBA", "CBA", "英超", "西甲", "德甲", "意甲", "法甲", "欧冠", "欧联", "亚冠", + "世锦赛", "大满贯", "全运会", "体育产业", "体育用品", "体育营销", "体育赞助" + ], + "教育": [ + "教育", "学校", "大学", "高校", "中学", "小学", "幼儿园", "学院", "研究生院", + "教师", "老师", "学生", "学员", "家长", "教授", "讲师", "班主任", "辅导员", + "课程", "课堂", "教材", "教学", "学习", "考试", "考核", "成绩", "分数", "学分", + "升学", "高考", "中考", "考研", "考博", "考证", "留学", "出国", "海归", "归国", + "学历", "学位", "文凭", "证书", "学士", "硕士", "博士", "博士后", "教育部", + "教育局", "教育厅", "教育系统", "教育改革", "素质教育", "应试教育", "职业教育", + "在线教育", "远程教育", "教育科技", "教育创新", "教育公平", "教育资源", "教育质量", + "校园", "宿舍", "食堂", "图书馆", "实验室", "教室", "操场", "体育馆", "礼堂" + ], + "健康": [ + "健康", "医疗", "医院", "医生", "医师", "护士", "护理", "患者", "病人", "就医", + "疾病", "疫情", "病毒", "细菌", "感染", "传染", "流行病", "新冠", "肺炎", "发热", + "症状", "治疗", "用药", "药物", "药品", "药剂", "处方", "诊断", "检查", "手术", + "康复", "保健", "养生", "营养", "饮食", "运动", "锻炼", "减肥", "增重", "塑形", + "心理", "精神", "抑郁", "焦虑", "压力", "睡眠", "失眠", "心理咨询", "心理治疗", + "医保", "医疗保险", "社保", "医改", "医疗改革", "医疗体系", "医疗资源", "医疗服务", + "疫苗", "接种", "防疫", "防控", "消毒", "隔离", "核酸", "抗原", "检测", "筛查" + ], + "国际": [ + "国际", "全球", "世界", "外交", "国家", "地区", "大使", "领事", "使馆", "领馆", + "美国", "中国", "俄罗斯", "欧盟", "日本", "印度", "英国", "法国", "德国", "意大利", + "加拿大", "澳大利亚", "巴西", "南非", "沙特", "伊朗", "以色列", "巴勒斯坦", "朝鲜", + "韩国", "越南", "新加坡", "马来西亚", "印尼", "泰国", "菲律宾", "乌克兰", "白俄罗斯", + "战争", "冲突", "和平", "停火", "制裁", "协议", "条约", "峰会", "会议", "会晤", + "联合国", "安理会", "世卫组织", "世贸组织", "国际货币基金组织", "世界银行", "北约", + "政治", "经济", "军事", "外交", "贸易", "投资", "援助", "移民", "难民", "人权", + "气候变化", "全球变暖", "可持续发展", "减排", "碳中和", "国际合作", "多边主义" + ] +} \ No newline at end of file diff --git a/app/data/config/stopwords.json b/app/data/config/stopwords.json new file mode 100644 index 0000000..bf46d2b --- /dev/null +++ b/app/data/config/stopwords.json @@ -0,0 +1,19 @@ +{ + "stopwords": [ + "什么", "怎么", "如何", "为何", "为什么", "哪些", "多少", "几个", "怎样", + "一个", "这个", "那个", "自己", "这些", "那些", "因为", "所以", "如果", + "可以", "还是", "这样", "那样", "关于", "对于", + "今天", "明天", "昨天", "今年", "去年", "最近", "现在", + "一些", "有些", "很多", "许多", + "a", "an", "the", "and", "or", "but", "if", "because", "as", "what", "when", + "where", "how", "to", "of", "for", "with", "in", "on", "at", "from", "by", + "about", "into", "is", "are", "was", "were", "be", "been", "being", "have", + "has", "had", "do", "does", "did", "doing", "can", "could", "will", "would", + "should", "shall", "may", "might", "must", "that", "which", "who", "whom", + "this", "these", "those", "am", "i", "you", "he", "she", "it", "we", "they", + "their", "your", "my", "his", "her", "its", "our", "than", "then", "so", "not", + "的", "了", "和", "是", "在", "我", "有", "个", "这", "那", "就", "也", + "要", "会", "对", "啊", "吧", "呢", "吗", "嗯", "哦", "哪", "啥", "么", + "被", "说", "到", "等", "着", "为", "与", "但", "并", "或", "而", "所以" + ] +} \ No newline at end of file diff --git a/app/db/__init__.py b/app/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/db/models.py b/app/db/models.py new file mode 100644 index 0000000..f04462f --- /dev/null +++ b/app/db/models.py @@ -0,0 +1,15 @@ +from sqlalchemy import Column, Integer, String, Text, DateTime +from sqlalchemy.ext.declarative import declarative_base + +Base = declarative_base() + +class News(Base): + __tablename__ = 'news' + + id = Column(Integer, primary_key=True, autoincrement=True) + title = Column(String(255), nullable=False) + content = Column(Text, nullable=True) + url = Column(String(255), nullable=False, unique=True) + source = Column(String(50), nullable=True) + publish_time = Column(DateTime, nullable=True) + created_at = Column(DateTime, nullable=False) \ No newline at end of file diff --git a/app/db/mysql.py b/app/db/mysql.py new file mode 100644 index 0000000..f6f251b --- /dev/null +++ b/app/db/mysql.py @@ -0,0 +1,60 @@ +from datetime import datetime +from typing import Optional, List, Dict, Any +from .models import Base, News + +# 移除对 SQLAlchemy 的依赖 +# from app.core.db import Base + +# 定义一个简单的数据类来替代 SQLAlchemy 模型 +class News: + """新闻数据模型""" + + def __init__(self, + title: str = "", + content: str = "", + url: str = "", + source: str = "", + publish_time: Optional[datetime] = None): + self.id: Optional[int] = None + self.title = title + self.content = content + self.url = url + self.source = source + self.publish_time = publish_time or datetime.now() + self.created_at = datetime.now() + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'News': + """从字典创建新闻对象""" + news = cls( + title=data.get('title', ''), + content=data.get('content', ''), + url=data.get('url', ''), + source=data.get('source', ''), + publish_time=data.get('publish_time') + ) + if 'id' in data: + news.id = data['id'] + if 'created_at' in data: + news.created_at = data['created_at'] + return news + + def to_dict(self) -> Dict[str, Any]: + """转换为字典""" + return { + 'id': self.id, + 'title': self.title, + 'content': self.content, + 'url': self.url, + 'source': self.source, + 'publish_time': self.publish_time, + 'created_at': self.created_at + } + +def insert_news(news_list): + """将新闻列表插入数据库""" + from app.core import db + # 如果传入的是 News 对象列表,转换为字典列表 + if news_list and isinstance(news_list[0], News): + news_list = [news.to_dict() for news in news_list] + return db.insert_news(news_list) diff --git a/app/db/redis.py b/app/db/redis.py new file mode 100644 index 0000000..31f1a86 --- /dev/null +++ b/app/db/redis.py @@ -0,0 +1,45 @@ +import redis +from redis import Redis +from typing import Optional +from pydantic import BaseModel + +from app.core import cache +from app.core.config import get_redis_config + +REDIS_CONFIG = { + "host": "localhost", + "port": 6379, + "db": 0, + "decode_responses": False, + "socket_timeout": 5, + "socket_connect_timeout": 5, + "health_check_interval": 30, +} + +_redis_pool = None + +def get_redis_pool() -> redis.ConnectionPool: + global _redis_pool + if _redis_pool is None: + redis_config = get_redis_config() + _redis_pool = redis.ConnectionPool( + host=redis_config.host, + port=redis_config.port, + db=redis_config.db, + password=redis_config.password, + decode_responses=redis_config.decode_responses, + socket_timeout=redis_config.socket_timeout, + socket_connect_timeout=redis_config.socket_connect_timeout, + health_check_interval=redis_config.health_check_interval + ) + return _redis_pool + +def get_redis_client() -> Redis: + pool = get_redis_pool() + return redis.Redis(connection_pool=pool) + +class CacheNews(BaseModel): + title: str + url: str + score: int + desc: str \ No newline at end of file diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..2408428 --- /dev/null +++ b/app/main.py @@ -0,0 +1,92 @@ +# app/main.py +import threading +import time +from contextlib import asynccontextmanager + +from fastapi import FastAPI, Request, Response +from fastapi.middleware.cors import CORSMiddleware +import uvicorn + +import app.services.crawler as crawler +import tg_bot as tg_bot +from app.api.v1 import daily_news, web_tools, analysis +from app.utils.logger import log +from app.core import db, cache +from app.core.config import get_app_config, get_config +from app.services.browser_manager import BrowserManager + +# 获取应用配置 +app_config = get_app_config() + +# 应用启动和关闭的生命周期管理 +@asynccontextmanager +async def lifespan(app: FastAPI): + # 启动时执行 + log.info("Application startup") + + # 初始化数据库连接 + db.init_db() + + # 初始化缓存 + cache.init_cache() + + # 异步启动爬虫,避免阻塞应用启动 + threading.Thread(target=crawler.crawlers_logic, daemon=True).start() + + yield + + # 关闭时执行 + log.info("Application shutdown") + + # 关闭浏览器管理器 + try: + BrowserManager().shutdown() + log.info("Browser manager shutdown") + except Exception as e: + log.error(f"Error shutting down browser manager: {e}") + + # 关闭数据库连接 + db.close_db() + + # 关闭缓存连接 + cache.close_cache() + +# 创建应用实例 +app = FastAPI( + title=app_config.title, + description=app_config.description, + version=app_config.version, + lifespan=lifespan +) + +# 添加CORS中间件 +app.add_middleware( + CORSMiddleware, + allow_origins=app_config.cors["allow_origins"], + allow_credentials=app_config.cors["allow_credentials"], + allow_methods=app_config.cors["allow_methods"], + allow_headers=app_config.cors["allow_headers"], +) + +# 请求计时中间件 +@app.middleware("http") +async def add_process_time_header(request: Request, call_next): + start_time = time.time() + response = await call_next(request) + process_time = time.time() - start_time + response.headers["X-Process-Time"] = str(process_time) + return response + +# 注册路由 +app.include_router(daily_news.router, prefix="/api/v1/dailynews", tags=["Daily News"]) +app.include_router(web_tools.router, prefix="/api/v1/tools/website-meta", tags=["Website Meta"]) +app.include_router(analysis.router, prefix="/api/v1/analysis", tags=["Analysis"]) + +# 健康检查端点 +@app.get("/health", tags=["Health"]) +async def health_check(): + return {"status": "healthy", "version": app_config.version} + +# 如果直接运行此文件 +if __name__ == "__main__": + uvicorn.run("app.main:app", host=app_config.host, port=app_config.port, reload=app_config.debug) diff --git a/app/services/__init__.py b/app/services/__init__.py new file mode 100644 index 0000000..32bbd13 --- /dev/null +++ b/app/services/__init__.py @@ -0,0 +1,43 @@ +from apscheduler.schedulers.background import BackgroundScheduler +from apscheduler.executors.pool import ThreadPoolExecutor, ProcessPoolExecutor +from apscheduler.jobstores.memory import MemoryJobStore +import pytz + +from app.services.sites.factory import CrawlerRegister +from app.utils.logger import log +from app.core.config import get_scheduler_config + +# 创建爬虫工厂 +crawler_factory = CrawlerRegister().register() + +# 获取调度器配置 +scheduler_config = get_scheduler_config() + +# 配置调度器 +jobstores = { + 'default': MemoryJobStore() +} + +executors = { + 'default': ThreadPoolExecutor(scheduler_config.thread_pool_size), + 'processpool': ProcessPoolExecutor(scheduler_config.process_pool_size) +} + +job_defaults = { + 'coalesce': scheduler_config.coalesce, + 'max_instances': scheduler_config.max_instances, + 'misfire_grace_time': scheduler_config.misfire_grace_time, +} + +# 创建并配置调度器 +_scheduler = BackgroundScheduler( + jobstores=jobstores, + executors=executors, + job_defaults=job_defaults, + timezone=pytz.timezone(scheduler_config.timezone) +) + +# 启动调度器 +_scheduler.start() + +log.info(f"Scheduler started with timezone: {scheduler_config.timezone}") diff --git a/app/services/browser_manager.py b/app/services/browser_manager.py new file mode 100644 index 0000000..c27f510 --- /dev/null +++ b/app/services/browser_manager.py @@ -0,0 +1,121 @@ +import threading +import time +import os +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from webdriver_manager.chrome import ChromeDriverManager +from app.utils.logger import log + +class BrowserManager: + """浏览器管理器,提供共享的Chrome浏览器实例""" + _instance = None + _lock = threading.Lock() + _driver = None + _driver_path = None + _last_activity = 0 + _max_idle_time = 1800 # 最大空闲时间(秒),默认30分钟 + + def __new__(cls, *args, **kwargs): + """单例模式实现""" + if cls._instance is None: + with cls._lock: + if cls._instance is None: + cls._instance = super(BrowserManager, cls).__new__(cls) + cls._instance._init_driver_path() + cls._instance._start_idle_monitor() + return cls._instance + + def _init_driver_path(self): + """初始化ChromeDriver路径""" + try: + self._driver_path = ChromeDriverManager().install() + log.info(f"ChromeDriver已安装: {self._driver_path}") + except Exception as e: + log.error(f"ChromeDriver安装失败: {str(e)}") + raise + + def _start_idle_monitor(self): + """启动空闲监控线程""" + def monitor(): + while True: + time.sleep(60) # 每分钟检查一次 + try: + with self._lock: + if self._driver is not None: + current_time = time.time() + if current_time - self._last_activity > self._max_idle_time: + log.info(f"浏览器空闲超过{self._max_idle_time}秒,释放资源") + self._quit_driver() + except Exception as e: + log.error(f"浏览器监控线程异常: {str(e)}") + + monitor_thread = threading.Thread(target=monitor, daemon=True) + monitor_thread.start() + log.info("浏览器空闲监控线程已启动") + + def get_driver(self): + """获取Chrome浏览器实例""" + with self._lock: + self._last_activity = time.time() + if self._driver is None: + self._create_driver() + return self._driver + + def _create_driver(self): + """创建新的Chrome浏览器实例""" + log.info("创建新的Chrome浏览器实例") + options = webdriver.ChromeOptions() + # 基本配置(无头模式) + options.add_argument("--headless") + options.add_argument("--disable-gpu") + options.add_argument("--no-sandbox") + # 内存优化配置 + options.add_argument("--disable-dev-shm-usage") + options.add_argument("--disable-extensions") + options.add_argument("--disable-application-cache") + options.add_argument("--js-flags=--expose-gc") + options.add_argument("--memory-pressure-off") + options.add_argument("--disable-default-apps") + # 日志级别 + options.add_argument("--log-level=3") + + self._driver = webdriver.Chrome( + service=Service(self._driver_path), + options=options + ) + self._driver.set_page_load_timeout(30) + + def _quit_driver(self): + """关闭浏览器实例""" + if self._driver: + try: + self._driver.quit() + log.info("浏览器实例已关闭") + except Exception as e: + log.error(f"关闭浏览器实例出错: {str(e)}") + finally: + self._driver = None + + def release_driver(self): + """使用完毕后标记为活动状态""" + with self._lock: + self._last_activity = time.time() + + def get_page_content(self, url, wait_time=5): + """获取指定URL的页面内容,并自动处理浏览器""" + driver = self.get_driver() + try: + driver.get(url) + time.sleep(wait_time) # 等待页面加载 + page_source = driver.page_source + self.release_driver() + return page_source, driver + except Exception as e: + log.error(f"获取页面内容失败: {str(e)}") + self.release_driver() + raise + + def shutdown(self): + """关闭浏览器管理器""" + with self._lock: + self._quit_driver() \ No newline at end of file diff --git a/app/services/crawler.py b/app/services/crawler.py new file mode 100644 index 0000000..30b5ebf --- /dev/null +++ b/app/services/crawler.py @@ -0,0 +1,240 @@ +import time +import traceback +import threading +from datetime import datetime +from functools import wraps +import pytz +import signal +from typing import List, Dict, Any, Optional, Callable + +from app.services import crawler_factory, _scheduler +from app.utils.logger import log +from app.core import db, cache +from app.core.config import get_crawler_config +from app.utils.notification import notification_manager + +# 获取爬虫配置 +crawler_config = get_crawler_config() + +# 配置常量 +CRAWLER_INTERVAL = crawler_config.interval +CRAWLER_TIMEOUT = crawler_config.timeout +MAX_RETRY_COUNT = crawler_config.max_retry_count +SHANGHAI_TZ = pytz.timezone('Asia/Shanghai') + +class CrawlerTimeoutError(Exception): + """爬虫超时异常""" + pass + +def timeout_handler(func: Callable, timeout: int = CRAWLER_TIMEOUT) -> Callable: + """超时处理装饰器,支持Unix信号和线程两种实现""" + @wraps(func) + def wrapper(*args, **kwargs): + # 线程实现的超时机制 + result = [None] + exception = [None] + completed = [False] + + def target(): + try: + result[0] = func(*args, **kwargs) + except Exception as e: + exception[0] = e + finally: + completed[0] = True + + thread = threading.Thread(target=target) + thread.daemon = True + thread.start() + thread.join(timeout) + + if not completed[0]: + error_msg = f"Function {func.__name__} timed out after {timeout} seconds" + log.error(error_msg) + raise CrawlerTimeoutError(error_msg) + + if exception[0]: + log.error(f"Function {func.__name__} raised an exception: {exception[0]}") + raise exception[0] + + return result[0] + return wrapper + +def safe_fetch(crawler_name: str, crawler, date_str: str, is_retry: bool = False) -> List[Dict[str, Any]]: + """安全地执行爬虫抓取,处理异常并返回结果""" + try: + news_list = crawler.fetch(date_str) + if news_list and len(news_list) > 0: + cache_key = f"crawler:{crawler_name}:{date_str}" + cache.set_cache(key=cache_key, value=news_list, expire=0) + + log.info(f"{crawler_name} fetch success, {len(news_list)} news fetched") + return news_list + else: + log.info(f"{'Second time ' if is_retry else ''}crawler {crawler_name} failed. 0 news fetched") + return [] + except Exception as e: + error_msg = traceback.format_exc() + log.error(f"{'Second time ' if is_retry else ''}crawler {crawler_name} error: {error_msg}") + + # 发送钉钉通知 + try: + notification_manager.notify_crawler_error( + crawler_name=crawler_name, + error_msg=str(e), + date_str=date_str, + is_retry=is_retry + ) + except Exception as notify_error: + log.error(f"Failed to send notification for crawler {crawler_name}: {notify_error}") + + return [] + +def run_data_analysis(date_str: str): + """执行数据分析并缓存结果""" + log.info(f"Starting data analysis for date {date_str}") + try: + # 导入分析模块(在这里导入避免循环依赖) + from app.analysis.trend_analyzer import TrendAnalyzer + from app.analysis.predictor import TrendPredictor + + # 创建分析器实例 + analyzer = TrendAnalyzer() + predictor = TrendPredictor() + + # 1. 生成关键词云图数据并缓存 + log.info("Generating keyword cloud data...") + analyzer.get_keyword_cloud(date_str, refresh=True) + + # 2. 生成热点聚合分析数据并缓存 + log.info("Generating trend analysis data...") + analyzer.get_analysis(date_str, analysis_type="main") + + # 3. 生成跨平台热点分析数据并缓存 + log.info("Generating cross-platform analysis data...") + analyzer.get_cross_platform_analysis(date_str, refresh=True) + + # 4. 生成热点趋势预测数据并缓存 + log.info("Generating trend prediction data...") + predictor.get_prediction(date_str) + + # 5. 生成平台对比分析数据并缓存 + log.info("Generating platform comparison data...") + analyzer.get_platform_comparison(date_str) + + # 6. 生成高级分析数据并缓存 + log.info("Generating advanced analysis data...") + analyzer.get_advanced_analysis(date_str, refresh=True) + + # 7. 生成数据可视化分析数据并缓存 + log.info("Generating data visualization analysis...") + analyzer.get_data_visualization(date_str, refresh=True) + + # 8. 生成趋势预测分析数据并缓存 + log.info("Generating trend forecast data...") + analyzer.get_trend_forecast(date_str, refresh=True) + + log.info(f"All data analysis completed for date {date_str}") + except Exception as e: + error_msg = traceback.format_exc() + log.error(f"Error during data analysis: {str(e)}") + log.error(error_msg) + + # 发送数据分析异常通知 + try: + notification_manager.notify_analysis_error( + error_msg=str(e), + date_str=date_str + ) + except Exception as notify_error: + log.error(f"Failed to send analysis error notification: {notify_error}") + +@_scheduler.scheduled_job('interval', id='crawlers_logic', seconds=CRAWLER_INTERVAL, + max_instances=crawler_config.max_instances, + misfire_grace_time=crawler_config.misfire_grace_time) +def crawlers_logic(): + """爬虫主逻辑,包含超时保护和错误处理""" + + @timeout_handler + def crawler_work(): + now_time = datetime.now(SHANGHAI_TZ) + date_str = now_time.strftime("%Y-%m-%d") + log.info(f"Starting crawler job at {now_time.strftime('%Y-%m-%d %H:%M:%S')}") + + retry_crawler = [] + success_count = 0 + failed_crawlers = [] + + for crawler_name, crawler in crawler_factory.items(): + news_list = safe_fetch(crawler_name, crawler, date_str) + if news_list: + success_count += 1 + else: + retry_crawler.append(crawler_name) + failed_crawlers.append(crawler_name) + + # 第二轮爬取(重试失败的爬虫) + if retry_crawler: + log.info(f"Retrying {len(retry_crawler)} failed crawlers") + retry_failed = [] + for crawler_name in retry_crawler: + news_list = safe_fetch(crawler_name, crawler_factory[crawler_name], date_str, is_retry=True) + if news_list: + success_count += 1 + # 从失败列表中移除成功的爬虫 + if crawler_name in failed_crawlers: + failed_crawlers.remove(crawler_name) + else: + retry_failed.append(crawler_name) + + # 记录完成时间 + end_time = datetime.now(SHANGHAI_TZ) + duration = (end_time - now_time).total_seconds() + log.info(f"Crawler job finished at {end_time.strftime('%Y-%m-%d %H:%M:%S')}, " + f"duration: {duration:.2f}s, success: {success_count}/{len(crawler_factory)}") + + # 发送通知 + try: + notification_manager.notify_crawler_summary( + success_count=success_count, + total_count=len(crawler_factory), + failed_crawlers=failed_crawlers, + duration=duration, + date_str=date_str + ) + except Exception as notify_error: + log.error(f"Failed to send crawler notification: {notify_error}") + + # 爬取完成后执行数据分析 + log.info("Crawler job completed, starting data analysis...") + # 使用新线程执行分析,避免阻塞主线程 + threading.Thread(target=run_data_analysis, args=(date_str,), daemon=True).start() + + return success_count + + try: + return crawler_work() + except CrawlerTimeoutError as e: + log.error(f"Crawler job timeout: {str(e)}") + # 发送超时通知 + try: + notification_manager.notify_crawler_timeout( + timeout_seconds=CRAWLER_TIMEOUT, + date_str=date_str + ) + except Exception as notify_error: + log.error(f"Failed to send timeout notification: {notify_error}") + return 0 + except Exception as e: + log.error(f"Crawler job error: {str(e)}") + log.error(traceback.format_exc()) + # 发送通用异常通知 + try: + notification_manager.notify_crawler_error( + crawler_name="crawler_job", + error_msg=str(e), + date_str=date_str + ) + except Exception as notify_error: + log.error(f"Failed to send error notification: {notify_error}") + return 0 diff --git a/app/services/sites/__init__.py b/app/services/sites/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/services/sites/baidu.py b/app/services/sites/baidu.py new file mode 100644 index 0000000..7f291f0 --- /dev/null +++ b/app/services/sites/baidu.py @@ -0,0 +1,99 @@ +import json +import datetime + +import requests +import urllib3 +from bs4 import BeautifulSoup + +from .crawler import Crawler +from ...core import cache +from ...db.mysql import News + +urllib3.disable_warnings() + + +class BaiduNewsCrawler(Crawler): + # 返回news_list + def fetch(self, date_str) -> list: + # 获取当前时间 + current_time = datetime.datetime.now() + + url = "https://top.baidu.com/api/board?platform=wise&tab=realtime" + + resp = requests.get(url=url, params=self.header, verify=False, timeout=self.timeout) + if resp.status_code != 200: + print(f"request failed, status: {resp.status_code}") + return [] + + json_data = resp.json() + contents = json_data.get("data")["cards"][0]["content"][0]["content"] + result = [] + cache_list = [] + for content in contents: + title = content.get("word") + url = content.get("url") + desc = content.get("desc") + score = content.get("hotScore") + + # replace url m to www + url = url.replace("m.", "www.") + news = { + 'title': title, + 'url': url, + 'content': desc, + 'source': 'baidu', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串 + } + result.append(news) + cache_list.append(news) # 直接添加字典,json.dumps会在后面处理整个列表 + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + def crawler_name(self): + return "baidu" + + @staticmethod + def fetch_v0(): + # 获取当前时间 + current_time = datetime.datetime.now() + + url = "https://top.baidu.com/board?tab=realtime" + proxies = { + # "http": "http://127.0.0.1:7890", + # "https": "http://127.0.0.0:7890" + } + + header = { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng," + "*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + "upgrade-insecure-requests": 1, + "host": "www.baidu.com", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/86.0.4240.183 Safari/537.36" + } + html = requests.get(url=url, params=header, verify=False, proxies=proxies) + html.encoding = "utf-8" + html_text = html.text + soup = BeautifulSoup(html_text, "html.parser") + main_content = soup.find_all("main")[0] + news_main_content = main_content.find("div", style='margin-bottom:20px') + + div_elements = news_main_content.find_all('div', class_='category-wrap_iQLoo horizontal_1eKyQ') + + result = [] + for div_element in div_elements: + hot_index = div_element.find(class_='hot-index_1Bl1a').text.strip() + news_title = div_element.find(class_='c-single-text-ellipsis').text.strip() + news_link = div_element.find('a', class_='title_dIF3B')['href'] + + news = { + 'title': news_title, + 'url': news_link, + 'content': "", + 'source': 'baidu', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串 + } + result.append(news) + + return result diff --git a/app/services/sites/bilibili.py b/app/services/sites/bilibili.py new file mode 100644 index 0000000..63d16f5 --- /dev/null +++ b/app/services/sites/bilibili.py @@ -0,0 +1,64 @@ +import json +import datetime + +import requests +import urllib3 + +from .crawler import Crawler +from ...core import cache +from ...db.mysql import News + +urllib3.disable_warnings() + + +class BilibiliCrawler(Crawler): + + def fetch(self, date_str): + current_time = datetime.datetime.now() + + url = "https://api.bilibili.com/x/web-interface/popular" + + headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "Chrome/122.0.0.0 Safari/537.36" + "AppleWebKit/537.36 (KHTML, like Gecko) " + ), + "Referer": "https://www.bilibili.com/", + } + + resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout) + if resp.status_code != 200: + print(f"request failed, status: {resp.status_code}") + return [] + + data = resp.json() + if data["code"] != 0: + print(f"API error: {data['message']}") + return [] + + result = [] + cache_list = [] + + for item in data["data"].get("list", []): + title = item.get("title", "") + bvid = item.get("bvid", "") + desc = item.get("desc", "") + video_url = f"https://www.bilibili.com/video/{bvid}" + + news = { + 'title': title, + 'url': video_url, + 'content': desc, + 'source': 'bilibili', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') + } + + result.append(news) + cache_list.append(news) + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + def crawler_name(self): + return "bilibili" diff --git a/app/services/sites/cls.py b/app/services/sites/cls.py new file mode 100644 index 0000000..c9970e7 --- /dev/null +++ b/app/services/sites/cls.py @@ -0,0 +1,100 @@ +import json +import datetime +import requests +import urllib3 + +from .crawler import Crawler +from ...core import cache + +urllib3.disable_warnings() + + +class CLSCrawler(Crawler): + """财联社""" + + def fetch(self, date_str) -> list: + current_time = datetime.datetime.now() + + try: + params = { + 'app': 'CailianpressWeb', + 'os': 'web', + 'sv': '8.4.6', + 'sign': '9f8797a1f4de66c2370f7a03990d2737' + } + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Accept': 'application/json, text/plain, */*', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', + 'Referer': 'https://www.cls.cn/', + 'Origin': 'https://www.cls.cn' + } + + response = requests.get( + "https://www.cls.cn/featured/v1/column/list", + params=params, + headers=headers, + timeout=self.timeout, + verify=False + ) + response.raise_for_status() + + data = response.json() + if data.get('errno') != 0: + return [] + + column_list = data.get('data', {}).get('column_list', []) + + result = [] + cache_list = [] + + for idx, column in enumerate(column_list[:20]): + try: + title = column.get('title', '').strip() + if not title or len(title) < 2: + continue + + article_list = column.get('article_list', {}) + if article_list: + article_title = article_list.get('title', '').strip() + jump_url = article_list.get('jump_url', '').strip() + brief = article_list.get('brief', '').strip() + + if article_title: + display_title = f"[{title}] {article_title}" + content = brief if brief else article_title + url = "https://www.cls.cn/telegraph" + else: + display_title = title + content = column.get('brief', '').strip() + url = f"https://www.cls.cn/telegraph" + else: + display_title = title + content = column.get('brief', '').strip() + url = f"https://www.cls.cn/telegraph" + + news = { + 'title': display_title, + 'url': url, + 'content': content, + 'source': 'cls', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S'), + 'score': 1000 - idx, + 'rank': idx + 1 + } + + result.append(news) + cache_list.append(news) + + except Exception: + continue + + if cache_list: + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + except Exception as e: + return [] + + def crawler_name(self): + return "cls" \ No newline at end of file diff --git a/app/services/sites/crawler.py b/app/services/sites/crawler.py new file mode 100644 index 0000000..1c5b7bc --- /dev/null +++ b/app/services/sites/crawler.py @@ -0,0 +1,23 @@ +from abc import ABC, abstractmethod +from typing import List, Dict, Any + +class Crawler(ABC): + def __init__(self): + self.header = { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng," + "*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/86.0.4240.183 Safari/537.36" + } + self.timeout = 10 + + @abstractmethod + def fetch(self, date_str: str) -> List[Dict[str, Any]]: + """获取新闻列表""" + pass + + @abstractmethod + def crawler_name(self) -> str: + """获取爬虫名称""" + pass diff --git a/app/services/sites/douban.py b/app/services/sites/douban.py new file mode 100644 index 0000000..46794d7 --- /dev/null +++ b/app/services/sites/douban.py @@ -0,0 +1,79 @@ +import json +import re +import datetime + +import requests +import urllib3 +from bs4 import BeautifulSoup +from ...core import cache +from .crawler import Crawler + +urllib3.disable_warnings() + + +class DouBanCrawler(Crawler): + """豆瓣网""" + + def fetch(self, date_str): + current_time = datetime.datetime.now() + + url = "https://www.douban.com/group/explore" + + header = self.header.copy() + header.update({ + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-encoding": "", + "accept-language": "zh-CN,zh;q=0.9,en;q=0.8", + "host": "www.douban.com", + "referer": "https://www.douban.com/group/explore", + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + }) + + resp = requests.get(url=url, headers=header, verify=False, timeout=self.timeout) + if resp.status_code != 200: + print(f"request failed, status: {resp.status_code}") + return [] + + html_text = resp.text + soup = BeautifulSoup(html_text, "html.parser") + + topic_list = soup.find_all('div', class_='channel-item') + + result = [] + cache_list = [] + + for topic in topic_list: + title_elem = topic.find('h3') + if not title_elem: + continue + + link_elem = title_elem.find('a') + if not link_elem: + continue + + title = link_elem.text.strip() + url = link_elem.get('href') + + desc_elem = topic.find('div', class_='content') + desc = desc_elem.text.strip() if desc_elem else "" + + news = { + 'title': title, + 'url': url, + 'content': desc, + 'source': 'douban', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') + } + + result.append(news) + cache_list.append(news) + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + def crawler_name(self): + return "douban" diff --git a/app/services/sites/douyin.py b/app/services/sites/douyin.py new file mode 100644 index 0000000..200a7e1 --- /dev/null +++ b/app/services/sites/douyin.py @@ -0,0 +1,111 @@ +import json +import datetime +import time + +import requests +from selenium.webdriver.common.by import By +from bs4 import BeautifulSoup + +from ...core import cache +from ...db.mysql import News +from .crawler import Crawler +from ..browser_manager import BrowserManager + + +class DouYinCrawler(Crawler): + def fetch(self, date_str): + return self.fetch_v2(date_str) + + def fetch_v1(self, date_str): + current_time = datetime.datetime.now() + url = "https://www.douyin.com/hot" + browser_manager = BrowserManager() + + try: + # 使用浏览器管理器获取页面内容 + page_source, driver = browser_manager.get_page_content(url, wait_time=5) + + result = [] + cache_list = [] + + # 抖音热榜条目(li 标签里含 /video/ 链接) + items = driver.find_elements(By.XPATH, '//li[a[contains(@href, "/video/")]]') + + for item in items: + try: + # 提取标题(含 # 标签或较长文本) + title_elem = item.find_element(By.XPATH, './/div[contains(text(), "#") or string-length(text()) > 10]') + # 提取链接 + link_elem = item.find_element(By.XPATH, './/a[contains(@href, "/video/")]') + # 提取热度 + hot_elem = item.find_element(By.XPATH, './/span[contains(text(), "万") or contains(text(), "亿")]') + + title = title_elem.text.strip() + item_url = "https://www.douyin.com" + link_elem.get_attribute("href") + hot = hot_elem.text.strip() + + news = { + 'title': title, + 'url': item_url, + 'content': f"热度: {hot}", + 'source': 'douyin', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') + } + + result.append(news) + cache_list.append(news) + except Exception: + continue # 跳过无效项 + + # 缓存并返回 + if cache_list: + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + except Exception as e: + return [] + + def fetch_v2(self, date_str): + current_time = datetime.datetime.now() + url = "https://www.douyin.com/aweme/v1/web/hot/search/list/?device_platform=webapp&aid=6383&channel=channel_pc_web&detail_list=1&source=6&pc_client_type=1&pc_libra_divert=Windows&support_h265=1&support_dash=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=136.0.0.0&browser_online=true&engine_name=Blink&engine_version=136.0.0.0&os_name=Windows&os_version=10&cpu_core_num=16&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7490997798633555467" + + headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "Chrome/122.0.0.0 Safari/537.36" + "AppleWebKit/537.36 (KHTML, like Gecko) " + ), + "Referer": "https://www.douyin.com/", + } + + resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout) + if resp.status_code != 200: + print(f"request failed, status: {resp.status_code}") + return [] + + data = resp.json() + # https://www.douyin.com/hot/2094286?&trending_topic=%E5%A4%8F%E5%A4%A9%E7%9A%84%E5%91%B3%E9%81%93%E5%9C%A8%E6%8A%96%E9%9F%B3&previous_page=main_page&enter_method=trending_topic&modeFrom=hotDetail&tab_name=trend&position=1&hotValue=11892557 + result = [] + cache_list = [] + + for item in data["data"]["word_list"]: + title = item["word"] + url = f"https://www.douyin.com/hot/{item['sentence_id']}?&trending_topic={item['word']}&previous_page=main_page&enter_method=trending_topic&modeFrom=hotDetail&tab_name=trend&position=1&hotValue={item['hot_value']}" + + news = { + 'title': title, + 'url': url, + 'content': title, + 'source': 'douyin', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') + } + + result.append(news) + cache_list.append(news) + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + + def crawler_name(self): + return "douyin" diff --git a/app/services/sites/eastmoney.py b/app/services/sites/eastmoney.py new file mode 100644 index 0000000..4181d87 --- /dev/null +++ b/app/services/sites/eastmoney.py @@ -0,0 +1,88 @@ +import json +import datetime +import requests +import urllib3 + +from .crawler import Crawler +from ...core import cache + +urllib3.disable_warnings() + + +class EastMoneyCrawler(Crawler): + """东方财富网""" + + def fetch(self, date_str) -> list: + current_time = datetime.datetime.now() + + try: + params = { + 'client': 'web', + 'biz': 'web_724', + 'fastColumn': '102', + 'sortEnd': '', + 'pageSize': '50', + 'req_trace': str(int(current_time.timestamp() * 1000)) # 使用当前时间戳 + } + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Accept': 'application/json, text/plain, */*', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', + 'Referer': 'https://kuaixun.eastmoney.com/', + 'Origin': 'https://kuaixun.eastmoney.com' + } + + response = requests.get( + "https://np-weblist.eastmoney.com/comm/web/getFastNewsList", + params=params, + headers=headers, + timeout=self.timeout, + verify=False + ) + response.raise_for_status() + + data = response.json() + if data.get('code') != '1': + return [] + fast_news_list = data.get('data', {}).get('fastNewsList', []) + + result = [] + cache_list = [] + + for idx, news_item in enumerate(fast_news_list[:20]): # 取前20条 + try: + title = news_item.get('title', '').strip() + if not title: + continue + + summary = news_item.get('summary', '').strip() + show_time = news_item.get('showTime', '').strip() + code = news_item.get('code', '').strip() + url = f"https://finance.eastmoney.com/a/{code}" if code else "https://kuaixun.eastmoney.com/" + + news = { + 'title': title, + 'url': url, + 'content': summary, + 'source': 'eastmoney', + 'publish_time': show_time if show_time else current_time.strftime('%Y-%m-%d %H:%M:%S'), + 'score': 1000 - idx, + 'rank': idx + 1 + } + + result.append(news) + cache_list.append(news) + + except Exception: + continue + + if cache_list: + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + except Exception as e: + return [] + + def crawler_name(self): + return "eastmoney" \ No newline at end of file diff --git a/app/services/sites/factory.py b/app/services/sites/factory.py new file mode 100644 index 0000000..6d77e17 --- /dev/null +++ b/app/services/sites/factory.py @@ -0,0 +1,64 @@ +from typing import Dict, Type + +from .baidu import BaiduNewsCrawler +from .bilibili import BilibiliCrawler +from .crawler import Crawler +from .douban import DouBanCrawler +from .douyin import DouYinCrawler +from .ftpojie import FtPoJieCrawler +from .github import GithubCrawler +from .hackernews import HackerNewsCrawler +from .hupu import HuPuCrawler +from .jinritoutiao import JinRiTouTiaoCrawler +from .juejin import JueJinCrawler +from .sspai import ShaoShuPaiCrawler +from .stackoverflow import StackOverflowCrawler +from .tenxunwang import TenXunWangCrawler +from .tieba import TieBaCrawler +from .tskr import TsKrCrawler +from .vtex import VtexCrawler +from .weibo import WeiboCrawler +from .weixin import WeiXinCrawler +from .zhihu import ZhiHuCrawler +from .sina_finance import SinaFinanceCrawler +from .eastmoney import EastMoneyCrawler +from .xueqiu import XueqiuCrawler +from .cls import CLSCrawler + + +class CrawlerRegister: + def __init__(self): + self.crawlers = {} + + def register(self) -> Dict[str, Crawler]: + """注册所有爬虫""" + crawler_map = { + "baidu": BaiduNewsCrawler(), + "shaoshupai": ShaoShuPaiCrawler(), + "weibo": WeiboCrawler(), + "zhihu": ZhiHuCrawler(), + "36kr": TsKrCrawler(), + "52pojie": FtPoJieCrawler(), + "bilibili": BilibiliCrawler(), + "douban": DouBanCrawler(), + "hupu": HuPuCrawler(), + "tieba": TieBaCrawler(), + "juejin": JueJinCrawler(), + "douyin": DouYinCrawler(), + "v2ex": VtexCrawler(), + "jinritoutiao": JinRiTouTiaoCrawler(), + "tenxunwang": TenXunWangCrawler(), + "stackoverflow": StackOverflowCrawler(), + "github": GithubCrawler(), + "hackernews": HackerNewsCrawler(), + "sina_finance": SinaFinanceCrawler(), + "eastmoney": EastMoneyCrawler(), + "xueqiu": XueqiuCrawler(), + "cls": CLSCrawler(), + } + + self.crawlers = crawler_map + return self.crawlers + + def get_crawlers(self): + return self.register().values() diff --git a/app/services/sites/ftpojie.py b/app/services/sites/ftpojie.py new file mode 100644 index 0000000..2185750 --- /dev/null +++ b/app/services/sites/ftpojie.py @@ -0,0 +1,69 @@ +import json +import datetime # 添加datetime导入 +import re + +import requests +import urllib3 +from bs4 import BeautifulSoup +# 移除 SQLAlchemy 导入 +# from sqlalchemy.sql.functions import now + +from ...core import cache +from ...db.mysql import News +from .crawler import Crawler + +urllib3.disable_warnings() + + +class FtPoJieCrawler(Crawler): + """吾爱破解""" + + def fetch(self, date_str): + # 获取当前时间 + current_time = datetime.datetime.now() + + url = "https://www.52pojie.cn/forum.php?mod=guide&view=hot" + + resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout) + if resp.status_code != 200: + print(f"request failed, status: {resp.status_code}") + return [] + + resp.encoding = 'gbk' # 52pojie使用GBK编码 + html_text = resp.text + soup = BeautifulSoup(html_text, "html.parser") + + # 找到热门帖子列表 + hot_threads = soup.find_all('tbody', id=lambda x: x and x.startswith('normalthread_')) + + result = [] + cache_list = [] + + for thread in hot_threads: + title_elem = thread.find('a', class_='xst') + if not title_elem: + continue + + title = title_elem.text.strip() + url = "https://www.52pojie.cn/" + title_elem.get('href') + + # 获取帖子信息 + info_elem = thread.find('td', class_='by') + info = info_elem.text.strip() if info_elem else "" + + news = { + 'title': title, + 'url': url, + 'content': info, + 'source': '52pojie', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') + } + + result.append(news) + cache_list.append(news) + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + def crawler_name(self): + return "52pojie" diff --git a/app/services/sites/github.py b/app/services/sites/github.py new file mode 100644 index 0000000..ae02dcb --- /dev/null +++ b/app/services/sites/github.py @@ -0,0 +1,58 @@ +import json +import datetime + +import requests +import urllib3 + +from .crawler import Crawler +from ...core import cache +from ...db.mysql import News + +urllib3.disable_warnings() + + +class GithubCrawler(Crawler): + def fetch(self, date_str): + current_time = datetime.datetime.now() + + url = "https://api.github.com/search/repositories?q=stars:%3E1&sort=stars" + + headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "Chrome/122.0.0.0 Safari/537.36" + "AppleWebKit/537.36 (KHTML, like Gecko) " + ), + "Referer": "https://github.com/", + } + + resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout) + if resp.status_code != 200: + print(f"request failed, status: {resp.status_code}") + return [] + + data = resp.json() + result = [] + cache_list = [] + + for i, item in enumerate(data["items"]): + title = item.get("full_name", "") + url = item.get("html_url", "") + desc = item.get("description", "") + + news = { + 'title': title, + 'url': url, + 'content': desc, + 'source': self.crawler_name(), + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') + } + + result.append(news) + cache_list.append(news) + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + def crawler_name(self): + return "github" diff --git a/app/services/sites/hackernews.py b/app/services/sites/hackernews.py new file mode 100644 index 0000000..a0c8303 --- /dev/null +++ b/app/services/sites/hackernews.py @@ -0,0 +1,235 @@ +import json +import datetime +import time +import requests +from bs4 import BeautifulSoup +import urllib3 +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + +from ...core import cache +from ...db.mysql import News +from .crawler import Crawler +from ..browser_manager import BrowserManager + +# 禁用SSL警告 +urllib3.disable_warnings() + +class HackerNewsCrawler(Crawler): + """hacker news""" + def fetch(self, date_str): + current_time = datetime.datetime.now() + + try: + # 首先尝试直接请求方式获取内容 + result = self._fetch_with_requests() + + if result and len(result) > 0: + # 缓存数据 + cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False)) + return result + + # 如果请求方式失败,尝试使用浏览器模拟获取 + browser_manager = BrowserManager() + result = self._fetch_with_browser(browser_manager) + if result and len(result) > 0: + # 缓存数据 + cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False)) + return result + + except Exception as e: + # 如果遇到错误,返回空列表 + return [] + + # 所有方法都失败,返回空列表 + return [] + + def _fetch_with_requests(self): + """使用requests直接获取Hacker News内容""" + url = "https://news.ycombinator.com/" + + try: + # 发送HTTP请求 + response = requests.get(url, headers=self.header, timeout=self.timeout) + if response.status_code != 200: + return [] + + # 解析HTML内容 + soup = BeautifulSoup(response.text, 'html.parser') + + result = [] + current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + # 获取所有新闻条目 + items = soup.select("tr.athing") + + for item in items: + try: + # 获取ID用于关联评论和元数据 + item_id = item.get('id') + if not item_id: + continue + + # 获取标题和链接 + title_element = item.select_one(".titleline a") + if not title_element: + continue + + title = title_element.text.strip() + url = title_element.get('href') + + # 如果URL是相对路径,转换为绝对路径 + if url and not url.startswith('http'): + url = f"https://news.ycombinator.com/{url}" + + # 获取来源网站 + site_element = item.select_one(".sitestr") + site = site_element.text.strip() if site_element else "" + + # 查找下一个tr获取元数据(分数、用户、时间等) + metadata = item.find_next_sibling('tr') + if not metadata: + continue + + # 获取分数 + score_element = metadata.select_one(".score") + score = score_element.text.strip() if score_element else "0 points" + + # 获取作者 + user_element = metadata.select_one(".hnuser") + user = user_element.text.strip() if user_element else "unknown" + + # 获取评论数 + comments_element = metadata.select_one("a:last-child") + comments = comments_element.text.strip() if comments_element else "0 comments" + if "discuss" in comments: + comments = "0 comments" + + # 构建内容摘要 + content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}" + + news = { + 'title': title, + 'url': url, + 'content': content, + 'source': 'hackernews', + 'publish_time': current_time + } + + result.append(news) + + # 限制获取前30条 + if len(result) >= 30: + break + + except Exception as e: + continue + + return result + + except Exception as e: + return [] + + def _fetch_with_browser(self, browser_manager): + """使用浏览器模拟方式获取Hacker News内容""" + url = "https://news.ycombinator.com/" + + try: + # 获取页面内容 + page_source, driver = browser_manager.get_page_content(url, wait_time=5) + + # 等待页面元素加载 + try: + WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.CSS_SELECTOR, ".athing")) + ) + except: + # 如果等待超时,仍然尝试获取内容 + pass + + result = [] + current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + # 获取所有新闻条目 + items = driver.find_elements(By.CSS_SELECTOR, "tr.athing") + + for item in items: + try: + # 获取ID用于关联评论和元数据 + item_id = item.get_attribute("id") + if not item_id: + continue + + # 获取标题和链接 + title_element = item.find_element(By.CSS_SELECTOR, ".titleline a") + title = title_element.text.strip() + url = title_element.get_attribute("href") + + # 获取来源网站 + site = "" + try: + site_element = item.find_element(By.CSS_SELECTOR, ".sitestr") + site = site_element.text.strip() + except: + pass + + # 查找下一个tr获取元数据(分数、用户、时间等) + try: + metadata = driver.find_element(By.XPATH, f"//tr[@id='{item_id}']/following-sibling::tr[1]") + + # 获取分数 + score = "0 points" + try: + score_element = metadata.find_element(By.CSS_SELECTOR, ".score") + score = score_element.text.strip() + except: + pass + + # 获取作者 + user = "unknown" + try: + user_element = metadata.find_element(By.CSS_SELECTOR, ".hnuser") + user = user_element.text.strip() + except: + pass + + # 获取评论数 + comments = "0 comments" + try: + comments_element = metadata.find_element(By.XPATH, ".//a[last()]") + comments = comments_element.text.strip() + if "discuss" in comments: + comments = "0 comments" + except: + pass + + # 构建内容摘要 + content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}" + except: + content = f"来源: {site}" + + news = { + 'title': title, + 'url': url, + 'content': content, + 'source': 'hackernews', + 'publish_time': current_time + } + + result.append(news) + + # 限制获取前30条 + if len(result) >= 30: + break + + except Exception as e: + continue + + return result + + except Exception as e: + return [] + + def crawler_name(self): + return "hackernews" diff --git a/app/services/sites/hupu.py b/app/services/sites/hupu.py new file mode 100644 index 0000000..8015e5d --- /dev/null +++ b/app/services/sites/hupu.py @@ -0,0 +1,72 @@ +import json +import datetime # 添加datetime导入 +import re + +import requests +import urllib3 +from bs4 import BeautifulSoup +# 移除 SQLAlchemy 导入 +# from sqlalchemy.sql.functions import now + +from ...core import cache +from ...db.mysql import News +from .crawler import Crawler + +urllib3.disable_warnings() + + +class HuPuCrawler(Crawler): + """虎扑""" + + def fetch(self, date_str): + # 获取当前时间 + current_time = datetime.datetime.now() + + url = "https://bbs.hupu.com/all-gambia" + + resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout) + if resp.status_code != 200: + print(f"request failed, status: {resp.status_code}") + return [] + + html_text = resp.text + soup = BeautifulSoup(html_text, "html.parser") + + # 找到热门帖子列表 + post_list = soup.find_all('div', class_='t-info') + + result = [] + cache_list = [] + + for post in post_list: + title_elem = post.find('span', class_='t-title') + if not title_elem: + continue + + link_elem = post.find('a') + if not link_elem: + continue + + title = title_elem.text.strip() + url = "https://bbs.hupu.com" + link_elem.get('href') if link_elem.get('href').startswith('/') else link_elem.get('href') + + # 获取帖子信息 + info_elem = post.find('span', class_='t-replies') + info = info_elem.text.strip() if info_elem else "" + + news = { + 'title': title, + 'url': url, + 'content': info, + 'source': 'hupu', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') + } + + result.append(news) + cache_list.append(news) + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + def crawler_name(self): + return "hupu" diff --git a/app/services/sites/jinritoutiao.py b/app/services/sites/jinritoutiao.py new file mode 100644 index 0000000..09af314 --- /dev/null +++ b/app/services/sites/jinritoutiao.py @@ -0,0 +1,63 @@ +# -- coding: utf-8 -- + +import json +import datetime # 添加datetime导入 + +import requests +import urllib3 +from bs4 import BeautifulSoup +# 移除 SQLAlchemy 导入 +# from sqlalchemy.sql.functions import now + +from ...core import cache +from .crawler import Crawler + +urllib3.disable_warnings() + + +class JinRiTouTiaoCrawler(Crawler): + """ 今日头条 """ + + def fetch(self, date_str): + # 获取当前时间 + current_time = datetime.datetime.now() + + url = "https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc" + + resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout) + if resp.status_code != 200: + print(f"request failed, status: {resp.status_code}") + return [] + + try: + json_data = resp.json() + data = json_data.get('data', []) + + result = [] + cache_list = [] + + for item in data: + title = item.get('Title', '') + url = item.get('Url', '') + hot_value = item.get('HotValue', '') + + news = { + 'title': title, + 'url': url, + 'content': f"热度: {hot_value}", + 'source': 'jinritoutiao', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') + } + + result.append(news) + cache_list.append(news) + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + except Exception as e: + print(f"Error parsing JSON: {e}") + return [] + + def crawler_name(self): + return "jinritoutiao" diff --git a/app/services/sites/juejin.py b/app/services/sites/juejin.py new file mode 100644 index 0000000..fba4534 --- /dev/null +++ b/app/services/sites/juejin.py @@ -0,0 +1,63 @@ +import json +import datetime # 添加datetime导入 + +import requests +import urllib3 +from bs4 import BeautifulSoup +# 移除 SQLAlchemy 导入 +# from sqlalchemy.sql.functions import now + +from .crawler import Crawler +from ...core import cache +from ...db.mysql import News + +urllib3.disable_warnings() + + +class JueJinCrawler(Crawler): + """掘金""" + + def fetch(self, date_str): + # 获取当前时间 + current_time = datetime.datetime.now() + + url = "https://api.juejin.cn/content_api/v1/content/article_rank?category_id=1&type=hot" + + resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout) + if resp.status_code != 200: + print(f"request failed, status: {resp.status_code}") + return [] + + try: + json_data = resp.json() + data = json_data.get('data', []) + + result = [] + cache_list = [] + + for item in data: + article_info = item.get('content', {}) + title = article_info.get('title', '') + article_id = article_info.get('content_id', '') + url = f"https://juejin.cn/post/{article_id}" + + news = { + 'title': title, + 'url': url, + 'content': title, + 'source': 'juejin', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') + } + + result.append(news) + cache_list.append(news) + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + except Exception as e: + print(f"Error parsing JSON: {e}") + return [] + + def crawler_name(self): + return "juejin" diff --git a/app/services/sites/models.py b/app/services/sites/models.py new file mode 100644 index 0000000..4009f95 --- /dev/null +++ b/app/services/sites/models.py @@ -0,0 +1,20 @@ +import datetime + +from sqlalchemy import Column, String, Integer, DateTime +from sqlalchemy.ext.declarative import declarative_base + +Base = declarative_base() + + +class DailyNews(Base): + __tablename__ = 'tab_daily_news' + + id = Column(Integer, primary_key=True) + title = Column(String(255)) + desc = Column(String(255)) + link = Column(String(255)) + type = Column(Integer, default=0) + score = Column(Integer, default=0) + times = Column(Integer, default=0) + create_time = Column(DateTime, default=datetime.datetime.now) + update_time = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now) diff --git a/app/services/sites/sina_finance.py b/app/services/sites/sina_finance.py new file mode 100644 index 0000000..c18c5d8 --- /dev/null +++ b/app/services/sites/sina_finance.py @@ -0,0 +1,75 @@ +import json +import datetime +import requests +import urllib3 +from ...core import cache +from .crawler import Crawler + +urllib3.disable_warnings() + + +class SinaFinanceCrawler(Crawler): + """新浪财经""" + def fetch(self, date_str): + current_time = datetime.datetime.now() + + try: + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Accept': 'application/json, text/plain, */*', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', + 'Referer': 'https://finance.sina.com.cn/', + 'Origin': 'https://finance.sina.com.cn' + } + + response = requests.get( + "https://zhibo.sina.com.cn/api/zhibo/feed?page=1&page_size=20&zhibo_id=152&tag_id=0&dire=f&dpc=1&pagesize=20", + headers=headers, + timeout=self.timeout, + verify=False + ) + response.raise_for_status() + + data = response.json() + if data.get('result', {}).get('status', {}).get('code') != 0: + return [] + + feed_list = data.get('result', {}).get('data', {}).get('feed', {}).get('list', []) + result = [] + cache_list = [] + + for item in feed_list: + try: + title = item.get('rich_text', '').strip() + if not title: + continue + + ext_str = item.get('ext', '{}') + try: + ext_data = json.loads(ext_str) + doc_url = ext_data.get('docurl', '') + except: + doc_url = item.get('docurl', '').strip(' "') + + news = { + 'title': title, + 'url': doc_url, + 'content': title, + 'source': 'sina_finance', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') + } + + result.append(news) + cache_list.append(news) + + except Exception: + continue + + if cache_list: + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + except Exception as e: + return [] + + def crawler_name(self): + return "sina_finance" \ No newline at end of file diff --git a/app/services/sites/sspai.py b/app/services/sites/sspai.py new file mode 100644 index 0000000..5b27737 --- /dev/null +++ b/app/services/sites/sspai.py @@ -0,0 +1,60 @@ +import json +import datetime # 添加datetime导入 + +import requests +import urllib3 +from bs4 import BeautifulSoup +# 移除 SQLAlchemy 导入 +# from sqlalchemy.sql.functions import now + +from ...core import cache +from .crawler import Crawler + +urllib3.disable_warnings() + + +class ShaoShuPaiCrawler(Crawler): + """少数派""" + def fetch(self, date_str): + current_time = datetime.datetime.now() + + url = "https://sspai.com/api/v1/article/index/page/get?limit=20&offset=0&created_at=0" + + resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout) + if resp.status_code != 200: + print(f"request failed, status: {resp.status_code}") + return [] + + try: + json_data = resp.json() + data = json_data.get('data', []) + + result = [] + cache_list = [] + + for item in data: + title = item.get('title', '') + article_id = item.get('id', '') + url = f"https://sspai.com/post/{article_id}" + summary = item.get('summary', '') + + news = { + 'title': title, + 'url': url, + 'content': summary, + 'source': 'sspai', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') + } + + result.append(news) + cache_list.append(news) + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + except Exception as e: + print(f"Error parsing JSON: {e}") + return [] + + def crawler_name(self): + return "shaoshupai" diff --git a/app/services/sites/stackoverflow.py b/app/services/sites/stackoverflow.py new file mode 100644 index 0000000..5fdc46f --- /dev/null +++ b/app/services/sites/stackoverflow.py @@ -0,0 +1,58 @@ +import json +import datetime + +import requests +import urllib3 + +from .crawler import Crawler +from ...core import cache +from ...db.mysql import News + +urllib3.disable_warnings() + + +class StackOverflowCrawler(Crawler): + def fetch(self, date_str): + current_time = datetime.datetime.now() + + url = "https://api.stackexchange.com/2.3/questions?order=desc&sort=hot&site=stackoverflow" + + headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "Chrome/122.0.0.0 Safari/537.36" + "AppleWebKit/537.36 (KHTML, like Gecko) " + ), + "Referer": "https://stackoverflow.com/", + } + + resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout) + if resp.status_code != 200: + print(f"request failed, status: {resp.status_code}") + return [] + + data = resp.json() + result = [] + cache_list = [] + + for i, item in enumerate(data["items"]): + title = item.get("title", "") + url = item.get("link", "") + desc = item.get("title", "") + + news = { + 'title': title, + 'url': url, + 'content': desc, + 'source': 'stackoverflow', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') + } + + result.append(news) + cache_list.append(news) + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + def crawler_name(self): + return "stackoverflow" diff --git a/app/services/sites/tenxunwang.py b/app/services/sites/tenxunwang.py new file mode 100644 index 0000000..d1f7e88 --- /dev/null +++ b/app/services/sites/tenxunwang.py @@ -0,0 +1,65 @@ +import json +import datetime + +import requests +import urllib3 + +from .crawler import Crawler +from ...core import cache +from ...db.mysql import News + +urllib3.disable_warnings() + + +class TenXunWangCrawler(Crawler): + """腾讯网""" + + def fetch(self, date_str): + current_time = datetime.datetime.now() + + url = "https://i.news.qq.com/gw/event/pc_hot_ranking_list?ids_hash=&offset=0&page_size=51&appver=15.5_qqnews_7.1.60&rank_id=hot" + + headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "Chrome/122.0.0.0 Safari/537.36" + "AppleWebKit/537.36 (KHTML, like Gecko) " + ), + "Referer": "https://news.qq.com/", + } + + resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout) + if resp.status_code != 200: + print(f"request failed, status: {resp.status_code}") + return [] + + data = resp.json() + result = [] + cache_list = [] + + + for i, item in enumerate(data["idlist"][0].get("newslist", [])): + if i == 0: + # 腾讯新闻用户最关注的热点,每10分钟更新一次 + continue + + title = item.get("title", "") + url = item.get("url", "") + desc = item.get("abstract", "") + + news = { + 'title': title, + 'url': url, + 'content': desc, + 'source': 'tenxunwang', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') + } + + result.append(news) + cache_list.append(news) + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + def crawler_name(self): + return "tenxunwang" diff --git a/app/services/sites/tieba.py b/app/services/sites/tieba.py new file mode 100644 index 0000000..624d1c4 --- /dev/null +++ b/app/services/sites/tieba.py @@ -0,0 +1,65 @@ +import json +import datetime # 添加datetime导入 + +import requests +import urllib3 +from bs4 import BeautifulSoup +# 移除 SQLAlchemy 导入 +# from sqlalchemy.sql.functions import now + +from .crawler import Crawler +from ...core import cache +from ...db.mysql import News + +urllib3.disable_warnings() + + +class TieBaCrawler(Crawler): + """百度贴吧""" + + def fetch(self, date_str): + # 获取当前时间 + current_time = datetime.datetime.now() + + url = "http://tieba.baidu.com/hottopic/browse/topicList" + + resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout) + if resp.status_code != 200: + print(f"request failed, status: {resp.status_code}") + return [] + + try: + json_data = resp.json() + data = json_data.get('data', {}).get('bang_topic', {}).get('topic_list', []) + + result = [] + cache_list = [] + + for item in data: + title = item.get('topic_name', '') + url = item.get('topic_url', '') + if url and not url.startswith('http'): + url = f"http://tieba.baidu.com{url}" + + desc = item.get('topic_desc', '') + + news = { + 'title': title, + 'url': url, + 'content': desc, + 'source': 'tieba', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') + } + + result.append(news) + cache_list.append(news) + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + except Exception as e: + print(f"Error parsing JSON: {e}") + return [] + + def crawler_name(self): + return "tieba" diff --git a/app/services/sites/tskr.py b/app/services/sites/tskr.py new file mode 100644 index 0000000..8df76ca --- /dev/null +++ b/app/services/sites/tskr.py @@ -0,0 +1,83 @@ +import json +import datetime +import time + +import requests +import urllib3 + +from .crawler import Crawler +from ...core import cache + +urllib3.disable_warnings() + + +class TsKrCrawler(Crawler): + """36氪""" + + def fetch(self, date_str): + """ + 获取36氪热榜数据 + """ + current_time = datetime.datetime.now() + url = f"https://gateway.36kr.com/api/mis/nav/home/nav/rank/hot" + headers = { + "Content-Type": "application/json; charset=utf-8", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36", + } + + body = { + "partner_id": "wap", + "param": { + "siteId": 1, + "platformId": 2, + }, + "timestamp": int(time.time() * 1000), + } + + try: + resp = requests.post( + url=url, + headers=headers, + json=body, + verify=False, + timeout=self.timeout + ) + + if resp.status_code != 200: + print(f"request failed, status: {resp.status_code}") + return [] + + json_data = resp.json() + data_key = "hotRankList" + data_list = json_data.get("data", {}).get(data_key, []) + + result = [] + cache_list = [] + + for item in data_list: + template_material = item.get("templateMaterial", {}) + item_id = item.get("itemId", "") + + title = template_material.get("widgetTitle", "") + article_url = f"https://www.36kr.com/p/{item_id}" + + news = { + 'title': title, + 'url': article_url, + 'content': title, + 'source': '36kr', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S'), + } + + result.append(news) + cache_list.append(news) + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + except Exception as e: + print(f"Error fetching 36kr data: {e}") + return [] + + def crawler_name(self): + return "36kr" diff --git a/app/services/sites/vtex.py b/app/services/sites/vtex.py new file mode 100644 index 0000000..0967b12 --- /dev/null +++ b/app/services/sites/vtex.py @@ -0,0 +1,71 @@ +import json +import datetime # 添加datetime导入 + +import requests +import urllib3 +from bs4 import BeautifulSoup +# 移除 SQLAlchemy 导入 +# from sqlalchemy.sql.functions import now + +from .crawler import Crawler +from ...core import cache +from ...db.mysql import News + +urllib3.disable_warnings() + + +class VtexCrawler(Crawler): + """v2ex""" + + def fetch(self, date_str): + # 获取当前时间 + current_time = datetime.datetime.now() + + url = "https://www.v2ex.com/?tab=hot" + + resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout) + if resp.status_code != 200: + print(f"request failed, status: {resp.status_code}") + return [] + + html_text = resp.text + soup = BeautifulSoup(html_text, "html.parser") + + # 找到热门话题列表 + topic_list = soup.find_all('div', class_='cell item') + + result = [] + cache_list = [] + + for topic in topic_list: + title_elem = topic.find('span', class_='item_title') + if not title_elem: + continue + + link_elem = title_elem.find('a') + if not link_elem: + continue + + title = link_elem.text.strip() + url = "https://www.v2ex.com" + link_elem.get('href') + + # 获取话题信息 + info_elem = topic.find('span', class_='topic_info') + info = info_elem.text.strip() if info_elem else "" + + news = { + 'title': title, + 'url': url, + 'content': info, + 'source': 'v2ex', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') + } + + result.append(news) + cache_list.append(news) + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + def crawler_name(self): + return "v2ex" diff --git a/app/services/sites/weibo.py b/app/services/sites/weibo.py new file mode 100644 index 0000000..934c9db --- /dev/null +++ b/app/services/sites/weibo.py @@ -0,0 +1,68 @@ +import json +import datetime # 添加datetime导入 + +import requests +import urllib3 +from bs4 import BeautifulSoup +# 移除 SQLAlchemy 导入 +# from sqlalchemy.sql.functions import now + +from ...core import cache +from .crawler import Crawler + +urllib3.disable_warnings() + + +class WeiboCrawler(Crawler): + """微博""" + + def fetch(self, date_str): + # 获取当前时间 + current_time = datetime.datetime.now() + + header = self.header.copy() + header.update({ + "accept": "application/json, text/javascript, */*; q=0.01", + "host": "weibo.com", + "Referer": "https://weibo.com", + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + }) + + url = "https://weibo.com/ajax/side/hotSearch" + + resp = requests.get(url=url, headers=header, verify=False, timeout=self.timeout) + if resp.status_code != 200: + print(f"request failed, status: {resp.status_code}") + return [] + + try: + json_data = resp.json() + data = json_data.get('data', {}).get('realtime', []) + + result = [] + cache_list = [] + + for item in data: + title = item.get('word', '') + url = f"https://s.weibo.com/weibo?q=%23{title}%23" + + news = { + 'title': title, + 'url': url, + 'content': title, + 'source': 'weibo', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') + } + + result.append(news) + cache_list.append(news) + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + except Exception as e: + print(f"Error parsing JSON: {e}") + return [] + + def crawler_name(self): + return "weibo" diff --git a/app/services/sites/weixin.py b/app/services/sites/weixin.py new file mode 100644 index 0000000..b493000 --- /dev/null +++ b/app/services/sites/weixin.py @@ -0,0 +1,228 @@ +import json +import datetime +import time +import requests +from bs4 import BeautifulSoup +import urllib3 +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + +from ...core import cache +from ...db.mysql import News +from .crawler import Crawler +from ..browser_manager import BrowserManager + +# 禁用SSL警告 +urllib3.disable_warnings() + +class WeiXinCrawler(Crawler): + """ + 微信热门内容爬虫 + 使用微信看一看热门页面获取数据 + """ + + def fetch(self, date_str): + """获取微信热门内容""" + current_time = datetime.datetime.now() + browser_manager = BrowserManager() + + try: + # 首先尝试从微信看一看获取热门内容 + result = self._fetch_from_weixin_kankan(browser_manager) + + if result and len(result) > 0: + # 缓存数据 + cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False)) + return result + + # 如果看一看失败,尝试从微信读书获取热门书评 + result = self._fetch_from_weixin_dushu(browser_manager) + if result and len(result) > 0: + # 缓存数据 + cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False)) + return result + + except Exception as e: + # 如果遇到错误,返回空列表 + return [] + + # 所有方法都失败,返回空列表 + return [] + + def _fetch_from_weixin_kankan(self, browser_manager): + """从微信看一看页面获取热门内容""" + url = "https://k.weixin.qq.com/" + + try: + # 获取页面内容 + page_source, driver = browser_manager.get_page_content(url, wait_time=10) + + # 等待热门内容加载 + try: + WebDriverWait(driver, 15).until( + EC.presence_of_element_located((By.CSS_SELECTOR, ".hot")) + ) + except: + # 如果等待超时,仍然尝试获取内容 + pass + + # 点击"热点"标签切换到热门内容 + try: + hot_tab = driver.find_element(By.XPATH, "//div[contains(text(), '热点') and @class='tab']") + hot_tab.click() + time.sleep(3) # 等待内容加载 + except: + # 如果找不到热点标签,继续尝试获取当前页面内容 + pass + + result = [] + current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + # 获取文章列表 + articles = driver.find_elements(By.CSS_SELECTOR, ".article-item") + + if not articles: + # 尝试其他可能的选择器 + articles = driver.find_elements(By.CSS_SELECTOR, ".doc-item") + + if not articles: + # 再尝试其他可能的选择器 + articles = driver.find_elements(By.CSS_SELECTOR, ".item") + + for article in articles: + try: + # 获取文章标题和链接 + title_elem = article.find_element(By.CSS_SELECTOR, "h3, .title") + title = title_elem.text.strip() + + # 尝试获取链接 + link = None + try: + link_elem = article.find_element(By.TAG_NAME, "a") + link = link_elem.get_attribute("href") + except: + # 如果直接获取链接失败,则记录文章id,以后可以构建链接 + try: + article_id = article.get_attribute("data-id") or article.get_attribute("id") + link = f"https://k.weixin.qq.com/article?id={article_id}" + except: + link = "https://k.weixin.qq.com/" + + # 获取来源 + source = "" + try: + source_elem = article.find_element(By.CSS_SELECTOR, ".account, .source") + source = source_elem.text.strip() + except: + pass + + # 获取摘要 + summary = "" + try: + summary_elem = article.find_element(By.CSS_SELECTOR, ".desc, .summary, p") + summary = summary_elem.text.strip() + except: + pass + + news = { + 'title': title, + 'url': link, + 'content': f"来源: {source} | 摘要: {summary[:50] if summary else '无摘要'}", + 'source': 'weixin', + 'publish_time': current_time + } + + result.append(news) + + # 限制获取前20条 + if len(result) >= 20: + break + + except Exception as e: + continue + + return result + + except Exception as e: + return [] + + def _fetch_from_weixin_dushu(self, browser_manager): + """从微信读书获取热门书评""" + url = "https://weread.qq.com/web/category/all" + + try: + # 获取页面内容 + page_source, driver = browser_manager.get_page_content(url, wait_time=8) + + result = [] + current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + # 尝试点击排行榜标签 + try: + rank_tab = driver.find_element(By.XPATH, "//a[contains(text(), '排行榜')]") + rank_tab.click() + time.sleep(3) # 等待内容加载 + except: + # 如果找不到排行榜标签,继续尝试获取当前页面内容 + pass + + # 获取热门书籍列表 + books = driver.find_elements(By.CSS_SELECTOR, ".shelf-item, .book-item") + + for book in books: + try: + # 获取书籍标题和链接 + title_elem = book.find_element(By.CSS_SELECTOR, ".title, h3") + title = title_elem.text.strip() + + # 尝试获取链接 + link = "https://weread.qq.com/web/category/all" + try: + link_elem = book.find_element(By.TAG_NAME, "a") + link = link_elem.get_attribute("href") + except: + book_id = book.get_attribute("data-bid") or book.get_attribute("id") + if book_id: + link = f"https://weread.qq.com/web/reader/{book_id}" + + # 获取作者 + author = "" + try: + author_elem = book.find_element(By.CSS_SELECTOR, ".author, .writer") + author = author_elem.text.strip() + except: + pass + + # 获取摘要/简介 + intro = "" + try: + intro_elem = book.find_element(By.CSS_SELECTOR, ".intro, .desc") + intro = intro_elem.text.strip() + except: + pass + + news = { + 'title': f"热门书籍: {title}", + 'url': link, + 'content': f"作者: {author} | 简介: {intro[:50] if intro else '无简介'}", + 'source': 'weixin', + 'publish_time': current_time + } + + result.append(news) + + # 限制获取前20条 + if len(result) >= 20: + break + + except Exception as e: + continue + + return result + + except Exception as e: + return [] + + def crawler_name(self): + return "weixin" diff --git a/app/services/sites/xueqiu.py b/app/services/sites/xueqiu.py new file mode 100644 index 0000000..f5280aa --- /dev/null +++ b/app/services/sites/xueqiu.py @@ -0,0 +1,155 @@ +import json +import datetime +import requests +import urllib3 +import re +from requests.sessions import Session + +from .crawler import Crawler +from ...core import cache + +urllib3.disable_warnings() + + +class XueqiuCrawler(Crawler): + """雪球""" + def __init__(self): + super().__init__() + self.session = Session() + self._init_session() + + def _init_session(self): + try: + # 第一步:访问主页获取基础cookies + main_url = "https://xueqiu.com" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', + 'Cache-Control': 'no-cache', + 'Pragma': 'no-cache', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Sec-Fetch-User': '?1', + 'Upgrade-Insecure-Requests': '1' + } + + resp = self.session.get(main_url, headers=headers, verify=False, timeout=self.timeout) + if resp.status_code == 200: + html_content = resp.text + + # 尝试提取token + token_match = re.search(r'window\.SNB\s*=\s*\{[^}]*token["\']?\s*:\s*["\']([^"\']+)["\']', html_content) + if token_match: + token = token_match.group(1) + self.session.headers.update({'X-Requested-With': 'XMLHttpRequest'}) + + hot_page_url = "https://xueqiu.com/hot_event" + hot_headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', + 'Referer': 'https://xueqiu.com/', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'same-origin', + 'Sec-Fetch-User': '?1', + 'Upgrade-Insecure-Requests': '1' + } + + hot_resp = self.session.get(hot_page_url, headers=hot_headers, verify=False, timeout=self.timeout) + if hot_resp.status_code == 200: + print("雪球热门页面访问成功,已获取完整认证信息") + else: + print(f"雪球热门页面访问失败: {hot_resp.status_code}") + + else: + print(f"雪球主页访问失败: {resp.status_code}") + + except Exception as e: + print(f"初始化雪球会话失败: {e}") + + def fetch(self, date_str) -> list: + current_time = datetime.datetime.now() + + url = "https://xueqiu.com/hot_event/list.json?count=10" + headers = { + 'Accept': 'application/json, text/plain, */*', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive', + 'Pragma': 'no-cache', + 'Referer': 'https://xueqiu.com/', + 'Sec-Fetch-Dest': 'empty', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'same-origin', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36', + 'X-Requested-With': 'XMLHttpRequest' + } + + try: + resp = self.session.get(url=url, headers=headers, verify=False, timeout=self.timeout) + + if resp.status_code != 200: + print(f"雪球请求失败, status: {resp.status_code}") + self._init_session() + resp = self.session.get(url=url, headers=headers, verify=False, timeout=self.timeout) + if resp.status_code != 200: + print(f"雪球重试后仍失败, status: {resp.status_code}") + return [] + + json_data = resp.json() + if 'list' not in json_data: + print("雪球响应格式异常") + return [] + + result = [] + cache_list = [] + + for idx, item in enumerate(json_data['list'][:10]): # 取前10条 + try: + tag = item.get('tag', '').strip() + if tag.startswith('#') and tag.endswith('#'): + title = tag[1:-1] + else: + title = tag + + if not title: + continue + + item_id = item.get('id') + url_link = f"https://xueqiu.com/" + + content = item.get('content', '').strip() + if len(content) > 200: + content = content[:200] + '...' + + status_count = item.get('status_count', 0) + hot_value = item.get('hot', 0) + + news = { + 'title': title, + 'url': url_link, + 'content': content, + 'source': 'xueqiu', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S'), + 'score': status_count if status_count > 0 else 1000 - idx, + 'rank': idx + 1 + } + result.append(news) + cache_list.append(news) + + except Exception as e: + print(f"解析雪球新闻项失败: {e}") + continue + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + except Exception as e: + print(f"获取雪球数据失败: {e}") + return [] + + def crawler_name(self): + return "xueqiu" \ No newline at end of file diff --git a/app/services/sites/zhihu.py b/app/services/sites/zhihu.py new file mode 100644 index 0000000..c86cb34 --- /dev/null +++ b/app/services/sites/zhihu.py @@ -0,0 +1,64 @@ +import json +import datetime # 添加datetime导入 + +import requests +import urllib3 +from bs4 import BeautifulSoup +# 移除 SQLAlchemy 导入 +# from sqlalchemy.sql.functions import now + +from .crawler import Crawler +from ...core import cache +from ...db.mysql import News + +urllib3.disable_warnings() + + +class ZhiHuCrawler(Crawler): + """知乎""" + + def fetch(self, date_str): + # 获取当前时间 + current_time = datetime.datetime.now() + + url = "https://www.zhihu.com/api/v3/explore/guest/feeds?limit=30&ws_qiangzhisafe=0" + + resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout) + if resp.status_code != 200: + print(f"request failed, status: {resp.status_code}") + return [] + + try: + json_data = resp.json() + data = json_data.get('data', []) + + result = [] + cache_list = [] + + for item in data: + target = item.get('target', {}) + question = target.get('question', {}) + title = question.get('title', '') + url = f"https://www.zhihu.com/question/{question.get('id')}" + excerpt = target.get('excerpt', '') + + news = { + 'title': title, + 'url': url, + 'content': excerpt, + 'source': 'zhihu', + 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') + } + + result.append(news) + cache_list.append(news) + + cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) + return result + + except Exception as e: + print(f"Error parsing JSON: {e}") + return [] + + def crawler_name(self): + return "zhihu" diff --git a/app/utils/__init__.py b/app/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/utils/logger.py b/app/utils/logger.py new file mode 100644 index 0000000..e6153b2 --- /dev/null +++ b/app/utils/logger.py @@ -0,0 +1,73 @@ +import logging +import os +import sys +import time +from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler +import pytz +from datetime import datetime + +from app.core.config import get_logging_config + +# 获取日志配置 +log_config = get_logging_config() + +# 确保日志目录存在 +os.makedirs(log_config.dir, exist_ok=True) + +# 自定义日志格式化器,使用配置的时区 +class CustomFormatter(logging.Formatter): + def converter(self, timestamp): + dt = datetime.fromtimestamp(timestamp) + tz = pytz.timezone(log_config.timezone) + return dt.replace(tzinfo=pytz.utc).astimezone(tz) + + def formatTime(self, record, datefmt=None): + dt = self.converter(record.created) + if datefmt: + return dt.strftime(datefmt) + return dt.strftime("%Y-%m-%d %H:%M:%S") + +# 创建日志记录器 +log = logging.getLogger('app') +log.setLevel(getattr(logging, log_config.level)) + +# 清除现有处理器 +for handler in log.handlers[:]: + log.removeHandler(handler) + +# 创建控制台处理器 +console_handler = logging.StreamHandler(sys.stdout) +console_handler.setLevel(getattr(logging, log_config.level)) +console_formatter = CustomFormatter(log_config.format) +console_handler.setFormatter(console_formatter) +log.addHandler(console_handler) + +# 创建文件处理器 - 按大小轮转 +file_handler = RotatingFileHandler( + os.path.join(log_config.dir, log_config.file), + maxBytes=log_config.max_size, + backupCount=log_config.backup_count, + encoding='utf-8' +) +file_handler.setLevel(getattr(logging, log_config.level)) +file_formatter = CustomFormatter(log_config.format) +file_handler.setFormatter(file_formatter) +log.addHandler(file_handler) + +# 创建文件处理器 - 按日期轮转 +daily_handler = TimedRotatingFileHandler( + os.path.join(log_config.dir, 'app.daily.log'), + when='midnight', + interval=1, + backupCount=log_config.daily_backup_count, + encoding='utf-8' +) +daily_handler.setLevel(getattr(logging, log_config.level)) +daily_handler.setFormatter(file_formatter) +log.addHandler(daily_handler) + +# 防止日志传播到父记录器 +log.propagate = False + +# 记录启动信息 +log.info(f"Logger initialized at {datetime.now(pytz.timezone(log_config.timezone)).strftime('%Y-%m-%d %H:%M:%S')}") \ No newline at end of file diff --git a/app/utils/notification.py b/app/utils/notification.py new file mode 100644 index 0000000..69fdae4 --- /dev/null +++ b/app/utils/notification.py @@ -0,0 +1,286 @@ +import json +import time +import hmac +import hashlib +import base64 +import urllib.parse +from datetime import datetime +from typing import Dict, Any, Optional, List +import requests +import pytz + +from app.utils.logger import log +from app.core.config import get_notification_config + + +class DingTalkNotifier: + """钉钉机器人通知器""" + + def __init__(self): + self.config = get_notification_config() + self.webhook_url = self.config.get('dingtalk', {}).get('webhook_url', '') + self.secret = self.config.get('dingtalk', {}).get('secret', '') + self.enabled = self.config.get('dingtalk', {}).get('enabled', False) + self.timeout = self.config.get('dingtalk', {}).get('timeout', 10) + self.notify_success = self.config.get('dingtalk', {}).get('notify_success', False) + self.shanghai_tz = pytz.timezone('Asia/Shanghai') + + if not self.webhook_url and self.enabled: + log.warning("DingTalk webhook URL not configured, notifications will be disabled") + self.enabled = False + + def _generate_sign(self, timestamp: int) -> str: + """生成钉钉机器人签名""" + if not self.secret: + return "" + + string_to_sign = f'{timestamp}\n{self.secret}' + hmac_code = hmac.new( + self.secret.encode('utf-8'), + string_to_sign.encode('utf-8'), + digestmod=hashlib.sha256 + ).digest() + sign = urllib.parse.quote_plus(base64.b64encode(hmac_code)) + return sign + + def _send_message(self, message: Dict[str, Any]) -> bool: + """发送消息到钉钉""" + if not self.enabled: + log.debug("DingTalk notifications are disabled") + return False + + try: + # 生成时间戳和签名 + timestamp = int(round(time.time() * 1000)) + sign = self._generate_sign(timestamp) + + # 构建请求URL + url = self.webhook_url + if sign: + url += f"×tamp={timestamp}&sign={sign}" + + # 发送请求 + response = requests.post( + url, + json=message, + timeout=self.timeout, + headers={'Content-Type': 'application/json'} + ) + + if response.status_code == 200: + result = response.json() + if result.get('errcode') == 0: + log.info("DingTalk notification sent successfully") + return True + else: + log.error(f"DingTalk API error: {result.get('errmsg', 'Unknown error')}") + return False + else: + log.error(f"DingTalk HTTP error: {response.status_code}") + return False + + except Exception as e: + log.error(f"Failed to send DingTalk notification: {str(e)}") + return False + + def send_text_message(self, content: str, at_mobiles: Optional[List[str]] = None, + at_all: bool = False) -> bool: + """发送文本消息""" + message = { + "msgtype": "text", + "text": { + "content": content + } + } + + if at_mobiles or at_all: + message["at"] = { + "atMobiles": at_mobiles or [], + "isAtAll": at_all + } + + return self._send_message(message) + + def send_markdown_message(self, title: str, text: str, + at_mobiles: Optional[List[str]] = None, + at_all: bool = False) -> bool: + """发送Markdown消息""" + message = { + "msgtype": "markdown", + "markdown": { + "title": title, + "text": text + } + } + + if at_mobiles or at_all: + message["at"] = { + "atMobiles": at_mobiles or [], + "isAtAll": at_all + } + + return self._send_message(message) + + def send_crawler_error(self, crawler_name: str, error_msg: str, + date_str: str, is_retry: bool = False) -> bool: + """发送爬虫错误通知""" + current_time = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S") + retry_text = "重试失败" if is_retry else "首次失败" + + title = f"🚨 爬虫异常通知 - {crawler_name}" + content = f""" +## {title} + +**时间**: {current_time}\n +**爬虫**: {crawler_name}\n +**日期**: {date_str}\n +**状态**: {retry_text}\n +**错误信息**: +``` +{error_msg} +``` + +请及时检查爬虫状态! + """.strip() + + # 异常时@所有人 + return self.send_markdown_message(title, content, at_all=True) + + def send_crawler_timeout(self, timeout_seconds: int, date_str: str) -> bool: + """发送爬虫超时通知""" + current_time = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S") + + title = "⏰ 爬虫超时通知" + content = f""" +## {title} + +**时间**: {current_time}\n +**日期**: {date_str}\n +**超时时长**: {timeout_seconds}秒\n +**状态**: 爬虫任务执行超时被强制终止 + +请检查爬虫性能或调整超时配置! + """.strip() + + # 超时异常时@所有人 + return self.send_markdown_message(title, content, at_all=True) + + def send_crawler_summary(self, success_count: int, total_count: int, + failed_crawlers: List[str], duration: float, + date_str: str) -> bool: + """发送爬虫执行摘要通知""" + # 全部成功且未启用正常通知时,不发送 + if success_count == total_count and not self.notify_success: + return True + + current_time = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S") + + # 构建失败爬虫列表 + failed_list = "\n".join([f"- {name}" for name in failed_crawlers]) if failed_crawlers else "" + + if failed_crawlers: + title = f"🚨 爬虫执行摘要 - {date_str}" + else: + title = f"📊 爬虫执行摘要 - {date_str}" + + # 根据是否有失败构建不同的内容 + if failed_crawlers: + content = f""" +## {title} + +**时间**: {current_time}\n +**日期**: {date_str}\n +**执行时长**: {duration:.2f}秒\n +**成功**: {success_count}/{total_count}\n +**失败**: {len(failed_crawlers)} + +**失败的爬虫**: +{failed_list} + +请关注失败的爬虫状态! + """.strip() + else: + content = f""" +## {title} + +**时间**: {current_time}\n +**日期**: {date_str}\n +**执行时长**: {duration:.2f}秒\n +**成功**: {success_count}/{total_count}\n +**失败**: {len(failed_crawlers)} + +所有爬虫执行成功! + """.strip() + + # 有失败时@所有人,没失败时不@ + at_all = len(failed_crawlers) > 0 + return self.send_markdown_message(title, content, at_all=at_all) + + def send_analysis_error(self, error_msg: str, date_str: str) -> bool: + """发送数据分析错误通知""" + current_time = datetime.now(self.shanghai_tz).strftime("%Y-%m-%d %H:%M:%S") + + title = "🔍 数据分析异常通知" + content = f""" +## {title} + +**时间**: {current_time}\n +**日期**: {date_str}\n +**错误信息**: +``` +{error_msg} +``` + +数据分析任务执行失败,请检查分析模块! + """.strip() + + # 分析异常时@所有人 + return self.send_markdown_message(title, content, at_all=True) + +class NotificationManager: + """通知管理器,支持多种通知方式""" + + def __init__(self): + self.dingtalk = DingTalkNotifier() + # 可以在这里添加其他通知方式,如企业微信、邮件等 + + def is_enabled(self) -> bool: + """检查通知是否启用""" + return self.dingtalk.enabled + + @property + def webhook_url(self) -> str: + """获取webhook URL""" + return self.dingtalk.webhook_url + + def send_text(self, content: str, at_all: bool = False) -> bool: + """发送文本消息""" + return self.dingtalk.send_text_message(content, at_all=at_all) + + def send_markdown(self, title: str, text: str, at_all: bool = False) -> bool: + """发送Markdown消息""" + return self.dingtalk.send_markdown_message(title, text, at_all=at_all) + + def notify_crawler_error(self, crawler_name: str, error_msg: str, + date_str: str, is_retry: bool = False): + """通知爬虫错误""" + self.dingtalk.send_crawler_error(crawler_name, error_msg, date_str, is_retry) + + def notify_crawler_timeout(self, timeout_seconds: int, date_str: str): + """通知爬虫超时""" + self.dingtalk.send_crawler_timeout(timeout_seconds, date_str) + + def notify_crawler_summary(self, success_count: int, total_count: int, + failed_crawlers: List[str], duration: float, + date_str: str): + """通知爬虫执行摘要""" + self.dingtalk.send_crawler_summary(success_count, total_count, + failed_crawlers, duration, date_str) + + def notify_analysis_error(self, error_msg: str, date_str: str): + """通知数据分析错误""" + self.dingtalk.send_analysis_error(error_msg, date_str) + + +# 全局通知管理器实例 +notification_manager = NotificationManager() \ No newline at end of file diff --git a/config/config.yaml b/config/config.yaml new file mode 100644 index 0000000..409c28f --- /dev/null +++ b/config/config.yaml @@ -0,0 +1,66 @@ + +app: + title: "News Crawler API" + description: "API for news crawling and management" + version: "1.0.0" + host: "0.0.0.0" + port: 18080 + debug: false + cors: + allow_origins: ["*"] + allow_credentials: true + allow_methods: ["*"] + allow_headers: ["*"] + +database: + host: "localhost" + user: "root" + password: "123456" + db: "news_crawler" + charset: "utf8mb4" + autocommit: true + + +redis: + host: "localhost" + port: 6379 + db: 0 + password: "" + decode_responses: false + socket_timeout: 5 + socket_connect_timeout: 5 + health_check_interval: 30 + +crawler: + interval: 1800 + timeout: 1700 + max_retry_count: 2 + max_instances: 2 + misfire_grace_time: 300 + +logging: + level: "INFO" + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + dir: "logs" + file: "app.log" + max_size: 10485760 # 10MB + backup_count: 5 + daily_backup_count: 30 + timezone: "Asia/Shanghai" + +notification: + dingtalk: + enabled: false + webhook_url: "" + secret: "" + timeout: 10 + notify_success: false + + +scheduler: + thread_pool_size: 20 + process_pool_size: 5 + coalesce: true + max_instances: 2 + misfire_grace_time: 300 + timezone: "Asia/Shanghai" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8e2c8fb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,19 @@ +requests==2.31.0 +beautifulsoup4==4.9.3 +SQLAlchemy==2.0.23 +pymysql==1.1.0 +apscheduler>=3.8.0 +fastapi>=0.100.0 +uvicorn>=0.23.0 +schedule>=1.1.0 +redis>=3.7.0 +pytz>=2021.1 +python-telegram-bot==21.3 +urllib3~=2.0.7 +pydantic~=1.10.14 +PyYAML~=6.0.2 +cloudscraper~=1.2.71 +selenium~=4.29.0 +webdriver-manager~=4.0.2 +jieba>=0.42.1 +cryptography==41.0.3 \ No newline at end of file diff --git a/run.py b/run.py new file mode 100644 index 0000000..906f82e --- /dev/null +++ b/run.py @@ -0,0 +1,22 @@ +# run.py +import os +import sys + +sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) + +from app.core.config import load_config +load_config() + +from app.main import app + +if __name__ == "__main__": + import uvicorn + from app.core.config import get_app_config + + app_config = get_app_config() + uvicorn.run( + "app.main:app", + host=app_config.host, + port=app_config.port, + reload=app_config.debug + ) diff --git a/test/crawler_test.py b/test/crawler_test.py new file mode 100644 index 0000000..d39a298 --- /dev/null +++ b/test/crawler_test.py @@ -0,0 +1,24 @@ +from datetime import datetime + +import pytz + +timezone = pytz.timezone('Asia/Shanghai') +now_time = datetime.now(timezone) +date_str = now_time.strftime("%Y-%m-%d") + + +class TestCrawler: + + def test_init(self): + pass + + def test_crawler(self): + from app.service.sites import BilibiliCrawler + + crawler = BilibiliCrawler() + crawler.fetch(date_str) + + +if __name__ == '__main__': + test = TestCrawler() + test.test_crawler() diff --git a/test/hackernews_test.py b/test/hackernews_test.py new file mode 100644 index 0000000..343a874 --- /dev/null +++ b/test/hackernews_test.py @@ -0,0 +1,58 @@ +import sys +import os +import json +from datetime import datetime + +# 添加项目根目录到系统路径 +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from app.services.sites.hackernews import HackerNewsCrawler + +def test_hackernews_crawler(): + """测试Hacker News爬虫""" + print("===== 测试 Hacker News 爬虫 =====") + crawler = HackerNewsCrawler() + date_str = datetime.now().strftime('%Y-%m-%d') + + print("1. 使用requests方式测试:") + result = crawler._fetch_with_requests() + if result and len(result) > 0: + print(f" - 成功获取到 {len(result)} 条新闻") + print(" - 第一条新闻示例:") + print(f" 标题: {result[0]['title']}") + print(f" 链接: {result[0]['url']}") + print(f" 内容: {result[0]['content']}") + else: + print(" - 使用requests方式获取失败") + + print("\n2. 使用浏览器方式测试:") + from app.services.browser_manager import BrowserManager + browser_manager = BrowserManager() + + try: + result = crawler._fetch_with_browser(browser_manager) + if result and len(result) > 0: + print(f" - 成功获取到 {len(result)} 条新闻") + print(" - 第一条新闻示例:") + print(f" 标题: {result[0]['title']}") + print(f" 链接: {result[0]['url']}") + print(f" 内容: {result[0]['content']}") + else: + print(" - 使用浏览器方式获取失败") + except Exception as e: + print(f" - 浏览器测试异常: {str(e)}") + + print("\n3. 测试完整的fetch方法:") + result = crawler.fetch(date_str) + if result and len(result) > 0: + print(f" - 成功获取到 {len(result)} 条新闻") + print(" - 结果示例(前3条):") + for i, news in enumerate(result[:3]): + print(f" [{i+1}] {news['title']}") + else: + print(" - fetch方法获取失败") + + print("\n===== 测试完成 =====") + +if __name__ == "__main__": + test_hackernews_crawler() \ No newline at end of file diff --git a/tg_bot.py b/tg_bot.py new file mode 100644 index 0000000..4ab7280 --- /dev/null +++ b/tg_bot.py @@ -0,0 +1,127 @@ +import asyncio +import logging +import os + +import requests +import telegram.constants +from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup +from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler +from telegram.helpers import escape_markdown + +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + level=logging.INFO +) + +supported_sites = { + "百度": "baidu", + "少数派": "shaoshupai", + "微博": "weibo", + "知乎": "zhihu", + "36氪": "36kr", + "吾爱破解": "52pojie", + "哔哩哔哩": "bilibili", + "豆瓣": "douban", + "虎扑": "hupu", + "贴吧": "tieba", + "掘金": "juejin", + "抖音": "douyin", + "V2EX": "v2ex", + "今日头条": "jinritoutiao" +} + + +async def news(update: Update, context: ContextTypes.DEFAULT_TYPE): + """ + Send a message when the command /news is issued. + """ + + user = update.effective_user + response_message = f"哈喽! {user.first_name} {user.last_name if user.last_name else ''}, 我是一个新闻机器人。 请选择想要了解的站点热榜:\n\n" + + # Create the inline keyboard layout + reply_markup = build_reply_makeup() + + await context.bot.send_message( + chat_id=update.effective_chat.id, + text=response_message, + reply_markup=reply_markup + ) + + +async def selected_news(update: Update, context: ContextTypes.DEFAULT_TYPE): + """ + Return the selected site's news. + """ + query = update.callback_query + await query.answer() + + news_url = f"https://orz.ai/api/v1/dailynews/?platform={supported_sites[query.data]}" + + resp = requests.get(news_url) + + if resp.status_code != 200: + await context.bot.send_message( + chat_id=update.effective_chat.id, + text=f"Failed to get news from {query.data}, please try again later.", + ) + return + + response_message = f"下面是来自*{query.data}*的最新热榜:\n\n" + + news_datas = resp.json()['data'] + for i, news_data in enumerate(news_datas): + if i >= 10: + break + + response_message += f"{i + 1}\. [{escape_markdown(news_data['title'], version=2)}]({escape_markdown(news_data['url'], version=2)})\n\n" + + reply_markup = build_reply_makeup() + + await context.bot.send_message( + chat_id=update.effective_chat.id, + text=response_message, + parse_mode=telegram.constants.ParseMode.MARKDOWN_V2, + reply_markup=reply_markup + ) + + +def build_reply_makeup(): + """ + Build the inline keyboard markup. + """ + keyboard = [] + row = [] + i = 0 + for k in supported_sites.keys(): + row.append(InlineKeyboardButton(k, callback_data=k)) + if (i + 1) % 3 == 0: + keyboard.append(row) + row = [] + i += 1 + + if row: + keyboard.append(row) + return InlineKeyboardMarkup(keyboard) + + +if __name__ == '__main__': + token = os.getenv('TG_BOT_TOKEN') + if not token: + logging.error("Please set the environment variable 'TG_BOT_TOKEN'") + exit(1) + + # if you want to use proxy + # proxy_url = 'http://127.0.0.1:7890' + # app = ApplicationBuilder().token(token).proxy(proxy_url).get_updates_proxy_url(proxy_url).build() + + app = ApplicationBuilder().token(token).build() + + app.add_handler(CommandHandler(['start', 'news'], news)) + app.add_handler(telegram.ext.CallbackQueryHandler(selected_news)) + + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + # use polling mode + app.run_polling()