This commit is contained in:
2026-03-26 15:04:59 +08:00
commit e0af97ac7f
65 changed files with 7366 additions and 0 deletions

43
app/services/__init__.py Normal file
View File

@@ -0,0 +1,43 @@
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.executors.pool import ThreadPoolExecutor, ProcessPoolExecutor
from apscheduler.jobstores.memory import MemoryJobStore
import pytz
from app.services.sites.factory import CrawlerRegister
from app.utils.logger import log
from app.core.config import get_scheduler_config
# 创建爬虫工厂
crawler_factory = CrawlerRegister().register()
# 获取调度器配置
scheduler_config = get_scheduler_config()
# 配置调度器
jobstores = {
'default': MemoryJobStore()
}
executors = {
'default': ThreadPoolExecutor(scheduler_config.thread_pool_size),
'processpool': ProcessPoolExecutor(scheduler_config.process_pool_size)
}
job_defaults = {
'coalesce': scheduler_config.coalesce,
'max_instances': scheduler_config.max_instances,
'misfire_grace_time': scheduler_config.misfire_grace_time,
}
# 创建并配置调度器
_scheduler = BackgroundScheduler(
jobstores=jobstores,
executors=executors,
job_defaults=job_defaults,
timezone=pytz.timezone(scheduler_config.timezone)
)
# 启动调度器
_scheduler.start()
log.info(f"Scheduler started with timezone: {scheduler_config.timezone}")

View File

@@ -0,0 +1,121 @@
import threading
import time
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from app.utils.logger import log
class BrowserManager:
"""浏览器管理器提供共享的Chrome浏览器实例"""
_instance = None
_lock = threading.Lock()
_driver = None
_driver_path = None
_last_activity = 0
_max_idle_time = 1800 # 最大空闲时间默认30分钟
def __new__(cls, *args, **kwargs):
"""单例模式实现"""
if cls._instance is None:
with cls._lock:
if cls._instance is None:
cls._instance = super(BrowserManager, cls).__new__(cls)
cls._instance._init_driver_path()
cls._instance._start_idle_monitor()
return cls._instance
def _init_driver_path(self):
"""初始化ChromeDriver路径"""
try:
self._driver_path = ChromeDriverManager().install()
log.info(f"ChromeDriver已安装: {self._driver_path}")
except Exception as e:
log.error(f"ChromeDriver安装失败: {str(e)}")
raise
def _start_idle_monitor(self):
"""启动空闲监控线程"""
def monitor():
while True:
time.sleep(60) # 每分钟检查一次
try:
with self._lock:
if self._driver is not None:
current_time = time.time()
if current_time - self._last_activity > self._max_idle_time:
log.info(f"浏览器空闲超过{self._max_idle_time}秒,释放资源")
self._quit_driver()
except Exception as e:
log.error(f"浏览器监控线程异常: {str(e)}")
monitor_thread = threading.Thread(target=monitor, daemon=True)
monitor_thread.start()
log.info("浏览器空闲监控线程已启动")
def get_driver(self):
"""获取Chrome浏览器实例"""
with self._lock:
self._last_activity = time.time()
if self._driver is None:
self._create_driver()
return self._driver
def _create_driver(self):
"""创建新的Chrome浏览器实例"""
log.info("创建新的Chrome浏览器实例")
options = webdriver.ChromeOptions()
# 基本配置(无头模式)
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
# 内存优化配置
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-extensions")
options.add_argument("--disable-application-cache")
options.add_argument("--js-flags=--expose-gc")
options.add_argument("--memory-pressure-off")
options.add_argument("--disable-default-apps")
# 日志级别
options.add_argument("--log-level=3")
self._driver = webdriver.Chrome(
service=Service(self._driver_path),
options=options
)
self._driver.set_page_load_timeout(30)
def _quit_driver(self):
"""关闭浏览器实例"""
if self._driver:
try:
self._driver.quit()
log.info("浏览器实例已关闭")
except Exception as e:
log.error(f"关闭浏览器实例出错: {str(e)}")
finally:
self._driver = None
def release_driver(self):
"""使用完毕后标记为活动状态"""
with self._lock:
self._last_activity = time.time()
def get_page_content(self, url, wait_time=5):
"""获取指定URL的页面内容并自动处理浏览器"""
driver = self.get_driver()
try:
driver.get(url)
time.sleep(wait_time) # 等待页面加载
page_source = driver.page_source
self.release_driver()
return page_source, driver
except Exception as e:
log.error(f"获取页面内容失败: {str(e)}")
self.release_driver()
raise
def shutdown(self):
"""关闭浏览器管理器"""
with self._lock:
self._quit_driver()

240
app/services/crawler.py Normal file
View File

@@ -0,0 +1,240 @@
import time
import traceback
import threading
from datetime import datetime
from functools import wraps
import pytz
import signal
from typing import List, Dict, Any, Optional, Callable
from app.services import crawler_factory, _scheduler
from app.utils.logger import log
from app.core import db, cache
from app.core.config import get_crawler_config
from app.utils.notification import notification_manager
# 获取爬虫配置
crawler_config = get_crawler_config()
# 配置常量
CRAWLER_INTERVAL = crawler_config.interval
CRAWLER_TIMEOUT = crawler_config.timeout
MAX_RETRY_COUNT = crawler_config.max_retry_count
SHANGHAI_TZ = pytz.timezone('Asia/Shanghai')
class CrawlerTimeoutError(Exception):
"""爬虫超时异常"""
pass
def timeout_handler(func: Callable, timeout: int = CRAWLER_TIMEOUT) -> Callable:
"""超时处理装饰器支持Unix信号和线程两种实现"""
@wraps(func)
def wrapper(*args, **kwargs):
# 线程实现的超时机制
result = [None]
exception = [None]
completed = [False]
def target():
try:
result[0] = func(*args, **kwargs)
except Exception as e:
exception[0] = e
finally:
completed[0] = True
thread = threading.Thread(target=target)
thread.daemon = True
thread.start()
thread.join(timeout)
if not completed[0]:
error_msg = f"Function {func.__name__} timed out after {timeout} seconds"
log.error(error_msg)
raise CrawlerTimeoutError(error_msg)
if exception[0]:
log.error(f"Function {func.__name__} raised an exception: {exception[0]}")
raise exception[0]
return result[0]
return wrapper
def safe_fetch(crawler_name: str, crawler, date_str: str, is_retry: bool = False) -> List[Dict[str, Any]]:
"""安全地执行爬虫抓取,处理异常并返回结果"""
try:
news_list = crawler.fetch(date_str)
if news_list and len(news_list) > 0:
cache_key = f"crawler:{crawler_name}:{date_str}"
cache.set_cache(key=cache_key, value=news_list, expire=0)
log.info(f"{crawler_name} fetch success, {len(news_list)} news fetched")
return news_list
else:
log.info(f"{'Second time ' if is_retry else ''}crawler {crawler_name} failed. 0 news fetched")
return []
except Exception as e:
error_msg = traceback.format_exc()
log.error(f"{'Second time ' if is_retry else ''}crawler {crawler_name} error: {error_msg}")
# 发送钉钉通知
try:
notification_manager.notify_crawler_error(
crawler_name=crawler_name,
error_msg=str(e),
date_str=date_str,
is_retry=is_retry
)
except Exception as notify_error:
log.error(f"Failed to send notification for crawler {crawler_name}: {notify_error}")
return []
def run_data_analysis(date_str: str):
"""执行数据分析并缓存结果"""
log.info(f"Starting data analysis for date {date_str}")
try:
# 导入分析模块(在这里导入避免循环依赖)
from app.analysis.trend_analyzer import TrendAnalyzer
from app.analysis.predictor import TrendPredictor
# 创建分析器实例
analyzer = TrendAnalyzer()
predictor = TrendPredictor()
# 1. 生成关键词云图数据并缓存
log.info("Generating keyword cloud data...")
analyzer.get_keyword_cloud(date_str, refresh=True)
# 2. 生成热点聚合分析数据并缓存
log.info("Generating trend analysis data...")
analyzer.get_analysis(date_str, analysis_type="main")
# 3. 生成跨平台热点分析数据并缓存
log.info("Generating cross-platform analysis data...")
analyzer.get_cross_platform_analysis(date_str, refresh=True)
# 4. 生成热点趋势预测数据并缓存
log.info("Generating trend prediction data...")
predictor.get_prediction(date_str)
# 5. 生成平台对比分析数据并缓存
log.info("Generating platform comparison data...")
analyzer.get_platform_comparison(date_str)
# 6. 生成高级分析数据并缓存
log.info("Generating advanced analysis data...")
analyzer.get_advanced_analysis(date_str, refresh=True)
# 7. 生成数据可视化分析数据并缓存
log.info("Generating data visualization analysis...")
analyzer.get_data_visualization(date_str, refresh=True)
# 8. 生成趋势预测分析数据并缓存
log.info("Generating trend forecast data...")
analyzer.get_trend_forecast(date_str, refresh=True)
log.info(f"All data analysis completed for date {date_str}")
except Exception as e:
error_msg = traceback.format_exc()
log.error(f"Error during data analysis: {str(e)}")
log.error(error_msg)
# 发送数据分析异常通知
try:
notification_manager.notify_analysis_error(
error_msg=str(e),
date_str=date_str
)
except Exception as notify_error:
log.error(f"Failed to send analysis error notification: {notify_error}")
@_scheduler.scheduled_job('interval', id='crawlers_logic', seconds=CRAWLER_INTERVAL,
max_instances=crawler_config.max_instances,
misfire_grace_time=crawler_config.misfire_grace_time)
def crawlers_logic():
"""爬虫主逻辑,包含超时保护和错误处理"""
@timeout_handler
def crawler_work():
now_time = datetime.now(SHANGHAI_TZ)
date_str = now_time.strftime("%Y-%m-%d")
log.info(f"Starting crawler job at {now_time.strftime('%Y-%m-%d %H:%M:%S')}")
retry_crawler = []
success_count = 0
failed_crawlers = []
for crawler_name, crawler in crawler_factory.items():
news_list = safe_fetch(crawler_name, crawler, date_str)
if news_list:
success_count += 1
else:
retry_crawler.append(crawler_name)
failed_crawlers.append(crawler_name)
# 第二轮爬取(重试失败的爬虫)
if retry_crawler:
log.info(f"Retrying {len(retry_crawler)} failed crawlers")
retry_failed = []
for crawler_name in retry_crawler:
news_list = safe_fetch(crawler_name, crawler_factory[crawler_name], date_str, is_retry=True)
if news_list:
success_count += 1
# 从失败列表中移除成功的爬虫
if crawler_name in failed_crawlers:
failed_crawlers.remove(crawler_name)
else:
retry_failed.append(crawler_name)
# 记录完成时间
end_time = datetime.now(SHANGHAI_TZ)
duration = (end_time - now_time).total_seconds()
log.info(f"Crawler job finished at {end_time.strftime('%Y-%m-%d %H:%M:%S')}, "
f"duration: {duration:.2f}s, success: {success_count}/{len(crawler_factory)}")
# 发送通知
try:
notification_manager.notify_crawler_summary(
success_count=success_count,
total_count=len(crawler_factory),
failed_crawlers=failed_crawlers,
duration=duration,
date_str=date_str
)
except Exception as notify_error:
log.error(f"Failed to send crawler notification: {notify_error}")
# 爬取完成后执行数据分析
log.info("Crawler job completed, starting data analysis...")
# 使用新线程执行分析,避免阻塞主线程
threading.Thread(target=run_data_analysis, args=(date_str,), daemon=True).start()
return success_count
try:
return crawler_work()
except CrawlerTimeoutError as e:
log.error(f"Crawler job timeout: {str(e)}")
# 发送超时通知
try:
notification_manager.notify_crawler_timeout(
timeout_seconds=CRAWLER_TIMEOUT,
date_str=date_str
)
except Exception as notify_error:
log.error(f"Failed to send timeout notification: {notify_error}")
return 0
except Exception as e:
log.error(f"Crawler job error: {str(e)}")
log.error(traceback.format_exc())
# 发送通用异常通知
try:
notification_manager.notify_crawler_error(
crawler_name="crawler_job",
error_msg=str(e),
date_str=date_str
)
except Exception as notify_error:
log.error(f"Failed to send error notification: {notify_error}")
return 0

View File

View File

@@ -0,0 +1,99 @@
import json
import datetime
import requests
import urllib3
from bs4 import BeautifulSoup
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class BaiduNewsCrawler(Crawler):
# 返回news_list
def fetch(self, date_str) -> list:
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://top.baidu.com/api/board?platform=wise&tab=realtime"
resp = requests.get(url=url, params=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
json_data = resp.json()
contents = json_data.get("data")["cards"][0]["content"][0]["content"]
result = []
cache_list = []
for content in contents:
title = content.get("word")
url = content.get("url")
desc = content.get("desc")
score = content.get("hotScore")
# replace url m to www
url = url.replace("m.", "www.")
news = {
'title': title,
'url': url,
'content': desc,
'source': 'baidu',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串
}
result.append(news)
cache_list.append(news) # 直接添加字典json.dumps会在后面处理整个列表
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "baidu"
@staticmethod
def fetch_v0():
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://top.baidu.com/board?tab=realtime"
proxies = {
# "http": "http://127.0.0.1:7890",
# "https": "http://127.0.0.0:7890"
}
header = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
"*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"upgrade-insecure-requests": 1,
"host": "www.baidu.com",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/86.0.4240.183 Safari/537.36"
}
html = requests.get(url=url, params=header, verify=False, proxies=proxies)
html.encoding = "utf-8"
html_text = html.text
soup = BeautifulSoup(html_text, "html.parser")
main_content = soup.find_all("main")[0]
news_main_content = main_content.find("div", style='margin-bottom:20px')
div_elements = news_main_content.find_all('div', class_='category-wrap_iQLoo horizontal_1eKyQ')
result = []
for div_element in div_elements:
hot_index = div_element.find(class_='hot-index_1Bl1a').text.strip()
news_title = div_element.find(class_='c-single-text-ellipsis').text.strip()
news_link = div_element.find('a', class_='title_dIF3B')['href']
news = {
'title': news_title,
'url': news_link,
'content': "",
'source': 'baidu',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串
}
result.append(news)
return result

View File

@@ -0,0 +1,64 @@
import json
import datetime
import requests
import urllib3
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class BilibiliCrawler(Crawler):
def fetch(self, date_str):
current_time = datetime.datetime.now()
url = "https://api.bilibili.com/x/web-interface/popular"
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"Chrome/122.0.0.0 Safari/537.36"
"AppleWebKit/537.36 (KHTML, like Gecko) "
),
"Referer": "https://www.bilibili.com/",
}
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
data = resp.json()
if data["code"] != 0:
print(f"API error: {data['message']}")
return []
result = []
cache_list = []
for item in data["data"].get("list", []):
title = item.get("title", "")
bvid = item.get("bvid", "")
desc = item.get("desc", "")
video_url = f"https://www.bilibili.com/video/{bvid}"
news = {
'title': title,
'url': video_url,
'content': desc,
'source': 'bilibili',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "bilibili"

100
app/services/sites/cls.py Normal file
View File

@@ -0,0 +1,100 @@
import json
import datetime
import requests
import urllib3
from .crawler import Crawler
from ...core import cache
urllib3.disable_warnings()
class CLSCrawler(Crawler):
"""财联社"""
def fetch(self, date_str) -> list:
current_time = datetime.datetime.now()
try:
params = {
'app': 'CailianpressWeb',
'os': 'web',
'sv': '8.4.6',
'sign': '9f8797a1f4de66c2370f7a03990d2737'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://www.cls.cn/',
'Origin': 'https://www.cls.cn'
}
response = requests.get(
"https://www.cls.cn/featured/v1/column/list",
params=params,
headers=headers,
timeout=self.timeout,
verify=False
)
response.raise_for_status()
data = response.json()
if data.get('errno') != 0:
return []
column_list = data.get('data', {}).get('column_list', [])
result = []
cache_list = []
for idx, column in enumerate(column_list[:20]):
try:
title = column.get('title', '').strip()
if not title or len(title) < 2:
continue
article_list = column.get('article_list', {})
if article_list:
article_title = article_list.get('title', '').strip()
jump_url = article_list.get('jump_url', '').strip()
brief = article_list.get('brief', '').strip()
if article_title:
display_title = f"[{title}] {article_title}"
content = brief if brief else article_title
url = "https://www.cls.cn/telegraph"
else:
display_title = title
content = column.get('brief', '').strip()
url = f"https://www.cls.cn/telegraph"
else:
display_title = title
content = column.get('brief', '').strip()
url = f"https://www.cls.cn/telegraph"
news = {
'title': display_title,
'url': url,
'content': content,
'source': 'cls',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S'),
'score': 1000 - idx,
'rank': idx + 1
}
result.append(news)
cache_list.append(news)
except Exception:
continue
if cache_list:
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
return []
def crawler_name(self):
return "cls"

View File

@@ -0,0 +1,23 @@
from abc import ABC, abstractmethod
from typing import List, Dict, Any
class Crawler(ABC):
def __init__(self):
self.header = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
"*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/86.0.4240.183 Safari/537.36"
}
self.timeout = 10
@abstractmethod
def fetch(self, date_str: str) -> List[Dict[str, Any]]:
"""获取新闻列表"""
pass
@abstractmethod
def crawler_name(self) -> str:
"""获取爬虫名称"""
pass

View File

@@ -0,0 +1,79 @@
import json
import re
import datetime
import requests
import urllib3
from bs4 import BeautifulSoup
from ...core import cache
from .crawler import Crawler
urllib3.disable_warnings()
class DouBanCrawler(Crawler):
"""豆瓣网"""
def fetch(self, date_str):
current_time = datetime.datetime.now()
url = "https://www.douban.com/group/explore"
header = self.header.copy()
header.update({
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-encoding": "",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"host": "www.douban.com",
"referer": "https://www.douban.com/group/explore",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
})
resp = requests.get(url=url, headers=header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
html_text = resp.text
soup = BeautifulSoup(html_text, "html.parser")
topic_list = soup.find_all('div', class_='channel-item')
result = []
cache_list = []
for topic in topic_list:
title_elem = topic.find('h3')
if not title_elem:
continue
link_elem = title_elem.find('a')
if not link_elem:
continue
title = link_elem.text.strip()
url = link_elem.get('href')
desc_elem = topic.find('div', class_='content')
desc = desc_elem.text.strip() if desc_elem else ""
news = {
'title': title,
'url': url,
'content': desc,
'source': 'douban',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "douban"

View File

@@ -0,0 +1,111 @@
import json
import datetime
import time
import requests
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from ...core import cache
from ...db.mysql import News
from .crawler import Crawler
from ..browser_manager import BrowserManager
class DouYinCrawler(Crawler):
def fetch(self, date_str):
return self.fetch_v2(date_str)
def fetch_v1(self, date_str):
current_time = datetime.datetime.now()
url = "https://www.douyin.com/hot"
browser_manager = BrowserManager()
try:
# 使用浏览器管理器获取页面内容
page_source, driver = browser_manager.get_page_content(url, wait_time=5)
result = []
cache_list = []
# 抖音热榜条目li 标签里含 /video/ 链接)
items = driver.find_elements(By.XPATH, '//li[a[contains(@href, "/video/")]]')
for item in items:
try:
# 提取标题(含 # 标签或较长文本)
title_elem = item.find_element(By.XPATH, './/div[contains(text(), "#") or string-length(text()) > 10]')
# 提取链接
link_elem = item.find_element(By.XPATH, './/a[contains(@href, "/video/")]')
# 提取热度
hot_elem = item.find_element(By.XPATH, './/span[contains(text(), "") or contains(text(), "亿")]')
title = title_elem.text.strip()
item_url = "https://www.douyin.com" + link_elem.get_attribute("href")
hot = hot_elem.text.strip()
news = {
'title': title,
'url': item_url,
'content': f"热度: {hot}",
'source': 'douyin',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
except Exception:
continue # 跳过无效项
# 缓存并返回
if cache_list:
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
return []
def fetch_v2(self, date_str):
current_time = datetime.datetime.now()
url = "https://www.douyin.com/aweme/v1/web/hot/search/list/?device_platform=webapp&aid=6383&channel=channel_pc_web&detail_list=1&source=6&pc_client_type=1&pc_libra_divert=Windows&support_h265=1&support_dash=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=136.0.0.0&browser_online=true&engine_name=Blink&engine_version=136.0.0.0&os_name=Windows&os_version=10&cpu_core_num=16&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7490997798633555467"
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"Chrome/122.0.0.0 Safari/537.36"
"AppleWebKit/537.36 (KHTML, like Gecko) "
),
"Referer": "https://www.douyin.com/",
}
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
data = resp.json()
# https://www.douyin.com/hot/2094286?&trending_topic=%E5%A4%8F%E5%A4%A9%E7%9A%84%E5%91%B3%E9%81%93%E5%9C%A8%E6%8A%96%E9%9F%B3&previous_page=main_page&enter_method=trending_topic&modeFrom=hotDetail&tab_name=trend&position=1&hotValue=11892557
result = []
cache_list = []
for item in data["data"]["word_list"]:
title = item["word"]
url = f"https://www.douyin.com/hot/{item['sentence_id']}?&trending_topic={item['word']}&previous_page=main_page&enter_method=trending_topic&modeFrom=hotDetail&tab_name=trend&position=1&hotValue={item['hot_value']}"
news = {
'title': title,
'url': url,
'content': title,
'source': 'douyin',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "douyin"

View File

@@ -0,0 +1,88 @@
import json
import datetime
import requests
import urllib3
from .crawler import Crawler
from ...core import cache
urllib3.disable_warnings()
class EastMoneyCrawler(Crawler):
"""东方财富网"""
def fetch(self, date_str) -> list:
current_time = datetime.datetime.now()
try:
params = {
'client': 'web',
'biz': 'web_724',
'fastColumn': '102',
'sortEnd': '',
'pageSize': '50',
'req_trace': str(int(current_time.timestamp() * 1000)) # 使用当前时间戳
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://kuaixun.eastmoney.com/',
'Origin': 'https://kuaixun.eastmoney.com'
}
response = requests.get(
"https://np-weblist.eastmoney.com/comm/web/getFastNewsList",
params=params,
headers=headers,
timeout=self.timeout,
verify=False
)
response.raise_for_status()
data = response.json()
if data.get('code') != '1':
return []
fast_news_list = data.get('data', {}).get('fastNewsList', [])
result = []
cache_list = []
for idx, news_item in enumerate(fast_news_list[:20]): # 取前20条
try:
title = news_item.get('title', '').strip()
if not title:
continue
summary = news_item.get('summary', '').strip()
show_time = news_item.get('showTime', '').strip()
code = news_item.get('code', '').strip()
url = f"https://finance.eastmoney.com/a/{code}" if code else "https://kuaixun.eastmoney.com/"
news = {
'title': title,
'url': url,
'content': summary,
'source': 'eastmoney',
'publish_time': show_time if show_time else current_time.strftime('%Y-%m-%d %H:%M:%S'),
'score': 1000 - idx,
'rank': idx + 1
}
result.append(news)
cache_list.append(news)
except Exception:
continue
if cache_list:
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
return []
def crawler_name(self):
return "eastmoney"

View File

@@ -0,0 +1,64 @@
from typing import Dict, Type
from .baidu import BaiduNewsCrawler
from .bilibili import BilibiliCrawler
from .crawler import Crawler
from .douban import DouBanCrawler
from .douyin import DouYinCrawler
from .ftpojie import FtPoJieCrawler
from .github import GithubCrawler
from .hackernews import HackerNewsCrawler
from .hupu import HuPuCrawler
from .jinritoutiao import JinRiTouTiaoCrawler
from .juejin import JueJinCrawler
from .sspai import ShaoShuPaiCrawler
from .stackoverflow import StackOverflowCrawler
from .tenxunwang import TenXunWangCrawler
from .tieba import TieBaCrawler
from .tskr import TsKrCrawler
from .vtex import VtexCrawler
from .weibo import WeiboCrawler
from .weixin import WeiXinCrawler
from .zhihu import ZhiHuCrawler
from .sina_finance import SinaFinanceCrawler
from .eastmoney import EastMoneyCrawler
from .xueqiu import XueqiuCrawler
from .cls import CLSCrawler
class CrawlerRegister:
def __init__(self):
self.crawlers = {}
def register(self) -> Dict[str, Crawler]:
"""注册所有爬虫"""
crawler_map = {
"baidu": BaiduNewsCrawler(),
"shaoshupai": ShaoShuPaiCrawler(),
"weibo": WeiboCrawler(),
"zhihu": ZhiHuCrawler(),
"36kr": TsKrCrawler(),
"52pojie": FtPoJieCrawler(),
"bilibili": BilibiliCrawler(),
"douban": DouBanCrawler(),
"hupu": HuPuCrawler(),
"tieba": TieBaCrawler(),
"juejin": JueJinCrawler(),
"douyin": DouYinCrawler(),
"v2ex": VtexCrawler(),
"jinritoutiao": JinRiTouTiaoCrawler(),
"tenxunwang": TenXunWangCrawler(),
"stackoverflow": StackOverflowCrawler(),
"github": GithubCrawler(),
"hackernews": HackerNewsCrawler(),
"sina_finance": SinaFinanceCrawler(),
"eastmoney": EastMoneyCrawler(),
"xueqiu": XueqiuCrawler(),
"cls": CLSCrawler(),
}
self.crawlers = crawler_map
return self.crawlers
def get_crawlers(self):
return self.register().values()

View File

@@ -0,0 +1,69 @@
import json
import datetime # 添加datetime导入
import re
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from ...core import cache
from ...db.mysql import News
from .crawler import Crawler
urllib3.disable_warnings()
class FtPoJieCrawler(Crawler):
"""吾爱破解"""
def fetch(self, date_str):
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://www.52pojie.cn/forum.php?mod=guide&view=hot"
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
resp.encoding = 'gbk' # 52pojie使用GBK编码
html_text = resp.text
soup = BeautifulSoup(html_text, "html.parser")
# 找到热门帖子列表
hot_threads = soup.find_all('tbody', id=lambda x: x and x.startswith('normalthread_'))
result = []
cache_list = []
for thread in hot_threads:
title_elem = thread.find('a', class_='xst')
if not title_elem:
continue
title = title_elem.text.strip()
url = "https://www.52pojie.cn/" + title_elem.get('href')
# 获取帖子信息
info_elem = thread.find('td', class_='by')
info = info_elem.text.strip() if info_elem else ""
news = {
'title': title,
'url': url,
'content': info,
'source': '52pojie',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "52pojie"

View File

@@ -0,0 +1,58 @@
import json
import datetime
import requests
import urllib3
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class GithubCrawler(Crawler):
def fetch(self, date_str):
current_time = datetime.datetime.now()
url = "https://api.github.com/search/repositories?q=stars:%3E1&sort=stars"
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"Chrome/122.0.0.0 Safari/537.36"
"AppleWebKit/537.36 (KHTML, like Gecko) "
),
"Referer": "https://github.com/",
}
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
data = resp.json()
result = []
cache_list = []
for i, item in enumerate(data["items"]):
title = item.get("full_name", "")
url = item.get("html_url", "")
desc = item.get("description", "")
news = {
'title': title,
'url': url,
'content': desc,
'source': self.crawler_name(),
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "github"

View File

@@ -0,0 +1,235 @@
import json
import datetime
import time
import requests
from bs4 import BeautifulSoup
import urllib3
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from ...core import cache
from ...db.mysql import News
from .crawler import Crawler
from ..browser_manager import BrowserManager
# 禁用SSL警告
urllib3.disable_warnings()
class HackerNewsCrawler(Crawler):
"""hacker news"""
def fetch(self, date_str):
current_time = datetime.datetime.now()
try:
# 首先尝试直接请求方式获取内容
result = self._fetch_with_requests()
if result and len(result) > 0:
# 缓存数据
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
return result
# 如果请求方式失败,尝试使用浏览器模拟获取
browser_manager = BrowserManager()
result = self._fetch_with_browser(browser_manager)
if result and len(result) > 0:
# 缓存数据
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
return result
except Exception as e:
# 如果遇到错误,返回空列表
return []
# 所有方法都失败,返回空列表
return []
def _fetch_with_requests(self):
"""使用requests直接获取Hacker News内容"""
url = "https://news.ycombinator.com/"
try:
# 发送HTTP请求
response = requests.get(url, headers=self.header, timeout=self.timeout)
if response.status_code != 200:
return []
# 解析HTML内容
soup = BeautifulSoup(response.text, 'html.parser')
result = []
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 获取所有新闻条目
items = soup.select("tr.athing")
for item in items:
try:
# 获取ID用于关联评论和元数据
item_id = item.get('id')
if not item_id:
continue
# 获取标题和链接
title_element = item.select_one(".titleline a")
if not title_element:
continue
title = title_element.text.strip()
url = title_element.get('href')
# 如果URL是相对路径转换为绝对路径
if url and not url.startswith('http'):
url = f"https://news.ycombinator.com/{url}"
# 获取来源网站
site_element = item.select_one(".sitestr")
site = site_element.text.strip() if site_element else ""
# 查找下一个tr获取元数据分数、用户、时间等
metadata = item.find_next_sibling('tr')
if not metadata:
continue
# 获取分数
score_element = metadata.select_one(".score")
score = score_element.text.strip() if score_element else "0 points"
# 获取作者
user_element = metadata.select_one(".hnuser")
user = user_element.text.strip() if user_element else "unknown"
# 获取评论数
comments_element = metadata.select_one("a:last-child")
comments = comments_element.text.strip() if comments_element else "0 comments"
if "discuss" in comments:
comments = "0 comments"
# 构建内容摘要
content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}"
news = {
'title': title,
'url': url,
'content': content,
'source': 'hackernews',
'publish_time': current_time
}
result.append(news)
# 限制获取前30条
if len(result) >= 30:
break
except Exception as e:
continue
return result
except Exception as e:
return []
def _fetch_with_browser(self, browser_manager):
"""使用浏览器模拟方式获取Hacker News内容"""
url = "https://news.ycombinator.com/"
try:
# 获取页面内容
page_source, driver = browser_manager.get_page_content(url, wait_time=5)
# 等待页面元素加载
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".athing"))
)
except:
# 如果等待超时,仍然尝试获取内容
pass
result = []
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 获取所有新闻条目
items = driver.find_elements(By.CSS_SELECTOR, "tr.athing")
for item in items:
try:
# 获取ID用于关联评论和元数据
item_id = item.get_attribute("id")
if not item_id:
continue
# 获取标题和链接
title_element = item.find_element(By.CSS_SELECTOR, ".titleline a")
title = title_element.text.strip()
url = title_element.get_attribute("href")
# 获取来源网站
site = ""
try:
site_element = item.find_element(By.CSS_SELECTOR, ".sitestr")
site = site_element.text.strip()
except:
pass
# 查找下一个tr获取元数据分数、用户、时间等
try:
metadata = driver.find_element(By.XPATH, f"//tr[@id='{item_id}']/following-sibling::tr[1]")
# 获取分数
score = "0 points"
try:
score_element = metadata.find_element(By.CSS_SELECTOR, ".score")
score = score_element.text.strip()
except:
pass
# 获取作者
user = "unknown"
try:
user_element = metadata.find_element(By.CSS_SELECTOR, ".hnuser")
user = user_element.text.strip()
except:
pass
# 获取评论数
comments = "0 comments"
try:
comments_element = metadata.find_element(By.XPATH, ".//a[last()]")
comments = comments_element.text.strip()
if "discuss" in comments:
comments = "0 comments"
except:
pass
# 构建内容摘要
content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}"
except:
content = f"来源: {site}"
news = {
'title': title,
'url': url,
'content': content,
'source': 'hackernews',
'publish_time': current_time
}
result.append(news)
# 限制获取前30条
if len(result) >= 30:
break
except Exception as e:
continue
return result
except Exception as e:
return []
def crawler_name(self):
return "hackernews"

View File

@@ -0,0 +1,72 @@
import json
import datetime # 添加datetime导入
import re
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from ...core import cache
from ...db.mysql import News
from .crawler import Crawler
urllib3.disable_warnings()
class HuPuCrawler(Crawler):
"""虎扑"""
def fetch(self, date_str):
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://bbs.hupu.com/all-gambia"
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
html_text = resp.text
soup = BeautifulSoup(html_text, "html.parser")
# 找到热门帖子列表
post_list = soup.find_all('div', class_='t-info')
result = []
cache_list = []
for post in post_list:
title_elem = post.find('span', class_='t-title')
if not title_elem:
continue
link_elem = post.find('a')
if not link_elem:
continue
title = title_elem.text.strip()
url = "https://bbs.hupu.com" + link_elem.get('href') if link_elem.get('href').startswith('/') else link_elem.get('href')
# 获取帖子信息
info_elem = post.find('span', class_='t-replies')
info = info_elem.text.strip() if info_elem else ""
news = {
'title': title,
'url': url,
'content': info,
'source': 'hupu',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "hupu"

View File

@@ -0,0 +1,63 @@
# -- coding: utf-8 --
import json
import datetime # 添加datetime导入
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from ...core import cache
from .crawler import Crawler
urllib3.disable_warnings()
class JinRiTouTiaoCrawler(Crawler):
""" 今日头条 """
def fetch(self, date_str):
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc"
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
try:
json_data = resp.json()
data = json_data.get('data', [])
result = []
cache_list = []
for item in data:
title = item.get('Title', '')
url = item.get('Url', '')
hot_value = item.get('HotValue', '')
news = {
'title': title,
'url': url,
'content': f"热度: {hot_value}",
'source': 'jinritoutiao',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
print(f"Error parsing JSON: {e}")
return []
def crawler_name(self):
return "jinritoutiao"

View File

@@ -0,0 +1,63 @@
import json
import datetime # 添加datetime导入
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class JueJinCrawler(Crawler):
"""掘金"""
def fetch(self, date_str):
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://api.juejin.cn/content_api/v1/content/article_rank?category_id=1&type=hot"
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
try:
json_data = resp.json()
data = json_data.get('data', [])
result = []
cache_list = []
for item in data:
article_info = item.get('content', {})
title = article_info.get('title', '')
article_id = article_info.get('content_id', '')
url = f"https://juejin.cn/post/{article_id}"
news = {
'title': title,
'url': url,
'content': title,
'source': 'juejin',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
print(f"Error parsing JSON: {e}")
return []
def crawler_name(self):
return "juejin"

View File

@@ -0,0 +1,20 @@
import datetime
from sqlalchemy import Column, String, Integer, DateTime
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class DailyNews(Base):
__tablename__ = 'tab_daily_news'
id = Column(Integer, primary_key=True)
title = Column(String(255))
desc = Column(String(255))
link = Column(String(255))
type = Column(Integer, default=0)
score = Column(Integer, default=0)
times = Column(Integer, default=0)
create_time = Column(DateTime, default=datetime.datetime.now)
update_time = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now)

View File

@@ -0,0 +1,75 @@
import json
import datetime
import requests
import urllib3
from ...core import cache
from .crawler import Crawler
urllib3.disable_warnings()
class SinaFinanceCrawler(Crawler):
"""新浪财经"""
def fetch(self, date_str):
current_time = datetime.datetime.now()
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://finance.sina.com.cn/',
'Origin': 'https://finance.sina.com.cn'
}
response = requests.get(
"https://zhibo.sina.com.cn/api/zhibo/feed?page=1&page_size=20&zhibo_id=152&tag_id=0&dire=f&dpc=1&pagesize=20",
headers=headers,
timeout=self.timeout,
verify=False
)
response.raise_for_status()
data = response.json()
if data.get('result', {}).get('status', {}).get('code') != 0:
return []
feed_list = data.get('result', {}).get('data', {}).get('feed', {}).get('list', [])
result = []
cache_list = []
for item in feed_list:
try:
title = item.get('rich_text', '').strip()
if not title:
continue
ext_str = item.get('ext', '{}')
try:
ext_data = json.loads(ext_str)
doc_url = ext_data.get('docurl', '')
except:
doc_url = item.get('docurl', '').strip(' "')
news = {
'title': title,
'url': doc_url,
'content': title,
'source': 'sina_finance',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
except Exception:
continue
if cache_list:
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
return []
def crawler_name(self):
return "sina_finance"

View File

@@ -0,0 +1,60 @@
import json
import datetime # 添加datetime导入
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from ...core import cache
from .crawler import Crawler
urllib3.disable_warnings()
class ShaoShuPaiCrawler(Crawler):
"""少数派"""
def fetch(self, date_str):
current_time = datetime.datetime.now()
url = "https://sspai.com/api/v1/article/index/page/get?limit=20&offset=0&created_at=0"
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
try:
json_data = resp.json()
data = json_data.get('data', [])
result = []
cache_list = []
for item in data:
title = item.get('title', '')
article_id = item.get('id', '')
url = f"https://sspai.com/post/{article_id}"
summary = item.get('summary', '')
news = {
'title': title,
'url': url,
'content': summary,
'source': 'sspai',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
print(f"Error parsing JSON: {e}")
return []
def crawler_name(self):
return "shaoshupai"

View File

@@ -0,0 +1,58 @@
import json
import datetime
import requests
import urllib3
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class StackOverflowCrawler(Crawler):
def fetch(self, date_str):
current_time = datetime.datetime.now()
url = "https://api.stackexchange.com/2.3/questions?order=desc&sort=hot&site=stackoverflow"
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"Chrome/122.0.0.0 Safari/537.36"
"AppleWebKit/537.36 (KHTML, like Gecko) "
),
"Referer": "https://stackoverflow.com/",
}
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
data = resp.json()
result = []
cache_list = []
for i, item in enumerate(data["items"]):
title = item.get("title", "")
url = item.get("link", "")
desc = item.get("title", "")
news = {
'title': title,
'url': url,
'content': desc,
'source': 'stackoverflow',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "stackoverflow"

View File

@@ -0,0 +1,65 @@
import json
import datetime
import requests
import urllib3
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class TenXunWangCrawler(Crawler):
"""腾讯网"""
def fetch(self, date_str):
current_time = datetime.datetime.now()
url = "https://i.news.qq.com/gw/event/pc_hot_ranking_list?ids_hash=&offset=0&page_size=51&appver=15.5_qqnews_7.1.60&rank_id=hot"
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"Chrome/122.0.0.0 Safari/537.36"
"AppleWebKit/537.36 (KHTML, like Gecko) "
),
"Referer": "https://news.qq.com/",
}
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
data = resp.json()
result = []
cache_list = []
for i, item in enumerate(data["idlist"][0].get("newslist", [])):
if i == 0:
# 腾讯新闻用户最关注的热点每10分钟更新一次
continue
title = item.get("title", "")
url = item.get("url", "")
desc = item.get("abstract", "")
news = {
'title': title,
'url': url,
'content': desc,
'source': 'tenxunwang',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "tenxunwang"

View File

@@ -0,0 +1,65 @@
import json
import datetime # 添加datetime导入
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class TieBaCrawler(Crawler):
"""百度贴吧"""
def fetch(self, date_str):
# 获取当前时间
current_time = datetime.datetime.now()
url = "http://tieba.baidu.com/hottopic/browse/topicList"
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
try:
json_data = resp.json()
data = json_data.get('data', {}).get('bang_topic', {}).get('topic_list', [])
result = []
cache_list = []
for item in data:
title = item.get('topic_name', '')
url = item.get('topic_url', '')
if url and not url.startswith('http'):
url = f"http://tieba.baidu.com{url}"
desc = item.get('topic_desc', '')
news = {
'title': title,
'url': url,
'content': desc,
'source': 'tieba',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
print(f"Error parsing JSON: {e}")
return []
def crawler_name(self):
return "tieba"

View File

@@ -0,0 +1,83 @@
import json
import datetime
import time
import requests
import urllib3
from .crawler import Crawler
from ...core import cache
urllib3.disable_warnings()
class TsKrCrawler(Crawler):
"""36氪"""
def fetch(self, date_str):
"""
获取36氪热榜数据
"""
current_time = datetime.datetime.now()
url = f"https://gateway.36kr.com/api/mis/nav/home/nav/rank/hot"
headers = {
"Content-Type": "application/json; charset=utf-8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36",
}
body = {
"partner_id": "wap",
"param": {
"siteId": 1,
"platformId": 2,
},
"timestamp": int(time.time() * 1000),
}
try:
resp = requests.post(
url=url,
headers=headers,
json=body,
verify=False,
timeout=self.timeout
)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
json_data = resp.json()
data_key = "hotRankList"
data_list = json_data.get("data", {}).get(data_key, [])
result = []
cache_list = []
for item in data_list:
template_material = item.get("templateMaterial", {})
item_id = item.get("itemId", "")
title = template_material.get("widgetTitle", "")
article_url = f"https://www.36kr.com/p/{item_id}"
news = {
'title': title,
'url': article_url,
'content': title,
'source': '36kr',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S'),
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
print(f"Error fetching 36kr data: {e}")
return []
def crawler_name(self):
return "36kr"

View File

@@ -0,0 +1,71 @@
import json
import datetime # 添加datetime导入
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class VtexCrawler(Crawler):
"""v2ex"""
def fetch(self, date_str):
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://www.v2ex.com/?tab=hot"
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
html_text = resp.text
soup = BeautifulSoup(html_text, "html.parser")
# 找到热门话题列表
topic_list = soup.find_all('div', class_='cell item')
result = []
cache_list = []
for topic in topic_list:
title_elem = topic.find('span', class_='item_title')
if not title_elem:
continue
link_elem = title_elem.find('a')
if not link_elem:
continue
title = link_elem.text.strip()
url = "https://www.v2ex.com" + link_elem.get('href')
# 获取话题信息
info_elem = topic.find('span', class_='topic_info')
info = info_elem.text.strip() if info_elem else ""
news = {
'title': title,
'url': url,
'content': info,
'source': 'v2ex',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "v2ex"

View File

@@ -0,0 +1,68 @@
import json
import datetime # 添加datetime导入
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from ...core import cache
from .crawler import Crawler
urllib3.disable_warnings()
class WeiboCrawler(Crawler):
"""微博"""
def fetch(self, date_str):
# 获取当前时间
current_time = datetime.datetime.now()
header = self.header.copy()
header.update({
"accept": "application/json, text/javascript, */*; q=0.01",
"host": "weibo.com",
"Referer": "https://weibo.com",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
})
url = "https://weibo.com/ajax/side/hotSearch"
resp = requests.get(url=url, headers=header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
try:
json_data = resp.json()
data = json_data.get('data', {}).get('realtime', [])
result = []
cache_list = []
for item in data:
title = item.get('word', '')
url = f"https://s.weibo.com/weibo?q=%23{title}%23"
news = {
'title': title,
'url': url,
'content': title,
'source': 'weibo',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
print(f"Error parsing JSON: {e}")
return []
def crawler_name(self):
return "weibo"

View File

@@ -0,0 +1,228 @@
import json
import datetime
import time
import requests
from bs4 import BeautifulSoup
import urllib3
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from ...core import cache
from ...db.mysql import News
from .crawler import Crawler
from ..browser_manager import BrowserManager
# 禁用SSL警告
urllib3.disable_warnings()
class WeiXinCrawler(Crawler):
"""
微信热门内容爬虫
使用微信看一看热门页面获取数据
"""
def fetch(self, date_str):
"""获取微信热门内容"""
current_time = datetime.datetime.now()
browser_manager = BrowserManager()
try:
# 首先尝试从微信看一看获取热门内容
result = self._fetch_from_weixin_kankan(browser_manager)
if result and len(result) > 0:
# 缓存数据
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
return result
# 如果看一看失败,尝试从微信读书获取热门书评
result = self._fetch_from_weixin_dushu(browser_manager)
if result and len(result) > 0:
# 缓存数据
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
return result
except Exception as e:
# 如果遇到错误,返回空列表
return []
# 所有方法都失败,返回空列表
return []
def _fetch_from_weixin_kankan(self, browser_manager):
"""从微信看一看页面获取热门内容"""
url = "https://k.weixin.qq.com/"
try:
# 获取页面内容
page_source, driver = browser_manager.get_page_content(url, wait_time=10)
# 等待热门内容加载
try:
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".hot"))
)
except:
# 如果等待超时,仍然尝试获取内容
pass
# 点击"热点"标签切换到热门内容
try:
hot_tab = driver.find_element(By.XPATH, "//div[contains(text(), '热点') and @class='tab']")
hot_tab.click()
time.sleep(3) # 等待内容加载
except:
# 如果找不到热点标签,继续尝试获取当前页面内容
pass
result = []
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 获取文章列表
articles = driver.find_elements(By.CSS_SELECTOR, ".article-item")
if not articles:
# 尝试其他可能的选择器
articles = driver.find_elements(By.CSS_SELECTOR, ".doc-item")
if not articles:
# 再尝试其他可能的选择器
articles = driver.find_elements(By.CSS_SELECTOR, ".item")
for article in articles:
try:
# 获取文章标题和链接
title_elem = article.find_element(By.CSS_SELECTOR, "h3, .title")
title = title_elem.text.strip()
# 尝试获取链接
link = None
try:
link_elem = article.find_element(By.TAG_NAME, "a")
link = link_elem.get_attribute("href")
except:
# 如果直接获取链接失败则记录文章id以后可以构建链接
try:
article_id = article.get_attribute("data-id") or article.get_attribute("id")
link = f"https://k.weixin.qq.com/article?id={article_id}"
except:
link = "https://k.weixin.qq.com/"
# 获取来源
source = ""
try:
source_elem = article.find_element(By.CSS_SELECTOR, ".account, .source")
source = source_elem.text.strip()
except:
pass
# 获取摘要
summary = ""
try:
summary_elem = article.find_element(By.CSS_SELECTOR, ".desc, .summary, p")
summary = summary_elem.text.strip()
except:
pass
news = {
'title': title,
'url': link,
'content': f"来源: {source} | 摘要: {summary[:50] if summary else '无摘要'}",
'source': 'weixin',
'publish_time': current_time
}
result.append(news)
# 限制获取前20条
if len(result) >= 20:
break
except Exception as e:
continue
return result
except Exception as e:
return []
def _fetch_from_weixin_dushu(self, browser_manager):
"""从微信读书获取热门书评"""
url = "https://weread.qq.com/web/category/all"
try:
# 获取页面内容
page_source, driver = browser_manager.get_page_content(url, wait_time=8)
result = []
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 尝试点击排行榜标签
try:
rank_tab = driver.find_element(By.XPATH, "//a[contains(text(), '排行榜')]")
rank_tab.click()
time.sleep(3) # 等待内容加载
except:
# 如果找不到排行榜标签,继续尝试获取当前页面内容
pass
# 获取热门书籍列表
books = driver.find_elements(By.CSS_SELECTOR, ".shelf-item, .book-item")
for book in books:
try:
# 获取书籍标题和链接
title_elem = book.find_element(By.CSS_SELECTOR, ".title, h3")
title = title_elem.text.strip()
# 尝试获取链接
link = "https://weread.qq.com/web/category/all"
try:
link_elem = book.find_element(By.TAG_NAME, "a")
link = link_elem.get_attribute("href")
except:
book_id = book.get_attribute("data-bid") or book.get_attribute("id")
if book_id:
link = f"https://weread.qq.com/web/reader/{book_id}"
# 获取作者
author = ""
try:
author_elem = book.find_element(By.CSS_SELECTOR, ".author, .writer")
author = author_elem.text.strip()
except:
pass
# 获取摘要/简介
intro = ""
try:
intro_elem = book.find_element(By.CSS_SELECTOR, ".intro, .desc")
intro = intro_elem.text.strip()
except:
pass
news = {
'title': f"热门书籍: {title}",
'url': link,
'content': f"作者: {author} | 简介: {intro[:50] if intro else '无简介'}",
'source': 'weixin',
'publish_time': current_time
}
result.append(news)
# 限制获取前20条
if len(result) >= 20:
break
except Exception as e:
continue
return result
except Exception as e:
return []
def crawler_name(self):
return "weixin"

View File

@@ -0,0 +1,155 @@
import json
import datetime
import requests
import urllib3
import re
from requests.sessions import Session
from .crawler import Crawler
from ...core import cache
urllib3.disable_warnings()
class XueqiuCrawler(Crawler):
"""雪球"""
def __init__(self):
super().__init__()
self.session = Session()
self._init_session()
def _init_session(self):
try:
# 第一步访问主页获取基础cookies
main_url = "https://xueqiu.com"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1'
}
resp = self.session.get(main_url, headers=headers, verify=False, timeout=self.timeout)
if resp.status_code == 200:
html_content = resp.text
# 尝试提取token
token_match = re.search(r'window\.SNB\s*=\s*\{[^}]*token["\']?\s*:\s*["\']([^"\']+)["\']', html_content)
if token_match:
token = token_match.group(1)
self.session.headers.update({'X-Requested-With': 'XMLHttpRequest'})
hot_page_url = "https://xueqiu.com/hot_event"
hot_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://xueqiu.com/',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1'
}
hot_resp = self.session.get(hot_page_url, headers=hot_headers, verify=False, timeout=self.timeout)
if hot_resp.status_code == 200:
print("雪球热门页面访问成功,已获取完整认证信息")
else:
print(f"雪球热门页面访问失败: {hot_resp.status_code}")
else:
print(f"雪球主页访问失败: {resp.status_code}")
except Exception as e:
print(f"初始化雪球会话失败: {e}")
def fetch(self, date_str) -> list:
current_time = datetime.datetime.now()
url = "https://xueqiu.com/hot_event/list.json?count=10"
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Referer': 'https://xueqiu.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
try:
resp = self.session.get(url=url, headers=headers, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"雪球请求失败, status: {resp.status_code}")
self._init_session()
resp = self.session.get(url=url, headers=headers, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"雪球重试后仍失败, status: {resp.status_code}")
return []
json_data = resp.json()
if 'list' not in json_data:
print("雪球响应格式异常")
return []
result = []
cache_list = []
for idx, item in enumerate(json_data['list'][:10]): # 取前10条
try:
tag = item.get('tag', '').strip()
if tag.startswith('#') and tag.endswith('#'):
title = tag[1:-1]
else:
title = tag
if not title:
continue
item_id = item.get('id')
url_link = f"https://xueqiu.com/"
content = item.get('content', '').strip()
if len(content) > 200:
content = content[:200] + '...'
status_count = item.get('status_count', 0)
hot_value = item.get('hot', 0)
news = {
'title': title,
'url': url_link,
'content': content,
'source': 'xueqiu',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S'),
'score': status_count if status_count > 0 else 1000 - idx,
'rank': idx + 1
}
result.append(news)
cache_list.append(news)
except Exception as e:
print(f"解析雪球新闻项失败: {e}")
continue
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
print(f"获取雪球数据失败: {e}")
return []
def crawler_name(self):
return "xueqiu"

View File

@@ -0,0 +1,64 @@
import json
import datetime # 添加datetime导入
import requests
import urllib3
from bs4 import BeautifulSoup
# 移除 SQLAlchemy 导入
# from sqlalchemy.sql.functions import now
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class ZhiHuCrawler(Crawler):
"""知乎"""
def fetch(self, date_str):
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://www.zhihu.com/api/v3/explore/guest/feeds?limit=30&ws_qiangzhisafe=0"
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
try:
json_data = resp.json()
data = json_data.get('data', [])
result = []
cache_list = []
for item in data:
target = item.get('target', {})
question = target.get('question', {})
title = question.get('title', '')
url = f"https://www.zhihu.com/question/{question.get('id')}"
excerpt = target.get('excerpt', '')
news = {
'title': title,
'url': url,
'content': excerpt,
'source': 'zhihu',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
print(f"Error parsing JSON: {e}")
return []
def crawler_name(self):
return "zhihu"