init
This commit is contained in:
43
app/services/__init__.py
Normal file
43
app/services/__init__.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from apscheduler.executors.pool import ThreadPoolExecutor, ProcessPoolExecutor
|
||||
from apscheduler.jobstores.memory import MemoryJobStore
|
||||
import pytz
|
||||
|
||||
from app.services.sites.factory import CrawlerRegister
|
||||
from app.utils.logger import log
|
||||
from app.core.config import get_scheduler_config
|
||||
|
||||
# 创建爬虫工厂
|
||||
crawler_factory = CrawlerRegister().register()
|
||||
|
||||
# 获取调度器配置
|
||||
scheduler_config = get_scheduler_config()
|
||||
|
||||
# 配置调度器
|
||||
jobstores = {
|
||||
'default': MemoryJobStore()
|
||||
}
|
||||
|
||||
executors = {
|
||||
'default': ThreadPoolExecutor(scheduler_config.thread_pool_size),
|
||||
'processpool': ProcessPoolExecutor(scheduler_config.process_pool_size)
|
||||
}
|
||||
|
||||
job_defaults = {
|
||||
'coalesce': scheduler_config.coalesce,
|
||||
'max_instances': scheduler_config.max_instances,
|
||||
'misfire_grace_time': scheduler_config.misfire_grace_time,
|
||||
}
|
||||
|
||||
# 创建并配置调度器
|
||||
_scheduler = BackgroundScheduler(
|
||||
jobstores=jobstores,
|
||||
executors=executors,
|
||||
job_defaults=job_defaults,
|
||||
timezone=pytz.timezone(scheduler_config.timezone)
|
||||
)
|
||||
|
||||
# 启动调度器
|
||||
_scheduler.start()
|
||||
|
||||
log.info(f"Scheduler started with timezone: {scheduler_config.timezone}")
|
||||
121
app/services/browser_manager.py
Normal file
121
app/services/browser_manager.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import threading
|
||||
import time
|
||||
import os
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from app.utils.logger import log
|
||||
|
||||
class BrowserManager:
|
||||
"""浏览器管理器,提供共享的Chrome浏览器实例"""
|
||||
_instance = None
|
||||
_lock = threading.Lock()
|
||||
_driver = None
|
||||
_driver_path = None
|
||||
_last_activity = 0
|
||||
_max_idle_time = 1800 # 最大空闲时间(秒),默认30分钟
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
"""单例模式实现"""
|
||||
if cls._instance is None:
|
||||
with cls._lock:
|
||||
if cls._instance is None:
|
||||
cls._instance = super(BrowserManager, cls).__new__(cls)
|
||||
cls._instance._init_driver_path()
|
||||
cls._instance._start_idle_monitor()
|
||||
return cls._instance
|
||||
|
||||
def _init_driver_path(self):
|
||||
"""初始化ChromeDriver路径"""
|
||||
try:
|
||||
self._driver_path = ChromeDriverManager().install()
|
||||
log.info(f"ChromeDriver已安装: {self._driver_path}")
|
||||
except Exception as e:
|
||||
log.error(f"ChromeDriver安装失败: {str(e)}")
|
||||
raise
|
||||
|
||||
def _start_idle_monitor(self):
|
||||
"""启动空闲监控线程"""
|
||||
def monitor():
|
||||
while True:
|
||||
time.sleep(60) # 每分钟检查一次
|
||||
try:
|
||||
with self._lock:
|
||||
if self._driver is not None:
|
||||
current_time = time.time()
|
||||
if current_time - self._last_activity > self._max_idle_time:
|
||||
log.info(f"浏览器空闲超过{self._max_idle_time}秒,释放资源")
|
||||
self._quit_driver()
|
||||
except Exception as e:
|
||||
log.error(f"浏览器监控线程异常: {str(e)}")
|
||||
|
||||
monitor_thread = threading.Thread(target=monitor, daemon=True)
|
||||
monitor_thread.start()
|
||||
log.info("浏览器空闲监控线程已启动")
|
||||
|
||||
def get_driver(self):
|
||||
"""获取Chrome浏览器实例"""
|
||||
with self._lock:
|
||||
self._last_activity = time.time()
|
||||
if self._driver is None:
|
||||
self._create_driver()
|
||||
return self._driver
|
||||
|
||||
def _create_driver(self):
|
||||
"""创建新的Chrome浏览器实例"""
|
||||
log.info("创建新的Chrome浏览器实例")
|
||||
options = webdriver.ChromeOptions()
|
||||
# 基本配置(无头模式)
|
||||
options.add_argument("--headless")
|
||||
options.add_argument("--disable-gpu")
|
||||
options.add_argument("--no-sandbox")
|
||||
# 内存优化配置
|
||||
options.add_argument("--disable-dev-shm-usage")
|
||||
options.add_argument("--disable-extensions")
|
||||
options.add_argument("--disable-application-cache")
|
||||
options.add_argument("--js-flags=--expose-gc")
|
||||
options.add_argument("--memory-pressure-off")
|
||||
options.add_argument("--disable-default-apps")
|
||||
# 日志级别
|
||||
options.add_argument("--log-level=3")
|
||||
|
||||
self._driver = webdriver.Chrome(
|
||||
service=Service(self._driver_path),
|
||||
options=options
|
||||
)
|
||||
self._driver.set_page_load_timeout(30)
|
||||
|
||||
def _quit_driver(self):
|
||||
"""关闭浏览器实例"""
|
||||
if self._driver:
|
||||
try:
|
||||
self._driver.quit()
|
||||
log.info("浏览器实例已关闭")
|
||||
except Exception as e:
|
||||
log.error(f"关闭浏览器实例出错: {str(e)}")
|
||||
finally:
|
||||
self._driver = None
|
||||
|
||||
def release_driver(self):
|
||||
"""使用完毕后标记为活动状态"""
|
||||
with self._lock:
|
||||
self._last_activity = time.time()
|
||||
|
||||
def get_page_content(self, url, wait_time=5):
|
||||
"""获取指定URL的页面内容,并自动处理浏览器"""
|
||||
driver = self.get_driver()
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(wait_time) # 等待页面加载
|
||||
page_source = driver.page_source
|
||||
self.release_driver()
|
||||
return page_source, driver
|
||||
except Exception as e:
|
||||
log.error(f"获取页面内容失败: {str(e)}")
|
||||
self.release_driver()
|
||||
raise
|
||||
|
||||
def shutdown(self):
|
||||
"""关闭浏览器管理器"""
|
||||
with self._lock:
|
||||
self._quit_driver()
|
||||
240
app/services/crawler.py
Normal file
240
app/services/crawler.py
Normal file
@@ -0,0 +1,240 @@
|
||||
import time
|
||||
import traceback
|
||||
import threading
|
||||
from datetime import datetime
|
||||
from functools import wraps
|
||||
import pytz
|
||||
import signal
|
||||
from typing import List, Dict, Any, Optional, Callable
|
||||
|
||||
from app.services import crawler_factory, _scheduler
|
||||
from app.utils.logger import log
|
||||
from app.core import db, cache
|
||||
from app.core.config import get_crawler_config
|
||||
from app.utils.notification import notification_manager
|
||||
|
||||
# 获取爬虫配置
|
||||
crawler_config = get_crawler_config()
|
||||
|
||||
# 配置常量
|
||||
CRAWLER_INTERVAL = crawler_config.interval
|
||||
CRAWLER_TIMEOUT = crawler_config.timeout
|
||||
MAX_RETRY_COUNT = crawler_config.max_retry_count
|
||||
SHANGHAI_TZ = pytz.timezone('Asia/Shanghai')
|
||||
|
||||
class CrawlerTimeoutError(Exception):
|
||||
"""爬虫超时异常"""
|
||||
pass
|
||||
|
||||
def timeout_handler(func: Callable, timeout: int = CRAWLER_TIMEOUT) -> Callable:
|
||||
"""超时处理装饰器,支持Unix信号和线程两种实现"""
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
# 线程实现的超时机制
|
||||
result = [None]
|
||||
exception = [None]
|
||||
completed = [False]
|
||||
|
||||
def target():
|
||||
try:
|
||||
result[0] = func(*args, **kwargs)
|
||||
except Exception as e:
|
||||
exception[0] = e
|
||||
finally:
|
||||
completed[0] = True
|
||||
|
||||
thread = threading.Thread(target=target)
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
thread.join(timeout)
|
||||
|
||||
if not completed[0]:
|
||||
error_msg = f"Function {func.__name__} timed out after {timeout} seconds"
|
||||
log.error(error_msg)
|
||||
raise CrawlerTimeoutError(error_msg)
|
||||
|
||||
if exception[0]:
|
||||
log.error(f"Function {func.__name__} raised an exception: {exception[0]}")
|
||||
raise exception[0]
|
||||
|
||||
return result[0]
|
||||
return wrapper
|
||||
|
||||
def safe_fetch(crawler_name: str, crawler, date_str: str, is_retry: bool = False) -> List[Dict[str, Any]]:
|
||||
"""安全地执行爬虫抓取,处理异常并返回结果"""
|
||||
try:
|
||||
news_list = crawler.fetch(date_str)
|
||||
if news_list and len(news_list) > 0:
|
||||
cache_key = f"crawler:{crawler_name}:{date_str}"
|
||||
cache.set_cache(key=cache_key, value=news_list, expire=0)
|
||||
|
||||
log.info(f"{crawler_name} fetch success, {len(news_list)} news fetched")
|
||||
return news_list
|
||||
else:
|
||||
log.info(f"{'Second time ' if is_retry else ''}crawler {crawler_name} failed. 0 news fetched")
|
||||
return []
|
||||
except Exception as e:
|
||||
error_msg = traceback.format_exc()
|
||||
log.error(f"{'Second time ' if is_retry else ''}crawler {crawler_name} error: {error_msg}")
|
||||
|
||||
# 发送钉钉通知
|
||||
try:
|
||||
notification_manager.notify_crawler_error(
|
||||
crawler_name=crawler_name,
|
||||
error_msg=str(e),
|
||||
date_str=date_str,
|
||||
is_retry=is_retry
|
||||
)
|
||||
except Exception as notify_error:
|
||||
log.error(f"Failed to send notification for crawler {crawler_name}: {notify_error}")
|
||||
|
||||
return []
|
||||
|
||||
def run_data_analysis(date_str: str):
|
||||
"""执行数据分析并缓存结果"""
|
||||
log.info(f"Starting data analysis for date {date_str}")
|
||||
try:
|
||||
# 导入分析模块(在这里导入避免循环依赖)
|
||||
from app.analysis.trend_analyzer import TrendAnalyzer
|
||||
from app.analysis.predictor import TrendPredictor
|
||||
|
||||
# 创建分析器实例
|
||||
analyzer = TrendAnalyzer()
|
||||
predictor = TrendPredictor()
|
||||
|
||||
# 1. 生成关键词云图数据并缓存
|
||||
log.info("Generating keyword cloud data...")
|
||||
analyzer.get_keyword_cloud(date_str, refresh=True)
|
||||
|
||||
# 2. 生成热点聚合分析数据并缓存
|
||||
log.info("Generating trend analysis data...")
|
||||
analyzer.get_analysis(date_str, analysis_type="main")
|
||||
|
||||
# 3. 生成跨平台热点分析数据并缓存
|
||||
log.info("Generating cross-platform analysis data...")
|
||||
analyzer.get_cross_platform_analysis(date_str, refresh=True)
|
||||
|
||||
# 4. 生成热点趋势预测数据并缓存
|
||||
log.info("Generating trend prediction data...")
|
||||
predictor.get_prediction(date_str)
|
||||
|
||||
# 5. 生成平台对比分析数据并缓存
|
||||
log.info("Generating platform comparison data...")
|
||||
analyzer.get_platform_comparison(date_str)
|
||||
|
||||
# 6. 生成高级分析数据并缓存
|
||||
log.info("Generating advanced analysis data...")
|
||||
analyzer.get_advanced_analysis(date_str, refresh=True)
|
||||
|
||||
# 7. 生成数据可视化分析数据并缓存
|
||||
log.info("Generating data visualization analysis...")
|
||||
analyzer.get_data_visualization(date_str, refresh=True)
|
||||
|
||||
# 8. 生成趋势预测分析数据并缓存
|
||||
log.info("Generating trend forecast data...")
|
||||
analyzer.get_trend_forecast(date_str, refresh=True)
|
||||
|
||||
log.info(f"All data analysis completed for date {date_str}")
|
||||
except Exception as e:
|
||||
error_msg = traceback.format_exc()
|
||||
log.error(f"Error during data analysis: {str(e)}")
|
||||
log.error(error_msg)
|
||||
|
||||
# 发送数据分析异常通知
|
||||
try:
|
||||
notification_manager.notify_analysis_error(
|
||||
error_msg=str(e),
|
||||
date_str=date_str
|
||||
)
|
||||
except Exception as notify_error:
|
||||
log.error(f"Failed to send analysis error notification: {notify_error}")
|
||||
|
||||
@_scheduler.scheduled_job('interval', id='crawlers_logic', seconds=CRAWLER_INTERVAL,
|
||||
max_instances=crawler_config.max_instances,
|
||||
misfire_grace_time=crawler_config.misfire_grace_time)
|
||||
def crawlers_logic():
|
||||
"""爬虫主逻辑,包含超时保护和错误处理"""
|
||||
|
||||
@timeout_handler
|
||||
def crawler_work():
|
||||
now_time = datetime.now(SHANGHAI_TZ)
|
||||
date_str = now_time.strftime("%Y-%m-%d")
|
||||
log.info(f"Starting crawler job at {now_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
retry_crawler = []
|
||||
success_count = 0
|
||||
failed_crawlers = []
|
||||
|
||||
for crawler_name, crawler in crawler_factory.items():
|
||||
news_list = safe_fetch(crawler_name, crawler, date_str)
|
||||
if news_list:
|
||||
success_count += 1
|
||||
else:
|
||||
retry_crawler.append(crawler_name)
|
||||
failed_crawlers.append(crawler_name)
|
||||
|
||||
# 第二轮爬取(重试失败的爬虫)
|
||||
if retry_crawler:
|
||||
log.info(f"Retrying {len(retry_crawler)} failed crawlers")
|
||||
retry_failed = []
|
||||
for crawler_name in retry_crawler:
|
||||
news_list = safe_fetch(crawler_name, crawler_factory[crawler_name], date_str, is_retry=True)
|
||||
if news_list:
|
||||
success_count += 1
|
||||
# 从失败列表中移除成功的爬虫
|
||||
if crawler_name in failed_crawlers:
|
||||
failed_crawlers.remove(crawler_name)
|
||||
else:
|
||||
retry_failed.append(crawler_name)
|
||||
|
||||
# 记录完成时间
|
||||
end_time = datetime.now(SHANGHAI_TZ)
|
||||
duration = (end_time - now_time).total_seconds()
|
||||
log.info(f"Crawler job finished at {end_time.strftime('%Y-%m-%d %H:%M:%S')}, "
|
||||
f"duration: {duration:.2f}s, success: {success_count}/{len(crawler_factory)}")
|
||||
|
||||
# 发送通知
|
||||
try:
|
||||
notification_manager.notify_crawler_summary(
|
||||
success_count=success_count,
|
||||
total_count=len(crawler_factory),
|
||||
failed_crawlers=failed_crawlers,
|
||||
duration=duration,
|
||||
date_str=date_str
|
||||
)
|
||||
except Exception as notify_error:
|
||||
log.error(f"Failed to send crawler notification: {notify_error}")
|
||||
|
||||
# 爬取完成后执行数据分析
|
||||
log.info("Crawler job completed, starting data analysis...")
|
||||
# 使用新线程执行分析,避免阻塞主线程
|
||||
threading.Thread(target=run_data_analysis, args=(date_str,), daemon=True).start()
|
||||
|
||||
return success_count
|
||||
|
||||
try:
|
||||
return crawler_work()
|
||||
except CrawlerTimeoutError as e:
|
||||
log.error(f"Crawler job timeout: {str(e)}")
|
||||
# 发送超时通知
|
||||
try:
|
||||
notification_manager.notify_crawler_timeout(
|
||||
timeout_seconds=CRAWLER_TIMEOUT,
|
||||
date_str=date_str
|
||||
)
|
||||
except Exception as notify_error:
|
||||
log.error(f"Failed to send timeout notification: {notify_error}")
|
||||
return 0
|
||||
except Exception as e:
|
||||
log.error(f"Crawler job error: {str(e)}")
|
||||
log.error(traceback.format_exc())
|
||||
# 发送通用异常通知
|
||||
try:
|
||||
notification_manager.notify_crawler_error(
|
||||
crawler_name="crawler_job",
|
||||
error_msg=str(e),
|
||||
date_str=date_str
|
||||
)
|
||||
except Exception as notify_error:
|
||||
log.error(f"Failed to send error notification: {notify_error}")
|
||||
return 0
|
||||
0
app/services/sites/__init__.py
Normal file
0
app/services/sites/__init__.py
Normal file
99
app/services/sites/baidu.py
Normal file
99
app/services/sites/baidu.py
Normal file
@@ -0,0 +1,99 @@
|
||||
import json
|
||||
import datetime
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class BaiduNewsCrawler(Crawler):
|
||||
# 返回news_list
|
||||
def fetch(self, date_str) -> list:
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://top.baidu.com/api/board?platform=wise&tab=realtime"
|
||||
|
||||
resp = requests.get(url=url, params=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
json_data = resp.json()
|
||||
contents = json_data.get("data")["cards"][0]["content"][0]["content"]
|
||||
result = []
|
||||
cache_list = []
|
||||
for content in contents:
|
||||
title = content.get("word")
|
||||
url = content.get("url")
|
||||
desc = content.get("desc")
|
||||
score = content.get("hotScore")
|
||||
|
||||
# replace url m to www
|
||||
url = url.replace("m.", "www.")
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': desc,
|
||||
'source': 'baidu',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串
|
||||
}
|
||||
result.append(news)
|
||||
cache_list.append(news) # 直接添加字典,json.dumps会在后面处理整个列表
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "baidu"
|
||||
|
||||
@staticmethod
|
||||
def fetch_v0():
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://top.baidu.com/board?tab=realtime"
|
||||
proxies = {
|
||||
# "http": "http://127.0.0.1:7890",
|
||||
# "https": "http://127.0.0.0:7890"
|
||||
}
|
||||
|
||||
header = {
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
|
||||
"*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
"upgrade-insecure-requests": 1,
|
||||
"host": "www.baidu.com",
|
||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/86.0.4240.183 Safari/537.36"
|
||||
}
|
||||
html = requests.get(url=url, params=header, verify=False, proxies=proxies)
|
||||
html.encoding = "utf-8"
|
||||
html_text = html.text
|
||||
soup = BeautifulSoup(html_text, "html.parser")
|
||||
main_content = soup.find_all("main")[0]
|
||||
news_main_content = main_content.find("div", style='margin-bottom:20px')
|
||||
|
||||
div_elements = news_main_content.find_all('div', class_='category-wrap_iQLoo horizontal_1eKyQ')
|
||||
|
||||
result = []
|
||||
for div_element in div_elements:
|
||||
hot_index = div_element.find(class_='hot-index_1Bl1a').text.strip()
|
||||
news_title = div_element.find(class_='c-single-text-ellipsis').text.strip()
|
||||
news_link = div_element.find('a', class_='title_dIF3B')['href']
|
||||
|
||||
news = {
|
||||
'title': news_title,
|
||||
'url': news_link,
|
||||
'content': "",
|
||||
'source': 'baidu',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串
|
||||
}
|
||||
result.append(news)
|
||||
|
||||
return result
|
||||
64
app/services/sites/bilibili.py
Normal file
64
app/services/sites/bilibili.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import json
|
||||
import datetime
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class BilibiliCrawler(Crawler):
|
||||
|
||||
def fetch(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://api.bilibili.com/x/web-interface/popular"
|
||||
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"Chrome/122.0.0.0 Safari/537.36"
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
),
|
||||
"Referer": "https://www.bilibili.com/",
|
||||
}
|
||||
|
||||
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
data = resp.json()
|
||||
if data["code"] != 0:
|
||||
print(f"API error: {data['message']}")
|
||||
return []
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data["data"].get("list", []):
|
||||
title = item.get("title", "")
|
||||
bvid = item.get("bvid", "")
|
||||
desc = item.get("desc", "")
|
||||
video_url = f"https://www.bilibili.com/video/{bvid}"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': video_url,
|
||||
'content': desc,
|
||||
'source': 'bilibili',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "bilibili"
|
||||
100
app/services/sites/cls.py
Normal file
100
app/services/sites/cls.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import json
|
||||
import datetime
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class CLSCrawler(Crawler):
|
||||
"""财联社"""
|
||||
|
||||
def fetch(self, date_str) -> list:
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
try:
|
||||
params = {
|
||||
'app': 'CailianpressWeb',
|
||||
'os': 'web',
|
||||
'sv': '8.4.6',
|
||||
'sign': '9f8797a1f4de66c2370f7a03990d2737'
|
||||
}
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Referer': 'https://www.cls.cn/',
|
||||
'Origin': 'https://www.cls.cn'
|
||||
}
|
||||
|
||||
response = requests.get(
|
||||
"https://www.cls.cn/featured/v1/column/list",
|
||||
params=params,
|
||||
headers=headers,
|
||||
timeout=self.timeout,
|
||||
verify=False
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
if data.get('errno') != 0:
|
||||
return []
|
||||
|
||||
column_list = data.get('data', {}).get('column_list', [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for idx, column in enumerate(column_list[:20]):
|
||||
try:
|
||||
title = column.get('title', '').strip()
|
||||
if not title or len(title) < 2:
|
||||
continue
|
||||
|
||||
article_list = column.get('article_list', {})
|
||||
if article_list:
|
||||
article_title = article_list.get('title', '').strip()
|
||||
jump_url = article_list.get('jump_url', '').strip()
|
||||
brief = article_list.get('brief', '').strip()
|
||||
|
||||
if article_title:
|
||||
display_title = f"[{title}] {article_title}"
|
||||
content = brief if brief else article_title
|
||||
url = "https://www.cls.cn/telegraph"
|
||||
else:
|
||||
display_title = title
|
||||
content = column.get('brief', '').strip()
|
||||
url = f"https://www.cls.cn/telegraph"
|
||||
else:
|
||||
display_title = title
|
||||
content = column.get('brief', '').strip()
|
||||
url = f"https://www.cls.cn/telegraph"
|
||||
|
||||
news = {
|
||||
'title': display_title,
|
||||
'url': url,
|
||||
'content': content,
|
||||
'source': 'cls',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'score': 1000 - idx,
|
||||
'rank': idx + 1
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if cache_list:
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "cls"
|
||||
23
app/services/sites/crawler.py
Normal file
23
app/services/sites/crawler.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Dict, Any
|
||||
|
||||
class Crawler(ABC):
|
||||
def __init__(self):
|
||||
self.header = {
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
|
||||
"*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
"upgrade-insecure-requests": "1",
|
||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/86.0.4240.183 Safari/537.36"
|
||||
}
|
||||
self.timeout = 10
|
||||
|
||||
@abstractmethod
|
||||
def fetch(self, date_str: str) -> List[Dict[str, Any]]:
|
||||
"""获取新闻列表"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def crawler_name(self) -> str:
|
||||
"""获取爬虫名称"""
|
||||
pass
|
||||
79
app/services/sites/douban.py
Normal file
79
app/services/sites/douban.py
Normal file
@@ -0,0 +1,79 @@
|
||||
import json
|
||||
import re
|
||||
import datetime
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
from ...core import cache
|
||||
from .crawler import Crawler
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class DouBanCrawler(Crawler):
|
||||
"""豆瓣网"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://www.douban.com/group/explore"
|
||||
|
||||
header = self.header.copy()
|
||||
header.update({
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
"accept-encoding": "",
|
||||
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
"host": "www.douban.com",
|
||||
"referer": "https://www.douban.com/group/explore",
|
||||
"sec-fetch-dest": "document",
|
||||
"sec-fetch-mode": "navigate",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-user": "?1",
|
||||
"upgrade-insecure-requests": "1",
|
||||
})
|
||||
|
||||
resp = requests.get(url=url, headers=header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
html_text = resp.text
|
||||
soup = BeautifulSoup(html_text, "html.parser")
|
||||
|
||||
topic_list = soup.find_all('div', class_='channel-item')
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for topic in topic_list:
|
||||
title_elem = topic.find('h3')
|
||||
if not title_elem:
|
||||
continue
|
||||
|
||||
link_elem = title_elem.find('a')
|
||||
if not link_elem:
|
||||
continue
|
||||
|
||||
title = link_elem.text.strip()
|
||||
url = link_elem.get('href')
|
||||
|
||||
desc_elem = topic.find('div', class_='content')
|
||||
desc = desc_elem.text.strip() if desc_elem else ""
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': desc,
|
||||
'source': 'douban',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "douban"
|
||||
111
app/services/sites/douyin.py
Normal file
111
app/services/sites/douyin.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import json
|
||||
import datetime
|
||||
import time
|
||||
|
||||
import requests
|
||||
from selenium.webdriver.common.by import By
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
from .crawler import Crawler
|
||||
from ..browser_manager import BrowserManager
|
||||
|
||||
|
||||
class DouYinCrawler(Crawler):
|
||||
def fetch(self, date_str):
|
||||
return self.fetch_v2(date_str)
|
||||
|
||||
def fetch_v1(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
url = "https://www.douyin.com/hot"
|
||||
browser_manager = BrowserManager()
|
||||
|
||||
try:
|
||||
# 使用浏览器管理器获取页面内容
|
||||
page_source, driver = browser_manager.get_page_content(url, wait_time=5)
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
# 抖音热榜条目(li 标签里含 /video/ 链接)
|
||||
items = driver.find_elements(By.XPATH, '//li[a[contains(@href, "/video/")]]')
|
||||
|
||||
for item in items:
|
||||
try:
|
||||
# 提取标题(含 # 标签或较长文本)
|
||||
title_elem = item.find_element(By.XPATH, './/div[contains(text(), "#") or string-length(text()) > 10]')
|
||||
# 提取链接
|
||||
link_elem = item.find_element(By.XPATH, './/a[contains(@href, "/video/")]')
|
||||
# 提取热度
|
||||
hot_elem = item.find_element(By.XPATH, './/span[contains(text(), "万") or contains(text(), "亿")]')
|
||||
|
||||
title = title_elem.text.strip()
|
||||
item_url = "https://www.douyin.com" + link_elem.get_attribute("href")
|
||||
hot = hot_elem.text.strip()
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': item_url,
|
||||
'content': f"热度: {hot}",
|
||||
'source': 'douyin',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
except Exception:
|
||||
continue # 跳过无效项
|
||||
|
||||
# 缓存并返回
|
||||
if cache_list:
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def fetch_v2(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
url = "https://www.douyin.com/aweme/v1/web/hot/search/list/?device_platform=webapp&aid=6383&channel=channel_pc_web&detail_list=1&source=6&pc_client_type=1&pc_libra_divert=Windows&support_h265=1&support_dash=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=136.0.0.0&browser_online=true&engine_name=Blink&engine_version=136.0.0.0&os_name=Windows&os_version=10&cpu_core_num=16&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7490997798633555467"
|
||||
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"Chrome/122.0.0.0 Safari/537.36"
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
),
|
||||
"Referer": "https://www.douyin.com/",
|
||||
}
|
||||
|
||||
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
data = resp.json()
|
||||
# https://www.douyin.com/hot/2094286?&trending_topic=%E5%A4%8F%E5%A4%A9%E7%9A%84%E5%91%B3%E9%81%93%E5%9C%A8%E6%8A%96%E9%9F%B3&previous_page=main_page&enter_method=trending_topic&modeFrom=hotDetail&tab_name=trend&position=1&hotValue=11892557
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data["data"]["word_list"]:
|
||||
title = item["word"]
|
||||
url = f"https://www.douyin.com/hot/{item['sentence_id']}?&trending_topic={item['word']}&previous_page=main_page&enter_method=trending_topic&modeFrom=hotDetail&tab_name=trend&position=1&hotValue={item['hot_value']}"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': title,
|
||||
'source': 'douyin',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
|
||||
def crawler_name(self):
|
||||
return "douyin"
|
||||
88
app/services/sites/eastmoney.py
Normal file
88
app/services/sites/eastmoney.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import json
|
||||
import datetime
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class EastMoneyCrawler(Crawler):
|
||||
"""东方财富网"""
|
||||
|
||||
def fetch(self, date_str) -> list:
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
try:
|
||||
params = {
|
||||
'client': 'web',
|
||||
'biz': 'web_724',
|
||||
'fastColumn': '102',
|
||||
'sortEnd': '',
|
||||
'pageSize': '50',
|
||||
'req_trace': str(int(current_time.timestamp() * 1000)) # 使用当前时间戳
|
||||
}
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Referer': 'https://kuaixun.eastmoney.com/',
|
||||
'Origin': 'https://kuaixun.eastmoney.com'
|
||||
}
|
||||
|
||||
response = requests.get(
|
||||
"https://np-weblist.eastmoney.com/comm/web/getFastNewsList",
|
||||
params=params,
|
||||
headers=headers,
|
||||
timeout=self.timeout,
|
||||
verify=False
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
if data.get('code') != '1':
|
||||
return []
|
||||
fast_news_list = data.get('data', {}).get('fastNewsList', [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for idx, news_item in enumerate(fast_news_list[:20]): # 取前20条
|
||||
try:
|
||||
title = news_item.get('title', '').strip()
|
||||
if not title:
|
||||
continue
|
||||
|
||||
summary = news_item.get('summary', '').strip()
|
||||
show_time = news_item.get('showTime', '').strip()
|
||||
code = news_item.get('code', '').strip()
|
||||
url = f"https://finance.eastmoney.com/a/{code}" if code else "https://kuaixun.eastmoney.com/"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': summary,
|
||||
'source': 'eastmoney',
|
||||
'publish_time': show_time if show_time else current_time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'score': 1000 - idx,
|
||||
'rank': idx + 1
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if cache_list:
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "eastmoney"
|
||||
64
app/services/sites/factory.py
Normal file
64
app/services/sites/factory.py
Normal file
@@ -0,0 +1,64 @@
|
||||
from typing import Dict, Type
|
||||
|
||||
from .baidu import BaiduNewsCrawler
|
||||
from .bilibili import BilibiliCrawler
|
||||
from .crawler import Crawler
|
||||
from .douban import DouBanCrawler
|
||||
from .douyin import DouYinCrawler
|
||||
from .ftpojie import FtPoJieCrawler
|
||||
from .github import GithubCrawler
|
||||
from .hackernews import HackerNewsCrawler
|
||||
from .hupu import HuPuCrawler
|
||||
from .jinritoutiao import JinRiTouTiaoCrawler
|
||||
from .juejin import JueJinCrawler
|
||||
from .sspai import ShaoShuPaiCrawler
|
||||
from .stackoverflow import StackOverflowCrawler
|
||||
from .tenxunwang import TenXunWangCrawler
|
||||
from .tieba import TieBaCrawler
|
||||
from .tskr import TsKrCrawler
|
||||
from .vtex import VtexCrawler
|
||||
from .weibo import WeiboCrawler
|
||||
from .weixin import WeiXinCrawler
|
||||
from .zhihu import ZhiHuCrawler
|
||||
from .sina_finance import SinaFinanceCrawler
|
||||
from .eastmoney import EastMoneyCrawler
|
||||
from .xueqiu import XueqiuCrawler
|
||||
from .cls import CLSCrawler
|
||||
|
||||
|
||||
class CrawlerRegister:
|
||||
def __init__(self):
|
||||
self.crawlers = {}
|
||||
|
||||
def register(self) -> Dict[str, Crawler]:
|
||||
"""注册所有爬虫"""
|
||||
crawler_map = {
|
||||
"baidu": BaiduNewsCrawler(),
|
||||
"shaoshupai": ShaoShuPaiCrawler(),
|
||||
"weibo": WeiboCrawler(),
|
||||
"zhihu": ZhiHuCrawler(),
|
||||
"36kr": TsKrCrawler(),
|
||||
"52pojie": FtPoJieCrawler(),
|
||||
"bilibili": BilibiliCrawler(),
|
||||
"douban": DouBanCrawler(),
|
||||
"hupu": HuPuCrawler(),
|
||||
"tieba": TieBaCrawler(),
|
||||
"juejin": JueJinCrawler(),
|
||||
"douyin": DouYinCrawler(),
|
||||
"v2ex": VtexCrawler(),
|
||||
"jinritoutiao": JinRiTouTiaoCrawler(),
|
||||
"tenxunwang": TenXunWangCrawler(),
|
||||
"stackoverflow": StackOverflowCrawler(),
|
||||
"github": GithubCrawler(),
|
||||
"hackernews": HackerNewsCrawler(),
|
||||
"sina_finance": SinaFinanceCrawler(),
|
||||
"eastmoney": EastMoneyCrawler(),
|
||||
"xueqiu": XueqiuCrawler(),
|
||||
"cls": CLSCrawler(),
|
||||
}
|
||||
|
||||
self.crawlers = crawler_map
|
||||
return self.crawlers
|
||||
|
||||
def get_crawlers(self):
|
||||
return self.register().values()
|
||||
69
app/services/sites/ftpojie.py
Normal file
69
app/services/sites/ftpojie.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
import re
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
from .crawler import Crawler
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class FtPoJieCrawler(Crawler):
|
||||
"""吾爱破解"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://www.52pojie.cn/forum.php?mod=guide&view=hot"
|
||||
|
||||
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
resp.encoding = 'gbk' # 52pojie使用GBK编码
|
||||
html_text = resp.text
|
||||
soup = BeautifulSoup(html_text, "html.parser")
|
||||
|
||||
# 找到热门帖子列表
|
||||
hot_threads = soup.find_all('tbody', id=lambda x: x and x.startswith('normalthread_'))
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for thread in hot_threads:
|
||||
title_elem = thread.find('a', class_='xst')
|
||||
if not title_elem:
|
||||
continue
|
||||
|
||||
title = title_elem.text.strip()
|
||||
url = "https://www.52pojie.cn/" + title_elem.get('href')
|
||||
|
||||
# 获取帖子信息
|
||||
info_elem = thread.find('td', class_='by')
|
||||
info = info_elem.text.strip() if info_elem else ""
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': info,
|
||||
'source': '52pojie',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "52pojie"
|
||||
58
app/services/sites/github.py
Normal file
58
app/services/sites/github.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import json
|
||||
import datetime
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class GithubCrawler(Crawler):
|
||||
def fetch(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://api.github.com/search/repositories?q=stars:%3E1&sort=stars"
|
||||
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"Chrome/122.0.0.0 Safari/537.36"
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
),
|
||||
"Referer": "https://github.com/",
|
||||
}
|
||||
|
||||
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
data = resp.json()
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for i, item in enumerate(data["items"]):
|
||||
title = item.get("full_name", "")
|
||||
url = item.get("html_url", "")
|
||||
desc = item.get("description", "")
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': desc,
|
||||
'source': self.crawler_name(),
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "github"
|
||||
235
app/services/sites/hackernews.py
Normal file
235
app/services/sites/hackernews.py
Normal file
@@ -0,0 +1,235 @@
|
||||
import json
|
||||
import datetime
|
||||
import time
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib3
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
from .crawler import Crawler
|
||||
from ..browser_manager import BrowserManager
|
||||
|
||||
# 禁用SSL警告
|
||||
urllib3.disable_warnings()
|
||||
|
||||
class HackerNewsCrawler(Crawler):
|
||||
"""hacker news"""
|
||||
def fetch(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
try:
|
||||
# 首先尝试直接请求方式获取内容
|
||||
result = self._fetch_with_requests()
|
||||
|
||||
if result and len(result) > 0:
|
||||
# 缓存数据
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
# 如果请求方式失败,尝试使用浏览器模拟获取
|
||||
browser_manager = BrowserManager()
|
||||
result = self._fetch_with_browser(browser_manager)
|
||||
if result and len(result) > 0:
|
||||
# 缓存数据
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
# 如果遇到错误,返回空列表
|
||||
return []
|
||||
|
||||
# 所有方法都失败,返回空列表
|
||||
return []
|
||||
|
||||
def _fetch_with_requests(self):
|
||||
"""使用requests直接获取Hacker News内容"""
|
||||
url = "https://news.ycombinator.com/"
|
||||
|
||||
try:
|
||||
# 发送HTTP请求
|
||||
response = requests.get(url, headers=self.header, timeout=self.timeout)
|
||||
if response.status_code != 200:
|
||||
return []
|
||||
|
||||
# 解析HTML内容
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
result = []
|
||||
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# 获取所有新闻条目
|
||||
items = soup.select("tr.athing")
|
||||
|
||||
for item in items:
|
||||
try:
|
||||
# 获取ID用于关联评论和元数据
|
||||
item_id = item.get('id')
|
||||
if not item_id:
|
||||
continue
|
||||
|
||||
# 获取标题和链接
|
||||
title_element = item.select_one(".titleline a")
|
||||
if not title_element:
|
||||
continue
|
||||
|
||||
title = title_element.text.strip()
|
||||
url = title_element.get('href')
|
||||
|
||||
# 如果URL是相对路径,转换为绝对路径
|
||||
if url and not url.startswith('http'):
|
||||
url = f"https://news.ycombinator.com/{url}"
|
||||
|
||||
# 获取来源网站
|
||||
site_element = item.select_one(".sitestr")
|
||||
site = site_element.text.strip() if site_element else ""
|
||||
|
||||
# 查找下一个tr获取元数据(分数、用户、时间等)
|
||||
metadata = item.find_next_sibling('tr')
|
||||
if not metadata:
|
||||
continue
|
||||
|
||||
# 获取分数
|
||||
score_element = metadata.select_one(".score")
|
||||
score = score_element.text.strip() if score_element else "0 points"
|
||||
|
||||
# 获取作者
|
||||
user_element = metadata.select_one(".hnuser")
|
||||
user = user_element.text.strip() if user_element else "unknown"
|
||||
|
||||
# 获取评论数
|
||||
comments_element = metadata.select_one("a:last-child")
|
||||
comments = comments_element.text.strip() if comments_element else "0 comments"
|
||||
if "discuss" in comments:
|
||||
comments = "0 comments"
|
||||
|
||||
# 构建内容摘要
|
||||
content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': content,
|
||||
'source': 'hackernews',
|
||||
'publish_time': current_time
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
|
||||
# 限制获取前30条
|
||||
if len(result) >= 30:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def _fetch_with_browser(self, browser_manager):
|
||||
"""使用浏览器模拟方式获取Hacker News内容"""
|
||||
url = "https://news.ycombinator.com/"
|
||||
|
||||
try:
|
||||
# 获取页面内容
|
||||
page_source, driver = browser_manager.get_page_content(url, wait_time=5)
|
||||
|
||||
# 等待页面元素加载
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, ".athing"))
|
||||
)
|
||||
except:
|
||||
# 如果等待超时,仍然尝试获取内容
|
||||
pass
|
||||
|
||||
result = []
|
||||
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# 获取所有新闻条目
|
||||
items = driver.find_elements(By.CSS_SELECTOR, "tr.athing")
|
||||
|
||||
for item in items:
|
||||
try:
|
||||
# 获取ID用于关联评论和元数据
|
||||
item_id = item.get_attribute("id")
|
||||
if not item_id:
|
||||
continue
|
||||
|
||||
# 获取标题和链接
|
||||
title_element = item.find_element(By.CSS_SELECTOR, ".titleline a")
|
||||
title = title_element.text.strip()
|
||||
url = title_element.get_attribute("href")
|
||||
|
||||
# 获取来源网站
|
||||
site = ""
|
||||
try:
|
||||
site_element = item.find_element(By.CSS_SELECTOR, ".sitestr")
|
||||
site = site_element.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
# 查找下一个tr获取元数据(分数、用户、时间等)
|
||||
try:
|
||||
metadata = driver.find_element(By.XPATH, f"//tr[@id='{item_id}']/following-sibling::tr[1]")
|
||||
|
||||
# 获取分数
|
||||
score = "0 points"
|
||||
try:
|
||||
score_element = metadata.find_element(By.CSS_SELECTOR, ".score")
|
||||
score = score_element.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
# 获取作者
|
||||
user = "unknown"
|
||||
try:
|
||||
user_element = metadata.find_element(By.CSS_SELECTOR, ".hnuser")
|
||||
user = user_element.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
# 获取评论数
|
||||
comments = "0 comments"
|
||||
try:
|
||||
comments_element = metadata.find_element(By.XPATH, ".//a[last()]")
|
||||
comments = comments_element.text.strip()
|
||||
if "discuss" in comments:
|
||||
comments = "0 comments"
|
||||
except:
|
||||
pass
|
||||
|
||||
# 构建内容摘要
|
||||
content = f"来源: {site} | 得分: {score} | 作者: {user} | 评论: {comments}"
|
||||
except:
|
||||
content = f"来源: {site}"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': content,
|
||||
'source': 'hackernews',
|
||||
'publish_time': current_time
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
|
||||
# 限制获取前30条
|
||||
if len(result) >= 30:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "hackernews"
|
||||
72
app/services/sites/hupu.py
Normal file
72
app/services/sites/hupu.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
import re
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
from .crawler import Crawler
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class HuPuCrawler(Crawler):
|
||||
"""虎扑"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://bbs.hupu.com/all-gambia"
|
||||
|
||||
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
html_text = resp.text
|
||||
soup = BeautifulSoup(html_text, "html.parser")
|
||||
|
||||
# 找到热门帖子列表
|
||||
post_list = soup.find_all('div', class_='t-info')
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for post in post_list:
|
||||
title_elem = post.find('span', class_='t-title')
|
||||
if not title_elem:
|
||||
continue
|
||||
|
||||
link_elem = post.find('a')
|
||||
if not link_elem:
|
||||
continue
|
||||
|
||||
title = title_elem.text.strip()
|
||||
url = "https://bbs.hupu.com" + link_elem.get('href') if link_elem.get('href').startswith('/') else link_elem.get('href')
|
||||
|
||||
# 获取帖子信息
|
||||
info_elem = post.find('span', class_='t-replies')
|
||||
info = info_elem.text.strip() if info_elem else ""
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': info,
|
||||
'source': 'hupu',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "hupu"
|
||||
63
app/services/sites/jinritoutiao.py
Normal file
63
app/services/sites/jinritoutiao.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# -- coding: utf-8 --
|
||||
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from ...core import cache
|
||||
from .crawler import Crawler
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class JinRiTouTiaoCrawler(Crawler):
|
||||
""" 今日头条 """
|
||||
|
||||
def fetch(self, date_str):
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc"
|
||||
|
||||
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
try:
|
||||
json_data = resp.json()
|
||||
data = json_data.get('data', [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data:
|
||||
title = item.get('Title', '')
|
||||
url = item.get('Url', '')
|
||||
hot_value = item.get('HotValue', '')
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': f"热度: {hot_value}",
|
||||
'source': 'jinritoutiao',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing JSON: {e}")
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "jinritoutiao"
|
||||
63
app/services/sites/juejin.py
Normal file
63
app/services/sites/juejin.py
Normal file
@@ -0,0 +1,63 @@
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class JueJinCrawler(Crawler):
|
||||
"""掘金"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://api.juejin.cn/content_api/v1/content/article_rank?category_id=1&type=hot"
|
||||
|
||||
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
try:
|
||||
json_data = resp.json()
|
||||
data = json_data.get('data', [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data:
|
||||
article_info = item.get('content', {})
|
||||
title = article_info.get('title', '')
|
||||
article_id = article_info.get('content_id', '')
|
||||
url = f"https://juejin.cn/post/{article_id}"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': title,
|
||||
'source': 'juejin',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing JSON: {e}")
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "juejin"
|
||||
20
app/services/sites/models.py
Normal file
20
app/services/sites/models.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import datetime
|
||||
|
||||
from sqlalchemy import Column, String, Integer, DateTime
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
class DailyNews(Base):
|
||||
__tablename__ = 'tab_daily_news'
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
title = Column(String(255))
|
||||
desc = Column(String(255))
|
||||
link = Column(String(255))
|
||||
type = Column(Integer, default=0)
|
||||
score = Column(Integer, default=0)
|
||||
times = Column(Integer, default=0)
|
||||
create_time = Column(DateTime, default=datetime.datetime.now)
|
||||
update_time = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now)
|
||||
75
app/services/sites/sina_finance.py
Normal file
75
app/services/sites/sina_finance.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import json
|
||||
import datetime
|
||||
import requests
|
||||
import urllib3
|
||||
from ...core import cache
|
||||
from .crawler import Crawler
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class SinaFinanceCrawler(Crawler):
|
||||
"""新浪财经"""
|
||||
def fetch(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
try:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Referer': 'https://finance.sina.com.cn/',
|
||||
'Origin': 'https://finance.sina.com.cn'
|
||||
}
|
||||
|
||||
response = requests.get(
|
||||
"https://zhibo.sina.com.cn/api/zhibo/feed?page=1&page_size=20&zhibo_id=152&tag_id=0&dire=f&dpc=1&pagesize=20",
|
||||
headers=headers,
|
||||
timeout=self.timeout,
|
||||
verify=False
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
if data.get('result', {}).get('status', {}).get('code') != 0:
|
||||
return []
|
||||
|
||||
feed_list = data.get('result', {}).get('data', {}).get('feed', {}).get('list', [])
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in feed_list:
|
||||
try:
|
||||
title = item.get('rich_text', '').strip()
|
||||
if not title:
|
||||
continue
|
||||
|
||||
ext_str = item.get('ext', '{}')
|
||||
try:
|
||||
ext_data = json.loads(ext_str)
|
||||
doc_url = ext_data.get('docurl', '')
|
||||
except:
|
||||
doc_url = item.get('docurl', '').strip(' "')
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': doc_url,
|
||||
'content': title,
|
||||
'source': 'sina_finance',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if cache_list:
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "sina_finance"
|
||||
60
app/services/sites/sspai.py
Normal file
60
app/services/sites/sspai.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from ...core import cache
|
||||
from .crawler import Crawler
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class ShaoShuPaiCrawler(Crawler):
|
||||
"""少数派"""
|
||||
def fetch(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://sspai.com/api/v1/article/index/page/get?limit=20&offset=0&created_at=0"
|
||||
|
||||
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
try:
|
||||
json_data = resp.json()
|
||||
data = json_data.get('data', [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data:
|
||||
title = item.get('title', '')
|
||||
article_id = item.get('id', '')
|
||||
url = f"https://sspai.com/post/{article_id}"
|
||||
summary = item.get('summary', '')
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': summary,
|
||||
'source': 'sspai',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing JSON: {e}")
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "shaoshupai"
|
||||
58
app/services/sites/stackoverflow.py
Normal file
58
app/services/sites/stackoverflow.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import json
|
||||
import datetime
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class StackOverflowCrawler(Crawler):
|
||||
def fetch(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://api.stackexchange.com/2.3/questions?order=desc&sort=hot&site=stackoverflow"
|
||||
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"Chrome/122.0.0.0 Safari/537.36"
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
),
|
||||
"Referer": "https://stackoverflow.com/",
|
||||
}
|
||||
|
||||
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
data = resp.json()
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for i, item in enumerate(data["items"]):
|
||||
title = item.get("title", "")
|
||||
url = item.get("link", "")
|
||||
desc = item.get("title", "")
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': desc,
|
||||
'source': 'stackoverflow',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "stackoverflow"
|
||||
65
app/services/sites/tenxunwang.py
Normal file
65
app/services/sites/tenxunwang.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import json
|
||||
import datetime
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class TenXunWangCrawler(Crawler):
|
||||
"""腾讯网"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://i.news.qq.com/gw/event/pc_hot_ranking_list?ids_hash=&offset=0&page_size=51&appver=15.5_qqnews_7.1.60&rank_id=hot"
|
||||
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"Chrome/122.0.0.0 Safari/537.36"
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
),
|
||||
"Referer": "https://news.qq.com/",
|
||||
}
|
||||
|
||||
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
data = resp.json()
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
|
||||
for i, item in enumerate(data["idlist"][0].get("newslist", [])):
|
||||
if i == 0:
|
||||
# 腾讯新闻用户最关注的热点,每10分钟更新一次
|
||||
continue
|
||||
|
||||
title = item.get("title", "")
|
||||
url = item.get("url", "")
|
||||
desc = item.get("abstract", "")
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': desc,
|
||||
'source': 'tenxunwang',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "tenxunwang"
|
||||
65
app/services/sites/tieba.py
Normal file
65
app/services/sites/tieba.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class TieBaCrawler(Crawler):
|
||||
"""百度贴吧"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "http://tieba.baidu.com/hottopic/browse/topicList"
|
||||
|
||||
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
try:
|
||||
json_data = resp.json()
|
||||
data = json_data.get('data', {}).get('bang_topic', {}).get('topic_list', [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data:
|
||||
title = item.get('topic_name', '')
|
||||
url = item.get('topic_url', '')
|
||||
if url and not url.startswith('http'):
|
||||
url = f"http://tieba.baidu.com{url}"
|
||||
|
||||
desc = item.get('topic_desc', '')
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': desc,
|
||||
'source': 'tieba',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing JSON: {e}")
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "tieba"
|
||||
83
app/services/sites/tskr.py
Normal file
83
app/services/sites/tskr.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import json
|
||||
import datetime
|
||||
import time
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class TsKrCrawler(Crawler):
|
||||
"""36氪"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
"""
|
||||
获取36氪热榜数据
|
||||
"""
|
||||
current_time = datetime.datetime.now()
|
||||
url = f"https://gateway.36kr.com/api/mis/nav/home/nav/rank/hot"
|
||||
headers = {
|
||||
"Content-Type": "application/json; charset=utf-8",
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36",
|
||||
}
|
||||
|
||||
body = {
|
||||
"partner_id": "wap",
|
||||
"param": {
|
||||
"siteId": 1,
|
||||
"platformId": 2,
|
||||
},
|
||||
"timestamp": int(time.time() * 1000),
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.post(
|
||||
url=url,
|
||||
headers=headers,
|
||||
json=body,
|
||||
verify=False,
|
||||
timeout=self.timeout
|
||||
)
|
||||
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
json_data = resp.json()
|
||||
data_key = "hotRankList"
|
||||
data_list = json_data.get("data", {}).get(data_key, [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data_list:
|
||||
template_material = item.get("templateMaterial", {})
|
||||
item_id = item.get("itemId", "")
|
||||
|
||||
title = template_material.get("widgetTitle", "")
|
||||
article_url = f"https://www.36kr.com/p/{item_id}"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': article_url,
|
||||
'content': title,
|
||||
'source': '36kr',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error fetching 36kr data: {e}")
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "36kr"
|
||||
71
app/services/sites/vtex.py
Normal file
71
app/services/sites/vtex.py
Normal file
@@ -0,0 +1,71 @@
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class VtexCrawler(Crawler):
|
||||
"""v2ex"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://www.v2ex.com/?tab=hot"
|
||||
|
||||
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
html_text = resp.text
|
||||
soup = BeautifulSoup(html_text, "html.parser")
|
||||
|
||||
# 找到热门话题列表
|
||||
topic_list = soup.find_all('div', class_='cell item')
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for topic in topic_list:
|
||||
title_elem = topic.find('span', class_='item_title')
|
||||
if not title_elem:
|
||||
continue
|
||||
|
||||
link_elem = title_elem.find('a')
|
||||
if not link_elem:
|
||||
continue
|
||||
|
||||
title = link_elem.text.strip()
|
||||
url = "https://www.v2ex.com" + link_elem.get('href')
|
||||
|
||||
# 获取话题信息
|
||||
info_elem = topic.find('span', class_='topic_info')
|
||||
info = info_elem.text.strip() if info_elem else ""
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': info,
|
||||
'source': 'v2ex',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
def crawler_name(self):
|
||||
return "v2ex"
|
||||
68
app/services/sites/weibo.py
Normal file
68
app/services/sites/weibo.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from ...core import cache
|
||||
from .crawler import Crawler
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class WeiboCrawler(Crawler):
|
||||
"""微博"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
header = self.header.copy()
|
||||
header.update({
|
||||
"accept": "application/json, text/javascript, */*; q=0.01",
|
||||
"host": "weibo.com",
|
||||
"Referer": "https://weibo.com",
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
})
|
||||
|
||||
url = "https://weibo.com/ajax/side/hotSearch"
|
||||
|
||||
resp = requests.get(url=url, headers=header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
try:
|
||||
json_data = resp.json()
|
||||
data = json_data.get('data', {}).get('realtime', [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data:
|
||||
title = item.get('word', '')
|
||||
url = f"https://s.weibo.com/weibo?q=%23{title}%23"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': title,
|
||||
'source': 'weibo',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing JSON: {e}")
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "weibo"
|
||||
228
app/services/sites/weixin.py
Normal file
228
app/services/sites/weixin.py
Normal file
@@ -0,0 +1,228 @@
|
||||
import json
|
||||
import datetime
|
||||
import time
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib3
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
from .crawler import Crawler
|
||||
from ..browser_manager import BrowserManager
|
||||
|
||||
# 禁用SSL警告
|
||||
urllib3.disable_warnings()
|
||||
|
||||
class WeiXinCrawler(Crawler):
|
||||
"""
|
||||
微信热门内容爬虫
|
||||
使用微信看一看热门页面获取数据
|
||||
"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
"""获取微信热门内容"""
|
||||
current_time = datetime.datetime.now()
|
||||
browser_manager = BrowserManager()
|
||||
|
||||
try:
|
||||
# 首先尝试从微信看一看获取热门内容
|
||||
result = self._fetch_from_weixin_kankan(browser_manager)
|
||||
|
||||
if result and len(result) > 0:
|
||||
# 缓存数据
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
# 如果看一看失败,尝试从微信读书获取热门书评
|
||||
result = self._fetch_from_weixin_dushu(browser_manager)
|
||||
if result and len(result) > 0:
|
||||
# 缓存数据
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(result, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
# 如果遇到错误,返回空列表
|
||||
return []
|
||||
|
||||
# 所有方法都失败,返回空列表
|
||||
return []
|
||||
|
||||
def _fetch_from_weixin_kankan(self, browser_manager):
|
||||
"""从微信看一看页面获取热门内容"""
|
||||
url = "https://k.weixin.qq.com/"
|
||||
|
||||
try:
|
||||
# 获取页面内容
|
||||
page_source, driver = browser_manager.get_page_content(url, wait_time=10)
|
||||
|
||||
# 等待热门内容加载
|
||||
try:
|
||||
WebDriverWait(driver, 15).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, ".hot"))
|
||||
)
|
||||
except:
|
||||
# 如果等待超时,仍然尝试获取内容
|
||||
pass
|
||||
|
||||
# 点击"热点"标签切换到热门内容
|
||||
try:
|
||||
hot_tab = driver.find_element(By.XPATH, "//div[contains(text(), '热点') and @class='tab']")
|
||||
hot_tab.click()
|
||||
time.sleep(3) # 等待内容加载
|
||||
except:
|
||||
# 如果找不到热点标签,继续尝试获取当前页面内容
|
||||
pass
|
||||
|
||||
result = []
|
||||
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# 获取文章列表
|
||||
articles = driver.find_elements(By.CSS_SELECTOR, ".article-item")
|
||||
|
||||
if not articles:
|
||||
# 尝试其他可能的选择器
|
||||
articles = driver.find_elements(By.CSS_SELECTOR, ".doc-item")
|
||||
|
||||
if not articles:
|
||||
# 再尝试其他可能的选择器
|
||||
articles = driver.find_elements(By.CSS_SELECTOR, ".item")
|
||||
|
||||
for article in articles:
|
||||
try:
|
||||
# 获取文章标题和链接
|
||||
title_elem = article.find_element(By.CSS_SELECTOR, "h3, .title")
|
||||
title = title_elem.text.strip()
|
||||
|
||||
# 尝试获取链接
|
||||
link = None
|
||||
try:
|
||||
link_elem = article.find_element(By.TAG_NAME, "a")
|
||||
link = link_elem.get_attribute("href")
|
||||
except:
|
||||
# 如果直接获取链接失败,则记录文章id,以后可以构建链接
|
||||
try:
|
||||
article_id = article.get_attribute("data-id") or article.get_attribute("id")
|
||||
link = f"https://k.weixin.qq.com/article?id={article_id}"
|
||||
except:
|
||||
link = "https://k.weixin.qq.com/"
|
||||
|
||||
# 获取来源
|
||||
source = ""
|
||||
try:
|
||||
source_elem = article.find_element(By.CSS_SELECTOR, ".account, .source")
|
||||
source = source_elem.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
# 获取摘要
|
||||
summary = ""
|
||||
try:
|
||||
summary_elem = article.find_element(By.CSS_SELECTOR, ".desc, .summary, p")
|
||||
summary = summary_elem.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': link,
|
||||
'content': f"来源: {source} | 摘要: {summary[:50] if summary else '无摘要'}",
|
||||
'source': 'weixin',
|
||||
'publish_time': current_time
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
|
||||
# 限制获取前20条
|
||||
if len(result) >= 20:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def _fetch_from_weixin_dushu(self, browser_manager):
|
||||
"""从微信读书获取热门书评"""
|
||||
url = "https://weread.qq.com/web/category/all"
|
||||
|
||||
try:
|
||||
# 获取页面内容
|
||||
page_source, driver = browser_manager.get_page_content(url, wait_time=8)
|
||||
|
||||
result = []
|
||||
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# 尝试点击排行榜标签
|
||||
try:
|
||||
rank_tab = driver.find_element(By.XPATH, "//a[contains(text(), '排行榜')]")
|
||||
rank_tab.click()
|
||||
time.sleep(3) # 等待内容加载
|
||||
except:
|
||||
# 如果找不到排行榜标签,继续尝试获取当前页面内容
|
||||
pass
|
||||
|
||||
# 获取热门书籍列表
|
||||
books = driver.find_elements(By.CSS_SELECTOR, ".shelf-item, .book-item")
|
||||
|
||||
for book in books:
|
||||
try:
|
||||
# 获取书籍标题和链接
|
||||
title_elem = book.find_element(By.CSS_SELECTOR, ".title, h3")
|
||||
title = title_elem.text.strip()
|
||||
|
||||
# 尝试获取链接
|
||||
link = "https://weread.qq.com/web/category/all"
|
||||
try:
|
||||
link_elem = book.find_element(By.TAG_NAME, "a")
|
||||
link = link_elem.get_attribute("href")
|
||||
except:
|
||||
book_id = book.get_attribute("data-bid") or book.get_attribute("id")
|
||||
if book_id:
|
||||
link = f"https://weread.qq.com/web/reader/{book_id}"
|
||||
|
||||
# 获取作者
|
||||
author = ""
|
||||
try:
|
||||
author_elem = book.find_element(By.CSS_SELECTOR, ".author, .writer")
|
||||
author = author_elem.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
# 获取摘要/简介
|
||||
intro = ""
|
||||
try:
|
||||
intro_elem = book.find_element(By.CSS_SELECTOR, ".intro, .desc")
|
||||
intro = intro_elem.text.strip()
|
||||
except:
|
||||
pass
|
||||
|
||||
news = {
|
||||
'title': f"热门书籍: {title}",
|
||||
'url': link,
|
||||
'content': f"作者: {author} | 简介: {intro[:50] if intro else '无简介'}",
|
||||
'source': 'weixin',
|
||||
'publish_time': current_time
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
|
||||
# 限制获取前20条
|
||||
if len(result) >= 20:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "weixin"
|
||||
155
app/services/sites/xueqiu.py
Normal file
155
app/services/sites/xueqiu.py
Normal file
@@ -0,0 +1,155 @@
|
||||
import json
|
||||
import datetime
|
||||
import requests
|
||||
import urllib3
|
||||
import re
|
||||
from requests.sessions import Session
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class XueqiuCrawler(Crawler):
|
||||
"""雪球"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.session = Session()
|
||||
self._init_session()
|
||||
|
||||
def _init_session(self):
|
||||
try:
|
||||
# 第一步:访问主页获取基础cookies
|
||||
main_url = "https://xueqiu.com"
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Pragma': 'no-cache',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Upgrade-Insecure-Requests': '1'
|
||||
}
|
||||
|
||||
resp = self.session.get(main_url, headers=headers, verify=False, timeout=self.timeout)
|
||||
if resp.status_code == 200:
|
||||
html_content = resp.text
|
||||
|
||||
# 尝试提取token
|
||||
token_match = re.search(r'window\.SNB\s*=\s*\{[^}]*token["\']?\s*:\s*["\']([^"\']+)["\']', html_content)
|
||||
if token_match:
|
||||
token = token_match.group(1)
|
||||
self.session.headers.update({'X-Requested-With': 'XMLHttpRequest'})
|
||||
|
||||
hot_page_url = "https://xueqiu.com/hot_event"
|
||||
hot_headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Referer': 'https://xueqiu.com/',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Upgrade-Insecure-Requests': '1'
|
||||
}
|
||||
|
||||
hot_resp = self.session.get(hot_page_url, headers=hot_headers, verify=False, timeout=self.timeout)
|
||||
if hot_resp.status_code == 200:
|
||||
print("雪球热门页面访问成功,已获取完整认证信息")
|
||||
else:
|
||||
print(f"雪球热门页面访问失败: {hot_resp.status_code}")
|
||||
|
||||
else:
|
||||
print(f"雪球主页访问失败: {resp.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"初始化雪球会话失败: {e}")
|
||||
|
||||
def fetch(self, date_str) -> list:
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://xueqiu.com/hot_event/list.json?count=10"
|
||||
headers = {
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'Pragma': 'no-cache',
|
||||
'Referer': 'https://xueqiu.com/',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
|
||||
'X-Requested-With': 'XMLHttpRequest'
|
||||
}
|
||||
|
||||
try:
|
||||
resp = self.session.get(url=url, headers=headers, verify=False, timeout=self.timeout)
|
||||
|
||||
if resp.status_code != 200:
|
||||
print(f"雪球请求失败, status: {resp.status_code}")
|
||||
self._init_session()
|
||||
resp = self.session.get(url=url, headers=headers, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"雪球重试后仍失败, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
json_data = resp.json()
|
||||
if 'list' not in json_data:
|
||||
print("雪球响应格式异常")
|
||||
return []
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for idx, item in enumerate(json_data['list'][:10]): # 取前10条
|
||||
try:
|
||||
tag = item.get('tag', '').strip()
|
||||
if tag.startswith('#') and tag.endswith('#'):
|
||||
title = tag[1:-1]
|
||||
else:
|
||||
title = tag
|
||||
|
||||
if not title:
|
||||
continue
|
||||
|
||||
item_id = item.get('id')
|
||||
url_link = f"https://xueqiu.com/"
|
||||
|
||||
content = item.get('content', '').strip()
|
||||
if len(content) > 200:
|
||||
content = content[:200] + '...'
|
||||
|
||||
status_count = item.get('status_count', 0)
|
||||
hot_value = item.get('hot', 0)
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url_link,
|
||||
'content': content,
|
||||
'source': 'xueqiu',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'score': status_count if status_count > 0 else 1000 - idx,
|
||||
'rank': idx + 1
|
||||
}
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
except Exception as e:
|
||||
print(f"解析雪球新闻项失败: {e}")
|
||||
continue
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取雪球数据失败: {e}")
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "xueqiu"
|
||||
64
app/services/sites/zhihu.py
Normal file
64
app/services/sites/zhihu.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import json
|
||||
import datetime # 添加datetime导入
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
# 移除 SQLAlchemy 导入
|
||||
# from sqlalchemy.sql.functions import now
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class ZhiHuCrawler(Crawler):
|
||||
"""知乎"""
|
||||
|
||||
def fetch(self, date_str):
|
||||
# 获取当前时间
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://www.zhihu.com/api/v3/explore/guest/feeds?limit=30&ws_qiangzhisafe=0"
|
||||
|
||||
resp = requests.get(url=url, headers=self.header, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
try:
|
||||
json_data = resp.json()
|
||||
data = json_data.get('data', [])
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data:
|
||||
target = item.get('target', {})
|
||||
question = target.get('question', {})
|
||||
title = question.get('title', '')
|
||||
url = f"https://www.zhihu.com/question/{question.get('id')}"
|
||||
excerpt = target.get('excerpt', '')
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': excerpt,
|
||||
'source': 'zhihu',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing JSON: {e}")
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "zhihu"
|
||||
Reference in New Issue
Block a user