init
This commit is contained in:
155
app/services/sites/xueqiu.py
Normal file
155
app/services/sites/xueqiu.py
Normal file
@@ -0,0 +1,155 @@
|
||||
import json
|
||||
import datetime
|
||||
import requests
|
||||
import urllib3
|
||||
import re
|
||||
from requests.sessions import Session
|
||||
|
||||
from .crawler import Crawler
|
||||
from ...core import cache
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class XueqiuCrawler(Crawler):
|
||||
"""雪球"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.session = Session()
|
||||
self._init_session()
|
||||
|
||||
def _init_session(self):
|
||||
try:
|
||||
# 第一步:访问主页获取基础cookies
|
||||
main_url = "https://xueqiu.com"
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Pragma': 'no-cache',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Upgrade-Insecure-Requests': '1'
|
||||
}
|
||||
|
||||
resp = self.session.get(main_url, headers=headers, verify=False, timeout=self.timeout)
|
||||
if resp.status_code == 200:
|
||||
html_content = resp.text
|
||||
|
||||
# 尝试提取token
|
||||
token_match = re.search(r'window\.SNB\s*=\s*\{[^}]*token["\']?\s*:\s*["\']([^"\']+)["\']', html_content)
|
||||
if token_match:
|
||||
token = token_match.group(1)
|
||||
self.session.headers.update({'X-Requested-With': 'XMLHttpRequest'})
|
||||
|
||||
hot_page_url = "https://xueqiu.com/hot_event"
|
||||
hot_headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Referer': 'https://xueqiu.com/',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Upgrade-Insecure-Requests': '1'
|
||||
}
|
||||
|
||||
hot_resp = self.session.get(hot_page_url, headers=hot_headers, verify=False, timeout=self.timeout)
|
||||
if hot_resp.status_code == 200:
|
||||
print("雪球热门页面访问成功,已获取完整认证信息")
|
||||
else:
|
||||
print(f"雪球热门页面访问失败: {hot_resp.status_code}")
|
||||
|
||||
else:
|
||||
print(f"雪球主页访问失败: {resp.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"初始化雪球会话失败: {e}")
|
||||
|
||||
def fetch(self, date_str) -> list:
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
url = "https://xueqiu.com/hot_event/list.json?count=10"
|
||||
headers = {
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'Pragma': 'no-cache',
|
||||
'Referer': 'https://xueqiu.com/',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
|
||||
'X-Requested-With': 'XMLHttpRequest'
|
||||
}
|
||||
|
||||
try:
|
||||
resp = self.session.get(url=url, headers=headers, verify=False, timeout=self.timeout)
|
||||
|
||||
if resp.status_code != 200:
|
||||
print(f"雪球请求失败, status: {resp.status_code}")
|
||||
self._init_session()
|
||||
resp = self.session.get(url=url, headers=headers, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"雪球重试后仍失败, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
json_data = resp.json()
|
||||
if 'list' not in json_data:
|
||||
print("雪球响应格式异常")
|
||||
return []
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for idx, item in enumerate(json_data['list'][:10]): # 取前10条
|
||||
try:
|
||||
tag = item.get('tag', '').strip()
|
||||
if tag.startswith('#') and tag.endswith('#'):
|
||||
title = tag[1:-1]
|
||||
else:
|
||||
title = tag
|
||||
|
||||
if not title:
|
||||
continue
|
||||
|
||||
item_id = item.get('id')
|
||||
url_link = f"https://xueqiu.com/"
|
||||
|
||||
content = item.get('content', '').strip()
|
||||
if len(content) > 200:
|
||||
content = content[:200] + '...'
|
||||
|
||||
status_count = item.get('status_count', 0)
|
||||
hot_value = item.get('hot', 0)
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url_link,
|
||||
'content': content,
|
||||
'source': 'xueqiu',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'score': status_count if status_count > 0 else 1000 - idx,
|
||||
'rank': idx + 1
|
||||
}
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
except Exception as e:
|
||||
print(f"解析雪球新闻项失败: {e}")
|
||||
continue
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取雪球数据失败: {e}")
|
||||
return []
|
||||
|
||||
def crawler_name(self):
|
||||
return "xueqiu"
|
||||
Reference in New Issue
Block a user