Files
hot-news-api/app/services/sites/baidu.py
2026-03-26 15:04:59 +08:00

100 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import datetime
import requests
import urllib3
from bs4 import BeautifulSoup
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class BaiduNewsCrawler(Crawler):
# 返回news_list
def fetch(self, date_str) -> list:
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://top.baidu.com/api/board?platform=wise&tab=realtime"
resp = requests.get(url=url, params=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
json_data = resp.json()
contents = json_data.get("data")["cards"][0]["content"][0]["content"]
result = []
cache_list = []
for content in contents:
title = content.get("word")
url = content.get("url")
desc = content.get("desc")
score = content.get("hotScore")
# replace url m to www
url = url.replace("m.", "www.")
news = {
'title': title,
'url': url,
'content': desc,
'source': 'baidu',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串
}
result.append(news)
cache_list.append(news) # 直接添加字典json.dumps会在后面处理整个列表
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "baidu"
@staticmethod
def fetch_v0():
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://top.baidu.com/board?tab=realtime"
proxies = {
# "http": "http://127.0.0.1:7890",
# "https": "http://127.0.0.0:7890"
}
header = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
"*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"upgrade-insecure-requests": 1,
"host": "www.baidu.com",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/86.0.4240.183 Safari/537.36"
}
html = requests.get(url=url, params=header, verify=False, proxies=proxies)
html.encoding = "utf-8"
html_text = html.text
soup = BeautifulSoup(html_text, "html.parser")
main_content = soup.find_all("main")[0]
news_main_content = main_content.find("div", style='margin-bottom:20px')
div_elements = news_main_content.find_all('div', class_='category-wrap_iQLoo horizontal_1eKyQ')
result = []
for div_element in div_elements:
hot_index = div_element.find(class_='hot-index_1Bl1a').text.strip()
news_title = div_element.find(class_='c-single-text-ellipsis').text.strip()
news_link = div_element.find('a', class_='title_dIF3B')['href']
news = {
'title': news_title,
'url': news_link,
'content': "",
'source': 'baidu',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串
}
result.append(news)
return result