This commit is contained in:
2026-03-26 15:04:59 +08:00
commit e0af97ac7f
65 changed files with 7366 additions and 0 deletions

View File

@@ -0,0 +1,99 @@
import json
import datetime
import requests
import urllib3
from bs4 import BeautifulSoup
from .crawler import Crawler
from ...core import cache
from ...db.mysql import News
urllib3.disable_warnings()
class BaiduNewsCrawler(Crawler):
# 返回news_list
def fetch(self, date_str) -> list:
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://top.baidu.com/api/board?platform=wise&tab=realtime"
resp = requests.get(url=url, params=self.header, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
json_data = resp.json()
contents = json_data.get("data")["cards"][0]["content"][0]["content"]
result = []
cache_list = []
for content in contents:
title = content.get("word")
url = content.get("url")
desc = content.get("desc")
score = content.get("hotScore")
# replace url m to www
url = url.replace("m.", "www.")
news = {
'title': title,
'url': url,
'content': desc,
'source': 'baidu',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串
}
result.append(news)
cache_list.append(news) # 直接添加字典json.dumps会在后面处理整个列表
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "baidu"
@staticmethod
def fetch_v0():
# 获取当前时间
current_time = datetime.datetime.now()
url = "https://top.baidu.com/board?tab=realtime"
proxies = {
# "http": "http://127.0.0.1:7890",
# "https": "http://127.0.0.0:7890"
}
header = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
"*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"upgrade-insecure-requests": 1,
"host": "www.baidu.com",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/86.0.4240.183 Safari/537.36"
}
html = requests.get(url=url, params=header, verify=False, proxies=proxies)
html.encoding = "utf-8"
html_text = html.text
soup = BeautifulSoup(html_text, "html.parser")
main_content = soup.find_all("main")[0]
news_main_content = main_content.find("div", style='margin-bottom:20px')
div_elements = news_main_content.find_all('div', class_='category-wrap_iQLoo horizontal_1eKyQ')
result = []
for div_element in div_elements:
hot_index = div_element.find(class_='hot-index_1Bl1a').text.strip()
news_title = div_element.find(class_='c-single-text-ellipsis').text.strip()
news_link = div_element.find('a', class_='title_dIF3B')['href']
news = {
'title': news_title,
'url': news_link,
'content': "",
'source': 'baidu',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串
}
result.append(news)
return result