100 lines
3.4 KiB
Python
100 lines
3.4 KiB
Python
import json
|
||
import datetime
|
||
|
||
import requests
|
||
import urllib3
|
||
from bs4 import BeautifulSoup
|
||
|
||
from .crawler import Crawler
|
||
from ...core import cache
|
||
from ...db.mysql import News
|
||
|
||
urllib3.disable_warnings()
|
||
|
||
|
||
class BaiduNewsCrawler(Crawler):
|
||
# 返回news_list
|
||
def fetch(self, date_str) -> list:
|
||
# 获取当前时间
|
||
current_time = datetime.datetime.now()
|
||
|
||
url = "https://top.baidu.com/api/board?platform=wise&tab=realtime"
|
||
|
||
resp = requests.get(url=url, params=self.header, verify=False, timeout=self.timeout)
|
||
if resp.status_code != 200:
|
||
print(f"request failed, status: {resp.status_code}")
|
||
return []
|
||
|
||
json_data = resp.json()
|
||
contents = json_data.get("data")["cards"][0]["content"][0]["content"]
|
||
result = []
|
||
cache_list = []
|
||
for content in contents:
|
||
title = content.get("word")
|
||
url = content.get("url")
|
||
desc = content.get("desc")
|
||
score = content.get("hotScore")
|
||
|
||
# replace url m to www
|
||
url = url.replace("m.", "www.")
|
||
news = {
|
||
'title': title,
|
||
'url': url,
|
||
'content': desc,
|
||
'source': 'baidu',
|
||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串
|
||
}
|
||
result.append(news)
|
||
cache_list.append(news) # 直接添加字典,json.dumps会在后面处理整个列表
|
||
|
||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||
return result
|
||
|
||
def crawler_name(self):
|
||
return "baidu"
|
||
|
||
@staticmethod
|
||
def fetch_v0():
|
||
# 获取当前时间
|
||
current_time = datetime.datetime.now()
|
||
|
||
url = "https://top.baidu.com/board?tab=realtime"
|
||
proxies = {
|
||
# "http": "http://127.0.0.1:7890",
|
||
# "https": "http://127.0.0.0:7890"
|
||
}
|
||
|
||
header = {
|
||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
|
||
"*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||
"upgrade-insecure-requests": 1,
|
||
"host": "www.baidu.com",
|
||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/86.0.4240.183 Safari/537.36"
|
||
}
|
||
html = requests.get(url=url, params=header, verify=False, proxies=proxies)
|
||
html.encoding = "utf-8"
|
||
html_text = html.text
|
||
soup = BeautifulSoup(html_text, "html.parser")
|
||
main_content = soup.find_all("main")[0]
|
||
news_main_content = main_content.find("div", style='margin-bottom:20px')
|
||
|
||
div_elements = news_main_content.find_all('div', class_='category-wrap_iQLoo horizontal_1eKyQ')
|
||
|
||
result = []
|
||
for div_element in div_elements:
|
||
hot_index = div_element.find(class_='hot-index_1Bl1a').text.strip()
|
||
news_title = div_element.find(class_='c-single-text-ellipsis').text.strip()
|
||
news_link = div_element.find('a', class_='title_dIF3B')['href']
|
||
|
||
news = {
|
||
'title': news_title,
|
||
'url': news_link,
|
||
'content': "",
|
||
'source': 'baidu',
|
||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串
|
||
}
|
||
result.append(news)
|
||
|
||
return result
|