import json import datetime import requests import urllib3 from bs4 import BeautifulSoup from .crawler import Crawler from ...core import cache from ...db.mysql import News urllib3.disable_warnings() class BaiduNewsCrawler(Crawler): # 返回news_list def fetch(self, date_str) -> list: # 获取当前时间 current_time = datetime.datetime.now() url = "https://top.baidu.com/api/board?platform=wise&tab=realtime" resp = requests.get(url=url, params=self.header, verify=False, timeout=self.timeout) if resp.status_code != 200: print(f"request failed, status: {resp.status_code}") return [] json_data = resp.json() contents = json_data.get("data")["cards"][0]["content"][0]["content"] result = [] cache_list = [] for content in contents: title = content.get("word") url = content.get("url") desc = content.get("desc") score = content.get("hotScore") # replace url m to www url = url.replace("m.", "www.") news = { 'title': title, 'url': url, 'content': desc, 'source': 'baidu', 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串 } result.append(news) cache_list.append(news) # 直接添加字典,json.dumps会在后面处理整个列表 cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False)) return result def crawler_name(self): return "baidu" @staticmethod def fetch_v0(): # 获取当前时间 current_time = datetime.datetime.now() url = "https://top.baidu.com/board?tab=realtime" proxies = { # "http": "http://127.0.0.1:7890", # "https": "http://127.0.0.0:7890" } header = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng," "*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "upgrade-insecure-requests": 1, "host": "www.baidu.com", "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/86.0.4240.183 Safari/537.36" } html = requests.get(url=url, params=header, verify=False, proxies=proxies) html.encoding = "utf-8" html_text = html.text soup = BeautifulSoup(html_text, "html.parser") main_content = soup.find_all("main")[0] news_main_content = main_content.find("div", style='margin-bottom:20px') div_elements = news_main_content.find_all('div', class_='category-wrap_iQLoo horizontal_1eKyQ') result = [] for div_element in div_elements: hot_index = div_element.find(class_='hot-index_1Bl1a').text.strip() news_title = div_element.find(class_='c-single-text-ellipsis').text.strip() news_link = div_element.find('a', class_='title_dIF3B')['href'] news = { 'title': news_title, 'url': news_link, 'content': "", 'source': 'baidu', 'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S') # 使用格式化的时间字符串 } result.append(news) return result