hot-news-api/app/services/sites/baidu.py

import json
import datetime

import requests
import urllib3
from bs4 import BeautifulSoup

from .crawler import Crawler
from ...core import cache
from ...db.mysql import News

urllib3.disable_warnings()


class BaiduNewsCrawler(Crawler):
    # 返回news_list
    def fetch(self, date_str) -> list:
        # 获取当前时间
        current_time = datetime.datetime.now()

        url = "https://top.baidu.com/api/board?platform=wise&tab=realtime"

        resp = requests.get(url=url, params=self.header, verify=False, timeout=self.timeout)
        if resp.status_code != 200:
            print(f"request failed, status: {resp.status_code}")
            return []

        json_data = resp.json()
        contents = json_data.get("data")["cards"][0]["content"][0]["content"]
        result = []
        cache_list = []
        for content in contents:
            title = content.get("word")
            url = content.get("url")
            desc = content.get("desc")
            score = content.get("hotScore")

            # replace url m to www
            url = url.replace("m.", "www.")
            news = {
                'title': title,
                'url': url,
                'content': desc,
                'source': 'baidu',
                'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')  # 使用格式化的时间字符串
            }
            result.append(news)
            cache_list.append(news)  # 直接添加字典，json.dumps会在后面处理整个列表

        cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
        return result

    def crawler_name(self):
        return "baidu"

    @staticmethod
    def fetch_v0():
        # 获取当前时间
        current_time = datetime.datetime.now()

        url = "https://top.baidu.com/board?tab=realtime"
        proxies = {
            # "http": "http://127.0.0.1:7890",
            # "https": "http://127.0.0.0:7890"
        }

        header = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
                      "*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "upgrade-insecure-requests": 1,
            "host": "www.baidu.com",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/86.0.4240.183 Safari/537.36"
        }
        html = requests.get(url=url, params=header, verify=False, proxies=proxies)
        html.encoding = "utf-8"
        html_text = html.text
        soup = BeautifulSoup(html_text, "html.parser")
        main_content = soup.find_all("main")[0]
        news_main_content = main_content.find("div", style='margin-bottom:20px')

        div_elements = news_main_content.find_all('div', class_='category-wrap_iQLoo horizontal_1eKyQ')

        result = []
        for div_element in div_elements:
            hot_index = div_element.find(class_='hot-index_1Bl1a').text.strip()
            news_title = div_element.find(class_='c-single-text-ellipsis').text.strip()
            news_link = div_element.find('a', class_='title_dIF3B')['href']

            news = {
                'title': news_title,
                'url': news_link,
                'content': "",
                'source': 'baidu',
                'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')  # 使用格式化的时间字符串
            }
            result.append(news)

        return result