init

2026-03-26 15:04:59 +08:00
commit e0af97ac7f
65 changed files with 7366 additions and 0 deletions
--- a/app/services/sites/baidu.py
+++ b/app/services/sites/baidu.py
@@ -0,0 +1,99 @@
+import json
+import datetime
+
+import requests
+import urllib3
+from bs4 import BeautifulSoup
+
+from .crawler import Crawler
+from ...core import cache
+from ...db.mysql import News
+
+urllib3.disable_warnings()
+
+
+class BaiduNewsCrawler(Crawler):
+    # 返回news_list
+    def fetch(self, date_str) -> list:
+        # 获取当前时间
+        current_time = datetime.datetime.now()
+        
+        url = "https://top.baidu.com/api/board?platform=wise&tab=realtime"
+
+        resp = requests.get(url=url, params=self.header, verify=False, timeout=self.timeout)
+        if resp.status_code != 200:
+            print(f"request failed, status: {resp.status_code}")
+            return []
+
+        json_data = resp.json()
+        contents = json_data.get("data")["cards"][0]["content"][0]["content"]
+        result = []
+        cache_list = []
+        for content in contents:
+            title = content.get("word")
+            url = content.get("url")
+            desc = content.get("desc")
+            score = content.get("hotScore")
+
+            # replace url m to www
+            url = url.replace("m.", "www.")
+            news = {
+                'title': title,
+                'url': url,
+                'content': desc,
+                'source': 'baidu',
+                'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')  # 使用格式化的时间字符串
+            }
+            result.append(news)
+            cache_list.append(news)  # 直接添加字典，json.dumps会在后面处理整个列表
+
+        cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
+        return result
+
+    def crawler_name(self):
+        return "baidu"
+
+    @staticmethod
+    def fetch_v0():
+        # 获取当前时间
+        current_time = datetime.datetime.now()
+        
+        url = "https://top.baidu.com/board?tab=realtime"
+        proxies = {
+            # "http": "http://127.0.0.1:7890",
+            # "https": "http://127.0.0.0:7890"
+        }
+
+        header = {
+            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
+                      "*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+            "upgrade-insecure-requests": 1,
+            "host": "www.baidu.com",
+            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
+                          "Chrome/86.0.4240.183 Safari/537.36"
+        }
+        html = requests.get(url=url, params=header, verify=False, proxies=proxies)
+        html.encoding = "utf-8"
+        html_text = html.text
+        soup = BeautifulSoup(html_text, "html.parser")
+        main_content = soup.find_all("main")[0]
+        news_main_content = main_content.find("div", style='margin-bottom:20px')
+
+        div_elements = news_main_content.find_all('div', class_='category-wrap_iQLoo horizontal_1eKyQ')
+
+        result = []
+        for div_element in div_elements:
+            hot_index = div_element.find(class_='hot-index_1Bl1a').text.strip()
+            news_title = div_element.find(class_='c-single-text-ellipsis').text.strip()
+            news_link = div_element.find('a', class_='title_dIF3B')['href']
+
+            news = {
+                'title': news_title,
+                'url': news_link,
+                'content': "",
+                'source': 'baidu',
+                'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')  # 使用格式化的时间字符串
+            }
+            result.append(news)
+
+        return result