hot-news-api/app/services/sites/douyin.py

import json
import datetime
import time

import requests
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

from ...core import cache
from ...db.mysql import News
from .crawler import Crawler
from ..browser_manager import BrowserManager


class DouYinCrawler(Crawler):
    def fetch(self, date_str):
        return self.fetch_v2(date_str)

    def fetch_v1(self, date_str):
        current_time = datetime.datetime.now()
        url = "https://www.douyin.com/hot"
        browser_manager = BrowserManager()

        try:
            # 使用浏览器管理器获取页面内容
            page_source, driver = browser_manager.get_page_content(url, wait_time=5)

            result = []
            cache_list = []

            # 抖音热榜条目（li 标签里含 /video/ 链接）
            items = driver.find_elements(By.XPATH, '//li[a[contains(@href, "/video/")]]')

            for item in items:
                try:
                    # 提取标题（含 # 标签或较长文本）
                    title_elem = item.find_element(By.XPATH, './/div[contains(text(), "#") or string-length(text()) > 10]')
                    # 提取链接
                    link_elem = item.find_element(By.XPATH, './/a[contains(@href, "/video/")]')
                    # 提取热度
                    hot_elem = item.find_element(By.XPATH, './/span[contains(text(), "万") or contains(text(), "亿")]')

                    title = title_elem.text.strip()
                    item_url = "https://www.douyin.com" + link_elem.get_attribute("href")
                    hot = hot_elem.text.strip()

                    news = {
                        'title': title,
                        'url': item_url,
                        'content': f"热度: {hot}",
                        'source': 'douyin',
                        'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
                    }

                    result.append(news)
                    cache_list.append(news)
                except Exception:
                    continue  # 跳过无效项

            # 缓存并返回
            if cache_list:
                cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
            return result

        except Exception as e:
            return []

    def fetch_v2(self, date_str):
        current_time = datetime.datetime.now()
        url = "https://www.douyin.com/aweme/v1/web/hot/search/list/?device_platform=webapp&aid=6383&channel=channel_pc_web&detail_list=1&source=6&pc_client_type=1&pc_libra_divert=Windows&support_h265=1&support_dash=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=136.0.0.0&browser_online=true&engine_name=Blink&engine_version=136.0.0.0&os_name=Windows&os_version=10&cpu_core_num=16&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7490997798633555467"

        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "Chrome/122.0.0.0 Safari/537.36"
                "AppleWebKit/537.36 (KHTML, like Gecko) "
            ),
            "Referer": "https://www.douyin.com/",
        }

        resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
        if resp.status_code != 200:
            print(f"request failed, status: {resp.status_code}")
            return []

        data = resp.json()
        # https://www.douyin.com/hot/2094286?&trending_topic=%E5%A4%8F%E5%A4%A9%E7%9A%84%E5%91%B3%E9%81%93%E5%9C%A8%E6%8A%96%E9%9F%B3&previous_page=main_page&enter_method=trending_topic&modeFrom=hotDetail&tab_name=trend&position=1&hotValue=11892557
        result = []
        cache_list = []

        for item in data["data"]["word_list"]:
            title = item["word"]
            url =  f"https://www.douyin.com/hot/{item['sentence_id']}?&trending_topic={item['word']}&previous_page=main_page&enter_method=trending_topic&modeFrom=hotDetail&tab_name=trend&position=1&hotValue={item['hot_value']}"

            news = {
                'title': title,
                'url': url,
                'content': title,
                'source': 'douyin',
                'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
            }

            result.append(news)
            cache_list.append(news)

        cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
        return result


    def crawler_name(self):
        return "douyin"