Files
hot-news-api/app/services/sites/douyin.py
2026-03-26 15:04:59 +08:00

112 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import datetime
import time
import requests
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from ...core import cache
from ...db.mysql import News
from .crawler import Crawler
from ..browser_manager import BrowserManager
class DouYinCrawler(Crawler):
def fetch(self, date_str):
return self.fetch_v2(date_str)
def fetch_v1(self, date_str):
current_time = datetime.datetime.now()
url = "https://www.douyin.com/hot"
browser_manager = BrowserManager()
try:
# 使用浏览器管理器获取页面内容
page_source, driver = browser_manager.get_page_content(url, wait_time=5)
result = []
cache_list = []
# 抖音热榜条目li 标签里含 /video/ 链接)
items = driver.find_elements(By.XPATH, '//li[a[contains(@href, "/video/")]]')
for item in items:
try:
# 提取标题(含 # 标签或较长文本)
title_elem = item.find_element(By.XPATH, './/div[contains(text(), "#") or string-length(text()) > 10]')
# 提取链接
link_elem = item.find_element(By.XPATH, './/a[contains(@href, "/video/")]')
# 提取热度
hot_elem = item.find_element(By.XPATH, './/span[contains(text(), "") or contains(text(), "亿")]')
title = title_elem.text.strip()
item_url = "https://www.douyin.com" + link_elem.get_attribute("href")
hot = hot_elem.text.strip()
news = {
'title': title,
'url': item_url,
'content': f"热度: {hot}",
'source': 'douyin',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
except Exception:
continue # 跳过无效项
# 缓存并返回
if cache_list:
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
except Exception as e:
return []
def fetch_v2(self, date_str):
current_time = datetime.datetime.now()
url = "https://www.douyin.com/aweme/v1/web/hot/search/list/?device_platform=webapp&aid=6383&channel=channel_pc_web&detail_list=1&source=6&pc_client_type=1&pc_libra_divert=Windows&support_h265=1&support_dash=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=136.0.0.0&browser_online=true&engine_name=Blink&engine_version=136.0.0.0&os_name=Windows&os_version=10&cpu_core_num=16&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7490997798633555467"
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"Chrome/122.0.0.0 Safari/537.36"
"AppleWebKit/537.36 (KHTML, like Gecko) "
),
"Referer": "https://www.douyin.com/",
}
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
if resp.status_code != 200:
print(f"request failed, status: {resp.status_code}")
return []
data = resp.json()
# https://www.douyin.com/hot/2094286?&trending_topic=%E5%A4%8F%E5%A4%A9%E7%9A%84%E5%91%B3%E9%81%93%E5%9C%A8%E6%8A%96%E9%9F%B3&previous_page=main_page&enter_method=trending_topic&modeFrom=hotDetail&tab_name=trend&position=1&hotValue=11892557
result = []
cache_list = []
for item in data["data"]["word_list"]:
title = item["word"]
url = f"https://www.douyin.com/hot/{item['sentence_id']}?&trending_topic={item['word']}&previous_page=main_page&enter_method=trending_topic&modeFrom=hotDetail&tab_name=trend&position=1&hotValue={item['hot_value']}"
news = {
'title': title,
'url': url,
'content': title,
'source': 'douyin',
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
}
result.append(news)
cache_list.append(news)
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
return result
def crawler_name(self):
return "douyin"