init
This commit is contained in:
111
app/services/sites/douyin.py
Normal file
111
app/services/sites/douyin.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import json
|
||||
import datetime
|
||||
import time
|
||||
|
||||
import requests
|
||||
from selenium.webdriver.common.by import By
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ...core import cache
|
||||
from ...db.mysql import News
|
||||
from .crawler import Crawler
|
||||
from ..browser_manager import BrowserManager
|
||||
|
||||
|
||||
class DouYinCrawler(Crawler):
|
||||
def fetch(self, date_str):
|
||||
return self.fetch_v2(date_str)
|
||||
|
||||
def fetch_v1(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
url = "https://www.douyin.com/hot"
|
||||
browser_manager = BrowserManager()
|
||||
|
||||
try:
|
||||
# 使用浏览器管理器获取页面内容
|
||||
page_source, driver = browser_manager.get_page_content(url, wait_time=5)
|
||||
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
# 抖音热榜条目(li 标签里含 /video/ 链接)
|
||||
items = driver.find_elements(By.XPATH, '//li[a[contains(@href, "/video/")]]')
|
||||
|
||||
for item in items:
|
||||
try:
|
||||
# 提取标题(含 # 标签或较长文本)
|
||||
title_elem = item.find_element(By.XPATH, './/div[contains(text(), "#") or string-length(text()) > 10]')
|
||||
# 提取链接
|
||||
link_elem = item.find_element(By.XPATH, './/a[contains(@href, "/video/")]')
|
||||
# 提取热度
|
||||
hot_elem = item.find_element(By.XPATH, './/span[contains(text(), "万") or contains(text(), "亿")]')
|
||||
|
||||
title = title_elem.text.strip()
|
||||
item_url = "https://www.douyin.com" + link_elem.get_attribute("href")
|
||||
hot = hot_elem.text.strip()
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': item_url,
|
||||
'content': f"热度: {hot}",
|
||||
'source': 'douyin',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
except Exception:
|
||||
continue # 跳过无效项
|
||||
|
||||
# 缓存并返回
|
||||
if cache_list:
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def fetch_v2(self, date_str):
|
||||
current_time = datetime.datetime.now()
|
||||
url = "https://www.douyin.com/aweme/v1/web/hot/search/list/?device_platform=webapp&aid=6383&channel=channel_pc_web&detail_list=1&source=6&pc_client_type=1&pc_libra_divert=Windows&support_h265=1&support_dash=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=136.0.0.0&browser_online=true&engine_name=Blink&engine_version=136.0.0.0&os_name=Windows&os_version=10&cpu_core_num=16&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7490997798633555467"
|
||||
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"Chrome/122.0.0.0 Safari/537.36"
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
),
|
||||
"Referer": "https://www.douyin.com/",
|
||||
}
|
||||
|
||||
resp = requests.get(url=url, headers=headers, verify=False, timeout=self.timeout)
|
||||
if resp.status_code != 200:
|
||||
print(f"request failed, status: {resp.status_code}")
|
||||
return []
|
||||
|
||||
data = resp.json()
|
||||
# https://www.douyin.com/hot/2094286?&trending_topic=%E5%A4%8F%E5%A4%A9%E7%9A%84%E5%91%B3%E9%81%93%E5%9C%A8%E6%8A%96%E9%9F%B3&previous_page=main_page&enter_method=trending_topic&modeFrom=hotDetail&tab_name=trend&position=1&hotValue=11892557
|
||||
result = []
|
||||
cache_list = []
|
||||
|
||||
for item in data["data"]["word_list"]:
|
||||
title = item["word"]
|
||||
url = f"https://www.douyin.com/hot/{item['sentence_id']}?&trending_topic={item['word']}&previous_page=main_page&enter_method=trending_topic&modeFrom=hotDetail&tab_name=trend&position=1&hotValue={item['hot_value']}"
|
||||
|
||||
news = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'content': title,
|
||||
'source': 'douyin',
|
||||
'publish_time': current_time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
}
|
||||
|
||||
result.append(news)
|
||||
cache_list.append(news)
|
||||
|
||||
cache.hset(date_str, self.crawler_name(), json.dumps(cache_list, ensure_ascii=False))
|
||||
return result
|
||||
|
||||
|
||||
def crawler_name(self):
|
||||
return "douyin"
|
||||
Reference in New Issue
Block a user