185 lines
5.8 KiB
Python
185 lines
5.8 KiB
Python
# app/api/endpoints/website_meta.py
|
||
import json
|
||
import time
|
||
from urllib.parse import urlparse, urljoin
|
||
|
||
import cloudscraper
|
||
|
||
from app.utils.logger import log
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
from fastapi import APIRouter, Query
|
||
from fastapi.responses import JSONResponse
|
||
|
||
from app.core import cache
|
||
|
||
router = APIRouter()
|
||
|
||
|
||
@router.get(
|
||
"/",
|
||
summary="获取网站元数据",
|
||
description="提取指定网页的元数据信息,包括标题、描述、关键词、Open Graph 标签、Twitter Card 标签和 favicon",
|
||
response_description="返回包含网站元数据的 JSON 对象",
|
||
responses={
|
||
200: {"description": "成功获取网站元数据"},
|
||
404: {"description": "URL 参数缺失或无法访问"}
|
||
}
|
||
)
|
||
def get_meta(
|
||
url: str = Query(
|
||
default=...,
|
||
description="要获取元数据的网页 URL",
|
||
example="https://www.example.com"
|
||
)
|
||
):
|
||
"""
|
||
**获取网站元数据**
|
||
|
||
提取指定网页的各种元数据信息,支持标准 meta 标签、Open Graph 协议和 Twitter Card 协议。
|
||
|
||
**功能特性:**
|
||
- 自动检测并提取页面标题、描述、关键词
|
||
- 支持 Open Graph 协议(Facebook)
|
||
- 支持 Twitter Card 协议
|
||
- 自动查找 favicon 图标地址
|
||
- 内置缓存机制,相同 URL 不会重复请求
|
||
|
||
**提取字段说明:**
|
||
- `title`: 页面标题
|
||
- `description`: 页面描述
|
||
- `keywords`: 页面关键词
|
||
- `author`: 作者信息
|
||
- `og:*`: Open Graph 相关字段
|
||
- `twitter:*`: Twitter Card 相关字段
|
||
- `favicon_url`: 网站图标 URL
|
||
|
||
**缓存策略:**
|
||
- 首次请求会实际抓取网页
|
||
- 后续请求从 Redis 缓存读取(TTL: 60 秒)
|
||
- 响应中 `cache` 字段标识是否来自缓存
|
||
|
||
**注意事项:**
|
||
- 部分网站可能有反爬机制,使用 cloudscraper 进行绕过
|
||
- 动态渲染的内容可能无法完整获取
|
||
"""
|
||
if not url:
|
||
return {
|
||
"status": "404",
|
||
"data": [],
|
||
"msg": "`url` is required"
|
||
}
|
||
|
||
# get from cache
|
||
cached_metadata = cache.get(url)
|
||
if cached_metadata:
|
||
return {
|
||
"status": "200",
|
||
"data": json.loads(cached_metadata),
|
||
"msg": "success",
|
||
"cache": True
|
||
}
|
||
|
||
headers = {
|
||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||
"accept-language": "zh-CN,zh;q=0.9,zh-TW;q=0.8,ar;q=0.7,en;q=0.6",
|
||
"cache-control": "max-age=0",
|
||
"priority": "u=0, i",
|
||
"sec-ch-ua": '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
|
||
"sec-ch-ua-mobile": "?0",
|
||
"sec-ch-ua-platform": '"Windows"',
|
||
"sec-fetch-dest": "document",
|
||
"sec-fetch-mode": "navigate",
|
||
"sec-fetch-site": "same-origin",
|
||
"sec-fetch-user": "?1",
|
||
"upgrade-insecure-requests": "1",
|
||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
|
||
}
|
||
|
||
|
||
try:
|
||
response = requests.get(url, headers=headers, timeout=10)
|
||
response.raise_for_status()
|
||
page_content = response.content
|
||
except requests.RequestException as e:
|
||
scraper = cloudscraper.create_scraper(delay=100)
|
||
response = scraper.get(url)
|
||
page_content = response.content
|
||
|
||
if not page_content:
|
||
return {
|
||
"status": "404",
|
||
"data": [],
|
||
"msg": "No content"
|
||
}
|
||
|
||
soup = BeautifulSoup(page_content, "html.parser")
|
||
meta_info = {
|
||
"title": soup.title.string if soup.title else "No title",
|
||
"description": "",
|
||
"keywords": "",
|
||
"author": "",
|
||
"og:title": "",
|
||
"og:description": "",
|
||
"og:image": "",
|
||
"og:url": url,
|
||
"twitter:card": "",
|
||
"twitter:title": "",
|
||
"twitter:description": "",
|
||
"twitter:image": ""
|
||
}
|
||
|
||
for meta_tag in soup.find_all("meta"):
|
||
name_attr = meta_tag.get("name", "").lower()
|
||
property_attr = meta_tag.get("property", "").lower()
|
||
content = meta_tag.get("content", "")
|
||
|
||
if name_attr == "description":
|
||
meta_info["description"] = content
|
||
elif name_attr == "keywords":
|
||
meta_info["keywords"] = content
|
||
elif name_attr == "author":
|
||
meta_info["author"] = content
|
||
|
||
elif property_attr == "og:title":
|
||
meta_info["og:title"] = content
|
||
elif property_attr == "og:description":
|
||
meta_info["og:description"] = content
|
||
elif property_attr == "og:image":
|
||
meta_info["og:image"] = content
|
||
elif property_attr == "og:url":
|
||
meta_info["og:url"] = content
|
||
|
||
elif name_attr == "twitter:card":
|
||
meta_info["twitter:card"] = content
|
||
elif name_attr == "twitter:title":
|
||
meta_info["twitter:title"] = content
|
||
elif name_attr == "twitter:description":
|
||
meta_info["twitter:description"] = content
|
||
elif name_attr == "twitter:image":
|
||
meta_info["twitter:image"] = content
|
||
|
||
parsed_url = urlparse(url)
|
||
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||
favicon_url = urljoin(base_url, "favicon.ico") # 默认 favicon 路径
|
||
|
||
link_tag = soup.find("link", rel=["icon", "shortcut icon"])
|
||
if link_tag:
|
||
favicon_url = urljoin(base_url, link_tag.get("href", "favicon.ico"))
|
||
|
||
metadata = {
|
||
"meta_info": meta_info,
|
||
"favicon_url": favicon_url
|
||
}
|
||
|
||
cache.set(url, json.dumps(metadata, ensure_ascii=False), ex=60)
|
||
result = {
|
||
"status": "200",
|
||
"data": metadata,
|
||
"msg": "Success",
|
||
"cache": False
|
||
}
|
||
|
||
return result
|