# app/api/endpoints/website_meta.py import json import time from urllib.parse import urlparse, urljoin import cloudscraper from app.utils.logger import log import requests from bs4 import BeautifulSoup from fastapi import APIRouter, Query from fastapi.responses import JSONResponse from app.core import cache router = APIRouter() @router.get( "/", summary="获取网站元数据", description="提取指定网页的元数据信息,包括标题、描述、关键词、Open Graph 标签、Twitter Card 标签和 favicon", response_description="返回包含网站元数据的 JSON 对象", responses={ 200: {"description": "成功获取网站元数据"}, 404: {"description": "URL 参数缺失或无法访问"} } ) def get_meta( url: str = Query( default=..., description="要获取元数据的网页 URL", example="https://www.example.com" ) ): """ **获取网站元数据** 提取指定网页的各种元数据信息,支持标准 meta 标签、Open Graph 协议和 Twitter Card 协议。 **功能特性:** - 自动检测并提取页面标题、描述、关键词 - 支持 Open Graph 协议(Facebook) - 支持 Twitter Card 协议 - 自动查找 favicon 图标地址 - 内置缓存机制,相同 URL 不会重复请求 **提取字段说明:** - `title`: 页面标题 - `description`: 页面描述 - `keywords`: 页面关键词 - `author`: 作者信息 - `og:*`: Open Graph 相关字段 - `twitter:*`: Twitter Card 相关字段 - `favicon_url`: 网站图标 URL **缓存策略:** - 首次请求会实际抓取网页 - 后续请求从 Redis 缓存读取(TTL: 60 秒) - 响应中 `cache` 字段标识是否来自缓存 **注意事项:** - 部分网站可能有反爬机制,使用 cloudscraper 进行绕过 - 动态渲染的内容可能无法完整获取 """ if not url: return { "status": "404", "data": [], "msg": "`url` is required" } # get from cache cached_metadata = cache.get(url) if cached_metadata: return { "status": "200", "data": json.loads(cached_metadata), "msg": "success", "cache": True } headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "accept-language": "zh-CN,zh;q=0.9,zh-TW;q=0.8,ar;q=0.7,en;q=0.6", "cache-control": "max-age=0", "priority": "u=0, i", "sec-ch-ua": '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"', "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": '"Windows"', "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36" } try: response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() page_content = response.content except requests.RequestException as e: scraper = cloudscraper.create_scraper(delay=100) response = scraper.get(url) page_content = response.content if not page_content: return { "status": "404", "data": [], "msg": "No content" } soup = BeautifulSoup(page_content, "html.parser") meta_info = { "title": soup.title.string if soup.title else "No title", "description": "", "keywords": "", "author": "", "og:title": "", "og:description": "", "og:image": "", "og:url": url, "twitter:card": "", "twitter:title": "", "twitter:description": "", "twitter:image": "" } for meta_tag in soup.find_all("meta"): name_attr = meta_tag.get("name", "").lower() property_attr = meta_tag.get("property", "").lower() content = meta_tag.get("content", "") if name_attr == "description": meta_info["description"] = content elif name_attr == "keywords": meta_info["keywords"] = content elif name_attr == "author": meta_info["author"] = content elif property_attr == "og:title": meta_info["og:title"] = content elif property_attr == "og:description": meta_info["og:description"] = content elif property_attr == "og:image": meta_info["og:image"] = content elif property_attr == "og:url": meta_info["og:url"] = content elif name_attr == "twitter:card": meta_info["twitter:card"] = content elif name_attr == "twitter:title": meta_info["twitter:title"] = content elif name_attr == "twitter:description": meta_info["twitter:description"] = content elif name_attr == "twitter:image": meta_info["twitter:image"] = content parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" favicon_url = urljoin(base_url, "favicon.ico") # 默认 favicon 路径 link_tag = soup.find("link", rel=["icon", "shortcut icon"]) if link_tag: favicon_url = urljoin(base_url, link_tag.get("href", "favicon.ico")) metadata = { "meta_info": meta_info, "favicon_url": favicon_url } cache.set(url, json.dumps(metadata, ensure_ascii=False), ex=60) result = { "status": "200", "data": metadata, "msg": "Success", "cache": False } return result