This commit is contained in:
2026-03-26 15:04:59 +08:00
commit e0af97ac7f
65 changed files with 7366 additions and 0 deletions

138
app/api/v1/web_tools.py Normal file
View File

@@ -0,0 +1,138 @@
# app/api/endpoints/website_meta.py
import json
import time
from urllib.parse import urlparse, urljoin
import cloudscraper
from app.utils.logger import log
import requests
from bs4 import BeautifulSoup
from fastapi import APIRouter
from app.core import cache
router = APIRouter()
@router.get("/")
def get_meta(url: str = None):
if not url:
return {
"status": "404",
"data": [],
"msg": "`url` is required"
}
# get from cache
cached_metadata = cache.get(url)
if cached_metadata:
return {
"status": "200",
"data": json.loads(cached_metadata),
"msg": "success",
"cache": True
}
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9,zh-TW;q=0.8,ar;q=0.7,en;q=0.6",
"cache-control": "max-age=0",
"priority": "u=0, i",
"sec-ch-ua": '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
page_content = response.content
except requests.RequestException as e:
scraper = cloudscraper.create_scraper(delay=100)
response = scraper.get(url)
page_content = response.content
if not page_content:
return {
"status": "404",
"data": [],
"msg": "No content"
}
soup = BeautifulSoup(page_content, "html.parser")
meta_info = {
"title": soup.title.string if soup.title else "No title",
"description": "",
"keywords": "",
"author": "",
"og:title": "",
"og:description": "",
"og:image": "",
"og:url": url,
"twitter:card": "",
"twitter:title": "",
"twitter:description": "",
"twitter:image": ""
}
for meta_tag in soup.find_all("meta"):
name_attr = meta_tag.get("name", "").lower()
property_attr = meta_tag.get("property", "").lower()
content = meta_tag.get("content", "")
if name_attr == "description":
meta_info["description"] = content
elif name_attr == "keywords":
meta_info["keywords"] = content
elif name_attr == "author":
meta_info["author"] = content
elif property_attr == "og:title":
meta_info["og:title"] = content
elif property_attr == "og:description":
meta_info["og:description"] = content
elif property_attr == "og:image":
meta_info["og:image"] = content
elif property_attr == "og:url":
meta_info["og:url"] = content
elif name_attr == "twitter:card":
meta_info["twitter:card"] = content
elif name_attr == "twitter:title":
meta_info["twitter:title"] = content
elif name_attr == "twitter:description":
meta_info["twitter:description"] = content
elif name_attr == "twitter:image":
meta_info["twitter:image"] = content
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
favicon_url = urljoin(base_url, "favicon.ico") # 默认 favicon 路径
link_tag = soup.find("link", rel=["icon", "shortcut icon"])
if link_tag:
favicon_url = urljoin(base_url, link_tag.get("href", "favicon.ico"))
metadata = {
"meta_info": meta_info,
"favicon_url": favicon_url
}
cache.set(url, json.dumps(metadata, ensure_ascii=False), ex=60)
result = {
"status": "200",
"data": metadata,
"msg": "Success",
"cache": False
}
return result