init

2026-03-26 15:04:59 +08:00
commit e0af97ac7f
65 changed files with 7366 additions and 0 deletions
--- a/app/api/v1/web_tools.py
+++ b/app/api/v1/web_tools.py
@@ -0,0 +1,138 @@
+# app/api/endpoints/website_meta.py
+import json
+import time
+from urllib.parse import urlparse, urljoin
+
+import cloudscraper
+
+from app.utils.logger import log
+
+import requests
+from bs4 import BeautifulSoup
+from fastapi import APIRouter
+
+from app.core import cache
+
+router = APIRouter()
+
+
+@router.get("/")
+def get_meta(url: str = None):
+    if not url:
+        return {
+            "status": "404",
+            "data": [],
+            "msg": "`url` is required"
+        }
+
+    # get from cache
+    cached_metadata = cache.get(url)
+    if cached_metadata:
+        return {
+            "status": "200",
+            "data": json.loads(cached_metadata),
+            "msg": "success",
+            "cache": True
+        }
+
+    headers = {
+        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+        "accept-language": "zh-CN,zh;q=0.9,zh-TW;q=0.8,ar;q=0.7,en;q=0.6",
+        "cache-control": "max-age=0",
+        "priority": "u=0, i",
+        "sec-ch-ua": '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
+        "sec-ch-ua-mobile": "?0",
+        "sec-ch-ua-platform": '"Windows"',
+        "sec-fetch-dest": "document",
+        "sec-fetch-mode": "navigate",
+        "sec-fetch-site": "same-origin",
+        "sec-fetch-user": "?1",
+        "upgrade-insecure-requests": "1",
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
+    }
+
+
+    try:
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        page_content = response.content
+    except requests.RequestException as e:
+        scraper = cloudscraper.create_scraper(delay=100)
+        response = scraper.get(url)
+        page_content = response.content
+
+    if not page_content:
+        return {
+            "status": "404",
+            "data": [],
+            "msg": "No content"
+        }
+
+    soup = BeautifulSoup(page_content, "html.parser")
+    meta_info = {
+        "title": soup.title.string if soup.title else "No title",
+        "description": "",
+        "keywords": "",
+        "author": "",
+        "og:title": "",
+        "og:description": "",
+        "og:image": "",
+        "og:url": url,
+        "twitter:card": "",
+        "twitter:title": "",
+        "twitter:description": "",
+        "twitter:image": ""
+    }
+
+    for meta_tag in soup.find_all("meta"):
+        name_attr = meta_tag.get("name", "").lower()
+        property_attr = meta_tag.get("property", "").lower()
+        content = meta_tag.get("content", "")
+
+        if name_attr == "description":
+            meta_info["description"] = content
+        elif name_attr == "keywords":
+            meta_info["keywords"] = content
+        elif name_attr == "author":
+            meta_info["author"] = content
+
+        elif property_attr == "og:title":
+            meta_info["og:title"] = content
+        elif property_attr == "og:description":
+            meta_info["og:description"] = content
+        elif property_attr == "og:image":
+            meta_info["og:image"] = content
+        elif property_attr == "og:url":
+            meta_info["og:url"] = content
+
+        elif name_attr == "twitter:card":
+            meta_info["twitter:card"] = content
+        elif name_attr == "twitter:title":
+            meta_info["twitter:title"] = content
+        elif name_attr == "twitter:description":
+            meta_info["twitter:description"] = content
+        elif name_attr == "twitter:image":
+            meta_info["twitter:image"] = content
+
+    parsed_url = urlparse(url)
+    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
+    favicon_url = urljoin(base_url, "favicon.ico")  # 默认 favicon 路径
+
+    link_tag = soup.find("link", rel=["icon", "shortcut icon"])
+    if link_tag:
+        favicon_url = urljoin(base_url, link_tag.get("href", "favicon.ico"))
+
+    metadata = {
+        "meta_info": meta_info,
+        "favicon_url": favicon_url
+    }
+
+    cache.set(url, json.dumps(metadata, ensure_ascii=False), ex=60)
+    result = {
+        "status": "200",
+        "data": metadata,
+        "msg": "Success",
+        "cache": False
+    }
+
+    return result