[docs] Download 3rd party embeds to local files

Add a script to download 3rd party embedded resources (CSS, JS, etc) to local files. This ensures that the documentation can be viewed offline and reduces external dependencies. The script: 1. Detects external URLs in HTML/CSS files 2. Downloads the resources to _static/downloads 3. Updates the HTML/CSS files to use local copies
apache · Feb 5, 2025 · 9898039 · 9898039
1 parent 9404fb5
commit 9898039
Show file tree

Hide file tree

Showing 5 changed files with 319 additions and 274 deletions.
diff --git a/docs/Makefile b/docs/Makefile
@@ -111,6 +111,10 @@ html: staging
 	@echo
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 
+htmldepoly: html
+	python3 $(PWD)/download_3rdparty_embeds.py -v
+	@echo "Replaced external URLs with local files."
+
 dirhtml: staging
 	cd $(STAGINGDIR) && $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(PWD)/$(BUILDDIR)/dirhtml
 	@echo

diff --git a/docs/download_3rdparty_embeds.py b/docs/download_3rdparty_embeds.py
@@ -0,0 +1,314 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=redefined-outer-name, missing-module-docstring
+import argparse
+import hashlib
+import os
+import re
+from html.parser import HTMLParser
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
+from urllib.parse import urlparse
+
+import requests
+
+# NOTE: This script is called by the Makefile via `make htmldepoly`.
+# It's not called every time the docs are built on CI. However, it's
+# can be only called during deployment stage, instead of building the docs.
+# Also, we can download the resources manually before running this script to
+# avoid the overhead of downloading the resources every time the docs are built.
+
+# Set to store unique external URLs found during processing
+BASE_URL = "https://tvm.apache.org"
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+HTML_DIR = os.path.join(SCRIPT_DIR, "_build/html")
+
+
+class ExternalURLParser(HTMLParser):
+    """HTML Parser to find external URLs in HTML content."""
+
+    def __init__(self):
+        super().__init__()
+        self.external_urls: List[str] = []
+        self.base_domain = urlparse(BASE_URL).netloc
+        # Tags and their attributes that might contain external resources
+        self.tags_to_check = {
+            "img": "src",
+            "script": "src",
+            "iframe": "src",
+            "video": "src",
+            "audio": "src",
+            "link": "href",
+            "source": "src",
+            "embed": "src",
+        }
+
+    def handle_starttag(self, tag: str, attrs: List[tuple[str, Union[str, None]]]) -> None:
+        """Handle HTML start tags to find external URLs."""
+        if tag not in self.tags_to_check:
+            return
+
+        attr_name = self.tags_to_check[tag]
+        for name, value in attrs:
+            if name != attr_name or not value:
+                continue
+
+            if value.startswith(("http://", "https://")):
+                domain = urlparse(value).netloc
+                if domain and domain != self.base_domain:
+                    self.external_urls.append(value)
+
+
+def detect_html_external_urls(html_content: str) -> List[str]:
+    """
+    Detect third-party embedded resources in HTML content.
+
+    Parameters
+    ----------
+    html_content : str
+        The HTML content to analyze
+
+    Returns
+    -------
+    List[str]
+        List of external URLs found in the HTML content
+    """
+    parser = ExternalURLParser()
+    parser.feed(html_content)
+    return parser.external_urls
+
+
+def detect_css_external_urls(css_content: str) -> List[str]:
+    """
+    Detect external URLs in CSS content.
+
+    Parameters
+    ----------
+    css_content : str
+        The CSS content to analyze
+
+    Returns
+    -------
+    List[str]
+        List of external URLs found in the CSS content
+    """
+    external_urls: List[str] = []
+    # Regex to find URLs in CSS
+    url_pattern = re.compile(r'url\(["\']?(.*?)["\']?\)')
+    matches = url_pattern.findall(css_content)
+    for match in matches:
+        if match.startswith(("http://", "https://")) and not match.startswith(BASE_URL):
+            external_urls.append(match)
+    return external_urls
+
+
+def all_files_in_dir(path: str) -> List[str]:
+    """
+    Get a list of all files in a directory and its subdirectories.
+
+    Parameters
+    ----------
+    path : str
+        The root directory path to search
+
+    Returns
+    -------
+    List[str]
+        List of full paths to all files found
+    """
+    return [os.path.join(root, file) for root, _, files in os.walk(path) for file in files]
+
+
+def detect_urls(files: List[str], verbose: bool = False) -> List[str]:
+    """
+    Detect external URLs in the given HTML and CSS files.
+
+    Parameters
+    ----------
+    files : List[str]
+        List of file paths to check for external URLs
+    verbose : bool, optional
+        Whether to print verbose output, by default False
+
+    Returns
+    -------
+    List[str]
+        List of external URLs found in the files
+    """
+
+    external_urls: Set[str] = set()
+    for file in files:
+        f_detect: Union[Callable[[str, str], List[str]], None] = None
+        if file.endswith(".html"):
+            f_detect = detect_html_external_urls
+        elif file.endswith(".css"):
+            f_detect = detect_css_external_urls
+        else:
+            continue
+        with open(file, "r", encoding="utf-8") as f:
+            content = f.read()
+            urls = f_detect(content)
+        if verbose:
+            print(f"Processing {file}")
+            exist_urls, new_urls = 0, 0
+            for url in urls:
+                if url in external_urls:
+                    exist_urls += 1
+                else:
+                    new_urls += 1
+                    if verbose:
+                        print(f"Found new {url}")
+            print(f"Found {exist_urls} existing resources and {new_urls} new resources")
+        external_urls.update(urls)
+    if verbose:
+        print(f"Total {len(external_urls)} external resources")
+        print("External resources:")
+        print("\n".join(external_urls))
+
+    return list(external_urls)
+
+
+def download_external_urls(
+    external_urls: List[str], verbose: bool = False
+) -> Tuple[Dict[str, str], List[str]]:
+    """
+    Download external URLs and save them to docs/_static/downloads.
+
+    Parameters
+    ----------
+    external_urls : List[str]
+        List of external URLs to download
+    verbose : bool, optional
+        Whether to print verbose output, by default False
+
+    Returns
+    -------
+    Tuple[Dict[str, str], List[str]]
+        A tuple containing:
+        - Dictionary mapping original URLs to their downloaded file paths
+        - List of paths to all downloaded files (including source maps)
+    """
+    download_dir = os.path.join(HTML_DIR, "_static/downloads")
+    os.makedirs(download_dir, exist_ok=True)
+    used_file_names: Set[str] = set()
+    downloaded_files: List[str] = []
+    remap_urls: Dict[str, str] = {}
+    for url in external_urls:
+        query = urlparse(url).query
+        if url.startswith("https://fonts.googleapis.com/css2"):
+            file_name = f"{hashlib.md5(url.encode()).hexdigest()}.css"
+        elif query:
+            raise ValueError(f"Unsupported URL with query: {url}")
+        else:
+            file_name = urlparse(url).path.split("/")[-1]
+        if verbose:
+            print(f"remapping {url} to {file_name}")
+        if file_name in used_file_names:
+            raise ValueError(f"File name {file_name} already exists")
+        used_file_names.add(file_name)
+        response = requests.get(url, timeout=30)
+        body = response.content
+        with open(os.path.join(download_dir, file_name), "wb") as f:
+            f.write(body)
+        remap_urls[url] = os.path.join(download_dir, file_name)
+        downloaded_files.append(os.path.join(download_dir, file_name))
+
+        # Also download the sourceMappingURL
+        if not url.startswith("https://fonts.googleapis.com/css2"):
+            map_file_name = f"{file_name}.map"
+            response = requests.get(f"{url}.map", timeout=30)
+            if response.status_code == 200:
+                body = response.content
+                with open(os.path.join(download_dir, map_file_name), "wb") as f:
+                    f.write(body)
+                    if verbose:
+                        print(f"Downloaded {map_file_name} for {url}")
+                downloaded_files.append(os.path.join(download_dir, map_file_name))
+
+    return remap_urls, downloaded_files
+
+
+def replace_urls_in_files(remap_urls: Dict[str, str], verbose: bool = False):
+    """
+    Replace external URLs with their downloaded versions in HTML/CSS files.
+
+    Parameters
+    ----------
+    remap_urls : Dict[str, str]
+        Dictionary mapping original URLs to their downloaded file paths
+    verbose : bool, optional
+        Whether to print verbose output, by default False
+    """
+    for root, _, files in os.walk(HTML_DIR):
+        for file in files:
+            if not (file.endswith(".html") or file.endswith(".css")):
+                continue
+
+            file_path = os.path.join(root, file)
+            if verbose:
+                print(f"Processing {file_path}")
+
+            # Calculate relative path from current file to _static/downloads
+            rel_path = os.path.relpath(
+                os.path.join(HTML_DIR, "_static/downloads"), os.path.dirname(file_path)
+            )
+
+            with open(file_path, "r", encoding="utf-8") as f:
+                content = f.read()
+
+            new_content = content
+            for original_url, new_path in remap_urls.items():
+                relative_url = os.path.join(rel_path, os.path.basename(new_path))
+                new_content = new_content.replace(original_url, relative_url)
+
+            if new_content != content:
+                with open(file_path, "w", encoding="utf-8") as f:
+                    f.write(new_content)
+                if verbose:
+                    print(f"Updated {file_path}")
+
+
+def download_and_replace_urls(files: Optional[List[str]] = None, verbose: bool = False):
+    """
+    Download external URLs found in files and replace them with local copies.
+    Recursively processes any new external URLs found in downloaded content.
+
+    Parameters
+    ----------
+    files : Optional[List[str]], optional
+        List of files to check for external URLs. If None, checks all files under HTML_DIR
+    verbose : bool, optional
+        Whether to print verbose output, by default False
+    """
+    if files is None:
+        files = all_files_in_dir(HTML_DIR)
+    remap_urls = {}
+    while True:
+        external_urls = detect_urls(files, verbose=verbose)
+        if not external_urls:
+            break
+        round_remap_urls, files = download_external_urls(external_urls, verbose=verbose)
+        remap_urls.update(round_remap_urls)
+
+    replace_urls_in_files(remap_urls, verbose=verbose)
+
+
+if __name__ == "__main__":
+    args = argparse.ArgumentParser()
+    args.add_argument("-v", "--verbose", action="store_true")
+    args = args.parse_args()
+    download_and_replace_urls(verbose=args.verbose)