Skip to content

Commit

Permalink
correctly escape the url
Browse files Browse the repository at this point in the history
  • Loading branch information
Frefreak committed Aug 22, 2024
1 parent 6fab8df commit 1652eb4
Showing 1 changed file with 18 additions and 2 deletions.
20 changes: 18 additions & 2 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import List, Tuple
from shutil import copytree, rmtree
from urllib.request import urlretrieve
from urllib.parse import urlsplit, quote, urlunsplit

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
Expand Down Expand Up @@ -139,7 +140,7 @@ def get_zip_and_extract(browser):

eprint(f"downloading {name} with url {url}")

urlretrieve(url, ZIP_FILE)
urlretrieve(escape_url(url), ZIP_FILE)
ignore_files = [
".git",
".github",
Expand Down Expand Up @@ -200,9 +201,24 @@ def get_changelog(browser):
r = date
eprint(f"downloading {name} with url {url}")

urlretrieve(url, CHANGELOG_FILE)
urlretrieve(escape_url(url), CHANGELOG_FILE)
return r

def escape_url(url: str) -> str:
url_parts = urlsplit(url)

# Escape the path and query parts
escaped_path = quote(url_parts.path)
escaped_query = quote(url_parts.query, safe='=&')

# Reassemble the URL
return urlunsplit((
url_parts.scheme,
url_parts.netloc,
escaped_path,
escaped_query,
url_parts.fragment
))

def main():
chrome_options = Options()
Expand Down

0 comments on commit 1652eb4

Please sign in to comment.