Bug #23 fixed

artiomn · Apr 27, 2024 · 9c4ca86 · 9c4ca86
1 parent 69bf14d
commit 9c4ca86
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 5 deletions.
diff --git a/markdown_toolset/image_downloader.py b/markdown_toolset/image_downloader.py
@@ -137,12 +137,16 @@ def download_images(self, images: List[Union[str, ImageLink]]) -> dict:
                         )
                         continue
 
+                    logging.debug('Image is URL: %s', is_url(image_download_url))
+
                     image_filename, image_content = (
                         self._get_remote_image(image_download_url, image_num, images_count)
                         if is_url(image_download_url)
                         else ImageDownloader._get_local_image(Path(image_download_url))
                     )
 
+                    logging.debug('Guessed image filename: %s', image_filename)
+
                     if image_filename is None:
                         logging.warning(
                             'Empty image filename, probably this is incorrect link: "%s".', image_download_url

diff --git a/markdown_toolset/www_tools.py b/markdown_toolset/www_tools.py
@@ -7,6 +7,7 @@
 from mimetypes import guess_extension
 import os
 import re
+from urllib.parse import urlparse, urlunparse
 import requests
 
 from .string_tools import slugify
@@ -15,8 +16,7 @@
 NECESSARY_HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0'}
 
 __protocol_prefix_replace_regex = re.compile(r'^\s*(:?(?:(?:http|ftp)+s?|file)://)', re.IGNORECASE)
-
-# TODO: Use urllib!!!
+__protocol_prefix_slashes_replace_regex = re.compile(r'^\s*:?//', re.IGNORECASE)
 
 
 def is_url(url: str, allowed_url_prefixes=('http', 'ftp', 'https', 'ftps')) -> bool:
@@ -37,7 +37,7 @@ def remove_protocol_prefix(url: str) -> str:
     Remove prefixes like http, ftp, HTTPS, and other from the URL.
     """
 
-    return __protocol_prefix_replace_regex.sub('', url)
+    return __protocol_prefix_slashes_replace_regex.sub('', str(urlunparse(urlparse(url)._replace(scheme=''))))
 
 
 def download_from_url(url: str, timeout: float = None):
@@ -48,6 +48,7 @@ def download_from_url(url: str, timeout: float = None):
     :raise OSError: when HTTP status is not 200.
     """
 
+    # todo: Add urlparse()?
     url = url.split()[0]
 
     try:
@@ -70,8 +71,11 @@ def get_filename_from_url(req: requests.Response) -> Optional[str]:
     Get filename from url and, if not found, try to get from content-disposition.
     """
 
+    logging.debug('URL from request: %s', req.url)
+
     if req and req.url.find('/'):
-        result = req.url.rsplit('/', 1)[1]
+        result = urlparse(req.url).path
+        logging.debug('Filename from URL: %s', result)
     else:
         cd = req.headers.get('content-disposition')
 
@@ -80,6 +84,8 @@ def get_filename_from_url(req: requests.Response) -> Optional[str]:
 
         file_name = re.findall('filename=(.+)', cd)
 
+        logging.debug('Filename from "filename=" part: %s', file_name)
+
         if len(file_name) == 0:
             return None
 

diff --git a/tests/data/image_mime_incorrect.md b/tests/data/image_mime_incorrect.md
@@ -0,0 +1,3 @@
+![](https://cubox.pro/c/filters:no_upscale()?valid=false&imageUrl=https%3A%2F%2Fpicx.zhimg.com%2F50%2Fv2-53de590b6bb3f42d1a06d28c806c698d_720w.jpg%3Fsource%3D1940ef5c)
+![](https://cubox.pro/c/filters:no_upscale()?valid=false&imageUrl=https%3A%2F%2Fpica.zhimg.com%2F50%2Fv2-872d10f75dfa52172835fe6fbf22c5fe_720w.jpg%3Fsource%3D1940ef5c)
+![](https://cubox.pro/c/filters:no_upscale()?valid=false&imageUrl=https%3A%2F%2Fpic1.zhimg.com%2F50%2Fv2-c4b89a30d2a3fe1897cfe24388ec935e_720w.jpg%3Fsource%3D1940ef5c)
diff --git a/tests/test_www_tools.py b/tests/test_www_tools.py
@@ -1,4 +1,6 @@
-from markdown_toolset.www_tools import remove_protocol_prefix, is_url
+import requests
+
+from markdown_toolset.www_tools import remove_protocol_prefix, is_url, get_filename_from_url, download_from_url
 
 
 class TestProtocolPrefixesFunctions:
@@ -18,3 +20,22 @@ def test_url_checker(self):
         assert is_url('Https://test') == True  # noqa
         assert is_url('FTPS://test') == True  # noqa
         assert is_url('file://test') == False  # noqa
+
+    def test_get_filename_from_url(self):
+        # Mock response.
+        req = requests.Response()
+        req.status_code = 200
+        req.headers['content-type'] = 'image/jpg'
+
+        req.url = 'https://image.cubox.pro/cardImg/26p25dhia8yismewd0i3zptqzluz1ydufavhzlog6yjr6b6yle.jpg?imageMogr2/quality/90/ignore-error/1'
+        assert get_filename_from_url(req) == 'cardimg26p25dhia8yismewd0i3zptqzluz1ydufavhzlog6yjr6b6yle.jpg'
+
+        req.url = 'https://image.cubox.pro/cardImg/53fjbjlzb8a72slatcat03qmae7rw44qh3rvyck9548bqg06a2.jpg?imageMogr2/quality/90/ignore-error/1'
+        assert get_filename_from_url(req) == 'cardimg53fjbjlzb8a72slatcat03qmae7rw44qh3rvyck9548bqg06a2.jpg'
+
+        url = (
+            'https://cubox.pro/c/filters:no_upscale()?valid=false&imageUrl=https%3A%2F%2Fpic1.zhimg.com'
+            '%2F50%2Fv2-c4b89a30d2a3fe1897cfe24388ec935e_720w.jpg%3Fsource%3D1940ef5c'
+        )
+        req = download_from_url(url)
+        assert get_filename_from_url(req) == 'cardimgo2sqp98phc0gflafoxr829sjojo4vouo8twjaqycdtakasiqc.jpg'