Skip to content

Commit

Permalink
Bug #23 fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
Artiom N. committed Apr 27, 2024
1 parent 69bf14d commit 9c4ca86
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 5 deletions.
4 changes: 4 additions & 0 deletions markdown_toolset/image_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,16 @@ def download_images(self, images: List[Union[str, ImageLink]]) -> dict:
)
continue

logging.debug('Image is URL: %s', is_url(image_download_url))

image_filename, image_content = (
self._get_remote_image(image_download_url, image_num, images_count)
if is_url(image_download_url)
else ImageDownloader._get_local_image(Path(image_download_url))
)

logging.debug('Guessed image filename: %s', image_filename)

if image_filename is None:
logging.warning(
'Empty image filename, probably this is incorrect link: "%s".', image_download_url
Expand Down
14 changes: 10 additions & 4 deletions markdown_toolset/www_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from mimetypes import guess_extension
import os
import re
from urllib.parse import urlparse, urlunparse
import requests

from .string_tools import slugify
Expand All @@ -15,8 +16,7 @@
NECESSARY_HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0'}

__protocol_prefix_replace_regex = re.compile(r'^\s*(:?(?:(?:http|ftp)+s?|file)://)', re.IGNORECASE)

# TODO: Use urllib!!!
__protocol_prefix_slashes_replace_regex = re.compile(r'^\s*:?//', re.IGNORECASE)


def is_url(url: str, allowed_url_prefixes=('http', 'ftp', 'https', 'ftps')) -> bool:
Expand All @@ -37,7 +37,7 @@ def remove_protocol_prefix(url: str) -> str:
Remove prefixes like http, ftp, HTTPS, and other from the URL.
"""

return __protocol_prefix_replace_regex.sub('', url)
return __protocol_prefix_slashes_replace_regex.sub('', str(urlunparse(urlparse(url)._replace(scheme=''))))


def download_from_url(url: str, timeout: float = None):
Expand All @@ -48,6 +48,7 @@ def download_from_url(url: str, timeout: float = None):
:raise OSError: when HTTP status is not 200.
"""

# todo: Add urlparse()?
url = url.split()[0]

try:
Expand All @@ -70,8 +71,11 @@ def get_filename_from_url(req: requests.Response) -> Optional[str]:
Get filename from url and, if not found, try to get from content-disposition.
"""

logging.debug('URL from request: %s', req.url)

if req and req.url.find('/'):
result = req.url.rsplit('/', 1)[1]
result = urlparse(req.url).path
logging.debug('Filename from URL: %s', result)
else:
cd = req.headers.get('content-disposition')

Expand All @@ -80,6 +84,8 @@ def get_filename_from_url(req: requests.Response) -> Optional[str]:

file_name = re.findall('filename=(.+)', cd)

logging.debug('Filename from "filename=" part: %s', file_name)

if len(file_name) == 0:
return None

Expand Down
3 changes: 3 additions & 0 deletions tests/data/image_mime_incorrect.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
![](https://cubox.pro/c/filters:no_upscale()?valid=false&imageUrl=https%3A%2F%2Fpicx.zhimg.com%2F50%2Fv2-53de590b6bb3f42d1a06d28c806c698d_720w.jpg%3Fsource%3D1940ef5c)
![](https://cubox.pro/c/filters:no_upscale()?valid=false&imageUrl=https%3A%2F%2Fpica.zhimg.com%2F50%2Fv2-872d10f75dfa52172835fe6fbf22c5fe_720w.jpg%3Fsource%3D1940ef5c)
![](https://cubox.pro/c/filters:no_upscale()?valid=false&imageUrl=https%3A%2F%2Fpic1.zhimg.com%2F50%2Fv2-c4b89a30d2a3fe1897cfe24388ec935e_720w.jpg%3Fsource%3D1940ef5c)
23 changes: 22 additions & 1 deletion tests/test_www_tools.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from markdown_toolset.www_tools import remove_protocol_prefix, is_url
import requests

from markdown_toolset.www_tools import remove_protocol_prefix, is_url, get_filename_from_url, download_from_url


class TestProtocolPrefixesFunctions:
Expand All @@ -18,3 +20,22 @@ def test_url_checker(self):
assert is_url('Https://test') == True # noqa
assert is_url('FTPS://test') == True # noqa
assert is_url('file://test') == False # noqa

def test_get_filename_from_url(self):
# Mock response.
req = requests.Response()
req.status_code = 200
req.headers['content-type'] = 'image/jpg'

req.url = 'https://image.cubox.pro/cardImg/26p25dhia8yismewd0i3zptqzluz1ydufavhzlog6yjr6b6yle.jpg?imageMogr2/quality/90/ignore-error/1'
assert get_filename_from_url(req) == 'cardimg26p25dhia8yismewd0i3zptqzluz1ydufavhzlog6yjr6b6yle.jpg'

req.url = 'https://image.cubox.pro/cardImg/53fjbjlzb8a72slatcat03qmae7rw44qh3rvyck9548bqg06a2.jpg?imageMogr2/quality/90/ignore-error/1'
assert get_filename_from_url(req) == 'cardimg53fjbjlzb8a72slatcat03qmae7rw44qh3rvyck9548bqg06a2.jpg'

url = (
'https://cubox.pro/c/filters:no_upscale()?valid=false&imageUrl=https%3A%2F%2Fpic1.zhimg.com'
'%2F50%2Fv2-c4b89a30d2a3fe1897cfe24388ec935e_720w.jpg%3Fsource%3D1940ef5c'
)
req = download_from_url(url)
assert get_filename_from_url(req) == 'cardimgo2sqp98phc0gflafoxr829sjojo4vouo8twjaqycdtakasiqc.jpg'

0 comments on commit 9c4ca86

Please sign in to comment.