From f0a5ba9fe098439660fc2a221b42e3405398c7c4 Mon Sep 17 00:00:00 2001 From: Maria Carmina Date: Thu, 16 Nov 2023 18:45:26 +0200 Subject: [PATCH 1/3] Fix fileinfo for arweave file. Added check for file path validation inside UrlFile class. --- ocean_provider/file_types/definitions.py | 22 +++++----------------- ocean_provider/file_types/file_types.py | 12 ++++++++++++ ocean_provider/utils/test/test_util.py | 11 +++++++++++ tests/test_fileinfo.py | 1 + 4 files changed, 29 insertions(+), 17 deletions(-) diff --git a/ocean_provider/file_types/definitions.py b/ocean_provider/file_types/definitions.py index 883cd48d..ccc45b26 100644 --- a/ocean_provider/file_types/definitions.py +++ b/ocean_provider/file_types/definitions.py @@ -72,7 +72,10 @@ def check_details(self, with_checksum=False): if result: status_code = result.status_code headers = copy.deepcopy(result.headers) - files_url = copy.deepcopy(result.url) + if "arweave" in result.url: + files_url = "" + else: + files_url = copy.deepcopy(result.url) # always close requests session, see https://requests.readthedocs.io/en/latest/user/advanced/#body-content-workflow result.close() if status_code == 200: @@ -85,18 +88,7 @@ def check_details(self, with_checksum=False): file_name = None if files_url: - filename = urlparse(files_url).path.split("/")[-1] - try: - if not self._validate_filename(filename): - msg = "Invalid file name format. It was not possible to get the file name." - logger.error(msg) - return False, {"error": msg} - - file_name = filename - except Exception as e: - msg = f"It was not possible to get the file name. {e}" - logger.warning(msg) - return False, {"error": msg} + file_name = urlparse(files_url).path.split("/")[-1] if not content_length and content_range: # sometimes servers send content-range instead @@ -128,10 +120,6 @@ def check_details(self, with_checksum=False): return False, {} - def _validate_filename(self, header: str) -> bool: - pattern = re.compile(r"\\|\.\.|/") - return not bool(pattern.findall(header)) - def _get_result_from_url(self, with_checksum=False): func, func_args = self._get_func_and_args() diff --git a/ocean_provider/file_types/file_types.py b/ocean_provider/file_types/file_types.py index dea36f39..81591742 100644 --- a/ocean_provider/file_types/file_types.py +++ b/ocean_provider/file_types/file_types.py @@ -1,6 +1,7 @@ import json import logging import os +import re from typing import Any, Optional, Tuple from urllib.parse import urljoin from uuid import uuid4 @@ -34,11 +35,22 @@ def validate_dict(self) -> Tuple[bool, Any]: if self.method not in ["get", "post"]: return False, f"Unsafe method {self.method}." + if not self.validate_url(self.url): + msg = "Invalid file name format. It was not possible to get the file name." + logger.error(msg) + return False, msg + return True, self def get_download_url(self): return self.url + def validate_url(self, url: str) -> bool: + pattern = re.compile(r"^(.+)\/([^/]+)$") + if url.startswith("http://") or url.startswith("https://"): + return True + return not bool(pattern.findall(url)) + @enforce_types def get_filename(self) -> str: return self.url.split("/")[-1] diff --git a/ocean_provider/utils/test/test_util.py b/ocean_provider/utils/test/test_util.py index 7189251d..19d71f7f 100644 --- a/ocean_provider/utils/test/test_util.py +++ b/ocean_provider/utils/test/test_util.py @@ -390,6 +390,17 @@ def test_validate_url_object(): assert result is False assert message == "Unsafe method delete." + url_object = { + "url": "./../dir", + "type": "url", + "method": "GET", + } + result, message = FilesTypeFactory.validate_and_create(url_object) + assert result is False + assert ( + message == "Invalid file name format. It was not possible to get the file name." + ) + @pytest.mark.unit def test_build_download_response_ipfs(): diff --git a/tests/test_fileinfo.py b/tests/test_fileinfo.py index 84069965..adb4a90a 100644 --- a/tests/test_fileinfo.py +++ b/tests/test_fileinfo.py @@ -171,6 +171,7 @@ def test_check_arweave_good(client): assert file_info["contentType"] == "application/octet-stream" assert file_info["valid"] is True assert file_info["type"] == "arweave" + assert file_info["name"] == "" @pytest.mark.unit From eff94e1d7e0f541dc15089b153c31690a82b2482 Mon Sep 17 00:00:00 2001 From: Maria Carmina Date: Thu, 16 Nov 2023 19:04:48 +0200 Subject: [PATCH 2/3] Fix key for tests. --- tests/test_fileinfo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_fileinfo.py b/tests/test_fileinfo.py index adb4a90a..119bb0a7 100644 --- a/tests/test_fileinfo.py +++ b/tests/test_fileinfo.py @@ -171,7 +171,7 @@ def test_check_arweave_good(client): assert file_info["contentType"] == "application/octet-stream" assert file_info["valid"] is True assert file_info["type"] == "arweave" - assert file_info["name"] == "" + assert file_info["filename"] == "" @pytest.mark.unit From dc655533e45f70d4549d2f873d214383daa8dcac Mon Sep 17 00:00:00 2001 From: Maria Carmina Date: Thu, 16 Nov 2023 19:50:59 +0200 Subject: [PATCH 3/3] Overwrite check_details in Arweave class. --- ocean_provider/file_types/definitions.py | 6 +- ocean_provider/file_types/file_types.py | 72 +++++++++++++++++++++++- 2 files changed, 72 insertions(+), 6 deletions(-) diff --git a/ocean_provider/file_types/definitions.py b/ocean_provider/file_types/definitions.py index ccc45b26..4a487cc7 100644 --- a/ocean_provider/file_types/definitions.py +++ b/ocean_provider/file_types/definitions.py @@ -4,7 +4,6 @@ import logging import mimetypes import os -import re from abc import abstractmethod from cgi import parse_header from typing import Protocol, Tuple @@ -72,10 +71,7 @@ def check_details(self, with_checksum=False): if result: status_code = result.status_code headers = copy.deepcopy(result.headers) - if "arweave" in result.url: - files_url = "" - else: - files_url = copy.deepcopy(result.url) + files_url = copy.deepcopy(result.url) # always close requests session, see https://requests.readthedocs.io/en/latest/user/advanced/#body-content-workflow result.close() if status_code == 200: diff --git a/ocean_provider/file_types/file_types.py b/ocean_provider/file_types/file_types.py index 81591742..9592c2f0 100644 --- a/ocean_provider/file_types/file_types.py +++ b/ocean_provider/file_types/file_types.py @@ -2,12 +2,15 @@ import logging import os import re +import copy from typing import Any, Optional, Tuple -from urllib.parse import urljoin +from urllib.parse import urljoin, urlparse from uuid import uuid4 from enforce_typing import enforce_types from ocean_provider.file_types.definitions import EndUrlType, FilesType +import requests +from ocean_provider.utils.url import is_safe_url logger = logging.getLogger(__name__) @@ -118,6 +121,73 @@ def get_download_url(self): def get_filename(self): return uuid4().hex + def check_details(self, with_checksum=False): + """ + If the url argument is invalid, returns False and empty dictionary. + Otherwise it returns True and a dictionary containing contentType and + contentLength. File name remains empty. + """ + + url = self.get_download_url() + + try: + if not is_safe_url(url): + return False, {} + status_code = None + headers = None + files_url = None + for _ in range(int(os.getenv("REQUEST_RETRIES", 1))): + result, extra_data = self._get_result_from_url( + with_checksum=with_checksum, + ) + if result: + status_code = result.status_code + headers = copy.deepcopy(result.headers) + files_url = "" + # always close requests session, see https://requests.readthedocs.io/en/latest/user/advanced/#body-content-workflow + result.close() + if status_code == 200: + break + + if status_code == 200: + content_type = headers.get("Content-Type") + content_length = headers.get("Content-Length") + content_range = headers.get("Content-Range") + file_name = None + + if files_url: + file_name = urlparse(files_url).path.split("/")[-1] + + if not content_length and content_range: + # sometimes servers send content-range instead + try: + content_length = content_range.split("-")[1] + except IndexError: + pass + + if content_type: + try: + content_type = content_type.split(";")[0] + except IndexError: + pass + + if content_type or content_length or file_name: + details = { + "contentLength": content_length or "", + "contentType": content_type or "", + "filename": file_name or "", + } + + if extra_data: + details.update(extra_data) + + self.checked_details = details + return True, details + except requests.exceptions.RequestException: + pass + + return False, {} + class GraphqlQuery(EndUrlType, FilesType): @enforce_types