-
Notifications
You must be signed in to change notification settings - Fork 26
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[FDS-2127] Update URL validation to use requests.options to verify connectivity #1469
Changes from all commits
6778273
a8b413d
847593e
e48b0fd
38e635a
6ac74a8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,15 @@ | ||
import builtins | ||
import logging | ||
import re | ||
from copy import deepcopy | ||
from time import perf_counter | ||
|
||
# allows specifying explicit variable types | ||
from typing import Any, Literal, Optional, Union | ||
from urllib import error | ||
from urllib.parse import urlparse | ||
from urllib.request import Request, urlopen | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import requests | ||
from jsonschema import ValidationError | ||
from synapseclient.core.exceptions import SynapseNoCredentialsError | ||
|
||
|
@@ -1127,16 +1125,16 @@ def type_validation( | |
def url_validation( | ||
self, | ||
val_rule: str, | ||
manifest_col: str, | ||
manifest_col: pd.Series, | ||
) -> tuple[list[list[str]], list[list[str]]]: | ||
""" | ||
Purpose: | ||
Validate URL's submitted for a particular attribute in a manifest. | ||
Determine if the URL is valid and contains attributes specified in the | ||
schema. | ||
schema. Additionally, the server must be reachable to be deemed as valid. | ||
Input: | ||
- val_rule: str, Validation rule | ||
- manifest_col: pd.core.series.Series, column for a given | ||
- manifest_col: pd.Series, column for a given | ||
attribute in the manifest | ||
Output: | ||
This function will return errors when the user input value | ||
|
@@ -1154,8 +1152,9 @@ def url_validation( | |
) | ||
if entry_has_value: | ||
# Check if a random phrase, string or number was added and | ||
# log the appropriate error. Specifically, Raise an error if the value added is not a string or no part | ||
# of the string can be parsed as a part of a URL. | ||
# log the appropriate error. Specifically, Raise an error if the value | ||
# added is not a string or no part of the string can be parsed as a | ||
# part of a URL. | ||
if not isinstance(url, str) or not ( | ||
urlparse(url).scheme | ||
+ urlparse(url).netloc | ||
|
@@ -1186,10 +1185,13 @@ def url_validation( | |
try: | ||
# Check that the URL points to a working webpage | ||
# if not log the appropriate error. | ||
request = Request(url) | ||
response = urlopen(request) | ||
valid_url = True | ||
response_code = response.getcode() | ||
response = requests.options(url, allow_redirects=True) | ||
logger.debug( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For the FAIR team, was there a specific reason to use I think something like this works too
note: I prefer using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. I also prefer using requests package. What Brian has there seems perfect to me. |
||
"Validated URL [URL: %s, status_code: %s]", | ||
url, | ||
response.status_code, | ||
) | ||
except: | ||
valid_url = False | ||
url_error = "invalid_url" | ||
|
@@ -1207,7 +1209,7 @@ def url_validation( | |
errors.append(vr_errors) | ||
if vr_warnings: | ||
warnings.append(vr_warnings) | ||
if valid_url == True: | ||
if valid_url: | ||
# If the URL works, check to see if it contains the proper arguments | ||
# as specified in the schema. | ||
for arg in url_args: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -142,3 +142,10 @@ def temporary_file_copy(request, helpers: Helpers) -> Generator[str, None, None] | |
# Teardown | ||
if os.path.exists(temp_csv_path): | ||
os.remove(temp_csv_path) | ||
|
||
|
||
@pytest.fixture(name="dmge", scope="function") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I moved this up to conftest so I could use it in my new tests as well |
||
def DMGE(helpers: Helpers) -> DataModelGraphExplorer: | ||
"""Fixture to instantiate a DataModelGraphExplorer object.""" | ||
dmge = helpers.get_data_model_graph_explorer(path="example.model.jsonld") | ||
return dmge |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import pandas as pd | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the start of moving some files into an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I appreciate the additive nature of these changes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks Bryan! I like the change. |
||
|
||
from schematic.models.validate_attribute import ValidateAttribute | ||
from schematic.schemas.data_model_graph import DataModelGraphExplorer | ||
|
||
CHECK_URL_NODE_NAME = "Check URL" | ||
VALIDATION_RULE_URL = "url" | ||
|
||
|
||
class TestValidateAttribute: | ||
"""Integration tests for the ValidateAttribute class.""" | ||
|
||
def test_url_validation_valid_url(self, dmge: DataModelGraphExplorer) -> None: | ||
# GIVEN a valid URL: | ||
url = "https://github.com/Sage-Bionetworks/schematic" | ||
|
||
# AND a pd.core.series.Series that contains this URL | ||
content = pd.Series(data=[url], name=CHECK_URL_NODE_NAME) | ||
|
||
# AND a validation attribute | ||
validator = ValidateAttribute(dmge=dmge) | ||
|
||
# WHEN the URL is validated | ||
result = validator.url_validation( | ||
val_rule=VALIDATION_RULE_URL, manifest_col=content | ||
) | ||
|
||
# THEN the result should pass validation | ||
assert result == ([], []) | ||
|
||
def test_url_validation_valid_doi(self, dmge: DataModelGraphExplorer) -> None: | ||
# GIVEN a valid URL: | ||
url = "https://doi.org/10.1158/0008-5472.can-23-0128" | ||
|
||
# AND a pd.core.series.Series that contains this URL | ||
content = pd.Series(data=[url], name=CHECK_URL_NODE_NAME) | ||
|
||
# AND a validation attribute | ||
validator = ValidateAttribute(dmge=dmge) | ||
|
||
# WHEN the URL is validated | ||
result = validator.url_validation( | ||
val_rule=VALIDATION_RULE_URL, manifest_col=content | ||
) | ||
|
||
# THEN the result should pass validation | ||
assert result == ([], []) | ||
|
||
def test_url_validation_invalid_url(self, dmge: DataModelGraphExplorer) -> None: | ||
# GIVEN an invalid URL: | ||
url = "http://googlef.com/" | ||
|
||
# AND a pd.core.series.Series that contains this URL | ||
content = pd.Series(data=[url], name=CHECK_URL_NODE_NAME) | ||
|
||
# AND a validation attribute | ||
validator = ValidateAttribute(dmge=dmge) | ||
|
||
# WHEN the URL is validated | ||
result = validator.url_validation( | ||
val_rule=VALIDATION_RULE_URL, manifest_col=content | ||
) | ||
|
||
# THEN the result should not pass validation | ||
assert result == ( | ||
[ | ||
[ | ||
"2", | ||
"Check URL", | ||
"For the attribute 'Check URL', on row 2, the URL provided (http://googlef.com/) does not conform to the standards of a URL. Please make sure you are entering a real, working URL as required by the Schema.", | ||
"http://googlef.com/", | ||
] | ||
], | ||
[], | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have a markdown plugin in VSCODE that automatically creates/updates these. If you don't wish to have this update I will delete it from the PR.