diff --git a/.env.example b/.env.example new file mode 100644 index 00000000..be1da87d --- /dev/null +++ b/.env.example @@ -0,0 +1,28 @@ +## CREDENTIALS +# can also be configured using the other standard methods for the provider, e.g. logging in to their CLI +AWS_ACCESS_KEY_ID=your_access_key_id +AWS_SECRET_ACCESS_KEY=your_secret_access_key + +AZURE_STORAGE_CONNECTION_STRING=DefaultEndpointsProtocol=https;AccountName=your_account_name;AccountKey=your_account_key;EndpointSuffix=core.windows.net + +GOOGLE_APPLICATION_CREDENTIALS=.gscreds.json +# or +GCP_PROJECT_ID=your_project_id +GCP_SA_KEY=your_service_account_key + +# Custom S3, e.g. MinIO +CUSTOM_S3_KEY_ID=your_custom_s3_key_id +CUSTOM_S3_SECRET_KEY=your_custom_s3_secret_key + + +## BUCKETS +# Used when running live tests; you will need read/write access to these buckets to run those test +LIVE_S3_BUCKET=a-bucket-you-can-access + +LIVE_AZURE_CONTAINER=a-container-you-can-access + +LIVE_GS_BUCKET=a-bucket-you-can-access + +# Custom S3, e.g. MinIO +CUSTOM_S3_BUCKET=a-bucket-you-can-access +CUSTOM_S3_ENDPOINT=your_custom_s3_endpoint diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index aa517758..c309ea2f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -81,6 +81,11 @@ Finally, you may want to run your tests against live servers to ensure that the make test-live-cloud ``` +You can copy `.env.example` to `.env` and fill in the credentials and bucket/container names for the providers you want to test against. **Note that the live tests will create and delete files on the cloud provider.** + +You can also skip providers you do not have accounts for by commenting them out in the `rig` and `s3_like_rig` variables defined at the end of `tests/conftest.py`. + + ### Test rigs Since we want behavior parity across providers, nearly all of the tests are written in a provider-agnositc way. Each test is passed a test rig as a fixture, and the rig provides the correct way for generating cloudpaths for testing. The test rigs are defined in [`conftest.py`](tests/conftest.py). @@ -135,6 +140,8 @@ To build the latest version of the documentation, you can run: make docs ``` +If you add/remove a method on the `CloudPath` class or any subclass, run `python docs/make_support_table.py` and paste the updated table into README.md. + ### Serving While you are developing, you can serve a local version of the docs to see what your changes look like. This will auto-reload for most changes to the docs: diff --git a/HISTORY.md b/HISTORY.md index 051266ba..1cfa97a6 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,7 @@ ## UNRELEASED - Implement sliced downloads in GSClient. (Issue [#387](https://github.com/drivendataorg/cloudpathlib/issues/387), PR [#389](https://github.com/drivendataorg/cloudpathlib/pull/389)) +- Implement `as_url` with presigned parameter for all backends. (Issue [#235](https://github.com/drivendataorg/cloudpathlib/issues/235), PR [#236](https://github.com/drivendataorg/cloudpathlib/pull/236)) ## 0.17.0 (2023-12-21) diff --git a/README.md b/README.md index ffde2073..c421ee60 100644 --- a/README.md +++ b/README.md @@ -135,7 +135,6 @@ Most methods and properties from `pathlib.Path` are supported except for the one | `is_absolute` | ✅ | ✅ | ✅ | | `is_dir` | ✅ | ✅ | ✅ | | `is_file` | ✅ | ✅ | ✅ | -| `is_junction` | ✅ | ✅ | ✅ | | `is_relative_to` | ✅ | ✅ | ✅ | | `iterdir` | ✅ | ✅ | ✅ | | `joinpath` | ✅ | ✅ | ✅ | @@ -161,9 +160,7 @@ Most methods and properties from `pathlib.Path` are supported except for the one | `suffixes` | ✅ | ✅ | ✅ | | `touch` | ✅ | ✅ | ✅ | | `unlink` | ✅ | ✅ | ✅ | -| `walk` | ✅ | ✅ | ✅ | | `with_name` | ✅ | ✅ | ✅ | -| `with_segments` | ✅ | ✅ | ✅ | | `with_stem` | ✅ | ✅ | ✅ | | `with_suffix` | ✅ | ✅ | ✅ | | `write_bytes` | ✅ | ✅ | ✅ | @@ -183,11 +180,13 @@ Most methods and properties from `pathlib.Path` are supported except for the one | `is_socket` | ❌ | ❌ | ❌ | | `is_symlink` | ❌ | ❌ | ❌ | | `lchmod` | ❌ | ❌ | ❌ | +| `link_to` | ❌ | ❌ | ❌ | | `lstat` | ❌ | ❌ | ❌ | | `owner` | ❌ | ❌ | ❌ | | `readlink` | ❌ | ❌ | ❌ | | `root` | ❌ | ❌ | ❌ | | `symlink_to` | ❌ | ❌ | ❌ | +| `as_url` | ✅ | ✅ | ✅ | | `clear_cache` | ✅ | ✅ | ✅ | | `cloud_prefix` | ✅ | ✅ | ✅ | | `copy` | ✅ | ✅ | ✅ | @@ -195,10 +194,13 @@ Most methods and properties from `pathlib.Path` are supported except for the one | `download_to` | ✅ | ✅ | ✅ | | `etag` | ✅ | ✅ | ✅ | | `fspath` | ✅ | ✅ | ✅ | +| `is_junction` | ✅ | ✅ | ✅ | | `is_valid_cloudpath` | ✅ | ✅ | ✅ | | `rmtree` | ✅ | ✅ | ✅ | | `upload_from` | ✅ | ✅ | ✅ | | `validate` | ✅ | ✅ | ✅ | +| `walk` | ✅ | ✅ | ✅ | +| `with_segments` | ✅ | ✅ | ✅ | | `blob` | ✅ | ❌ | ✅ | | `bucket` | ❌ | ✅ | ✅ | | `container` | ✅ | ❌ | ❌ | diff --git a/cloudpathlib/azure/azblobclient.py b/cloudpathlib/azure/azblobclient.py index 4db96183..dcda3e6a 100644 --- a/cloudpathlib/azure/azblobclient.py +++ b/cloudpathlib/azure/azblobclient.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timedelta import mimetypes import os from pathlib import Path, PurePosixPath @@ -14,7 +14,13 @@ try: from azure.core.exceptions import ResourceNotFoundError - from azure.storage.blob import BlobServiceClient, BlobProperties, ContentSettings + from azure.storage.blob import ( + BlobSasPermissions, + BlobServiceClient, + BlobProperties, + ContentSettings, + generate_blob_sas, + ) except ModuleNotFoundError: implementation_registry["azure"].dependencies_loaded = False @@ -271,5 +277,25 @@ def _upload_file( return cloud_path + def _get_public_url(self, cloud_path: AzureBlobPath) -> str: + blob_client = self.service_client.get_blob_client( + container=cloud_path.container, blob=cloud_path.blob + ) + return blob_client.url + + def _generate_presigned_url( + self, cloud_path: AzureBlobPath, expire_seconds: int = 60 * 60 + ) -> str: + sas_token = generate_blob_sas( + self.service_client.account_name, + container_name=cloud_path.container, + blob_name=cloud_path.blob, + account_key=self.service_client.credential.account_key, + permission=BlobSasPermissions(read=True), + expiry=datetime.utcnow() + timedelta(seconds=expire_seconds), + ) + url = f"{self._get_public_url(cloud_path)}?{sas_token}" + return url + AzureBlobClient.AzureBlobPath = AzureBlobClient.CloudPath # type: ignore diff --git a/cloudpathlib/client.py b/cloudpathlib/client.py index 831fcd4b..1b6c32eb 100644 --- a/cloudpathlib/client.py +++ b/cloudpathlib/client.py @@ -175,3 +175,13 @@ def _upload_file( self, local_path: Union[str, os.PathLike], cloud_path: BoundedCloudPath ) -> BoundedCloudPath: pass + + @abc.abstractmethod + def _get_public_url(self, cloud_path: BoundedCloudPath) -> str: + pass + + @abc.abstractmethod + def _generate_presigned_url( + self, cloud_path: BoundedCloudPath, expire_seconds: int = 60 * 60 + ) -> str: + pass diff --git a/cloudpathlib/cloudpath.py b/cloudpathlib/cloudpath.py index 6a12a21f..0b65f338 100644 --- a/cloudpathlib/cloudpath.py +++ b/cloudpathlib/cloudpath.py @@ -383,6 +383,13 @@ def touch(self, exist_ok: bool = True) -> None: """Should be implemented using the client API to create and update modified time""" pass + def as_url(self, presign: bool = False, expire_seconds: int = 60 * 60) -> str: + if presign: + url = self.client._generate_presigned_url(self, expire_seconds=expire_seconds) + else: + url = self.client._get_public_url(self) + return url + # ====================== IMPLEMENTED FROM SCRATCH ====================== # Methods with their own implementations that work generically def __rtruediv__(self, other: Any) -> None: diff --git a/cloudpathlib/gs/gsclient.py b/cloudpathlib/gs/gsclient.py index d75fd714..2d8fa73e 100644 --- a/cloudpathlib/gs/gsclient.py +++ b/cloudpathlib/gs/gsclient.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timedelta import mimetypes import os from pathlib import Path, PurePosixPath @@ -271,5 +271,18 @@ def _upload_file(self, local_path: Union[str, os.PathLike], cloud_path: GSPath) blob.upload_from_filename(str(local_path), **extra_args) return cloud_path + def _get_public_url(self, cloud_path: GSPath) -> str: + bucket = self.client.get_bucket(cloud_path.bucket) + blob = bucket.blob(cloud_path.blob) + return blob.public_url + + def _generate_presigned_url(self, cloud_path: GSPath, expire_seconds: int = 60 * 60) -> str: + bucket = self.client.get_bucket(cloud_path.bucket) + blob = bucket.blob(cloud_path.blob) + url = blob.generate_signed_url( + version="v4", expiration=timedelta(seconds=expire_seconds), method="GET" + ) + return url + GSClient.GSPath = GSClient.CloudPath # type: ignore diff --git a/cloudpathlib/local/localclient.py b/cloudpathlib/local/localclient.py index ad3bfa73..b3f829d8 100644 --- a/cloudpathlib/local/localclient.py +++ b/cloudpathlib/local/localclient.py @@ -156,6 +156,14 @@ def _get_metadata(self, cloud_path: "LocalPath") -> Dict: "content_type": content_type_method(str(self._cloud_path_to_local(cloud_path)))[0], } + def _get_public_url(self, cloud_path: "LocalPath") -> str: + return cloud_path.as_uri() + + def _generate_presigned_url( + self, cloud_path: "LocalPath", expire_seconds: int = 60 * 60 + ) -> str: + raise NotImplementedError("Cannot generate a presigned URL for a local path.") + _temp_dirs_to_clean: List[TemporaryDirectory] = [] diff --git a/cloudpathlib/s3/s3client.py b/cloudpathlib/s3/s3client.py index 9c1b5658..ac2ecaa6 100644 --- a/cloudpathlib/s3/s3client.py +++ b/cloudpathlib/s3/s3client.py @@ -126,6 +126,7 @@ def __init__( for k in ["RequestPayer", "ExpectedBucketOwner"] if k in self._extra_args } + self._endpoint_url = endpoint_url super().__init__( local_cache_dir=local_cache_dir, @@ -349,5 +350,30 @@ def _upload_file(self, local_path: Union[str, os.PathLike], cloud_path: S3Path) obj.upload_file(str(local_path), Config=self.boto3_transfer_config, ExtraArgs=extra_args) return cloud_path + def _get_public_url(self, cloud_path: S3Path) -> str: + """Apparently the best way to get the public URL is to generate a presigned URL + with the unsigned config set. This creates a temporary unsigned client to generate + the correct URL + See: https://stackoverflow.com/a/48197877 + """ + unsigned_config = Config(signature_version=botocore.UNSIGNED) + unsigned_client = self.sess.client( + "s3", endpoint_url=self._endpoint_url, config=unsigned_config + ) + url: str = unsigned_client.generate_presigned_url( + "get_object", + Params={"Bucket": cloud_path.bucket, "Key": cloud_path.key}, + ExpiresIn=0, + ) + return url + + def _generate_presigned_url(self, cloud_path: S3Path, expire_seconds: int = 60 * 60) -> str: + url: str = self.client.generate_presigned_url( + "get_object", + Params={"Bucket": cloud_path.bucket, "Key": cloud_path.key}, + ExpiresIn=expire_seconds, + ) + return url + S3Client.S3Path = S3Client.CloudPath # type: ignore diff --git a/tests/mock_clients/mock_azureblob.py b/tests/mock_clients/mock_azureblob.py index 689da039..3ddb7eee 100644 --- a/tests/mock_clients/mock_azureblob.py +++ b/tests/mock_clients/mock_azureblob.py @@ -6,6 +6,7 @@ from azure.storage.blob import BlobProperties +from azure.storage.blob._shared.authentication import SharedKeyCredentialPolicy from azure.core.exceptions import ResourceNotFoundError from .utils import delete_empty_parents_up_to_root @@ -30,6 +31,23 @@ def __init__(self, *args, **kwargs): def from_connection_string(cls, *args, **kwargs): return cls() + @property + def account_name(self) -> str: + """Returns well-known account name used by Azurite + See: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio%2Cblob-storage#well-known-storage-account-and-key + """ + return "devstoreaccount1" + + @property + def credential(self): + """Returns well-known account key used by Azurite + See: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio%2Cblob-storage#well-known-storage-account-and-key + """ + return SharedKeyCredentialPolicy( + self.account_name, + "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==", + ) + def __del__(self): self.tmp.cleanup() diff --git a/tests/mock_clients/mock_gs.py b/tests/mock_clients/mock_gs.py index 93056d7e..c48517f7 100644 --- a/tests/mock_clients/mock_gs.py +++ b/tests/mock_clients/mock_gs.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timedelta from pathlib import Path, PurePosixPath import shutil from tempfile import TemporaryDirectory @@ -38,6 +38,9 @@ def bucket(self, bucket): def list_buckets(self): return [DEFAULT_GS_BUCKET_NAME] + def get_bucket(self, bucket): + return MockBucket(self.tmp_path, bucket, client=self) + return MockClient @@ -106,6 +109,13 @@ def updated(self): def content_type(self): return self.client.metadata_cache.get(self.bucket / self.name, None) + @property + def public_url(self) -> str: + return f"https://storage.googleapis.com{self.bucket}/{self.name}" + + def generate_signed_url(self, version: str, expiration: timedelta, method: str): + return f"https://storage.googleapis.com{self.bucket}/{self.name}?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=TEST&X-Goog-Date=20240131T185515Z&X-Goog-Expires=3600&X-Goog-SignedHeaders=host&X-Goog-Signature=TEST" + class MockBucket: def __init__(self, name, bucket_name, client=None): diff --git a/tests/mock_clients/mock_s3.py b/tests/mock_clients/mock_s3.py index fc1ede18..9b5360bd 100644 --- a/tests/mock_clients/mock_s3.py +++ b/tests/mock_clients/mock_s3.py @@ -231,6 +231,10 @@ def head_object(self, Bucket, Key, **kwargs): else: return {"key": Key} + def generate_presigned_url(self, op: str, Params: dict, ExpiresIn: int): + mock_presigned_url = f"https://{Params['Bucket']}.s3.amazonaws.com/{Params['Key']}?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=TEST%2FTEST%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240131T194721Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=TEST" + return mock_presigned_url + @property def exceptions(self): Ex = collections.namedtuple("Ex", "NoSuchKey") diff --git a/tests/test_azure_specific.py b/tests/test_azure_specific.py index 05f4be31..7133d8f8 100644 --- a/tests/test_azure_specific.py +++ b/tests/test_azure_specific.py @@ -2,6 +2,7 @@ import pytest +from urllib.parse import urlparse, parse_qs from cloudpathlib import AzureBlobClient, AzureBlobPath from cloudpathlib.exceptions import MissingCredentialsError from cloudpathlib.local import LocalAzureBlobClient, LocalAzureBlobPath @@ -26,3 +27,21 @@ def test_azureblobpath_nocreds(client_class, monkeypatch): monkeypatch.delenv("AZURE_STORAGE_CONNECTION_STRING", raising=False) with pytest.raises(MissingCredentialsError): client_class() + + +def test_as_url(azure_rig): + p: AzureBlobPath = azure_rig.create_cloud_path("dir_0/file0_0.txt") + + public_url = str(p.as_url()) + public_parts = urlparse(public_url) + + assert public_parts.path.endswith("file0_0.txt") + + presigned_url = p.as_url(presign=True) + parts = urlparse(presigned_url) + query_params = parse_qs(parts.query) + assert parts.path.endswith("file0_0.txt") + assert "se" in query_params + assert "sp" in query_params + assert "sr" in query_params + assert "sig" in query_params diff --git a/tests/test_cloudpath_instantiation.py b/tests/test_cloudpath_instantiation.py index 366607fc..fe0ca3fa 100644 --- a/tests/test_cloudpath_instantiation.py +++ b/tests/test_cloudpath_instantiation.py @@ -31,6 +31,9 @@ def test_dispatch(path_class, cloud_path, monkeypatch): if path_class == AzureBlobPath: monkeypatch.setenv("AZURE_STORAGE_CONNECTION_STRING", "AccountName=fake;AccountKey=fake2;") + if path_class == GSPath: + monkeypatch.setenv("GOOGLE_CLOUD_PROJECT", "fake-project") + assert isinstance(CloudPath(cloud_path), path_class) diff --git a/tests/test_gs_specific.py b/tests/test_gs_specific.py index a851abb3..1632acae 100644 --- a/tests/test_gs_specific.py +++ b/tests/test_gs_specific.py @@ -1,11 +1,15 @@ import pytest +from urllib.parse import urlparse, parse_qs from cloudpathlib import GSPath from cloudpathlib.local import LocalGSPath @pytest.mark.parametrize("path_class", [GSPath, LocalGSPath]) -def test_gspath_properties(path_class): +def test_gspath_properties(path_class, monkeypatch): + if path_class == GSPath: + monkeypatch.setenv("GOOGLE_CLOUD_PROJECT", "fake-project") + p = path_class("gs://mybucket") assert p.blob == "" assert p.bucket == "mybucket" @@ -23,3 +27,25 @@ def test_concurrent_download(gs_rig, tmp_path, worker_type): assert not (dl_dir / p.name).exists() p.download_to(dl_dir) assert (dl_dir / p.name).is_file() + + +def test_as_url(gs_rig): + p: GSPath = gs_rig.create_cloud_path("dir_0/file0_0.txt") + public_url = p.as_url() + public_url_parts = urlparse(public_url) + assert public_url_parts.hostname and public_url_parts.hostname.startswith( + "storage.googleapis.com" + ) + assert public_url_parts.path.endswith("file0_0.txt") + + expire_seconds = 3600 + presigned_url = p.as_url(presign=True, expire_seconds=expire_seconds) + parts = urlparse(presigned_url) + query_params = parse_qs(parts.query) + assert parts.path.endswith("file0_0.txt") + assert query_params["X-Goog-Expires"] == [str(expire_seconds)] + assert "X-Goog-Algorithm" in query_params + assert "X-Goog-Credential" in query_params + assert "X-Goog-Date" in query_params + assert "X-Goog-SignedHeaders" in query_params + assert "X-Goog-Signature" in query_params diff --git a/tests/test_s3_specific.py b/tests/test_s3_specific.py index c529a4f9..4b12f7b9 100644 --- a/tests/test_s3_specific.py +++ b/tests/test_s3_specific.py @@ -1,7 +1,9 @@ from concurrent.futures import ProcessPoolExecutor from itertools import islice from time import sleep +import time +from urllib.parse import urlparse, parse_qs import pytest from boto3.s3.transfer import TransferConfig @@ -247,3 +249,44 @@ def test_aws_endpoint_url_env(monkeypatch): monkeypatch.setenv("AWS_ENDPOINT_URL", localstack_url) s3_client_custom_endpoint = S3Client() assert s3_client_custom_endpoint.client.meta.endpoint_url == localstack_url + + +def test_as_url_local(monkeypatch): + path = S3Path("s3://arxiv/pdf") + public_url = path.as_url() + assert public_url == "https://arxiv.s3.amazonaws.com/pdf" + + localstack_url = "http://localhost:4566" + monkeypatch.setenv("AWS_ENDPOINT_URL", localstack_url) + s3_client_custom_endpoint = S3Client() + + path = S3Path("s3://arxiv/pdf", client=s3_client_custom_endpoint) + public_url = path.as_url() + assert public_url == f"{localstack_url}/arxiv/pdf" + + +def test_as_url_presign(s3_rig): + p: S3Path = s3_rig.create_cloud_path("dir_0/file0_0.txt") + expire_seconds = 3600 + expire_timestamp = int(time.time()) + expire_seconds + presigned_url = p.as_url(presign=True, expire_seconds=expire_seconds) + parts = urlparse(presigned_url) + query_params = parse_qs(parts.query) + + assert parts.path.endswith("file0_0.txt") + + # v4 presigned URL (version depends on region and config) + # https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-query-string-auth.html + if "X-Amz-Expires" in query_params: + assert query_params["X-Amz-Expires"] == [str(expire_seconds)] + assert "X-Amz-Algorithm" in query_params + assert "X-Amz-Credential" in query_params + assert "X-Amz-Date" in query_params + assert "X-Amz-SignedHeaders" in query_params + assert "X-Amz-Signature" in query_params + elif "Expires" in query_params: + assert query_params["Expires"] == [str(expire_timestamp)] + assert "AWSAccessKeyId" in query_params + assert "Signature" in query_params + else: + assert False, "Unknown presigned URL format"