From ec0e754b84c2ca213a1fe976c352318040c8048f Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 10 Dec 2024 13:38:08 -0800 Subject: [PATCH] Fix reading manifest file when it's only one line and also served by GitHub (#1323) * Fix reading manifest file when it's only one line and also served by github * Easier to read --- .../build_soma/manifest.py | 28 +++++++++++------ .../tests/test_manifest.py | 30 ++++++++++++------- 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/manifest.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/manifest.py index a18de824d..78b55426e 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/manifest.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/manifest.py @@ -111,15 +111,25 @@ def load_blocklist(dataset_id_blocklist_uri: str | None) -> set[str]: logger.error(msg) raise ValueError(msg) - with fsspec.open(dataset_id_blocklist_uri, "rt") as fp: - for line in fp: - line = line.strip() - if len(line) == 0 or line.startswith("#"): - # strip blank lines and comments (hash is never in a UUID) - continue - blocked_dataset_ids.add(line) - - logger.info(f"Dataset blocklist found, containing {len(blocked_dataset_ids)} ids.") + blocklist = ( + # Figure out protocol to open file + fsspec.filesystem(fsspec.utils.infer_storage_options(dataset_id_blocklist_uri)["protocol"]) + # Read whole file as bytes + .cat_file(dataset_id_blocklist_uri) + # Decode to string and split into lines + .decode("utf-8") + .strip() + .split("\n") + ) + + for line in blocklist: + line = line.strip() + if len(line) == 0 or line.startswith("#"): + # strip blank lines and comments (hash is never in a UUID) + continue + blocked_dataset_ids.add(line) + + logger.info(f"Dataset blocklist found, containing {len(blocked_dataset_ids)} ids.") return blocked_dataset_ids diff --git a/tools/cellxgene_census_builder/tests/test_manifest.py b/tools/cellxgene_census_builder/tests/test_manifest.py index 1228696ac..62d74c1bf 100644 --- a/tools/cellxgene_census_builder/tests/test_manifest.py +++ b/tools/cellxgene_census_builder/tests/test_manifest.py @@ -210,13 +210,23 @@ def test_blocklist_alive_and_well() -> None: # test for existance by reading it. NOTE: if the file moves, this test will fail until # the new file location is merged to main. - with fsspec.open(dataset_id_blocklist_uri, "rt") as fp: - for line in fp: - # each line must be a comment, blank or a UUID - line = line.strip() - if not line or line.startswith("#"): - continue - - # UUID() raises ValueError upon malformed UUID - # Equality check enforces formatting (i.e., dashes) - assert line == str(uuid.UUID(hex=line)) + blocklist = ( + # Figure out protocol to open file + fsspec.filesystem(fsspec.utils.infer_storage_options(dataset_id_blocklist_uri)["protocol"]) + # Read whole file as bytes + .cat_file(dataset_id_blocklist_uri) + # Decode to string and split into lines + .decode("utf-8") + .strip() + .split("\n") + ) + + for line in blocklist: + # each line must be a comment, blank or a UUID + line = line.strip() + if not line or line.startswith("#"): + continue + + # UUID() raises ValueError upon malformed UUID + # Equality check enforces formatting (i.e., dashes) + assert line == str(uuid.UUID(hex=line))