Skip to content

Commit

Permalink
Fix reading manifest file when it's only one line and also served by …
Browse files Browse the repository at this point in the history
…GitHub (#1323)

* Fix reading manifest file when it's only one line and also served by github

* Easier to read
  • Loading branch information
ivirshup authored Dec 10, 2024
1 parent 46bf846 commit ec0e754
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -111,15 +111,25 @@ def load_blocklist(dataset_id_blocklist_uri: str | None) -> set[str]:
logger.error(msg)
raise ValueError(msg)

with fsspec.open(dataset_id_blocklist_uri, "rt") as fp:
for line in fp:
line = line.strip()
if len(line) == 0 or line.startswith("#"):
# strip blank lines and comments (hash is never in a UUID)
continue
blocked_dataset_ids.add(line)

logger.info(f"Dataset blocklist found, containing {len(blocked_dataset_ids)} ids.")
blocklist = (
# Figure out protocol to open file
fsspec.filesystem(fsspec.utils.infer_storage_options(dataset_id_blocklist_uri)["protocol"])
# Read whole file as bytes
.cat_file(dataset_id_blocklist_uri)
# Decode to string and split into lines
.decode("utf-8")
.strip()
.split("\n")
)

for line in blocklist:
line = line.strip()
if len(line) == 0 or line.startswith("#"):
# strip blank lines and comments (hash is never in a UUID)
continue
blocked_dataset_ids.add(line)

logger.info(f"Dataset blocklist found, containing {len(blocked_dataset_ids)} ids.")

return blocked_dataset_ids

Expand Down
30 changes: 20 additions & 10 deletions tools/cellxgene_census_builder/tests/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,13 +210,23 @@ def test_blocklist_alive_and_well() -> None:

# test for existance by reading it. NOTE: if the file moves, this test will fail until
# the new file location is merged to main.
with fsspec.open(dataset_id_blocklist_uri, "rt") as fp:
for line in fp:
# each line must be a comment, blank or a UUID
line = line.strip()
if not line or line.startswith("#"):
continue

# UUID() raises ValueError upon malformed UUID
# Equality check enforces formatting (i.e., dashes)
assert line == str(uuid.UUID(hex=line))
blocklist = (
# Figure out protocol to open file
fsspec.filesystem(fsspec.utils.infer_storage_options(dataset_id_blocklist_uri)["protocol"])
# Read whole file as bytes
.cat_file(dataset_id_blocklist_uri)
# Decode to string and split into lines
.decode("utf-8")
.strip()
.split("\n")
)

for line in blocklist:
# each line must be a comment, blank or a UUID
line = line.strip()
if not line or line.startswith("#"):
continue

# UUID() raises ValueError upon malformed UUID
# Equality check enforces formatting (i.e., dashes)
assert line == str(uuid.UUID(hex=line))

0 comments on commit ec0e754

Please sign in to comment.