Skip to content

Commit

Permalink
Add default excludes for datasets we know contain broken URIs (#54)
Browse files Browse the repository at this point in the history
* Add default excludes for individual datasets

* Add newline

* Update openneuro/default_excludes.json

* Add default_excludes.json to sdist

* Actually include the JSON …

* Add changelog

* Add test and simplify
  • Loading branch information
hoechenberger authored Oct 2, 2021
1 parent a0ed4b4 commit f130df9
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 0 deletions.
4 changes: 4 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
- Don't crash if the local `dataset_description.json` file is empty when trying
to resume an aborted download.

- We now by default exclude certain files from the download that are known to
be invalid for specific datasets. Once the datasets have been fixed on
OpenNeuro, we will revert these exclusions.

## 2021.8

- Retry downloads if a `ReadError` has occurred.
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
include LICENSE
include README.md
include openneuro/default_excludes.json
22 changes: 22 additions & 0 deletions openneuro/default_excludes.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[
{
"datasetName": "ds003104",
"excludeFiles": [
"derivatives/freesurfer/subjects/01/mri/aparc+aseg.mgz",
"derivatives/freesurfer/subjects/01/mri/aparc.DKTatlas+aseg.mgz",
"derivatives/freesurfer/subjects/01/mri/aparc.a2009s+aseg.mgz"
]
},
{
"datasetName": "ds000248",
"excludeFiles": [
"derivatives/freesurfer/subjects/fsaverage/mri/aparc.a2005s+aseg.mgz",
"derivatives/freesurfer/subjects/fsaverage/mri/aparc+aseg.mgz",
"derivatives/freesurfer/subjects/fsaverage/mri/aparc.a2009s+aseg.mgz",
"derivatives/freesurfer/subjects/fsaverage/xhemi/mri/aparc+aseg.mgz",
"derivatives/freesurfer/subjects/sub-01/mri/aparc+aseg.mgz",
"derivatives/freesurfer/subjects/sub-01/mri/aparc.DKTatlas+aseg.mgz",
"derivatives/freesurfer/subjects/sub-01/mri/aparc.a2009s+aseg.mgz"
]
}
]
17 changes: 17 additions & 0 deletions openneuro/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@
from . import __version__
from .config import default_base_url

DEFAULT_EXCLUDES = json.loads(
(Path(__file__).parent / 'default_excludes.json')
.read_text(encoding='utf-8')
)


if sys.stdout.encoding.lower() == 'utf-8':
stdout_unicode = True
Expand Down Expand Up @@ -573,6 +578,18 @@ def download(*,
exclude = [exclude] if isinstance(exclude, str) else exclude
exclude = [] if exclude is None else list(exclude)

for dataset_default_excludes in DEFAULT_EXCLUDES:
if dataset_default_excludes['datasetName'] == dataset:
msg = f'Adding default excludes for dataset {dataset}: \n '
msg += '\n '.join(dataset_default_excludes['excludeFiles'])
if stdout_unicode:
msg = f'🤕 {msg}'
tqdm.write(msg)
exclude = list(
set(exclude + dataset_default_excludes['excludeFiles'])
)
break

retry_backoff = 0.5 # seconds
metadata = _get_download_metadata(base_url=default_base_url,
dataset_id=dataset,
Expand Down
10 changes: 10 additions & 0 deletions openneuro/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,13 @@ def test_resume_download(tmp_path: Path):
include = ['sub-0001/meg/sub-0001_coordsystem.json']
download(dataset=dataset, tag=tag, target_dir=tmp_path,
include=include)


def test_ds000248(tmp_path: Path):
"""Test a dataset for that we ship default excludes."""
dataset = 'ds000248'
download(
dataset=dataset,
include=['participants.tsv'],
target_dir=tmp_path
)
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ install_requires =
sgqlc
importlib-metadata; python_version < "3.8"
typing-extensions; python_version < "3.8"
include_package_data = True

[options.entry_points]
console_scripts =
Expand Down

0 comments on commit f130df9

Please sign in to comment.