Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add json schemas to docs #1314

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
19d2942
add schemas
mvpatel2000 Jul 25, 2022
c5a55be
update conf
mvpatel2000 Jul 26, 2022
2f8a2b7
checkdown
mvpatel2000 Jul 26, 2022
a452faa
generate schemas
mvpatel2000 Jul 26, 2022
3092902
add json schema doctest
mvpatel2000 Jul 26, 2022
9fc271c
schema
mvpatel2000 Jul 26, 2022
acf6b80
schema sorted
mvpatel2000 Jul 26, 2022
593cd5c
remove unused required field
mvpatel2000 Jul 26, 2022
5502823
remove formatting requirements in json schema
mvpatel2000 Jul 26, 2022
5bcc34b
update schema
mvpatel2000 Jul 26, 2022
c1d51dd
update schemas and remove hook
mvpatel2000 Jul 26, 2022
a45eb09
checkdown jsons
mvpatel2000 Jul 26, 2022
f1dd48e
simplify schemas
mvpatel2000 Jul 26, 2022
c9c25a1
clean up jsons
mvpatel2000 Jul 26, 2022
2f17756
remove docs for optimizers
mvpatel2000 Jul 26, 2022
c64a736
misspelled name
mvpatel2000 Jul 26, 2022
bd5a6a3
update jsons
mvpatel2000 Jul 27, 2022
387e341
add yaml test and start fixing yamls
mvpatel2000 Jul 27, 2022
f8b9832
disable yaml test for now
mvpatel2000 Jul 27, 2022
ef683b7
generate schemas
mvpatel2000 Jul 27, 2022
c1a3e9d
empty commit
mvpatel2000 Jul 27, 2022
123cdfe
Merge branch 'dev' into mvpatel2000/add-json-schemas
mvpatel2000 Jul 27, 2022
e4842ae
enable yaml tests
mvpatel2000 Jul 27, 2022
362a0bf
check if sort_json causes errors
mvpatel2000 Jul 27, 2022
e7982d9
restore sortjson
mvpatel2000 Jul 27, 2022
87a9e8f
remove timeout
mvpatel2000 Jul 27, 2022
c1e4cdb
Merge branch 'dev' into mvpatel2000/add-json-schemas
mvpatel2000 Jul 28, 2022
9b2203a
remove yaml test to run docs
mvpatel2000 Jul 28, 2022
3a3f99b
merge
mvpatel2000 Jul 28, 2022
12d42b8
test docs
mvpatel2000 Jul 29, 2022
38c5163
merge
mvpatel2000 Jul 29, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions .ci/test_lint_doctests.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# Running these checks through pytest allows us to report any errors in Junit format,
# which is posted directly on the PR

import json
import os
import pathlib
import shutil
Expand Down Expand Up @@ -78,6 +79,36 @@ def test_docker_build_matrix():
assert existing_build_matrix == f.read()


def sort_json(json):
if isinstance(json, dict):
return sorted((key, sort_json(value)) for key, value in json.items())
elif isinstance(json, list):
return sorted(sort_json(elem) for elem in json)
return json


def test_json_schemas():
"""Test JSON Schemas are up to date."""
schemas_folder = pathlib.Path(os.path.dirname(__file__)) / '..' / 'composer' / 'json_schemas'

# Capture existing schemas
existing_schemas = []
for filename in sorted(os.listdir(schemas_folder)):
if filename != 'generate_json_schemas.py':
with open(os.path.join(schemas_folder, filename), 'r') as f:
existing_schemas.append(json.load(f))

# Run the script
check_output(
subprocess.run(['python', 'generate_json_schemas.py'], cwd=schemas_folder, capture_output=True, text=True))

# Assert that the files did not change
for existing_schema, filename in zip(existing_schemas, sorted(os.listdir(schemas_folder))):
if filename != 'generate_json_schemas.py':
with open(os.path.join(schemas_folder, filename), 'r') as f:
assert sort_json(existing_schema) == sort_json(json.load(f))


@pytest.mark.parametrize('example', [1, 2])
def test_release_tests_reflect_readme(example: int):
"""Test that example_1.py and example_2.py in release_tests reflect the README.md."""
Expand Down
4 changes: 4 additions & 0 deletions composer/datasets/ade20k_hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ class ADE20kDatasetHparams(DatasetHparams, SyntheticHparamsMixin):
ignore_background (bool): if true, ignore the background class when calculating the training loss.
Default: ``true``.
datadir (str): The path to the data directory.

.. jsonschema:: ../json_schemas/ade20kdataset_hparams.json
"""

split: str = hp.optional("Which split of the dataset to use. Either ['train', 'val', 'test']", default='train')
Expand Down Expand Up @@ -159,6 +161,8 @@ class StreamingADE20kHparams(DatasetHparams):
final_size (int): the final size of the image and target. Default: ``512``.
ignore_background (bool): if true, ignore the background class when calculating the training loss.
Default: ``true``.

.. jsonschema:: ../json_schemas/streamingade20k_hparams.json
"""

remote: str = hp.optional('Remote directory (S3 or local filesystem) where dataset is stored',
Expand Down
2 changes: 2 additions & 0 deletions composer/datasets/brats_hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ class BratsDatasetHparams(DatasetHparams):

Args:
oversampling (float): The oversampling ratio to use. Default: ``0.33``.

.. jsonschema:: ../json_schemas/bratsdataset_hparams.json
"""

oversampling: float = hp.optional('oversampling', default=0.33)
Expand Down
4 changes: 4 additions & 0 deletions composer/datasets/c4_hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class StreamingC4Hparams(DatasetHparams):
mlm_probability (float): If ``mlm==True``, the probability that tokens are masked. Default: ``0.15``.
max_retries (int): Number of download re-attempts before giving up. Default: 2.
timeout (float): How long to wait for shard to download before raising an exception. Default: 120 sec.

.. jsonschema:: ../json_schemas/streamingc4_hparams.json
"""

remote: str = hp.optional('Remote directory (S3 or local filesystem) where dataset is stored',
Expand Down Expand Up @@ -121,6 +123,8 @@ class C4DatasetHparams(DatasetHparams):
drop_last (bool): Whether to drop the last samples for the last batch. Default: ``True``.
Returns:
DataLoader: A PyTorch :class:`~torch.utils.data.DataLoader` object.

.. jsonschema:: ../json_schemas/c4dataset_hparams.json
"""

split: Optional[str] = hp.optional('What split of the dataset to use. Either `train` or `validation`.',
Expand Down
4 changes: 4 additions & 0 deletions composer/datasets/cifar_hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ class CIFAR10DatasetHparams(DatasetHparams, SyntheticHparamsMixin):
datadir (str): The path to the data directory.
is_train (bool): Whether to load the training data or validation data. Default:
``True``.

.. jsonschema:: ../json_schemas/cifar10dataset_hparams.json
"""
download: bool = hp.optional('whether to download the dataset, if needed', default=True)
use_ffcv: bool = hp.optional('whether to use ffcv for faster dataloading', default=False)
Expand Down Expand Up @@ -188,6 +190,8 @@ class StreamingCIFAR10Hparams(DatasetHparams):
local (str): Local filesystem directory where dataset is cached during operation.
Default: ``'/tmp/mds-cache/mds-cifar10/'``
split (str): The dataset split to use, either 'train' or 'val'. Default: ``'train'``.

.. jsonschema:: ../json_schemas/streamingcifar10_hparams.json
"""

remote: str = hp.optional('Remote directory (S3 or local filesystem) where dataset is stored',
Expand Down
4 changes: 4 additions & 0 deletions composer/datasets/coco_hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ class COCODatasetHparams(DatasetHparams):
datadir (str): The path to the data directory.
is_train (bool): Whether to load the training data or validation data. Default:
``True``.

.. jsonschema:: ../json_schemas/cocodataset_hparams.json
"""

is_train: bool = hp.optional('Whether to load the training data (the default) or validation data.', default=True)
Expand Down Expand Up @@ -79,6 +81,8 @@ class StreamingCOCOHparams(DatasetHparams):
local (str): Local filesystem directory where dataset is cached during operation.
Default: ``'/tmp/mds-cache/mds-coco/```
split (str): The dataset split to use, either 'train' or 'val'. Default: ``'train```.

.. jsonschema:: ../json_schemas/streamingcoco_hparams.json
"""

remote: str = hp.optional('Remote directory (S3 or local filesystem) where dataset is stored',
Expand Down
4 changes: 4 additions & 0 deletions composer/datasets/dataset_hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ class DataLoaderHparams(hp.Hparams):
If ``num_workers = 0``, then the ``pin_memory`` must be ``False``. Default: ``True``.
timeout (float): Timeout, in seconds, for collecting a batch from workers. Set to ``0`` for no timeout.
Default: ``0``.

.. jsonschema:: ../json_schemas/dataloader_hparams.json
"""

num_workers: int = hp.optional(textwrap.dedent("""\
Expand Down Expand Up @@ -115,6 +117,8 @@ class DatasetHparams(hp.Hparams, abc.ABC):
whether to drop the last batch or pad the last batch with zeros. Default:
``True``.
shuffle (bool): Whether to shuffle the dataset. Default: ``True``.

.. jsonschema:: ../json_schemas/dataset_hparams.json
"""

drop_last: bool = hp.optional(textwrap.dedent("""\
Expand Down
2 changes: 2 additions & 0 deletions composer/datasets/glue_hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ class GLUEHparams(DatasetHparams, SyntheticHparamsMixin):

Returns:
DataLoader: A PyTorch :class:`~torch.utils.data.DataLoader` object.

.. jsonschema:: ../json_schemas/glue_hparams.json
"""

task: Optional[str] = hp.optional(
Expand Down
4 changes: 4 additions & 0 deletions composer/datasets/imagenet_hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ class ImagenetDatasetHparams(DatasetHparams, SyntheticHparamsMixin):
datadir (str): The path to the data directory.
is_train (bool): Whether to load the training data or validation data. Default:
``True``.

.. jsonschema:: ../json_schemas/imagenetdataset_hparams.json
"""
resize_size: int = hp.optional('resize size. Set to -1 to not resize', default=-1)
crop_size: int = hp.optional('crop size', default=224)
Expand Down Expand Up @@ -218,6 +220,8 @@ class StreamingImageNet1kHparams(DatasetHparams):
split (str): The dataset split to use, either 'train' or 'val'. Default: ``'train```.
resize_size (int, optional): The resize size to use. Use -1 to not resize. Default: ``-1``.
crop size (int): The crop size to use. Default: ``224``.

.. jsonschema:: ../json_schemas/streamingimagenet1k_hparams.json
"""

remote: str = hp.optional('Remote directory (S3 or local filesystem) where dataset is stored',
Expand Down
2 changes: 2 additions & 0 deletions composer/datasets/lm_dataset_hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ class LMDatasetHparams(DatasetHparams, SyntheticHparamsMixin):
Default: ``1024``.
val_sequence_length (int, optional): Sequence length for validation dataset.
Default: ``1024``.

.. jsonschema:: ../json_schemas/lmdataset_hparams.json
"""

# TODO(moin): Switch datadir to be a string, rather than a list of strings, to be similar to the
Expand Down
2 changes: 2 additions & 0 deletions composer/datasets/mnist_hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ class MNISTDatasetHparams(DatasetHparams, SyntheticHparamsMixin):
datadir (str): The path to the data directory.
is_train (bool): Whether to load the training data or validation data. Default:
``True``.

.. jsonschema:: ../json_schemas/mnistdataset_hparams.json
"""
download: bool = hp.optional('whether to download the dataset, if needed', default=True)
is_train: bool = hp.optional('Whether to load the training data (the default) or validation data.', default=True)
Expand Down
2 changes: 2 additions & 0 deletions composer/datasets/synthetic_hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ class SyntheticHparamsMixin(hp.Hparams, abc.ABC):
synthetic_memory_format: The :class:`~.core.types.MemoryFormat` to use.
Ignored if :attr:`use_synthetic` is ``False``. Default:
``'CONTIGUOUS_FORMAT'``.

.. jsonschema:: ../json_schemas/syntheticmixin_hparams.json
"""

use_synthetic: bool = hp.optional('Whether to use synthetic data. Defaults to False.', default=False)
Expand Down
36 changes: 36 additions & 0 deletions composer/json_schemas/adam_hparams.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"additionalProperties": false,
"properties": {
"amsgrad": {
"description": "whether to use the AMSGrad variant of this\nalgorithm from the paper `On the Convergence of Adam and Beyond`_\n(default: False)",
"type": "boolean"
},
"betas": {
"description": "coefficients used for computing\nrunning averages of gradient and its square (default: (0.9, 0.999))",
"oneOf": [
{
"type": "number"
},
{
"items": {
"type": "number"
},
"type": "array"
}
]
},
"eps": {
"description": "term added to the denominator to improve\nnumerical stability (default: 1e-8)",
"type": "number"
},
"lr": {
"description": "learning rate (default: 1e-3)",
"type": "number"
},
"weight_decay": {
"description": "weight decay (L2 penalty) (default: 0)",
"type": "number"
}
},
"type": "object"
}
36 changes: 36 additions & 0 deletions composer/json_schemas/adamw_hparams.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"additionalProperties": false,
"properties": {
"amsgrad": {
"description": "whether to use the AMSGrad variant of this\nalgorithm from the paper `On the Convergence of Adam and Beyond`_\n(default: False)",
"type": "boolean"
},
"betas": {
"description": "coefficients used for computing\nrunning averages of gradient and its square (default: (0.9, 0.999))",
"oneOf": [
{
"type": "number"
},
{
"items": {
"type": "number"
},
"type": "array"
}
]
},
"eps": {
"description": "term added to the denominator to improve\nnumerical stability (default: 1e-8)",
"type": "number"
},
"lr": {
"description": "learning rate (default: 1e-3)",
"type": "number"
},
"weight_decay": {
"description": "weight decay coefficient (default: 1e-2)",
"type": "number"
}
},
"type": "object"
}
87 changes: 87 additions & 0 deletions composer/json_schemas/ade20kdataset_hparams.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
{
"$defs": {
"MemoryFormat": {
"oneOf": [
{
"pattern": "(?i)^CHANNELS_LAST$",
"type": "string"
},
{
"pattern": "(?i)^CHANNELS_LAST_3D$",
"type": "string"
},
{
"pattern": "(?i)^CONTIGUOUS_FORMAT$",
"type": "string"
},
{
"pattern": "(?i)^PRESERVE_FORMAT$",
"type": "string"
}
]
}
},
"additionalProperties": false,
"properties": {
"base_size": {
"description": "Initial size of the image and target before other augmentations",
"type": "integer"
},
"datadir": {
"description": "The path to the data directory",
"oneOf": [
{
"type": "string"
},
{
"type": "null"
}
]
},
"drop_last": {
"description": "If the number of samples is not divisible by the batch size,\nwhether to drop the last batch (the default) or pad the last batch with zeros.",
"type": "boolean"
},
"final_size": {
"description": "Final size of the image and target",
"type": "integer"
},
"ignore_background": {
"description": "If true, ignore the background class in training loss",
"type": "boolean"
},
"max_resize_scale": {
"description": "Maximum value that the image and target can be scaled",
"type": "number"
},
"min_resize_scale": {
"description": "Minimum value that the image and target can be scaled",
"type": "number"
},
"shuffle": {
"description": "Whether to shuffle the dataset for each epoch. Defaults to True.",
"type": "boolean"
},
"split": {
"description": "Which split of the dataset to use. Either ['train', 'val', 'test']",
"type": "string"
},
"synthetic_device": {
"description": "Device to store the sample pool. Should be `cuda` or `cpu`. Defauls to `cpu`.",
"type": "string"
},
"synthetic_memory_format": {
"$ref": "#/$defs/MemoryFormat",
"description": "Memory format. Defaults to contiguous format."
},
"synthetic_num_unique_samples": {
"description": "The number of unique samples to allocate memory for.",
"type": "integer"
},
"use_synthetic": {
"description": "Whether to use synthetic data. Defaults to False.",
"type": "boolean"
}
},
"type": "object"
}
Loading