fixing TensorBoard (Lightning-AI#687)

* flake8 * fix typo * fix tensorboardlogger drop test_tube dependence * formatting * fix tensorboard & tests * upgrade Tensorboard * test formatting separately * try to fix JIT issue * add tests for 1.4
lezwon · Jan 16, 2020 · f72e354 · f72e354
1 parent 6fdfa12
commit f72e354
Show file tree

Hide file tree

Showing 17 changed files with 132 additions and 68 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -16,14 +16,21 @@ references:
         sudo pip install pytest pytest-cov pytest-flake8
         pip install -r ./tests/requirements.txt --user
 
-  tests_format: &tests_format
+  tests: &tests
    run:
-     name: Tests and formating
+     name: Testing
      command: |
        python --version ; pip --version ; pip list
-       py.test pytorch_lightning tests pl_examples -v --doctest-modules --junitxml=test-reports/pytest_junit.xml --flake8
+       py.test pytorch_lightning tests pl_examples -v --doctest-modules --junitxml=test-reports/pytest_junit.xml
      no_output_timeout: 15m
 
+  format: &format
+   run:
+     name: Formatting
+     command: |
+       python --version ; pip --version ; pip list
+       flake8
+
   make_docs: &make_docs
    run:
      name: Make Documentation
@@ -43,6 +50,16 @@ jobs:
       - checkout
       - *make_docs
 
+  Formatting:
+    docker:
+      - image: circleci/python:3.7
+    environment:
+        - TORCH_VERSION: "torch"
+    steps:
+      - checkout
+      - *install_deps
+      - *format
+
   PyTorch:
     docker:
       - image: circleci/python:3.7
@@ -52,7 +69,7 @@ jobs:
       - checkout
 
       - *install_deps
-      - *tests_format
+      - *tests
 
       - store_test_results:
           path: test-reports
@@ -80,11 +97,20 @@ jobs:
       - TORCH_VERSION: "torch>=1.3, <1.4"
     steps: *steps
 
+  PyTorch-v1.4:
+    docker:
+      - image: circleci/python:3.6
+    environment:
+      - TORCH_VERSION: "torch>=1.4, <1.5"
+    steps: *steps
+
 workflows:
   version: 2
   build:
     jobs:
+      - Formatting
       - Build-Docs
       - PyTorch-v1.1
       - PyTorch-v1.2
       - PyTorch-v1.3
+      - PyTorch-v1.4
diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py
@@ -1,5 +1,6 @@
 """
-# Hooks
+Hooks
+=====
 
 There are cases when you might want to do something different at different parts of the training/validation loop.
  To enable a hook, simply override the method in your LightningModule and the trainer will call it at the correct time.

diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
@@ -135,7 +135,7 @@ def training_step(self, batch, batch_idx):
 
                 logger_logs = {'training_loss': loss} # optional (MUST ALL BE TENSORS)
 
-                # if using TestTubeLogger or TensorboardLogger you can nest scalars
+                # if using TestTubeLogger or TensorBoardLogger you can nest scalars
                 logger_logs = {'losses': logger_logs} # optional (MUST ALL BE TENSORS)
 
                 output = {
@@ -665,9 +665,10 @@ def configure_optimizers(self):
 
         .. note:: If you use multiple optimizers, training_step will have an additional `optimizer_idx` parameter.
 
-        .. note:: If you use LBFGS lightning handles the closure function automatically for you.
-        
-        .. note:: If you use multiple optimizers, gradients will be calculated only for the parameters of current optimizer at each training step.
+        .. note:: If you use LBFGS lightning handles the closure function automatically for you
+
+        .. note:: If you use multiple optimizers, gradients will be calculated only
+         for the parameters of current optimizer at each training step.
 
         Example
         -------
@@ -939,9 +940,9 @@ def load_from_metrics(cls, weights_path, tags_csv, map_location=None):
             for mapping storage {'cuda:1':'cuda:0'}
         :return: The pretrained LightningModule
 
-        If you're using test tube, there is an alternate method which uses the meta_tags.csv
-        file from test-tube to rebuild the model. The meta_tags.csv file can be found in the
-        test-tube experiment save_dir.
+        If you're using `test-tube`, there is an alternate method which uses the meta_tags.csv
+        file from test-tube to rebuild the model. The `meta_tags.csv` file can be found in the
+        `test-tube` experiment save_dir.
 
         .. code-block:: python
 

diff --git a/pytorch_lightning/logging/__init__.py b/pytorch_lightning/logging/__init__.py
@@ -166,8 +166,8 @@ def __init__(self, hparams):
 """
 
 from os import environ
-from .base import LightningLoggerBase, rank_zero_only
 
+from .base import LightningLoggerBase, rank_zero_only
 from .tensorboard import TensorBoardLogger
 
 try:

diff --git a/pytorch_lightning/logging/tensorboard.py b/pytorch_lightning/logging/tensorboard.py
@@ -1,8 +1,10 @@
 import os
 from warnings import warn
+from argparse import Namespace
+from pkg_resources import parse_version
 
 import torch
-from pkg_resources import parse_version
+import pandas as pd
 from torch.utils.tensorboard import SummaryWriter
 
 from .base import LightningLoggerBase, rank_zero_only
@@ -28,8 +30,8 @@ class TensorBoardLogger(LightningLoggerBase):
         directory for existing versions, then automatically assigns the next available version.
     :param \**kwargs: Other arguments are passed directly to the :class:`SummaryWriter` constructor.
 
-
     """
+    NAME_CSV_TAGS = 'meta_tags.csv'
 
     def __init__(self, save_dir, name="default", version=None, **kwargs):
         super().__init__()
@@ -38,6 +40,7 @@ def __init__(self, save_dir, name="default", version=None, **kwargs):
         self._version = version
 
         self._experiment = None
+        self.tags = {}
         self.kwargs = kwargs
 
     @property
@@ -57,22 +60,25 @@ def experiment(self):
 
     @rank_zero_only
     def log_hyperparams(self, params):
+        if params is None:
+            return
+
+        # in case converting from namespace
+        if isinstance(params, Namespace):
+            params = vars(params)
+        params = dict(params)
+
         if parse_version(torch.__version__) < parse_version("1.3.0"):
             warn(
                 f"Hyperparameter logging is not available for Torch version {torch.__version__}."
                 " Skipping log_hyperparams. Upgrade to Torch 1.3.0 or above to enable"
                 " hyperparameter logging."
             )
-            # TODO: some alternative should be added
-            return
-        try:
-            # in case converting from namespace, todo: rather test if it is namespace
-            params = vars(params)
-        except TypeError:
-            pass
-        if params is not None:
+        else:
             # `add_hparams` requires both - hparams and metric
-            self.experiment.add_hparams(hparam_dict=dict(params), metric_dict={})
+            self.experiment.add_hparams(hparam_dict=params, metric_dict={})
+        # some alternative should be added
+        self.tags.update(params)
 
     @rank_zero_only
     def log_metrics(self, metrics, step=None):
@@ -89,6 +95,17 @@ def save(self):
             # you are using PT version (<v1.2) which does not have implemented flush
             self.experiment._get_file_writer().flush()
 
+        # create a preudo standard path ala test-tube
+        dir_path = os.path.join(self.save_dir, self.name, 'version_%s' % self.version)
+        if not os.path.isdir(dir_path):
+            dir_path = self.save_dir
+        # prepare the file path
+        meta_tags_path = os.path.join(dir_path, self.NAME_CSV_TAGS)
+        # save the metatags file
+        df = pd.DataFrame({'key': list(self.tags.keys()),
+                           'value': list(self.tags.values())})
+        df.to_csv(meta_tags_path, index=False)
+
     @rank_zero_only
     def finalize(self, status):
         self.save()

diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py
@@ -1,5 +1,6 @@
 """
-# Trainer
+Trainer
+=======
 
 The lightning trainer abstracts best practices for running a training, val, test routine.
  It calls parts of your model when it wants to hand over full control and otherwise makes

diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py
@@ -2,7 +2,7 @@
 from abc import ABC
 
 from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
-from pytorch_lightning.logging import TensorboardLogger
+from pytorch_lightning.logging import TensorBoardLogger
 
 
 class TrainerCallbackConfigMixin(ABC):
@@ -69,7 +69,7 @@ def configure_early_stopping(self, early_stop_callback, logger):
         # configure logger
         if logger is True:
             # default logger
-            self.logger = TensorboardLogger(
+            self.logger = TensorBoardLogger(
                 save_dir=self.default_save_path,
                 version=self.slurm_job_id,
                 name='lightning_logs'

diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
@@ -1,5 +1,6 @@
 """
-# Validation loop
+Validation loop
+===============
 
 The lightning validation loop handles everything except the actual computations of your model.
 To decide what will happen in your validation loop, define the `validation_step` function.

diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py
@@ -96,7 +96,9 @@
 from subprocess import call
 import logging
 from abc import ABC
+from argparse import Namespace
 
+import pandas as pd
 import torch
 import torch.distributed as dist
 
@@ -268,7 +270,6 @@ def save_checkpoint(self, filepath):
             torch.save(checkpoint, filepath)
 
     def restore(self, checkpoint_path, on_gpu):
-
         # if on_gpu:
         #     checkpoint = torch.load(checkpoint_path)
         # else:
@@ -461,14 +462,13 @@ def max_ckpt_in_folder(self, path, name_key='ckpt_'):
 
 
 def load_hparams_from_tags_csv(tags_csv):
-    from argparse import Namespace
-    import pandas as pd
+    if not os.path.isfile(tags_csv):
+        logging.warning(f'Missing Tags: {tags_csv}.')
+        return Namespace()
 
     tags_df = pd.read_csv(tags_csv)
     dic = tags_df.to_dict(orient='records')
-
     ns_dict = {row['key']: convert(row['value']) for row in dic}
-
     ns = Namespace(**ns_dict)
     return ns
 

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
@@ -458,7 +458,7 @@ def run_training_batch(self, batch, batch_idx):
 
             # call training_step once per optimizer
             for opt_idx, optimizer in enumerate(self.optimizers):
-                # make sure only the gradients of the current optimizer's paramaters are calculated 
+                # make sure only the gradients of the current optimizer's paramaters are calculated
                 # in the training step to prevent dangling gradients in multiple-optimizer setup.
                 for param in self.get_model().parameters():
                     param.requires_grad = False

diff --git a/requirements.txt b/requirements.txt
@@ -2,7 +2,7 @@ scikit-learn>=0.20.2
 tqdm>=4.35.0
 numpy>=1.16.4
 torch>=1.1
-torchvision>=0.4.0
+torchvision>=0.4.0, < 0.5  # the 0.5. has some issues with torch JIT
 pandas>=0.24  # lower version do not support py3.7
-test-tube>=0.7.5
+tensorboard>=1.14
 future>=0.17.1  # required for builtins in setup.py
diff --git a/tests/requirements.txt b/tests/requirements.txt
@@ -5,7 +5,7 @@ pytest>=3.0.5
 pytest-cov
 flake8
 check-manifest
-# test_tube  # already installed in main req.
+test-tube>=0.7.5
 mlflow
 comet_ml
 wandb

diff --git a/tests/test_cpu_models.py b/tests/test_cpu_models.py
@@ -29,7 +29,7 @@ def test_early_stopping_cpu_model(tmpdir):
         show_progress_bar=True,
         logger=tutils.get_test_tube_logger(tmpdir),
         train_percent_check=0.1,
-        val_percent_check=0.1
+        val_percent_check=0.1,
     )
 
     model, hparams = tutils.get_model()
@@ -51,7 +51,7 @@ def test_lbfgs_cpu_model(tmpdir):
         show_progress_bar=False,
         weights_summary='top',
         train_percent_check=1.0,
-        val_percent_check=0.2
+        val_percent_check=0.2,
     )
 
     model, hparams = tutils.get_model(use_test_model=True, lbfgs=True)
@@ -70,7 +70,7 @@ def test_default_logger_callbacks_cpu_model(tmpdir):
         print_nan_grads=True,
         show_progress_bar=False,
         train_percent_check=0.01,
-        val_percent_check=0.01
+        val_percent_check=0.01,
     )
 
     model, hparams = tutils.get_model()

diff --git a/tests/test_logging.py b/tests/test_logging.py
@@ -192,6 +192,7 @@ def test_comet_pickle(tmpdir, monkeypatch):
     trainer2 = pickle.loads(pkl_bytes)
     trainer2.logger.log_metrics({"acc": 1.0})
 
+
 def test_wandb_logger(tmpdir):
     """Verify that basic functionality of wandb logger works."""
     tutils.reset_seed()
@@ -201,6 +202,7 @@ def test_wandb_logger(tmpdir):
     wandb_dir = os.path.join(tmpdir, "wandb")
     logger = WandbLogger(save_dir=wandb_dir, anonymous=True)
 
+
 def test_neptune_logger(tmpdir):
     """Verify that basic functionality of neptune logger works."""
     tutils.reset_seed()
@@ -223,13 +225,16 @@ def test_neptune_logger(tmpdir):
     print('result finished')
     assert result == 1, "Training failed"
 
+
 def test_wandb_pickle(tmpdir):
     """Verify that pickling trainer with wandb logger works."""
     tutils.reset_seed()
 
     from pytorch_lightning.logging import WandbLogger
     wandb_dir = str(tmpdir)
     logger = WandbLogger(save_dir=wandb_dir, anonymous=True)
+    assert logger is not None
+
 
 def test_neptune_pickle(tmpdir):
     """Verify that pickling trainer with neptune logger works."""