Skip to content

Commit

Permalink
fixing TensorBoard (Lightning-AI#687)
Browse files Browse the repository at this point in the history
* flake8

* fix typo

* fix tensorboardlogger
drop test_tube dependence

* formatting

* fix tensorboard & tests

* upgrade Tensorboard

* test formatting separately

* try to fix JIT issue

* add tests for 1.4
  • Loading branch information
Borda authored and williamFalcon committed Jan 16, 2020
1 parent 6fdfa12 commit f72e354
Show file tree
Hide file tree
Showing 17 changed files with 132 additions and 68 deletions.
34 changes: 30 additions & 4 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,21 @@ references:
sudo pip install pytest pytest-cov pytest-flake8
pip install -r ./tests/requirements.txt --user
tests_format: &tests_format
tests: &tests
run:
name: Tests and formating
name: Testing
command: |
python --version ; pip --version ; pip list
py.test pytorch_lightning tests pl_examples -v --doctest-modules --junitxml=test-reports/pytest_junit.xml --flake8
py.test pytorch_lightning tests pl_examples -v --doctest-modules --junitxml=test-reports/pytest_junit.xml
no_output_timeout: 15m

format: &format
run:
name: Formatting
command: |
python --version ; pip --version ; pip list
flake8
make_docs: &make_docs
run:
name: Make Documentation
Expand All @@ -43,6 +50,16 @@ jobs:
- checkout
- *make_docs

Formatting:
docker:
- image: circleci/python:3.7
environment:
- TORCH_VERSION: "torch"
steps:
- checkout
- *install_deps
- *format

PyTorch:
docker:
- image: circleci/python:3.7
Expand All @@ -52,7 +69,7 @@ jobs:
- checkout

- *install_deps
- *tests_format
- *tests

- store_test_results:
path: test-reports
Expand Down Expand Up @@ -80,11 +97,20 @@ jobs:
- TORCH_VERSION: "torch>=1.3, <1.4"
steps: *steps

PyTorch-v1.4:
docker:
- image: circleci/python:3.6
environment:
- TORCH_VERSION: "torch>=1.4, <1.5"
steps: *steps

workflows:
version: 2
build:
jobs:
- Formatting
- Build-Docs
- PyTorch-v1.1
- PyTorch-v1.2
- PyTorch-v1.3
- PyTorch-v1.4
3 changes: 2 additions & 1 deletion pytorch_lightning/core/hooks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
# Hooks
Hooks
=====
There are cases when you might want to do something different at different parts of the training/validation loop.
To enable a hook, simply override the method in your LightningModule and the trainer will call it at the correct time.
Expand Down
15 changes: 8 additions & 7 deletions pytorch_lightning/core/lightning.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def training_step(self, batch, batch_idx):
logger_logs = {'training_loss': loss} # optional (MUST ALL BE TENSORS)
# if using TestTubeLogger or TensorboardLogger you can nest scalars
# if using TestTubeLogger or TensorBoardLogger you can nest scalars
logger_logs = {'losses': logger_logs} # optional (MUST ALL BE TENSORS)
output = {
Expand Down Expand Up @@ -665,9 +665,10 @@ def configure_optimizers(self):
.. note:: If you use multiple optimizers, training_step will have an additional `optimizer_idx` parameter.
.. note:: If you use LBFGS lightning handles the closure function automatically for you.
.. note:: If you use multiple optimizers, gradients will be calculated only for the parameters of current optimizer at each training step.
.. note:: If you use LBFGS lightning handles the closure function automatically for you
.. note:: If you use multiple optimizers, gradients will be calculated only
for the parameters of current optimizer at each training step.
Example
-------
Expand Down Expand Up @@ -939,9 +940,9 @@ def load_from_metrics(cls, weights_path, tags_csv, map_location=None):
for mapping storage {'cuda:1':'cuda:0'}
:return: The pretrained LightningModule
If you're using test tube, there is an alternate method which uses the meta_tags.csv
file from test-tube to rebuild the model. The meta_tags.csv file can be found in the
test-tube experiment save_dir.
If you're using `test-tube`, there is an alternate method which uses the meta_tags.csv
file from test-tube to rebuild the model. The `meta_tags.csv` file can be found in the
`test-tube` experiment save_dir.
.. code-block:: python
Expand Down
2 changes: 1 addition & 1 deletion pytorch_lightning/logging/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,8 @@ def __init__(self, hparams):
"""

from os import environ
from .base import LightningLoggerBase, rank_zero_only

from .base import LightningLoggerBase, rank_zero_only
from .tensorboard import TensorBoardLogger

try:
Expand Down
39 changes: 28 additions & 11 deletions pytorch_lightning/logging/tensorboard.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import os
from warnings import warn
from argparse import Namespace
from pkg_resources import parse_version

import torch
from pkg_resources import parse_version
import pandas as pd
from torch.utils.tensorboard import SummaryWriter

from .base import LightningLoggerBase, rank_zero_only
Expand All @@ -28,8 +30,8 @@ class TensorBoardLogger(LightningLoggerBase):
directory for existing versions, then automatically assigns the next available version.
:param \**kwargs: Other arguments are passed directly to the :class:`SummaryWriter` constructor.
"""
NAME_CSV_TAGS = 'meta_tags.csv'

def __init__(self, save_dir, name="default", version=None, **kwargs):
super().__init__()
Expand All @@ -38,6 +40,7 @@ def __init__(self, save_dir, name="default", version=None, **kwargs):
self._version = version

self._experiment = None
self.tags = {}
self.kwargs = kwargs

@property
Expand All @@ -57,22 +60,25 @@ def experiment(self):

@rank_zero_only
def log_hyperparams(self, params):
if params is None:
return

# in case converting from namespace
if isinstance(params, Namespace):
params = vars(params)
params = dict(params)

if parse_version(torch.__version__) < parse_version("1.3.0"):
warn(
f"Hyperparameter logging is not available for Torch version {torch.__version__}."
" Skipping log_hyperparams. Upgrade to Torch 1.3.0 or above to enable"
" hyperparameter logging."
)
# TODO: some alternative should be added
return
try:
# in case converting from namespace, todo: rather test if it is namespace
params = vars(params)
except TypeError:
pass
if params is not None:
else:
# `add_hparams` requires both - hparams and metric
self.experiment.add_hparams(hparam_dict=dict(params), metric_dict={})
self.experiment.add_hparams(hparam_dict=params, metric_dict={})
# some alternative should be added
self.tags.update(params)

@rank_zero_only
def log_metrics(self, metrics, step=None):
Expand All @@ -89,6 +95,17 @@ def save(self):
# you are using PT version (<v1.2) which does not have implemented flush
self.experiment._get_file_writer().flush()

# create a preudo standard path ala test-tube
dir_path = os.path.join(self.save_dir, self.name, 'version_%s' % self.version)
if not os.path.isdir(dir_path):
dir_path = self.save_dir
# prepare the file path
meta_tags_path = os.path.join(dir_path, self.NAME_CSV_TAGS)
# save the metatags file
df = pd.DataFrame({'key': list(self.tags.keys()),
'value': list(self.tags.values())})
df.to_csv(meta_tags_path, index=False)

@rank_zero_only
def finalize(self, status):
self.save()
Expand Down
3 changes: 2 additions & 1 deletion pytorch_lightning/trainer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
# Trainer
Trainer
=======
The lightning trainer abstracts best practices for running a training, val, test routine.
It calls parts of your model when it wants to hand over full control and otherwise makes
Expand Down
4 changes: 2 additions & 2 deletions pytorch_lightning/trainer/callback_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from abc import ABC

from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.logging import TensorboardLogger
from pytorch_lightning.logging import TensorBoardLogger


class TrainerCallbackConfigMixin(ABC):
Expand Down Expand Up @@ -69,7 +69,7 @@ def configure_early_stopping(self, early_stop_callback, logger):
# configure logger
if logger is True:
# default logger
self.logger = TensorboardLogger(
self.logger = TensorBoardLogger(
save_dir=self.default_save_path,
version=self.slurm_job_id,
name='lightning_logs'
Expand Down
3 changes: 2 additions & 1 deletion pytorch_lightning/trainer/evaluation_loop.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
# Validation loop
Validation loop
===============
The lightning validation loop handles everything except the actual computations of your model.
To decide what will happen in your validation loop, define the `validation_step` function.
Expand Down
10 changes: 5 additions & 5 deletions pytorch_lightning/trainer/training_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@
from subprocess import call
import logging
from abc import ABC
from argparse import Namespace

import pandas as pd
import torch
import torch.distributed as dist

Expand Down Expand Up @@ -268,7 +270,6 @@ def save_checkpoint(self, filepath):
torch.save(checkpoint, filepath)

def restore(self, checkpoint_path, on_gpu):

# if on_gpu:
# checkpoint = torch.load(checkpoint_path)
# else:
Expand Down Expand Up @@ -461,14 +462,13 @@ def max_ckpt_in_folder(self, path, name_key='ckpt_'):


def load_hparams_from_tags_csv(tags_csv):
from argparse import Namespace
import pandas as pd
if not os.path.isfile(tags_csv):
logging.warning(f'Missing Tags: {tags_csv}.')
return Namespace()

tags_df = pd.read_csv(tags_csv)
dic = tags_df.to_dict(orient='records')

ns_dict = {row['key']: convert(row['value']) for row in dic}

ns = Namespace(**ns_dict)
return ns

Expand Down
2 changes: 1 addition & 1 deletion pytorch_lightning/trainer/training_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ def run_training_batch(self, batch, batch_idx):

# call training_step once per optimizer
for opt_idx, optimizer in enumerate(self.optimizers):
# make sure only the gradients of the current optimizer's paramaters are calculated
# make sure only the gradients of the current optimizer's paramaters are calculated
# in the training step to prevent dangling gradients in multiple-optimizer setup.
for param in self.get_model().parameters():
param.requires_grad = False
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ scikit-learn>=0.20.2
tqdm>=4.35.0
numpy>=1.16.4
torch>=1.1
torchvision>=0.4.0
torchvision>=0.4.0, < 0.5 # the 0.5. has some issues with torch JIT
pandas>=0.24 # lower version do not support py3.7
test-tube>=0.7.5
tensorboard>=1.14
future>=0.17.1 # required for builtins in setup.py
2 changes: 1 addition & 1 deletion tests/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ pytest>=3.0.5
pytest-cov
flake8
check-manifest
# test_tube # already installed in main req.
test-tube>=0.7.5
mlflow
comet_ml
wandb
Expand Down
6 changes: 3 additions & 3 deletions tests/test_cpu_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_early_stopping_cpu_model(tmpdir):
show_progress_bar=True,
logger=tutils.get_test_tube_logger(tmpdir),
train_percent_check=0.1,
val_percent_check=0.1
val_percent_check=0.1,
)

model, hparams = tutils.get_model()
Expand All @@ -51,7 +51,7 @@ def test_lbfgs_cpu_model(tmpdir):
show_progress_bar=False,
weights_summary='top',
train_percent_check=1.0,
val_percent_check=0.2
val_percent_check=0.2,
)

model, hparams = tutils.get_model(use_test_model=True, lbfgs=True)
Expand All @@ -70,7 +70,7 @@ def test_default_logger_callbacks_cpu_model(tmpdir):
print_nan_grads=True,
show_progress_bar=False,
train_percent_check=0.01,
val_percent_check=0.01
val_percent_check=0.01,
)

model, hparams = tutils.get_model()
Expand Down
5 changes: 5 additions & 0 deletions tests/test_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ def test_comet_pickle(tmpdir, monkeypatch):
trainer2 = pickle.loads(pkl_bytes)
trainer2.logger.log_metrics({"acc": 1.0})


def test_wandb_logger(tmpdir):
"""Verify that basic functionality of wandb logger works."""
tutils.reset_seed()
Expand All @@ -201,6 +202,7 @@ def test_wandb_logger(tmpdir):
wandb_dir = os.path.join(tmpdir, "wandb")
logger = WandbLogger(save_dir=wandb_dir, anonymous=True)


def test_neptune_logger(tmpdir):
"""Verify that basic functionality of neptune logger works."""
tutils.reset_seed()
Expand All @@ -223,13 +225,16 @@ def test_neptune_logger(tmpdir):
print('result finished')
assert result == 1, "Training failed"


def test_wandb_pickle(tmpdir):
"""Verify that pickling trainer with wandb logger works."""
tutils.reset_seed()

from pytorch_lightning.logging import WandbLogger
wandb_dir = str(tmpdir)
logger = WandbLogger(save_dir=wandb_dir, anonymous=True)
assert logger is not None


def test_neptune_pickle(tmpdir):
"""Verify that pickling trainer with neptune logger works."""
Expand Down
Loading

0 comments on commit f72e354

Please sign in to comment.