Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add warning to trainstep output #7779

Merged
merged 19 commits into from
Jun 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion pytorch_lightning/trainer/training_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from contextlib import contextmanager, suppress
from copy import copy
from functools import partial, update_wrapper
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union

import numpy as np
import torch
Expand Down Expand Up @@ -269,6 +269,16 @@ def _check_training_step_output(self, training_step_output):
if training_step_output.grad_fn is None:
# TODO: Find why - RuntimeError: Expected to mark a variable ready only once ...
raise MisconfigurationException("In manual optimization, `training_step` should not return a Tensor")
elif self.trainer.lightning_module.automatic_optimization:
if not any((
isinstance(training_step_output, torch.Tensor),
(isinstance(training_step_output, Mapping)
and 'loss' in training_step_output), training_step_output is None
)):
raise MisconfigurationException(
"In automatic optimization, `training_step` must either return a Tensor, "
"a dict with key 'loss' or None (where the step will be skipped)."
)
Comment on lines +272 to +281
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this validation work with distributed training?

eg, I noticed a few examples where the loss is not calculated in training_step but is instead computed in training_step_end after collating the outputs of training_step.

Should we maybe swap L299 and L301?

cc @ananthsub


def training_step(self, split_batch, batch_idx, opt_idx, hiddens):
# give the PL module a result for logging
Expand Down
24 changes: 24 additions & 0 deletions tests/trainer/loops/test_training_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re

import pytest
import torch

from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from tests.helpers import BoringModel


Expand Down Expand Up @@ -142,3 +145,24 @@ def validation_step(self, *args):
assert trainer.current_epoch == 0
assert trainer.global_step == 5
assert model.validation_called_at == (0, 4)


@pytest.mark.parametrize(['output'], [(5., ), ({'a': 5}, )])
def test_warning_invalid_trainstep_output(tmpdir, output):

class TestModel(BoringModel):

def training_step(self, batch, batch_idx):
return output

model = TestModel()

trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1)
with pytest.raises(
MisconfigurationException,
match=re.escape(
"In automatic optimization, `training_step` must either return a Tensor, "
"a dict with key 'loss' or None (where the step will be skipped)."
)
):
trainer.fit(model)