matrix contains invalid numeric entries #1720
arrrrr3186
started this conversation in
General
Replies: 0 comments
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
-
Hello @hbredin @mogwai,
I am trying to train segmentation model on my custom dataset. i am running adapting_pretrained_pipeline.ipynb file.
I sometime face the below issue, but sometime not while training the model.
`---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[16], line 46
41 from pytorch_lightning import Trainer
42 trainer = Trainer(accelerator="gpu",
43 callbacks=callbacks,
44 max_epochs=20,
45 gradient_clip_val=0.5)
---> 46 trainer.fit(model)
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py:544, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
542 self.state.status = TrainerStatus.RUNNING
543 self.training = True
--> 544 call._call_and_handle_interrupt(
545 self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
546 )
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/trainer/call.py:44, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
42 if trainer.strategy.launcher is not None:
43 return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
---> 44 return trainer_fn(*args, **kwargs)
46 except _TunerExitException:
47 _call_teardown_hook(trainer)
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py:580, in Trainer._fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
573 assert self.state.fn is not None
574 ckpt_path = self._checkpoint_connector._select_ckpt_path(
575 self.state.fn,
576 ckpt_path,
577 model_provided=True,
578 model_connected=self.lightning_module is not None,
579 )
--> 580 self._run(model, ckpt_path=ckpt_path)
582 assert self.state.stopped
583 self.training = False
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py:987, in Trainer._run(self, model, ckpt_path)
982 self._signal_connector.register_signal_handlers()
984 # ----------------------------
985 # RUN THE TRAINER
986 # ----------------------------
--> 987 results = self._run_stage()
989 # ----------------------------
990 # POST-Training CLEAN UP
991 # ----------------------------
992 log.debug(f"{self.class.name}: trainer tearing down")
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py:1033, in Trainer._run_stage(self)
1031 self._run_sanity_check()
1032 with torch.autograd.set_detect_anomaly(self._detect_anomaly):
-> 1033 self.fit_loop.run()
1034 return None
1035 raise RuntimeError(f"Unexpected state {self.state}")
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:205, in _FitLoop.run(self)
203 try:
204 self.on_advance_start()
--> 205 self.advance()
206 self.on_advance_end()
207 self._restarting = False
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:363, in _FitLoop.advance(self)
361 with self.trainer.profiler.profile("run_training_epoch"):
362 assert self._data_fetcher is not None
--> 363 self.epoch_loop.run(self._data_fetcher)
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/loops/training_epoch_loop.py:140, in _TrainingEpochLoop.run(self, data_fetcher)
138 while not self.done:
139 try:
--> 140 self.advance(data_fetcher)
141 self.on_advance_end(data_fetcher)
142 self._restarting = False
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/loops/training_epoch_loop.py:250, in _TrainingEpochLoop.advance(self, data_fetcher)
247 with trainer.profiler.profile("run_training_batch"):
248 if trainer.lightning_module.automatic_optimization:
249 # in automatic optimization, there can only be one optimizer
--> 250 batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
251 else:
252 batch_output = self.manual_optimization.run(kwargs)
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/loops/optimization/automatic.py:190, in _AutomaticOptimization.run(self, optimizer, batch_idx, kwargs)
183 closure()
185 # ------------------------------
186 # BACKWARD PASS
187 # ------------------------------
188 # gradient update with accumulated gradients
189 else:
--> 190 self._optimizer_step(batch_idx, closure)
192 result = closure.consume_result()
193 if result.loss is None:
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/loops/optimization/automatic.py:268, in _AutomaticOptimization._optimizer_step(self, batch_idx, train_step_and_backward_closure)
265 self.optim_progress.optimizer.step.increment_ready()
267 # model hook
--> 268 call._call_lightning_module_hook(
269 trainer,
270 "optimizer_step",
271 trainer.current_epoch,
272 batch_idx,
273 optimizer,
274 train_step_and_backward_closure,
275 )
277 if not should_accumulate:
278 self.optim_progress.optimizer.step.increment_completed()
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/trainer/call.py:157, in _call_lightning_module_hook(trainer, hook_name, pl_module, *args, **kwargs)
154 pl_module._current_fx_name = hook_name
156 with trainer.profiler.profile(f"[LightningModule]{pl_module.class.name}.{hook_name}"):
--> 157 output = fn(*args, **kwargs)
159 # restore current_fx when nested context
160 pl_module._current_fx_name = prev_fx_name
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/core/module.py:1303, in LightningModule.optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure)
1264 def optimizer_step(
1265 self,
1266 epoch: int,
(...)
1269 optimizer_closure: Optional[Callable[[], Any]] = None,
1270 ) -> None:
1271 r"""Override this method to adjust the default way the :class:
~pytorch_lightning.trainer.trainer.Trainer
calls1272 the optimizer.
1273
(...)
1301
1302 """
-> 1303 optimizer.step(closure=optimizer_closure)
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/core/optimizer.py:152, in LightningOptimizer.step(self, closure, **kwargs)
149 raise MisconfigurationException("When
optimizer.step(closure)
is called, the closure should be callable")151 assert self._strategy is not None
--> 152 step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
154 self._on_after_step()
156 return step_output
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/strategies/strategy.py:239, in Strategy.optimizer_step(self, optimizer, closure, model, **kwargs)
237 # TODO(fabric): remove assertion once strategy's optimizer_step typing is fixed
238 assert isinstance(model, pl.LightningModule)
--> 239 return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/plugins/precision/precision.py:122, in Precision.optimizer_step(self, optimizer, model, closure, **kwargs)
120 """Hook to run the optimizer step."""
121 closure = partial(self._wrap_closure, model, optimizer, closure)
--> 122 return optimizer.step(closure=closure, **kwargs)
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/torch/optim/optimizer.py:385, in Optimizer.profile_hook_step..wrapper(*args, **kwargs)
380 else:
381 raise RuntimeError(
382 f"{func} must return None or a tuple of (new_args, new_kwargs), but got {result}."
383 )
--> 385 out = func(*args, **kwargs)
386 self._optimizer_step_code()
388 # call optimizer step post hooks
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/torch/optim/optimizer.py:76, in _use_grad_for_differentiable.._use_grad(self, *args, **kwargs)
74 torch.set_grad_enabled(self.defaults['differentiable'])
75 torch._dynamo.graph_break()
---> 76 ret = func(self, *args, **kwargs)
77 finally:
78 torch._dynamo.graph_break()
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/torch/optim/adam.py:146, in Adam.step(self, closure)
144 if closure is not None:
145 with torch.enable_grad():
--> 146 loss = closure()
148 for group in self.param_groups:
149 params_with_grad = []
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/plugins/precision/precision.py:108, in Precision._wrap_closure(self, model, optimizer, closure)
95 def _wrap_closure(
96 self,
97 model: "pl.LightningModule",
98 optimizer: Optimizer,
99 closure: Callable[[], Any],
100 ) -> Any:
101 """This double-closure allows makes sure the
closure
is executed before theon_before_optimizer_step
102 hook is called.
103
(...)
106
107 """
--> 108 closure_result = closure()
109 self._after_closure(model, optimizer)
110 return closure_result
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/loops/optimization/automatic.py:144, in Closure.call(self, *args, **kwargs)
142 @OverRide
143 def call(self, *args: Any, **kwargs: Any) -> Optional[Tensor]:
--> 144 self._result = self.closure(*args, **kwargs)
145 return self._result.loss
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/loops/optimization/automatic.py:129, in Closure.closure(self, *args, **kwargs)
126 @OverRide
127 @torch.enable_grad()
128 def closure(self, *args: Any, **kwargs: Any) -> ClosureResult:
--> 129 step_output = self._step_fn()
131 if step_output.closure_loss is None:
132 self.warning_cache.warn("
training_step
returnedNone
. If this was on purpose, ignore this warning...")File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/loops/optimization/automatic.py:318, in _AutomaticOptimization._training_step(self, kwargs)
315 trainer = self.trainer
317 # manually capture logged metrics
--> 318 training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
319 self.trainer.strategy.post_training_step() # unused hook - call anyway for backward compatibility
321 return self.output_result_cls.from_training_step_output(training_step_output, trainer.accumulate_grad_batches)
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/trainer/call.py:309, in _call_strategy_hook(trainer, hook_name, *args, **kwargs)
306 return None
308 with trainer.profiler.profile(f"[Strategy]{trainer.strategy.class.name}.{hook_name}"):
--> 309 output = fn(*args, **kwargs)
311 # restore current_fx when nested context
312 pl_module._current_fx_name = prev_fx_name
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pytorch_lightning/strategies/strategy.py:391, in Strategy.training_step(self, *args, **kwargs)
389 if self.model != self.lightning_module:
390 return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs)
--> 391 return self.lightning_module.training_step(*args, **kwargs)
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pyannote/audio/core/model.py:358, in Model.training_step(self, batch, batch_idx)
357 def training_step(self, batch, batch_idx):
--> 358 return self.task.training_step(batch, batch_idx)
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pyannote/audio/tasks/segmentation/speaker_diarization.py:562, in SpeakerDiarization.training_step(self, batch, batch_idx)
557 seg_loss = self.segmentation_loss(
558 prediction, permutated_target_powerset, weight=weight
559 )
561 else:
--> 562 permutated_prediction, _ = permutate(target, prediction)
563 seg_loss = self.segmentation_loss(
564 permutated_prediction, target, weight=weight
565 )
567 self.model.log(
568 "loss/train/segmentation",
569 seg_loss,
(...)
573 logger=True,
574 )
File ~/anaconda3/envs/wisper/lib/python3.12/functools.py:909, in singledispatch..wrapper(*args, **kw)
905 if not args:
906 raise TypeError(f'{funcname} requires at least '
907 '1 positional argument')
--> 909 return dispatch(args[0].class)(*args, **kw)
File ~/anaconda3/envs/wisper/lib/python3.12/site-packages/pyannote/audio/utils/permutation.py:153, in permutate_torch(y1, y2, cost_func, return_cost)
150 padded_cost = cost
152 permutation = [None] * num_classes_1
--> 153 for k1, k2 in zip(*linear_sum_assignment(padded_cost.cpu())):
154 if k1 < num_classes_1:
155 permutation[k1] = k2
ValueError: matrix contains invalid numeric entries`
earlier I thought it was issue with my dataset, so I thoroughly checked my Dataset to match with AMI-dataset as well.
I am not sure where to look next or what is should be the next step to handle issue.
could you please guide me on the same.
Beta Was this translation helpful? Give feedback.
All reactions