From 00a1f86da1a5bfdbbac44bfeda177de9439f4c73 Mon Sep 17 00:00:00 2001 From: Shay Aharon <80472096+shaydeci@users.noreply.github.com> Date: Wed, 1 May 2024 15:57:08 +0300 Subject: [PATCH 1/5] Fix average best models to false when save model is false (#1976) * average best models disabled when save_model is set to false * average best models disabled when save_model is set to false --- src/super_gradients/training/sg_trainer/sg_trainer.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py index 5f856da732..7ba08998ae 100755 --- a/src/super_gradients/training/sg_trainer/sg_trainer.py +++ b/src/super_gradients/training/sg_trainer/sg_trainer.py @@ -1421,6 +1421,14 @@ def get_finetune_lr_dict(self, lr: float) -> Dict[str, float]: self.ckpt_best_name = self.training_params.ckpt_best_name + if self.training_params.average_best_models and not self.training_params.save_model: + logger.warning( + "'training_params.average_best_models' is enabled, but 'training_params.save_model' is disabled. \n" + "Model averaging requires saving snapshot checkpoints to function properly. As a result, " + "'training_params.average_best_models' will be disabled. " + ) + self.training_params.average_best_models = False + self.max_train_batches = self.training_params.max_train_batches self.max_valid_batches = self.training_params.max_valid_batches From 3548fd5e285ac01efe99bb67fee9c80755d8f431 Mon Sep 17 00:00:00 2001 From: Shay Aharon <80472096+shaydeci@users.noreply.github.com> Date: Thu, 2 May 2024 11:00:21 +0300 Subject: [PATCH 2/5] removed log_dir arg which is None by default (#1977) Co-authored-by: Ofri Masad --- src/super_gradients/training/utils/distributed_training_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/super_gradients/training/utils/distributed_training_utils.py b/src/super_gradients/training/utils/distributed_training_utils.py index c4dbd55e80..dda9913777 100755 --- a/src/super_gradients/training/utils/distributed_training_utils.py +++ b/src/super_gradients/training/utils/distributed_training_utils.py @@ -345,7 +345,6 @@ def restart_script_with_ddp(num_gpus: int = None): max_restarts=0, monitor_interval=5, start_method="spawn", - log_dir=None, redirects=Std.NONE, tee=Std.NONE, metrics_cfg={}, From 292e38cea4900523bb40ebd9ff60896c56c78eba Mon Sep 17 00:00:00 2001 From: Talhaa Hussain <73853725+talhaahussain@users.noreply.github.com> Date: Thu, 2 May 2024 19:09:32 +0100 Subject: [PATCH 3/5] Fixed issue with saved pose images (#1972) (#1973) Co-authored-by: Eugene Khvedchenya --- .../utils/predict/prediction_pose_estimation_results.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/super_gradients/training/utils/predict/prediction_pose_estimation_results.py b/src/super_gradients/training/utils/predict/prediction_pose_estimation_results.py index 2019a5ba17..345c7d15ee 100644 --- a/src/super_gradients/training/utils/predict/prediction_pose_estimation_results.py +++ b/src/super_gradients/training/utils/predict/prediction_pose_estimation_results.py @@ -119,7 +119,14 @@ def save( :param show_confidence: Whether to show confidence scores on the image. :param box_thickness: (Optional) Thickness of bounding boxes. If None, will adapt to the box size. """ - image = self.draw(box_thickness=box_thickness, show_confidence=show_confidence) + image = self.draw( + edge_colors=edge_colors, + joint_thickness=joint_thickness, + keypoint_colors=keypoint_colors, + keypoint_radius=keypoint_radius, + box_thickness=box_thickness, + show_confidence=show_confidence, + ) save_image(image=image, path=output_path) From 9e737925e7e9ac34eae4c9c21322dc9836da87a3 Mon Sep 17 00:00:00 2001 From: Shay Aharon <80472096+shaydeci@users.noreply.github.com> Date: Wed, 8 May 2024 11:02:09 +0300 Subject: [PATCH 4/5] more deprecated default args removed from LaunchConfig (#1982) --- .../training/utils/distributed_training_utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/super_gradients/training/utils/distributed_training_utils.py b/src/super_gradients/training/utils/distributed_training_utils.py index dda9913777..6d587466a1 100755 --- a/src/super_gradients/training/utils/distributed_training_utils.py +++ b/src/super_gradients/training/utils/distributed_training_utils.py @@ -10,7 +10,6 @@ from torch import distributed as dist from torch.cuda.amp import autocast from torch.distributed import get_rank, all_gather_object -from torch.distributed.elastic.multiprocessing import Std from torch.distributed.elastic.multiprocessing.errors import record from torch.distributed.launcher.api import LaunchConfig, elastic_launch @@ -345,8 +344,6 @@ def restart_script_with_ddp(num_gpus: int = None): max_restarts=0, monitor_interval=5, start_method="spawn", - redirects=Std.NONE, - tee=Std.NONE, metrics_cfg={}, ) From f8cc94a77e6eee520d82e76e18d9cfef6105403e Mon Sep 17 00:00:00 2001 From: Eugene Khvedchenya Date: Wed, 15 May 2024 13:38:15 +0300 Subject: [PATCH 5/5] Fixed issue of logging wrong config (#1988) --- src/super_gradients/training/sg_trainer/sg_trainer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py index 7ba08998ae..566228bc64 100755 --- a/src/super_gradients/training/sg_trainer/sg_trainer.py +++ b/src/super_gradients/training/sg_trainer/sg_trainer.py @@ -239,13 +239,15 @@ def train_from_config(cls, cfg: Union[DictConfig, dict]) -> Tuple[nn.Module, Tup :return: the model and the output of trainer.train(...) (i.e results tuple) """ - # TODO: bind checkpoint_run_id setup_device( device=core_utils.get_param(cfg, "device"), multi_gpu=core_utils.get_param(cfg, "multi_gpu"), num_gpus=core_utils.get_param(cfg, "num_gpus"), ) + # Create resolved config before instantiation + recipe_logged_cfg = {"recipe_config": OmegaConf.to_container(cfg, resolve=True)} + # INSTANTIATE ALL OBJECTS IN CFG cfg = hydra.utils.instantiate(cfg) @@ -283,7 +285,6 @@ def train_from_config(cls, cfg: Union[DictConfig, dict]) -> Tuple[nn.Module, Tup test_loaders = maybe_instantiate_test_loaders(cfg) - recipe_logged_cfg = {"recipe_config": OmegaConf.to_container(cfg, resolve=True)} # TRAIN res = trainer.train( model=model,