Skip to content

Commit

Permalink
fix redundant checkpointing in example training scripts (huggingface#…
Browse files Browse the repository at this point in the history
…33131)

* fix redundant checkpointing in example scripts

* Update examples/pytorch/image-classification/run_image_classification_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/translation/run_translation_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/token-classification/run_ner_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/text-classification/run_glue_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/summarization/run_summarization_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/language-modeling/run_mlm_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/language-modeling/run_fim_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/language-modeling/run_clm_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/image-pretraining/run_mim_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/multiple-choice/run_swag_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/question-answering/run_qa_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/object-detection/run_object_detection_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

---------

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
  • Loading branch information
2 people authored and BernardZach committed Dec 5, 2024
1 parent 65c7162 commit 9df6aa9
Show file tree
Hide file tree
Showing 15 changed files with 15 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@ def collate_fn(examples):
completed_steps += 1

if isinstance(checkpointing_steps, int):
if completed_steps % checkpointing_steps == 0:
if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
output_dir = f"step_{completed_steps}"
if args.output_dir is not None:
output_dir = os.path.join(args.output_dir, output_dir)
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/image-pretraining/run_mim_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,7 +723,7 @@ def preprocess_images(examples):
completed_steps += 1

if isinstance(checkpointing_steps, int):
if completed_steps % checkpointing_steps == 0:
if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
output_dir = f"step_{completed_steps}"
if args.output_dir is not None:
output_dir = os.path.join(args.output_dir, output_dir)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -639,7 +639,7 @@ def main():
completed_steps += 1

if isinstance(checkpointing_steps, int):
if completed_steps % checkpointing_steps == 0:
if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
output_dir = f"step_{completed_steps}"
if args.output_dir is not None:
output_dir = os.path.join(args.output_dir, output_dir)
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/language-modeling/run_clm_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@ def group_texts(examples):
completed_steps += 1

if isinstance(checkpointing_steps, int):
if completed_steps % checkpointing_steps == 0:
if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
output_dir = f"step_{completed_steps}"
if args.output_dir is not None:
output_dir = os.path.join(args.output_dir, output_dir)
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/language-modeling/run_fim_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -838,7 +838,7 @@ def apply_fim(examples):
completed_steps += 1

if isinstance(checkpointing_steps, int):
if completed_steps % checkpointing_steps == 0:
if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
output_dir = f"step_{completed_steps}"
if args.output_dir is not None:
output_dir = os.path.join(args.output_dir, output_dir)
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/language-modeling/run_mlm_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,7 +675,7 @@ def group_texts(examples):
completed_steps += 1

if isinstance(checkpointing_steps, int):
if completed_steps % checkpointing_steps == 0:
if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
output_dir = f"step_{completed_steps}"
if args.output_dir is not None:
output_dir = os.path.join(args.output_dir, output_dir)
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/multiple-choice/run_swag_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,7 @@ def preprocess_function(examples):
completed_steps += 1

if isinstance(checkpointing_steps, int):
if completed_steps % checkpointing_steps == 0:
if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
output_dir = f"step_{completed_steps}"
if args.output_dir is not None:
output_dir = os.path.join(args.output_dir, output_dir)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -677,7 +677,7 @@ def main():
completed_steps += 1

if isinstance(checkpointing_steps, int):
if completed_steps % checkpointing_steps == 0:
if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
output_dir = f"step_{completed_steps}"
if args.output_dir is not None:
output_dir = os.path.join(args.output_dir, output_dir)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -879,7 +879,7 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
completed_steps += 1

if isinstance(checkpointing_steps, int):
if completed_steps % checkpointing_steps == 0:
if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
accelerator.save_state(f"step_{completed_steps}")

if completed_steps >= args.max_train_steps:
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/question-answering/run_qa_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -894,7 +894,7 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
completed_steps += 1

if isinstance(checkpointing_steps, int):
if completed_steps % checkpointing_steps == 0:
if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
output_dir = f"step_{completed_steps}"
if args.output_dir is not None:
output_dir = os.path.join(args.output_dir, output_dir)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ def preprocess_batch(example_batch, transforms: A.Compose):
completed_steps += 1

if isinstance(checkpointing_steps, int):
if completed_steps % checkpointing_steps == 0:
if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
output_dir = f"step_{completed_steps}"
if args.output_dir is not None:
output_dir = os.path.join(args.output_dir, output_dir)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,7 @@ def postprocess_text(preds, labels):
completed_steps += 1

if isinstance(checkpointing_steps, int):
if completed_steps % checkpointing_steps == 0:
if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
output_dir = f"step_{completed_steps}"
if args.output_dir is not None:
output_dir = os.path.join(args.output_dir, output_dir)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ def preprocess_function(examples):
completed_steps += 1

if isinstance(checkpointing_steps, int):
if completed_steps % checkpointing_steps == 0:
if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
output_dir = f"step_{completed_steps}"
if args.output_dir is not None:
output_dir = os.path.join(args.output_dir, output_dir)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -722,7 +722,7 @@ def compute_metrics():
completed_steps += 1

if isinstance(checkpointing_steps, int):
if completed_steps % checkpointing_steps == 0:
if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
output_dir = f"step_{completed_steps}"
if args.output_dir is not None:
output_dir = os.path.join(args.output_dir, output_dir)
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/translation/run_translation_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,7 +664,7 @@ def postprocess_text(preds, labels):
completed_steps += 1

if isinstance(checkpointing_steps, int):
if completed_steps % checkpointing_steps == 0:
if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
output_dir = f"step_{completed_steps}"
if args.output_dir is not None:
output_dir = os.path.join(args.output_dir, output_dir)
Expand Down

0 comments on commit 9df6aa9

Please sign in to comment.