From 4af01fb1c013eefa3d49f09eeb619c339816c8a9 Mon Sep 17 00:00:00 2001
From: pphuc25 <phanphuc1100@gmail.com>
Date: Tue, 19 Sep 2023 01:25:04 +0700
Subject: [PATCH 1/6] docs: change assert to raise and some small docs

---
 .../language-modeling/run_clm_no_trainer.py       | 15 +++++++++++----
 .../language-modeling/run_mlm_no_trainer.py       |  5 +++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 3de3c7219c63..160e660c9789 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -246,13 +246,16 @@ def parse_args():
     else:
         if args.train_file is not None:
             extension = args.train_file.split(".")[-1]
-            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
+            if extension not in ["csv", "json", "txt"]:
+                raise ValueError("`train_file` should be a csv, json or txt file.")
         if args.validation_file is not None:
             extension = args.validation_file.split(".")[-1]
-            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
+            if extension not in ["csv", "json", "txt"]:
+                raise ValueError("`validation_file` should be a csv, json or txt file.")
 
     if args.push_to_hub:
-        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
+        if args.output_dir is None:
+            raise ValueError("Need an `output_dir` to create a repo when `--push_to_hub` is passed.")
 
     return args
 
@@ -514,6 +517,9 @@ def group_texts(examples):
     ]
     optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
 
+    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
+    # shorter in multiprocess)
+
     # Scheduler and math around the number of training steps.
     overrode_max_train_steps = False
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
@@ -636,6 +642,7 @@ def group_texts(examples):
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
                     accelerator.save_state(output_dir)
+
             if completed_steps >= args.max_train_steps:
                 break
 
@@ -655,7 +662,7 @@ def group_texts(examples):
         except OverflowError:
             perplexity = float("inf")
 
-        logger.info(f"epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}")
+        logger.info(f"epoch {epoch}: perplexity: {perplexity} | eval_loss: {eval_loss}")
 
         if args.with_tracking:
             accelerator.log(
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 3d035fded543..0a9fee6b230c 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -261,7 +261,8 @@ def parse_args():
                 raise ValueError("`validation_file` should be a csv, json or txt file.")
 
     if args.push_to_hub:
-        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
+        if args.output_dir is None:
+            raise ValueError("Need an `output_dir` to create a repo when `--push_to_hub` is passed.")
 
     return args
 
@@ -694,7 +695,7 @@ def group_texts(examples):
         except OverflowError:
             perplexity = float("inf")
 
-        logger.info(f"epoch {epoch}: perplexity: {perplexity}")
+        logger.info(f"epoch {epoch}: perplexity: {perplexity} | eval_loss: {eval_loss}")
 
         if args.with_tracking:
             accelerator.log(

From d9ca970f81e3b59c584fe928057450759ba909b5 Mon Sep 17 00:00:00 2001
From: pphuc25 <phanphuc1100@gmail.com>
Date: Tue, 19 Sep 2023 01:57:57 +0700
Subject: [PATCH 2/6] docs: add rule and some document

---
 examples/pytorch/language-modeling/run_clm_no_trainer.py | 8 +++++---
 examples/pytorch/language-modeling/run_mlm_no_trainer.py | 4 +++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 160e660c9789..bdd598e622c0 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -490,9 +490,11 @@ def group_texts(examples):
     train_dataset = lm_datasets["train"]
     eval_dataset = lm_datasets["validation"]
 
-    # Log a few random samples from the training set:
-    for index in random.sample(range(len(train_dataset)), 3):
-        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+    # Conditional for small test subsets
+    if len(train_dataset) > 3:
+        # Log a few random samples from the training set:
+        for index in random.sample(range(len(train_dataset)), 3):
+            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
 
     # DataLoaders creation:
     train_dataloader = DataLoader(
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 0a9fee6b230c..0e8c90340980 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -533,7 +533,9 @@ def group_texts(examples):
     train_dataloader = DataLoader(
         train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
     )
-    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
+    eval_dataloader = DataLoader(
+        eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
+    )
 
     # Optimizer
     # Split weights in two groups, one with weight decay and the other not.

From 3761cec1d64ac02b13178c2e3e4a92297175a019 Mon Sep 17 00:00:00 2001
From: pphuc25 <phanphuc1100@gmail.com>
Date: Thu, 21 Sep 2023 23:53:04 +0700
Subject: [PATCH 3/6] fix: fix bug

---
 src/transformers/models/persimmon/modeling_persimmon.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
index 5c6cde7f8a6d..654660e4fa9f 100644
--- a/src/transformers/models/persimmon/modeling_persimmon.py
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -72,7 +72,7 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 
 
 # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Persimmon
-class PersimmonRotaryEmbedding(torch.nn.Module):
+class PersimmonRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
 

From a8c267d23bbbeec9047d140e1d59de2b07e33b3c Mon Sep 17 00:00:00 2001
From: pphuc25 <phanphuc1100@gmail.com>
Date: Tue, 26 Sep 2023 12:28:37 +0700
Subject: [PATCH 4/6] fix: fix bug

---
 examples/pytorch/language-modeling/run_mlm_no_trainer.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 0e8c90340980..0a9fee6b230c 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -533,9 +533,7 @@ def group_texts(examples):
     train_dataloader = DataLoader(
         train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
     )
-    eval_dataloader = DataLoader(
-        eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
-    )
+    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
 
     # Optimizer
     # Split weights in two groups, one with weight decay and the other not.

From 1f9b28c2b01b84ae448161d6a2eab9ed4296f7b4 Mon Sep 17 00:00:00 2001
From: pphuc25 <phanphuc1100@gmail.com>
Date: Wed, 27 Sep 2023 17:22:06 +0700
Subject: [PATCH 5/6] chorse: revert logging

---
 examples/pytorch/language-modeling/run_clm_no_trainer.py | 2 +-
 examples/pytorch/language-modeling/run_mlm_no_trainer.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index bdd598e622c0..2d0d67e33e8d 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -664,7 +664,7 @@ def group_texts(examples):
         except OverflowError:
             perplexity = float("inf")
 
-        logger.info(f"epoch {epoch}: perplexity: {perplexity} | eval_loss: {eval_loss}")
+        logger.info(f"epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}")
 
         if args.with_tracking:
             accelerator.log(
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 0a9fee6b230c..749810cd31df 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -695,7 +695,7 @@ def group_texts(examples):
         except OverflowError:
             perplexity = float("inf")
 
-        logger.info(f"epoch {epoch}: perplexity: {perplexity} | eval_loss: {eval_loss}")
+        logger.info(f"epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}")
 
         if args.with_tracking:
             accelerator.log(

From 8b6c615365a8ac34df65708b42a420ce86b1e636 Mon Sep 17 00:00:00 2001
From: pphuc25 <phanphuc1100@gmail.com>
Date: Wed, 27 Sep 2023 17:44:17 +0700
Subject: [PATCH 6/6] chorse: revert

---
 .../pytorch/language-modeling/run_clm_no_trainer.py  | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 2d0d67e33e8d..b02a89e6dfcc 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -490,11 +490,9 @@ def group_texts(examples):
     train_dataset = lm_datasets["train"]
     eval_dataset = lm_datasets["validation"]
 
-    # Conditional for small test subsets
-    if len(train_dataset) > 3:
-        # Log a few random samples from the training set:
-        for index in random.sample(range(len(train_dataset)), 3):
-            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
 
     # DataLoaders creation:
     train_dataloader = DataLoader(
@@ -519,9 +517,6 @@ def group_texts(examples):
     ]
     optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
 
-    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
-    # shorter in multiprocess)
-
     # Scheduler and math around the number of training steps.
     overrode_max_train_steps = False
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
@@ -644,7 +639,6 @@ def group_texts(examples):
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
                     accelerator.save_state(output_dir)
-
             if completed_steps >= args.max_train_steps:
                 break