huggingface · Nicolinho · May 23, 2024 · May 23, 2024 · May 23, 2024
diff --git a/trl/trainer/ppov2_config.py b/trl/trainer/ppov2_config.py
@@ -12,7 +12,7 @@
 
 
 @dataclass
-class PPOv2Config(TrainingArguments, OnpolicyRuntimeConfig):
+class PPOv2Config(OnpolicyRuntimeConfig, TrainingArguments):
     # common config
     exp_name: str = os.path.basename(__file__)[: -len(".py")]
     """the name of this experiment"""

diff --git a/trl/trainer/ppov2_trainer.py b/trl/trainer/ppov2_trainer.py
@@ -285,7 +285,7 @@ def repeat_generator():
                         query_response, logits = generate(
                             unwrapped_model.policy,
                             query,
-                            tokenizer,
+                            tokenizer.pad_token_id,
                             generation_config,
                         )
                         response = query_response[:, context_length:]
@@ -407,7 +407,7 @@ def repeat_generator():
                             mb_query_responses = query_responses[micro_batch_inds]
                             mb_logprobs = logprobs[micro_batch_inds]
 
-                            output, vpred_temp = forward(model, mb_query_responses, tokenizer)
+                            output, vpred_temp = forward(model, mb_query_responses, tokenizer.pad_token_id)
                             logits = output.logits[:, context_length - 1 : -1]
                             logits /= args.temperature + 1e-7
                             new_all_logprobs = F.log_softmax(logits, dim=-1)

diff --git a/trl/trainer/rloo_config.py b/trl/trainer/rloo_config.py
@@ -13,7 +13,7 @@
 
 
 @dataclass
-class RLOOConfig(TrainingArguments, OnpolicyRuntimeConfig):
+class RLOOConfig(OnpolicyRuntimeConfig, TrainingArguments):
     # common config
     exp_name: str = os.path.basename(__file__)[: -len(".py")]
     """the name of this experiment"""