diff --git a/.github/workflows/e2e_lora.yml b/.github/workflows/e2e_lora.yml index bc2b5703..bea3b67b 100644 --- a/.github/workflows/e2e_lora.yml +++ b/.github/workflows/e2e_lora.yml @@ -43,4 +43,4 @@ jobs: - name: Running gsm8k e2e training tests with LoRA run: | ray stop --force - bash examples/sft/gsm8k/run_qwen_05_peft.sh 8 $HOME/ckpts/ \ No newline at end of file + bash tests/sft/run_sft_qwen05_peft.sh 8 $HOME/ckpts/ \ No newline at end of file diff --git a/docs/examples/config.rst b/docs/examples/config.rst index 3fc1906b..b5ccd284 100644 --- a/docs/examples/config.rst +++ b/docs/examples/config.rst @@ -59,60 +59,79 @@ Actor/Rollout/Reference Policy .. code:: yaml actor_rollout_ref: - hybrid_engine: True - model: - path: ~/models/deepseek-llm-7b-chat - external_lib: null - override_config: {} - enable_gradient_checkpointing: False - actor: - strategy: fsdp # This is for backward-compatibility - ppo_mini_batch_size: 256 - ppo_micro_batch_size: 64 - grad_clip: 1.0 - clip_ratio: 0.2 - entropy_coeff: 0.001 - ppo_epochs: 1 - shuffle: True - optim: - lr: 1e-6 - lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime - min_lr_ratio: null # only useful for warmup with cosine - warmup_style: constant # select from constant/cosine - total_training_steps: -1 # must be override by program - fsdp_config: - wrap_policy: - # transformer_layer_cls_to_wrap: None - min_num_params: 0 - param_offload: False - grad_offload: False - optimizer_offload: False - ref: - fsdp_config: - param_offload: False - wrap_policy: - # transformer_layer_cls_to_wrap: None - min_num_params: 0 - log_prob_micro_batch_size: 128 - rollout: - name: vllm - temperature: 1.0 - top_k: -1 # 0 for hf rollout, -1 for vllm rollout - top_p: 1 - response_length: ${data.max_response_length} - # for vllm rollout - dtype: bfloat16 # should align with FSDP - gpu_memory_utilization: 0.5 - ignore_eos: False - enforce_eager: True - free_cache_engine: True - load_format: dummy_dtensor # or dummy_hf or dummy_megatron - tensor_model_parallel_size: 2 - max_num_batched_tokens: 8192 - max_num_seqs: 1024 - log_prob_micro_batch_size: 128 - # for vllm and hf rollout - do_sample: True + hybrid_engine: True + model: + path: ~/models/deepseek-llm-7b-chat + external_lib: null + override_config: { } + enable_gradient_checkpointing: False + use_remove_padding: False + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 8 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: False # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + grad_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 16 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.5 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 16 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + # for hf rollout + do_sample: True + # number of responses (i.e. num sample times) + n: 1 # > 1 for grpo **Common config for actor, rollout and reference model** @@ -136,11 +155,15 @@ Actor/Rollout/Reference Policy - ``actor_rollout_ref.actor.ppo_mini_batch_size``: One sample is split into multiple sub-batches with batch_size=ppo_mini_batch_size for PPO - updates + updates. The ppo_mini_batch_size is a global num across all workers/gpus + +- ``actor_rollout_ref.actor.ppo_micro_batch_size``: [Will be deprecated, use ppo_micro_batch_size_per_gpu] + Similar to gradient accumulation, the micro_batch_size_per_gpu for one forward pass, + trading speed for GPU memory. The value represent the global view. -- ``actor_rollout_ref.actor.ppo_micro_batch_size``: Similar to gradient - accumulation, the micro_batch_size for one forward pass, trading speed - for GPU memory +- ``actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu``: Similar to gradient + accumulation, the micro_batch_size_per_gpu for one forward pass, trading speed + for GPU memory. The value represent the local num per gpu. - ``actor_rollout_ref.actor.grad_clip``: Gradient clipping for actor updates @@ -176,8 +199,12 @@ Actor/Rollout/Reference Policy - ``actor_rollout_ref.ref``: FSDP config same as actor. **For models larger than 7B, it's recommended to turn on offload for ref by default** -- ``actor_rollout_ref.ref.log_prob_micro_batch_size``: The batch size - for one forward pass in the computation of ``ref_log_prob``. + +- ``actor_rollout_ref.ref.log_prob_micro_batch_size``: [Will be deprecate, use log_prob_micro_batch_size_per_gpu] + The batch size for one forward pass in the computation of ``ref_log_prob``. The value represent the global num. + +- ``actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu``: The batch size + for one forward pass in the computation of ``ref_log_prob``. The value represent the local num per gpu. **Rollout Model** @@ -201,8 +228,11 @@ Actor/Rollout/Reference Policy - ``tensor_model_parallel_size``: TP size for rollout. Only effective for vllm. -- ``log_prob_micro_batch_size``: Micro_batch_size (The batch size for - one forward pass) for recalculating log_prob. +- ``actor_rollout_ref.ref.log_prob_micro_batch_size``: [Will be deprecate, use log_prob_micro_batch_size_per_gpu] + The batch size for one forward pass in the computation of ``log_prob``. The value represent the global num. + +- ``log_prob_micro_batch_size_per_gpu``: Micro batch size per gpu (The batch size for + one forward pass) for recalculating ``log_prob``. The value represent the local num per gpu. - ``do_sample``: Whether to sample. If set to False, the rollout model will perform greedy sampling. We disable ``do_sample`` during @@ -260,7 +290,7 @@ Reward Model fsdp_config: min_num_params: 0 param_offload: False - micro_batch_size: 64 + micro_batch_size_per_gpu: 16 max_length: null - ``reward_model.enable``: Whether to enable reward model. If False, we diff --git a/docs/examples/gsm8k_example.rst b/docs/examples/gsm8k_example.rst index de694cfd..ac4550df 100644 --- a/docs/examples/gsm8k_example.rst +++ b/docs/examples/gsm8k_example.rst @@ -85,7 +85,7 @@ We also provide various training scripts for SFT on GSM8K dataset in `gsm8k sft data.val_files=$HOME/data/gsm8k/test.parquet \ data.prompt_key=question \ data.response_key=answer \ - data.micro_batch_size=8 \ + data.micro_batch_size_per_gpu=8 \ model.partial_pretrain=deepseek-ai/deepseek-coder-6.7b-instruct \ trainer.default_hdfs_dir=hdfs://user/verl/experiments/gsm8k/deepseek-coder-6.7b-instruct/ \ trainer.project_name=gsm8k-sft \ @@ -136,21 +136,20 @@ The script of run_deepseek7b_llm.sh actor_rollout_ref.model.path=~/models/deepseek-llm-7b-chat \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ - actor_rollout_ref.actor.ppo_micro_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ - actor_rollout_ref.rollout.micro_batch_size=256 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ critic.optim.lr=1e-5 \ critic.model.path=~/models/deepseek-llm-7b-chat \ critic.model.enable_gradient_checkpointing=False \ - critic.ppo_micro_batch_size=64 \ + critic.ppo_micro_batch_size_per_gpu=16 \ critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.optimizer_offload=False \ diff --git a/docs/start/quickstart.rst b/docs/start/quickstart.rst index 881839af..5e0da4a7 100644 --- a/docs/start/quickstart.rst +++ b/docs/start/quickstart.rst @@ -92,14 +92,14 @@ Set the ``data.train_files`` ,\ ``data.val_files``, ``actor_rollout_ref.model.pa actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.ppo_mini_batch_size=64 \ - actor_rollout_ref.actor.ppo_micro_batch_size=4 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=8 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=4 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ critic.optim.lr=1e-5 \ critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \ - critic.ppo_micro_batch_size=4 \ + critic.ppo_micro_batch_size_per_gpu=4 \ algorithm.kl_ctrl.kl_coef=0.001 \ trainer.logger=['console'] \ +trainer.val_before_train=False \ @@ -133,8 +133,8 @@ If you encounter out of memory issues with HBM less than 32GB, enable the follow .. code-block:: bash - actor_rollout_ref.actor.ppo_micro_batch_size=1 \ - critic.ppo_micro_batch_size=1 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + critic.ppo_micro_batch_size_per_gpu=1 \ For the full set of configs, please refer to :ref:`config-explain-page` for detailed explaination and performance tuning. diff --git a/examples/grpo_trainer/run_deepseek7b_llm.sh b/examples/grpo_trainer/run_deepseek7b_llm.sh index 1db79510..912f6a34 100644 --- a/examples/grpo_trainer/run_deepseek7b_llm.sh +++ b/examples/grpo_trainer/run_deepseek7b_llm.sh @@ -12,7 +12,7 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ - actor_rollout_ref.actor.ppo_micro_batch_size=128 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=80 \ actor_rollout_ref.actor.use_kl_loss=True \ actor_rollout_ref.actor.kl_loss_coef=0.001 \ actor_rollout_ref.actor.kl_loss_type=low_var_kl \ @@ -20,16 +20,16 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=256 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=160 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ actor_rollout_ref.rollout.n=5 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=256 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=160 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ algorithm.kl_ctrl.kl_coef=0.001 \ trainer.critic_warmup=0 \ - trainer.logger=['console','wandb'] \ + trainer.logger=['console'] \ trainer.project_name='verl_grpo_example_gsm8k' \ trainer.experiment_name='deepseek_llm_7b_function_rm' \ trainer.n_gpus_per_node=8 \ diff --git a/examples/grpo_trainer/run_qwen2-7b.sh b/examples/grpo_trainer/run_qwen2-7b.sh index dc293365..a082c368 100644 --- a/examples/grpo_trainer/run_qwen2-7b.sh +++ b/examples/grpo_trainer/run_qwen2-7b.sh @@ -14,7 +14,7 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ - actor_rollout_ref.actor.ppo_micro_batch_size=128 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=80 \ actor_rollout_ref.actor.use_kl_loss=True \ actor_rollout_ref.actor.kl_loss_coef=0.001 \ actor_rollout_ref.actor.kl_loss_type=low_var_kl \ @@ -22,12 +22,12 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=256 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=160 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ actor_rollout_ref.rollout.n=5 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=256 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=160 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ algorithm.kl_ctrl.kl_coef=0.001 \ trainer.critic_warmup=0 \ diff --git a/examples/ppo_trainer/run_deepseek7b_llm.sh b/examples/ppo_trainer/run_deepseek7b_llm.sh index d4ebdd8a..a34f67a5 100644 --- a/examples/ppo_trainer/run_deepseek7b_llm.sh +++ b/examples/ppo_trainer/run_deepseek7b_llm.sh @@ -11,21 +11,22 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ - actor_rollout_ref.actor.ppo_micro_batch_size=32 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ critic.optim.lr=1e-5 \ critic.model.use_remove_padding=True \ critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ - critic.model.enable_gradient_checkpointing=False \ - critic.ppo_micro_batch_size=32 \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=32 \ critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.optimizer_offload=False \ @@ -37,4 +38,5 @@ python3 -m verl.trainer.main_ppo \ trainer.n_gpus_per_node=8 \ trainer.nnodes=1 \ trainer.save_freq=-1 \ + trainer.test_freq=1 \ trainer.total_epochs=15 $@ diff --git a/examples/ppo_trainer/run_deepseek7b_llm_sp2.sh b/examples/ppo_trainer/run_deepseek7b_llm_sp2.sh index 42b69b4c..ef4db1a7 100644 --- a/examples/ppo_trainer/run_deepseek7b_llm_sp2.sh +++ b/examples/ppo_trainer/run_deepseek7b_llm_sp2.sh @@ -11,24 +11,24 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ - actor_rollout_ref.actor.ppo_micro_batch_size=128 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \ actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \ actor_rollout_ref.model.enable_gradient_checkpointing=True \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=256 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64 \ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=256 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ critic.optim.lr=1e-5 \ critic.ulysses_sequence_parallel_size=2 \ critic.model.use_remove_padding=True \ critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ - critic.model.enable_gradient_checkpointing=False \ - critic.ppo_micro_batch_size=64 \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=64 \ critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.optimizer_offload=False \ diff --git a/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh b/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh index bd2c0bc8..a7b16a7c 100644 --- a/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh +++ b/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh @@ -13,21 +13,21 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.ppo_mini_batch_size=128 \ - actor_rollout_ref.actor.ppo_micro_batch_size=16 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=16 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=16 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ actor_rollout_ref.ref.param_offload=False \ critic.optim.lr=1e-5 \ critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ critic.model.enable_gradient_checkpointing=False \ - critic.ppo_micro_batch_size=16 \ + critic.ppo_micro_batch_size_per_gpu=4 \ reward_model.enable=True \ reward_model.megatron.tensor_model_parallel_size=4 \ reward_model.model.path=deepseek-ai/deepseek-llm-7b-chat \ - reward_model.micro_batch_size=16 \ + reward_model.micro_batch_size_per_gpu=4 \ reward_model.param_offload=False \ algorithm.kl_ctrl.kl_coef=0.001 \ trainer.critic_warmup=0 \ diff --git a/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron.sh b/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron.sh index c342d526..17b170a1 100644 --- a/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron.sh +++ b/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron.sh @@ -18,16 +18,16 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ - actor_rollout_ref.actor.ppo_micro_batch_size=32 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=32 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ critic.optim.lr=1e-5 \ critic.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \ critic.model.enable_gradient_checkpointing=False \ - critic.ppo_micro_batch_size=32 \ + critic.ppo_micro_batch_size_per_gpu=4 \ algorithm.kl_ctrl.kl_coef=0.001 \ trainer.critic_warmup=0 \ trainer.logger=['console','wandb'] \ diff --git a/examples/ppo_trainer/run_deepseek_megatron.sh b/examples/ppo_trainer/run_deepseek_megatron.sh index 1f0f51e7..c838a1bb 100644 --- a/examples/ppo_trainer/run_deepseek_megatron.sh +++ b/examples/ppo_trainer/run_deepseek_megatron.sh @@ -10,16 +10,16 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \ actor_rollout_ref.actor.optim.lr=2e-6 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ - actor_rollout_ref.actor.ppo_micro_batch_size=64 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ critic.optim.lr=2e-5 \ critic.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \ critic.model.enable_gradient_checkpointing=False \ - critic.ppo_micro_batch_size=64 \ + critic.ppo_micro_batch_size_per_gpu=8 \ algorithm.kl_ctrl.kl_coef=0.001 \ trainer.critic_warmup=0 \ trainer.logger=['console','wandb'] \ diff --git a/examples/ppo_trainer/run_gemma.sh b/examples/ppo_trainer/run_gemma.sh index 9fb455a6..5072e04e 100644 --- a/examples/ppo_trainer/run_gemma.sh +++ b/examples/ppo_trainer/run_gemma.sh @@ -11,21 +11,21 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.actor.ppo_mini_batch_size=128 \ - actor_rollout_ref.actor.ppo_micro_batch_size=4 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=4 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ critic.optim.lr=1e-5 \ critic.model.use_remove_padding=True \ critic.model.path=google/gemma-2-2b-it \ critic.model.enable_gradient_checkpointing=False \ - critic.ppo_micro_batch_size=4 \ + critic.ppo_micro_batch_size_per_gpu=4 \ critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.optimizer_offload=False \ diff --git a/examples/ppo_trainer/run_qwen2-7b.sh b/examples/ppo_trainer/run_qwen2-7b.sh index 8e6bb16c..80dcd922 100644 --- a/examples/ppo_trainer/run_qwen2-7b.sh +++ b/examples/ppo_trainer/run_qwen2-7b.sh @@ -19,21 +19,22 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ - actor_rollout_ref.actor.ppo_micro_batch_size=16 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=16 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=vllm \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=16 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ critic.optim.lr=1e-5 \ critic.model.use_remove_padding=True \ critic.model.path=Qwen/Qwen2-7B-Instruct \ - critic.model.enable_gradient_checkpointing=False \ - critic.ppo_micro_batch_size=16 \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=32 \ critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.optimizer_offload=False \ diff --git a/examples/ppo_trainer/run_qwen2-7b_rm.sh b/examples/ppo_trainer/run_qwen2-7b_rm.sh index fc64d36b..35d030ad 100644 --- a/examples/ppo_trainer/run_qwen2-7b_rm.sh +++ b/examples/ppo_trainer/run_qwen2-7b_rm.sh @@ -23,22 +23,22 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ - actor_rollout_ref.actor.ppo_micro_batch_size=16 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=16 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=16 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ critic.optim.lr=1e-5 \ critic.model.use_remove_padding=True \ critic.optim.lr_warmup_steps_ratio=0.05 \ critic.model.path=Qwen/Qwen2-7B-Instruct \ critic.model.enable_gradient_checkpointing=False \ - critic.ppo_micro_batch_size=16 \ + critic.ppo_micro_batch_size_per_gpu=16 \ critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.optimizer_offload=False \ @@ -46,7 +46,7 @@ python3 -m verl.trainer.main_ppo \ reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\ reward_model.model.use_remove_padding=True \ reward_model.model.fsdp_config.param_offload=True \ - reward_model.micro_batch_size=16 \ + reward_model.micro_batch_size_per_gpu=32 \ algorithm.kl_ctrl.kl_coef=0.001 \ trainer.critic_warmup=0 \ trainer.logger=['console','wandb'] \ diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh index dd1e7327..c626e67c 100644 --- a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh +++ b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh @@ -45,7 +45,7 @@ python3 -m verl.trainer.main_ppo \ reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\ reward_model.model.use_remove_padding=True \ reward_model.model.fsdp_config.param_offload=True \ - reward_model.micro_batch_size=16 \ + reward_model.micro_batch_size_per_gpu=32 \ reward_model.use_dynamic_bsz=True \ reward_model.forward_max_token_len_per_gpu=98304 \ algorithm.kl_ctrl.kl_coef=0.001 \ diff --git a/examples/ppo_trainer/run_qwen2.5-32b.sh b/examples/ppo_trainer/run_qwen2.5-32b.sh index 1f3bdc3a..5841d5b1 100644 --- a/examples/ppo_trainer/run_qwen2.5-32b.sh +++ b/examples/ppo_trainer/run_qwen2.5-32b.sh @@ -20,21 +20,22 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ - actor_rollout_ref.actor.ppo_micro_batch_size=16 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ actor_rollout_ref.rollout.name=vllm \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ critic.optim.lr=1e-5 \ critic.model.use_remove_padding=True \ critic.model.path=Qwen/Qwen2.5-32B-Instruct \ critic.model.enable_gradient_checkpointing=False \ - critic.ppo_micro_batch_size=32 \ + critic.ppo_micro_batch_size_per_gpu=8 \ critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.optimizer_offload=False \ diff --git a/examples/ppo_trainer/verl_getting_started.ipynb b/examples/ppo_trainer/verl_getting_started.ipynb index afe9ca35..dfa93789 100644 --- a/examples/ppo_trainer/verl_getting_started.ipynb +++ b/examples/ppo_trainer/verl_getting_started.ipynb @@ -646,7 +646,7 @@ "\u001b[36m(main_task pid=28294)\u001b[0m 'path': '/teamspace/studios/this_studio/models/Qwen2.5-0.5B-Instruct'},\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'ref': {'fsdp_config': {'param_offload': False,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'wrap_policy': {'min_num_params': 0}},\n", - "\u001b[36m(main_task pid=28294)\u001b[0m 'log_prob_micro_batch_size': 4},\n", + "\u001b[36m(main_task pid=28294)\u001b[0m 'log_prob_micro_batch_size_per_gpu': 4},\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'rollout': {'do_sample': True,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'dtype': 'bfloat16',\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'enforce_eager': True,\n", @@ -654,7 +654,7 @@ "\u001b[36m(main_task pid=28294)\u001b[0m 'gpu_memory_utilization': 0.4,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'ignore_eos': False,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'load_format': 'dummy_dtensor',\n", - "\u001b[36m(main_task pid=28294)\u001b[0m 'log_prob_micro_batch_size': 1,\n", + "\u001b[36m(main_task pid=28294)\u001b[0m 'log_prob_micro_batch_size_per_gpu': 1,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'max_num_batched_tokens': 8192,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'max_num_seqs': 1024,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'n': 1,\n", @@ -671,7 +671,7 @@ "\u001b[36m(main_task pid=28294)\u001b[0m 'kl_penalty': 'kl',\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'lam': 1.0},\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'critic': {'cliprange_value': 0.5,\n", - "\u001b[36m(main_task pid=28294)\u001b[0m 'forward_micro_batch_size': 4,\n", + "\u001b[36m(main_task pid=28294)\u001b[0m 'forward_micro_batch_size_per_gpu': 4,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'grad_clip': 1.0,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'model': {'enable_gradient_checkpointing': False,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'external_lib': None,\n", @@ -1110,10 +1110,10 @@ " actor_rollout_ref.actor.optim.lr=1e-6 \\\n", " actor_rollout_ref.actor.ppo_mini_batch_size=64 \\\n", " actor_rollout_ref.actor.ppo_micro_batch_size=1 \\\n", - " actor_rollout_ref.rollout.log_prob_micro_batch_size=1 \\\n", + " actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \\\n", " actor_rollout_ref.rollout.tensor_model_parallel_size=1 \\\n", " actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \\\n", - " actor_rollout_ref.ref.log_prob_micro_batch_size=4 \\\n", + " actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \\\n", " critic.optim.lr=1e-5 \\\n", " critic.model.path=$HOME/models/Qwen2.5-0.5B-Instruct \\\n", " critic.ppo_micro_batch_size=1 \\\n", diff --git a/examples/sft/gsm8k/run_deepseek_6b7.sh b/examples/sft/gsm8k/run_deepseek_6b7.sh index 8e4d54c6..f11965a6 100644 --- a/examples/sft/gsm8k/run_deepseek_6b7.sh +++ b/examples/sft/gsm8k/run_deepseek_6b7.sh @@ -1,19 +1,29 @@ set -x -hdfs_path=hdfs://user/verl/experiments/gsm8k/deepseek-coder-6.7b-instruct/ # replace to your own hdfs/local path +if [ "$#" -lt 2 ]; then + echo "Usage: run_deepseek_6b7.sh [other_configs...]" + exit 1 +fi nproc_per_node=$1 +save_path=$2 + +# Shift the arguments so $@ refers to the rest +shift 2 torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ -m verl.trainer.fsdp_sft_trainer \ data.train_files=$HOME/data/gsm8k/train.parquet \ data.val_files=$HOME/data/gsm8k/test.parquet \ - data.prompt_key=prompt \ - data.response_key=answer \ - data.micro_batch_size=8 \ + data.prompt_key=extra_info \ + data.response_key=extra_info \ + +data.prompt_dict_keys=['question'] \ + +data.response_dict_keys=['answer'] \ + data.micro_batch_size_per_gpu=4 \ model.partial_pretrain=deepseek-ai/deepseek-coder-6.7b-instruct \ - trainer.default_hdfs_dir=$hdfs_path \ + trainer.default_local_dir=$save_path \ trainer.project_name=gsm8k-sft \ trainer.experiment_name=gsm8k-sft-deepseek-coder-6.7b-instruct \ trainer.total_epochs=4 \ - trainer.logger=['console','wandb'] \ No newline at end of file + trainer.logger=['console','wandb'] \ + trainer.default_hdfs_dir=null $@ \ No newline at end of file diff --git a/examples/sft/gsm8k/run_gemma_2b.sh b/examples/sft/gsm8k/run_gemma_2b.sh index 7ec85c09..6d7917d9 100644 --- a/examples/sft/gsm8k/run_gemma_2b.sh +++ b/examples/sft/gsm8k/run_gemma_2b.sh @@ -21,7 +21,7 @@ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ data.response_key=extra_info \ +data.prompt_dict_keys=['question'] \ +data.response_dict_keys=['answer'] \ - data.micro_batch_size=8 \ + data.micro_batch_size_per_gpu=4 \ model.partial_pretrain=google/gemma-2b-it \ trainer.default_local_dir=$save_path \ trainer.project_name=gsm8k-sft \ diff --git a/examples/sft/gsm8k/run_gemma_7b.sh b/examples/sft/gsm8k/run_gemma_7b.sh index 9c357926..fdf4435b 100644 --- a/examples/sft/gsm8k/run_gemma_7b.sh +++ b/examples/sft/gsm8k/run_gemma_7b.sh @@ -1,8 +1,15 @@ set -x -hdfs_path=hdfs://user/verl/experiments/gsm8k/gemma-1.1-7b-it/ # replace to your own hdfs/local path +if [ "$#" -lt 2 ]; then + echo "Usage: run_gemma_7b.sh [other_configs...]" + exit 1 +fi nproc_per_node=$1 +save_path=$2 + +# Shift the arguments so $@ refers to the rest +shift 2 torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ -m verl.trainer.fsdp_sft_trainer \ @@ -10,10 +17,11 @@ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ data.val_files=$HOME/data/gsm8k/test.parquet \ data.prompt_key=prompt \ data.response_key=answer \ - data.micro_batch_size=8 \ + data.micro_batch_size_per_gpu=4 \ model.partial_pretrain=google/gemma-1.1-7b-it \ - trainer.default_hdfs_dir=$hdfs_path \ + trainer.default_local_dir=$save_path \ trainer.project_name=gsm8k-sft \ trainer.experiment_name=gsm8k-sft-gemma-1.1-7b-it \ trainer.total_epochs=4 \ - trainer.logger=['console','wandb'] \ No newline at end of file + trainer.logger=['console','wandb'] \ + trainer.default_hdfs_dir=null $@ \ No newline at end of file diff --git a/examples/sft/gsm8k/run_qwen_05_peft.sh b/examples/sft/gsm8k/run_qwen_05_peft.sh index be38e5de..3ba61c3a 100755 --- a/examples/sft/gsm8k/run_qwen_05_peft.sh +++ b/examples/sft/gsm8k/run_qwen_05_peft.sh @@ -22,13 +22,13 @@ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ optim.lr=1e-4 \ +data.prompt_dict_keys=['question'] \ +data.response_dict_keys=['answer'] \ - data.micro_batch_size=32 \ + data.micro_batch_size_per_gpu=4 \ model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \ trainer.default_local_dir=$save_path \ trainer.project_name=gsm8k-sft \ trainer.experiment_name=gsm8k-sft-qwen-2.5-0.5b-instruct \ trainer.logger=['console'] \ - trainer.total_training_steps=1 \ + trainer.total_epochs=1 \ trainer.default_hdfs_dir=null $@ \ model.lora_rank=32\ model.lora_alpha=16 \ diff --git a/examples/split_placement/config/ppo_trainer_split.yaml b/examples/split_placement/config/ppo_trainer_split.yaml index 7984c45a..a475d7af 100644 --- a/examples/split_placement/config/ppo_trainer_split.yaml +++ b/examples/split_placement/config/ppo_trainer_split.yaml @@ -20,7 +20,8 @@ actor_rollout_ref: actor: strategy: fsdp # This is for backward-compatibility ppo_mini_batch_size: 256 - ppo_micro_batch_size: 64 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 64 grad_clip: 1.0 clip_ratio: 0.2 entropy_coeff: 0.001 @@ -45,7 +46,8 @@ actor_rollout_ref: wrap_policy: # transformer_layer_cls_to_wrap: None min_num_params: 0 - log_prob_micro_batch_size: 128 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 128 rollout: name: vllm temperature: 1.0 @@ -63,7 +65,8 @@ actor_rollout_ref: tensor_model_parallel_size: 2 max_num_batched_tokens: 8192 max_num_seqs: 1024 - log_prob_micro_batch_size: 128 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 128 # for hf rollout do_sample: True # number of responses (i.e. num sample times) @@ -91,7 +94,8 @@ critic: # transformer_layer_cls_to_wrap: None min_num_params: 0 ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} - ppo_micro_batch_size: 64 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 64 ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} shuffle: ${actor_rollout_ref.actor.shuffle} grad_clip: 1.0 @@ -107,7 +111,8 @@ reward_model: fsdp_config: min_num_params: 0 param_offload: False - micro_batch_size: 64 + micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu + micro_batch_size_per_gpu: 64 max_length: null algorithm: diff --git a/examples/split_placement/run_deepseek7b_llm.sh b/examples/split_placement/run_deepseek7b_llm.sh index a2db960a..c701de85 100644 --- a/examples/split_placement/run_deepseek7b_llm.sh +++ b/examples/split_placement/run_deepseek7b_llm.sh @@ -10,20 +10,20 @@ python3 main_ppo_split.py \ actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ - actor_rollout_ref.actor.ppo_micro_batch_size=16 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=32 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ critic.optim.lr=1e-5 \ critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ critic.model.enable_gradient_checkpointing=False \ - critic.ppo_micro_batch_size=16 \ + critic.ppo_micro_batch_size_per_gpu=8 \ critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.optimizer_offload=False \ diff --git a/tests/e2e/arithmetic_sequence/rl/config/ray_trainer.yaml b/tests/e2e/arithmetic_sequence/rl/config/ray_trainer.yaml index 088557db..d2c5e056 100644 --- a/tests/e2e/arithmetic_sequence/rl/config/ray_trainer.yaml +++ b/tests/e2e/arithmetic_sequence/rl/config/ray_trainer.yaml @@ -22,7 +22,8 @@ actor_rollout_ref: actor: strategy: fsdp # This is for backward-compatibility ppo_mini_batch_size: 200 - ppo_micro_batch_size: 200 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null use_dynamic_bsz: False ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} grad_clip: 1.0 @@ -50,7 +51,8 @@ actor_rollout_ref: wrap_policy: # transformer_layer_cls_to_wrap: None min_num_params: 0 - micro_batch_size: 200 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: null log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size @@ -65,14 +67,15 @@ actor_rollout_ref: dtype: bfloat16 # should align with FSDP gpu_memory_utilization: 0.1 ignore_eos: False - micro_batch_size: 200 + micro_batch_size_per_gpu: 200 enforce_eager: True free_cache_engine: True load_format: dummy_dtensor tensor_model_parallel_size: 1 max_num_batched_tokens: 8192 max_num_seqs: 1024 - log_prob_micro_batch_size: 200 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: null log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} # for hf rollout @@ -80,6 +83,7 @@ actor_rollout_ref: # number of responses (i.e. num sample times) n: 1 # > 1 for grpo + critic: strategy: fsdp optim: @@ -100,8 +104,10 @@ critic: # transformer_layer_cls_to_wrap: None min_num_params: 0 ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} - ppo_micro_batch_size: 200 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} @@ -128,7 +134,8 @@ reward_model: fsdp_config: min_num_params: 0 fsdp_size: -1 - micro_batch_size: 8 + micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu + micro_batch_size_per_gpu: null # set a number max_length: null ulysses_sequence_parallel_size: 1 # sp size diff --git a/tests/e2e/arithmetic_sequence/rl/main_trainer.py b/tests/e2e/arithmetic_sequence/rl/main_trainer.py index 18fdd457..90e9a9e2 100644 --- a/tests/e2e/arithmetic_sequence/rl/main_trainer.py +++ b/tests/e2e/arithmetic_sequence/rl/main_trainer.py @@ -105,14 +105,6 @@ def main(config): from omegaconf import OmegaConf pprint(OmegaConf.to_container(config, resolve=True)) # resolve=True will eval symbol values - dp_size = config.trainer.n_gpus_per_node * config.trainer.nnodes - # normalize batch_size - # TODO: move this inside each role - config.actor_rollout_ref.actor.ppo_mini_batch_size //= dp_size - config.actor_rollout_ref.actor.ppo_micro_batch_size //= dp_size - config.critic.ppo_micro_batch_size //= dp_size - config.actor_rollout_ref.rollout.micro_batch_size //= dp_size - # print the config # print initial config print('Config after normalizing batch_size') diff --git a/tests/e2e/run_qwen_gsm8k_function_rm.sh b/tests/e2e/run_qwen_gsm8k_function_rm.sh index 459fbdb7..107674ea 100644 --- a/tests/e2e/run_qwen_gsm8k_function_rm.sh +++ b/tests/e2e/run_qwen_gsm8k_function_rm.sh @@ -13,21 +13,21 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ - actor_rollout_ref.actor.ppo_micro_batch_size=32 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ critic.optim.lr=1e-5 \ critic.model.use_remove_padding=True \ critic.model.path=Qwen/Qwen2.5-0.5B \ critic.model.enable_gradient_checkpointing=False \ - critic.ppo_micro_batch_size=32 \ + critic.ppo_micro_batch_size_per_gpu=4 \ critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.optimizer_offload=False \ diff --git a/tests/e2e/run_qwen_gsm8k_function_rm_no_rmpad.sh b/tests/e2e/run_qwen_gsm8k_function_rm_no_rmpad.sh index 5250b813..9b628fbd 100644 --- a/tests/e2e/run_qwen_gsm8k_function_rm_no_rmpad.sh +++ b/tests/e2e/run_qwen_gsm8k_function_rm_no_rmpad.sh @@ -13,21 +13,21 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.model.use_remove_padding=False \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ - actor_rollout_ref.actor.ppo_micro_batch_size=32 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ critic.optim.lr=1e-5 \ critic.model.use_remove_padding=False \ critic.model.path=Qwen/Qwen2.5-0.5B \ critic.model.enable_gradient_checkpointing=False \ - critic.ppo_micro_batch_size=32 \ + critic.ppo_micro_batch_size_per_gpu=4 \ critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.optimizer_offload=False \ diff --git a/tests/e2e/run_qwen_gsm8k_model_rm.sh b/tests/e2e/run_qwen_gsm8k_model_rm.sh index b7ef53ef..61da215f 100644 --- a/tests/e2e/run_qwen_gsm8k_model_rm.sh +++ b/tests/e2e/run_qwen_gsm8k_model_rm.sh @@ -15,22 +15,22 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ - actor_rollout_ref.actor.ppo_micro_batch_size=32 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ critic.optim.lr=1e-5 \ critic.model.use_remove_padding=True \ critic.optim.lr_warmup_steps_ratio=0.05 \ critic.model.path=Qwen/Qwen2.5-0.5B \ critic.model.enable_gradient_checkpointing=False \ - critic.ppo_micro_batch_size=32 \ + critic.ppo_micro_batch_size_per_gpu=4 \ critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.optimizer_offload=False \ @@ -38,7 +38,7 @@ python3 -m verl.trainer.main_ppo \ reward_model.model.path=Qwen/Qwen2.5-0.5B\ reward_model.model.use_remove_padding=True \ reward_model.model.fsdp_config.param_offload=True \ - reward_model.micro_batch_size=16 \ + reward_model.micro_batch_size_per_gpu=16 \ algorithm.kl_ctrl.kl_coef=0.001 \ trainer.critic_warmup=0 \ trainer.logger=['console'] \ diff --git a/tests/e2e/run_qwen_gsm8k_model_rm_no_rmpad.sh b/tests/e2e/run_qwen_gsm8k_model_rm_no_rmpad.sh index cd06c2f8..e54d5568 100644 --- a/tests/e2e/run_qwen_gsm8k_model_rm_no_rmpad.sh +++ b/tests/e2e/run_qwen_gsm8k_model_rm_no_rmpad.sh @@ -15,22 +15,22 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.model.use_remove_padding=False \ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ - actor_rollout_ref.actor.ppo_micro_batch_size=32 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ critic.optim.lr=1e-5 \ critic.model.use_remove_padding=False \ critic.optim.lr_warmup_steps_ratio=0.05 \ critic.model.path=Qwen/Qwen2.5-0.5B \ critic.model.enable_gradient_checkpointing=False \ - critic.ppo_micro_batch_size=32 \ + critic.ppo_micro_batch_size_per_gpu=4 \ critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.optimizer_offload=False \ @@ -38,7 +38,7 @@ python3 -m verl.trainer.main_ppo \ reward_model.model.path=Qwen/Qwen2.5-0.5B\ reward_model.model.use_remove_padding=False \ reward_model.model.fsdp_config.param_offload=True \ - reward_model.micro_batch_size=16 \ + reward_model.micro_batch_size_per_gpu=16 \ algorithm.kl_ctrl.kl_coef=0.001 \ trainer.critic_warmup=0 \ +trainer.val_before_train=False \ diff --git a/tests/e2e/run_qwen_gsm8k_model_rm_seq_balance.sh b/tests/e2e/run_qwen_gsm8k_model_rm_seq_balance.sh index e50a95fc..53b3c8cf 100644 --- a/tests/e2e/run_qwen_gsm8k_model_rm_seq_balance.sh +++ b/tests/e2e/run_qwen_gsm8k_model_rm_seq_balance.sh @@ -15,13 +15,13 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ - actor_rollout_ref.actor.ppo_micro_batch_size=32 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ actor_rollout_ref.actor.use_dynamic_bsz=True \ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=12000 \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ @@ -33,7 +33,7 @@ python3 -m verl.trainer.main_ppo \ critic.optim.lr_warmup_steps_ratio=0.05 \ critic.model.path=Qwen/Qwen2.5-0.5B \ critic.model.enable_gradient_checkpointing=False \ - critic.ppo_micro_batch_size=32 \ + critic.ppo_micro_batch_size_per_gpu=4 \ critic.use_dynamic_bsz=True \ critic.ppo_max_token_len_per_gpu=98304 \ critic.model.fsdp_config.param_offload=False \ @@ -43,7 +43,7 @@ python3 -m verl.trainer.main_ppo \ reward_model.model.path=Qwen/Qwen2.5-0.5B\ reward_model.model.use_remove_padding=True \ reward_model.model.fsdp_config.param_offload=True \ - reward_model.micro_batch_size=16 \ + reward_model.micro_batch_size_per_gpu=16 \ reward_model.use_dynamic_bsz=True \ reward_model.forward_max_token_len_per_gpu=98304 \ algorithm.kl_ctrl.kl_coef=0.001 \ diff --git a/tests/e2e/run_qwen_gsm8k_model_rm_ulysses.sh b/tests/e2e/run_qwen_gsm8k_model_rm_ulysses.sh index 4ba4f9e4..7ab764f8 100644 --- a/tests/e2e/run_qwen_gsm8k_model_rm_ulysses.sh +++ b/tests/e2e/run_qwen_gsm8k_model_rm_ulysses.sh @@ -15,17 +15,17 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ - actor_rollout_ref.actor.ppo_micro_batch_size=32 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ - actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ critic.optim.lr=1e-5 \ critic.ulysses_sequence_parallel_size=2 \ @@ -33,7 +33,7 @@ python3 -m verl.trainer.main_ppo \ critic.optim.lr_warmup_steps_ratio=0.05 \ critic.model.path=Qwen/Qwen2.5-0.5B \ critic.model.enable_gradient_checkpointing=False \ - critic.ppo_micro_batch_size=32 \ + critic.ppo_micro_batch_size_per_gpu=4 \ critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.optimizer_offload=False \ @@ -43,7 +43,7 @@ python3 -m verl.trainer.main_ppo \ reward_model.model.path=Qwen/Qwen2.5-0.5B\ reward_model.model.use_remove_padding=True \ reward_model.model.fsdp_config.param_offload=True \ - reward_model.micro_batch_size=16 \ + reward_model.micro_batch_size_per_gpu=16 \ algorithm.kl_ctrl.kl_coef=0.001 \ trainer.critic_warmup=0 \ +trainer.val_before_train=False \ diff --git a/tests/e2e/run_ray_trainer.sh b/tests/e2e/run_ray_trainer.sh index 51d18fcc..f06597cc 100644 --- a/tests/e2e/run_ray_trainer.sh +++ b/tests/e2e/run_ray_trainer.sh @@ -11,6 +11,10 @@ python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \ data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \ data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \ actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ + critic.ppo_micro_batch_size_per_gpu=1 \ critic.model.path=tests/e2e/arithmetic_sequence/model | tee $OUTPUT_FILE; python3 tests/e2e/check_results.py --output_file=$OUTPUT_FILE diff --git a/tests/sft/run_sft.sh b/tests/sft/run_sft.sh index bb0a18d2..89132832 100644 --- a/tests/sft/run_sft.sh +++ b/tests/sft/run_sft.sh @@ -10,7 +10,7 @@ torchrun --standalone --nnodes=1 --nproc_per_node=8 \ data.response_key=extra_info \ +data.prompt_dict_keys=['question'] \ +data.response_dict_keys=['answer'] \ - data.micro_batch_size=32 \ + data.micro_batch_size_per_gpu=32 \ model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \ trainer.default_local_dir=$HOME/ckpts/ \ trainer.project_name=qwen2.5-sft \ diff --git a/tests/sft/run_sft_qwen05_peft.sh b/tests/sft/run_sft_qwen05_peft.sh new file mode 100644 index 00000000..2d1744f6 --- /dev/null +++ b/tests/sft/run_sft_qwen05_peft.sh @@ -0,0 +1,38 @@ +# Tested with 2 & 4 GPUs + +set -x + +if [ "$#" -lt 2 ]; then + echo "Usage: run_qwen_05_peft.sh [other_configs...]" + exit 1 +fi + +nproc_per_node=$1 +save_path=$2 + +# Shift the arguments so $@ refers to the rest +shift 2 + +torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ + -m verl.trainer.fsdp_sft_trainer \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.prompt_key=extra_info \ + data.response_key=extra_info \ + optim.lr=1e-4 \ + +data.prompt_dict_keys=['question'] \ + +data.response_dict_keys=['answer'] \ + data.micro_batch_size_per_gpu=4 \ + model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \ + trainer.default_local_dir=$save_path \ + trainer.project_name=gsm8k-sft \ + trainer.experiment_name=gsm8k-sft-qwen-2.5-0.5b-instruct \ + trainer.logger=['console'] \ + trainer.total_training_steps=1 \ + trainer.default_hdfs_dir=null $@ \ + model.lora_rank=32\ + model.lora_alpha=16 \ + model.target_modules=all-linear + + # Or you can do this: + # model.target_modules=[q_proj,v_proj] \ diff --git a/tests/utility/test_tensor_dict_utilities.py b/tests/utility/test_tensor_dict_utilities.py index c952d5a9..344cf3a8 100644 --- a/tests/utility/test_tensor_dict_utilities.py +++ b/tests/utility/test_tensor_dict_utilities.py @@ -41,8 +41,12 @@ def test_union_tensor_dict(): data = union_tensor_dict(data1, data_with_copied_obs) data = np.random.random(100) - a = {'a': data} - b = {'a': data} + data2 = [float('nan') for _ in range(99)] + data2.append('nan') + data2 = np.array(data2, dtype=object) + data3 = np.tile(data2, (2, 1)) + a = {'a': data, 'b': data2, 'c': data3} + b = {'a': data, 'b': data2, 'c': data3} b_ = {'a': np.random.random(100)} union_numpy_dict(a, b) with pytest.raises(AssertionError): diff --git a/verl/protocol.py b/verl/protocol.py index 803da366..80626242 100644 --- a/verl/protocol.py +++ b/verl/protocol.py @@ -18,6 +18,7 @@ import pickle import numpy as np +import pandas as pd import copy from dataclasses import dataclass, field from typing import Callable, Dict, List, Union @@ -82,7 +83,8 @@ def union_numpy_dict(tensor_dict1: dict[np.ndarray], tensor_dict2: dict[np.ndarr if key in tensor_dict1: assert isinstance(tensor_dict2[key], np.ndarray) assert isinstance(tensor_dict1[key], np.ndarray) - assert np.all(tensor_dict2[key] == tensor_dict1[key]), \ + # to properly deal with nan and object type + assert pd.DataFrame(tensor_dict2[key]).equals(pd.DataFrame(tensor_dict1[key])), \ f'{key} in tensor_dict1 and tensor_dict2 are not the same object' tensor_dict1[key] = val diff --git a/verl/trainer/config/generation.yaml b/verl/trainer/config/generation.yaml index ed805a8c..27d92116 100644 --- a/verl/trainer/config/generation.yaml +++ b/verl/trainer/config/generation.yaml @@ -23,13 +23,13 @@ rollout: dtype: bfloat16 # should align with FSDP gpu_memory_utilization: 0.5 ignore_eos: False - micro_batch_size: 256 enforce_eager: True free_cache_engine: True load_format: dummy_dtensor tensor_model_parallel_size: 1 max_num_batched_tokens: 8192 max_num_seqs: 1024 - log_prob_micro_batch_size: 8 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 8 # for hf rollout do_sample: True \ No newline at end of file diff --git a/verl/trainer/config/ppo_megatron_trainer.yaml b/verl/trainer/config/ppo_megatron_trainer.yaml index 6ae26851..368d6512 100644 --- a/verl/trainer/config/ppo_megatron_trainer.yaml +++ b/verl/trainer/config/ppo_megatron_trainer.yaml @@ -20,7 +20,9 @@ actor_rollout_ref: actor: strategy: megatron # This is for backward-compatibility ppo_mini_batch_size: 256 - ppo_micro_batch_size: 64 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + use_dynamic_bsz: False clip_ratio: 0.2 entropy_coeff: 0.001 ppo_epochs: 1 @@ -48,7 +50,8 @@ actor_rollout_ref: seed: 1 load_weight: True param_offload: False - log_prob_micro_batch_size: 32 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: null rollout: name: vllm temperature: 1.0 @@ -66,7 +69,8 @@ actor_rollout_ref: tensor_model_parallel_size: 2 max_num_batched_tokens: 8192 max_num_seqs: 1024 - log_prob_micro_batch_size: 2 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: null # for hf rollout do_sample: True layer_name_map: @@ -98,7 +102,9 @@ critic: seed: 1 load_weight: True ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} - ppo_micro_batch_size: 2 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} shuffle: ${actor_rollout_ref.actor.shuffle} cliprange_value: 0.5 @@ -121,7 +127,9 @@ reward_model: external_lib: ${actor_rollout_ref.model.external_lib} load_weight: True param_offload: False - micro_batch_size: 64 + micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu + micro_batch_size_per_gpu: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} max_length: null algorithm: diff --git a/verl/trainer/config/ppo_trainer.yaml b/verl/trainer/config/ppo_trainer.yaml index b99c9670..6dd688a0 100644 --- a/verl/trainer/config/ppo_trainer.yaml +++ b/verl/trainer/config/ppo_trainer.yaml @@ -21,7 +21,8 @@ actor_rollout_ref: actor: strategy: fsdp # This is for backward-compatibility ppo_mini_batch_size: 256 - ppo_micro_batch_size: 64 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null use_dynamic_bsz: False ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} grad_clip: 1.0 @@ -53,7 +54,8 @@ actor_rollout_ref: wrap_policy: # transformer_layer_cls_to_wrap: None min_num_params: 0 - log_prob_micro_batch_size: 128 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: null log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size @@ -74,7 +76,8 @@ actor_rollout_ref: tensor_model_parallel_size: 2 max_num_batched_tokens: 8192 max_num_seqs: 1024 - log_prob_micro_batch_size: 128 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: null log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} # for hf rollout @@ -106,8 +109,10 @@ critic: min_num_params: 0 fsdp_size: -1 ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} - ppo_micro_batch_size: 64 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} @@ -129,7 +134,8 @@ reward_model: min_num_params: 0 param_offload: False fsdp_size: -1 - micro_batch_size: 64 + micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu + micro_batch_size_per_gpu: null # set a number max_length: null ulysses_sequence_parallel_size: 1 # sp size use_dynamic_bsz: ${critic.use_dynamic_bsz} diff --git a/verl/trainer/config/sft_trainer.yaml b/verl/trainer/config/sft_trainer.yaml index ccdef348..d68e6dcc 100644 --- a/verl/trainer/config/sft_trainer.yaml +++ b/verl/trainer/config/sft_trainer.yaml @@ -1,6 +1,7 @@ data: train_batch_size: 256 - micro_batch_size: 16 # this is also val batch size + micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu + micro_batch_size_per_gpu: 4 # this is also val batch size train_files: ~/data/gsm8k/train.parquet val_files: ~/data/gsm8k/test.parquet prompt_key: question diff --git a/verl/trainer/fsdp_sft_trainer.py b/verl/trainer/fsdp_sft_trainer.py index 6f04fa1b..dc7e677a 100644 --- a/verl/trainer/fsdp_sft_trainer.py +++ b/verl/trainer/fsdp_sft_trainer.py @@ -97,10 +97,10 @@ def _normalize_config_bsz(self): print(f'Normalize batch size by dp {dp_size}') assert self.config.data.train_batch_size % dp_size == 0 - assert self.config.data.micro_batch_size % dp_size == 0 self.config.data.train_batch_size //= dp_size - self.config.data.micro_batch_size //= dp_size + + assert self.config.data.train_batch_size % self.config.data.micro_batch_size_per_gpu == 0 def _build_dataloader(self): config = self.config @@ -143,7 +143,7 @@ def _build_dataloader(self): rank=rank, drop_last=True) self.val_dataloader = DataLoader(dataset=self.val_dataset, - batch_size=config.data.micro_batch_size, + batch_size=config.data.micro_batch_size_per_gpu, sampler=self.val_sampler, num_workers=8, pin_memory=True, @@ -285,7 +285,7 @@ def training_step(self, batch: TensorDict): log_gpu_memory_usage('After optimizer zero_grad', logger=logger) - micro_batches = batch.split(self.config.data.micro_batch_size) + micro_batches = batch.split(self.config.data.micro_batch_size_per_gpu) n_micro_batches = len(micro_batches) step_loss = 0 for micro_batch in micro_batches: @@ -373,7 +373,7 @@ def fit(self): # Perform final validation val_losses = [] for val_data in self.val_dataloader: - val_data = TensorDict(val_data, batch_size=self.config.data.micro_batch_size).cuda() + val_data = TensorDict(val_data, batch_size=self.config.data.micro_batch_size_per_gpu).cuda() val_loss = self.validation_step(val_data) val_losses.append(val_loss) if rank == 0: @@ -389,7 +389,7 @@ def fit(self): # validation val_losses = [] for data in self.val_dataloader: - data = TensorDict(data, batch_size=self.config.data.micro_batch_size).cuda() + data = TensorDict(data, batch_size=self.config.data.micro_batch_size_per_gpu).cuda() val_loss = self.validation_step(data) val_losses.append(val_loss) if rank == 0: diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 96eb93b4..071562dc 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -337,8 +337,13 @@ def __init__(self, else: self.kl_ctrl = core_algos.FixedKLController(kl_coef=0.) + self._validate_config() self._create_dataloader() + def _validate_config(self): + from verl.utils.config import validate_config + validate_config(self.config) + def _create_dataloader(self): from torch.utils.data import DataLoader # TODO: we have to make sure the batch size is divisible by the dp size diff --git a/verl/utils/config.py b/verl/utils/config.py index 5c9298c4..0dcd73a9 100644 --- a/verl/utils/config.py +++ b/verl/utils/config.py @@ -21,3 +21,69 @@ def update_dict_with_config(dictionary: Dict, config: DictConfig): for key in dictionary: if hasattr(config, key): dictionary[key] = getattr(config, key) + + +def validate_config(config): + # number of GPUs total + n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes + + # 1. Check total batch size for data correctness + real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n + assert real_train_batch_size % n_gpus == 0, \ + f"real_train_batch_size ({real_train_batch_size}) must be divisible by total n_gpus ({n_gpus})." + + # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu" + # We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu". + def check_mutually_exclusive(mbs, mbs_per_gpu, name: str): + if mbs is None and mbs_per_gpu is None: + raise ValueError(f"[{name}] Please set at least one of '{name}.micro_batch_size' or " + f"'{name}.micro_batch_size_per_gpu'.") + + if mbs is not None and mbs_per_gpu is not None: + raise ValueError(f"[{name}] You have set both '{name}.micro_batch_size' AND " + f"'{name}.micro_batch_size_per_gpu'. Please remove '{name}.micro_batch_size' " + f"because only '*_micro_batch_size_per_gpu' is supported (the former is deprecated).") + + if not config.actor_rollout_ref.actor.use_dynamic_bsz: + # actor: ppo_micro_batch_size vs. ppo_micro_batch_size_per_gpu + check_mutually_exclusive(config.actor_rollout_ref.actor.ppo_micro_batch_size, + config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu, "actor_rollout_ref.actor") + + # reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu + check_mutually_exclusive(config.actor_rollout_ref.ref.log_prob_micro_batch_size, + config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu, + "actor_rollout_ref.ref") + + # The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu + check_mutually_exclusive(config.actor_rollout_ref.rollout.log_prob_micro_batch_size, + config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu, + "actor_rollout_ref.rollout") + + if not config.critic.use_dynamic_bsz: + # Check for critic micro-batch size conflicts + check_mutually_exclusive(config.critic.ppo_micro_batch_size, config.critic.ppo_micro_batch_size_per_gpu, + "critic") + + # Check for reward model micro-batch size conflicts + if config.reward_model.enable and not config.reward_model.use_dynamic_bsz: + check_mutually_exclusive(config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, + "reward_model") + + # Actor + # if NOT dynamic_bsz, we must ensure: + # ppo_mini_batch_size is divisible by ppo_micro_batch_size + # ppo_micro_batch_size * sequence_parallel_size >= n_gpus + if not config.actor_rollout_ref.actor.use_dynamic_bsz: + sp_size = config.actor_rollout_ref.actor.ulysses_sequence_parallel_size + if config.actor_rollout_ref.actor.ppo_micro_batch_size is not None: + assert config.actor_rollout_ref.actor.ppo_mini_batch_size % config.actor_rollout_ref.actor.ppo_micro_batch_size == 0 + assert config.actor_rollout_ref.actor.ppo_micro_batch_size * sp_size >= n_gpus + + # critic + if not config.critic.use_dynamic_bsz: + sp_size = config.critic.ulysses_sequence_parallel_size + if config.critic.ppo_micro_batch_size is not None: + assert config.critic.ppo_mini_batch_size % config.critic.ppo_micro_batch_size == 0 + assert config.critic.ppo_micro_batch_size * sp_size >= n_gpus + + print("[validate_config] All configuration checks passed successfully!") diff --git a/verl/workers/actor/dp_actor.py b/verl/workers/actor/dp_actor.py index 5b7fef8a..3dde8dd3 100644 --- a/verl/workers/actor/dp_actor.py +++ b/verl/workers/actor/dp_actor.py @@ -204,8 +204,8 @@ def update_policy(self, data: DataProto): # make sure we are in training mode self.actor_module.train() - assert self.config.ppo_mini_batch_size % self.config.ppo_micro_batch_size == 0 - self.gradient_accumulation = self.config.ppo_mini_batch_size // self.config.ppo_micro_batch_size + assert self.config.ppo_mini_batch_size % self.config.ppo_micro_batch_size_per_gpu == 0 + self.gradient_accumulation = self.config.ppo_mini_batch_size // self.config.ppo_micro_batch_size_per_gpu temperature = data.meta_info['temperature'] # temperature must be in the data.meta_info to avoid slient error select_keys = ['responses', 'input_ids', 'attention_mask', 'position_ids', 'old_log_probs', 'advantages'] @@ -226,7 +226,7 @@ def update_policy(self, data: DataProto): micro_batches, _ = rearrange_micro_batches(batch=mini_batch, max_token_len=max_token_len) else: # split batch into micro_batches - micro_batches = mini_batch.split(self.config.ppo_micro_batch_size) + micro_batches = mini_batch.split(self.config.ppo_micro_batch_size_per_gpu) self.actor_optimizer.zero_grad() diff --git a/verl/workers/actor/megatron_actor.py b/verl/workers/actor/megatron_actor.py index e674a28f..694185a3 100644 --- a/verl/workers/actor/megatron_actor.py +++ b/verl/workers/actor/megatron_actor.py @@ -54,7 +54,7 @@ def __init__(self, config, model_config, megatron_config: ModelParallelConfig, a Args: config (OmegaConf): the basic config that contains the hyper-parameters of PPO Actor. It must contain - ``ppo_micro_batch_size``: minibatch size when updating ppo. + ``ppo_micro_batch_size_per_gpu``: micro batch size when updating ppo. ``ppo_mini_batch_size``: minibatch size when updating ppo using the batch data. @@ -232,7 +232,7 @@ def forward_backward_batch(self, data: DataProto, forward_only=False, post_proce if data.meta_info.get('micro_batch_size', None) is not None: batch_size = data.meta_info['micro_batch_size'] else: - batch_size = self.config.ppo_micro_batch_size + batch_size = self.config.ppo_micro_batch_size_per_gpu batches = split_dict_tensor_into_batches(data.batch, batch_size=batch_size) # compute input shapes for pp stages input_shapes = compute_transformers_input_shapes( diff --git a/verl/workers/critic/dp_critic.py b/verl/workers/critic/dp_critic.py index 0842ff4a..bdad6ecf 100644 --- a/verl/workers/critic/dp_critic.py +++ b/verl/workers/critic/dp_critic.py @@ -45,8 +45,8 @@ def __init__(self, config, critic_module: nn.Module, critic_optimizer: optim.Opt self.use_remove_padding = self.config.model.get('use_remove_padding', False) print(f'Critic use_remove_padding={self.use_remove_padding}') - assert self.config.ppo_mini_batch_size % self.config.ppo_micro_batch_size == 0 - self.gradient_accumulation = self.config.ppo_mini_batch_size // self.config.ppo_micro_batch_size + assert self.config.ppo_mini_batch_size % self.config.ppo_micro_batch_size_per_gpu == 0 + self.gradient_accumulation = self.config.ppo_mini_batch_size // self.config.ppo_micro_batch_size_per_gpu self.ulysses_sequence_parallel_size = self.config.get('ulysses_sequence_parallel_size', 1) @@ -161,7 +161,7 @@ def update_critic(self, data: DataProto): max_token_len = self.config.ppo_max_token_len_per_gpu * self.ulysses_sequence_parallel_size micro_batches, _ = rearrange_micro_batches(batch=mini_batch, max_token_len=max_token_len) else: - micro_batches = mini_batch.split(self.config.ppo_micro_batch_size) + micro_batches = mini_batch.split(self.config.ppo_micro_batch_size_per_gpu) self.critic_optimizer.zero_grad() diff --git a/verl/workers/critic/megatron_critic.py b/verl/workers/critic/megatron_critic.py index a39ad4b4..4545db68 100644 --- a/verl/workers/critic/megatron_critic.py +++ b/verl/workers/critic/megatron_critic.py @@ -118,7 +118,7 @@ def forward_backward_batch(self, data: DataProto, forward_only=False): group=mpu.get_pipeline_model_parallel_group()) # split into micro-batches data.batch['attention_mask'] = data.batch['attention_mask'].to(bool) - batches = split_dict_tensor_into_batches(data.batch, batch_size=self.config.ppo_micro_batch_size) + batches = split_dict_tensor_into_batches(data.batch, batch_size=self.config.ppo_micro_batch_size_per_gpu) n_micro_batch = len(batches) seq_len = batches[0]['input_ids'].shape[1] @@ -182,7 +182,7 @@ def forward_step(batch_iter, model): model=self.critic_module, num_microbatches=n_micro_batch, input_shapes=input_shapes, # must set for flash-attn sequence packing - seq_length=self.config.ppo_micro_batch_size * seq_len, # no use when input_shapes was set + seq_length=self.config.ppo_micro_batch_size_per_gpu * seq_len, # no use when input_shapes was set hidden_size=self.model_config.hidden_size, # no use when input_shapes was set micro_batch_size=1, # no use when input_shapes was set forward_only=forward_only, @@ -193,7 +193,7 @@ def forward_step(batch_iter, model): data_iterator=batch_generator, model=self.critic_module, num_microbatches=n_micro_batch, - seq_length=self.config.ppo_micro_batch_size * seq_len, # in use for pp = 1 + seq_length=self.config.ppo_micro_batch_size_per_gpu * seq_len, # in use for pp = 1 hidden_size=self.model_config.hidden_size, # in use for pp = 1 micro_batch_size=1, # in use for pp = 1 forward_only=forward_only, diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py index 152c0b23..0dc4da02 100644 --- a/verl/workers/fsdp_workers.py +++ b/verl/workers/fsdp_workers.py @@ -119,18 +119,22 @@ def __init__(self, config: DictConfig, role: str): # normalize config if self._is_actor: self.config.actor.ppo_mini_batch_size //= (self.device_mesh.shape[0] // self.ulysses_sequence_parallel_size) - self.config.actor.ppo_micro_batch_size //= (self.device_mesh.shape[0] // - self.ulysses_sequence_parallel_size) self.config.actor.ppo_mini_batch_size *= self.config.rollout.n - self.config.actor.ppo_micro_batch_size *= self.config.rollout.n - if self._is_rollout: + # micro bsz + if self.config.actor.ppo_micro_batch_size is not None: + self.config.actor.ppo_micro_batch_size //= (self.device_mesh.shape[0] // + self.ulysses_sequence_parallel_size) + self.config.actor.ppo_micro_batch_size_per_gpu = self.config.actor.ppo_micro_batch_size + # normalize rollout config + if self._is_rollout and self.config.rollout.log_prob_micro_batch_size is not None: self.config.rollout.log_prob_micro_batch_size //= (self.device_mesh.shape[0] // self.ulysses_sequence_parallel_size) - self.config.rollout.log_prob_micro_batch_size *= self.config.rollout.n - if self._is_ref: + self.config.rollout.log_prob_micro_batch_size_per_gpu = self.config.rollout.log_prob_micro_batch_size + # normalize ref config + if self._is_ref and self.config.ref.log_prob_micro_batch_size is not None: self.config.ref.log_prob_micro_batch_size //= (self.device_mesh.shape[0] // self.ulysses_sequence_parallel_size) - self.config.ref.log_prob_micro_batch_size *= self.config.rollout.n + self.config.ref.log_prob_micro_batch_size_per_gpu = self.config.ref.log_prob_micro_batch_size def _build_model_optimizer(self, model_path, @@ -424,8 +428,6 @@ def update_actor(self, data: DataProto): @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) def generate_sequences(self, prompts: DataProto): prompts = prompts.to('cuda') - # set to False if it is validation - recompute_log_prob = prompts.meta_info.get('recompute_log_prob', True) assert self._is_rollout if self._is_offload_param: @@ -461,7 +463,7 @@ def compute_log_prob(self, data: DataProto): assert self._is_actor data = data.to('cuda') # we should always recompute old_log_probs when it is HybridEngine - data.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size + data.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size_per_gpu data.meta_info['max_token_len'] = self.config.rollout.log_prob_max_token_len_per_gpu data.meta_info['use_dynamic_bsz'] = self.config.rollout.log_prob_use_dynamic_bsz data.meta_info['temperature'] = self.config.rollout.temperature @@ -489,7 +491,7 @@ def compute_ref_log_prob(self, data: DataProto): data = data.to('cuda') - micro_batch_size = self.config.ref.log_prob_micro_batch_size + micro_batch_size = self.config.ref.log_prob_micro_batch_size_per_gpu data.meta_info['micro_batch_size'] = micro_batch_size data.meta_info['temperature'] = self.config.rollout.temperature data.meta_info['max_token_len'] = self.config.ref.log_prob_max_token_len_per_gpu @@ -573,9 +575,13 @@ def __init__(self, config): # normalize config self.config.ppo_mini_batch_size //= (torch.distributed.get_world_size() // self.ulysses_sequence_parallel_size) - self.config.ppo_micro_batch_size //= (torch.distributed.get_world_size() // self.ulysses_sequence_parallel_size) - self.config.forward_micro_batch_size //= (torch.distributed.get_world_size() // + if self.config.ppo_micro_batch_size is not None: + self.config.ppo_micro_batch_size //= (torch.distributed.get_world_size() // self.ulysses_sequence_parallel_size) + self.config.forward_micro_batch_size //= (torch.distributed.get_world_size() // + self.ulysses_sequence_parallel_size) + self.config.ppo_micro_batch_size_per_gpu = self.config.ppo_micro_batch_size + self.config.forward_micro_batch_size_per_gpu = self.config.forward_micro_batch_size def _build_critic_model_optimizer(self, config): # the following line is necessary @@ -724,7 +730,7 @@ def compute_values(self, data: DataProto): load_fsdp_param_and_grad(module=self.critic_module, device_id=torch.cuda.current_device(), load_grad=self._is_offload_grad) - micro_batch_size = self.config.forward_micro_batch_size + micro_batch_size = self.config.forward_micro_batch_size_per_gpu data.meta_info['micro_batch_size'] = micro_batch_size data.meta_info['max_token_len'] = self.config.forward_max_token_len_per_gpu data.meta_info['use_dynamic_bsz'] = self.config.use_dynamic_bsz @@ -838,7 +844,11 @@ def __init__(self, config): self.ulysses_sharding_manager = FSDPUlyssesShardingManager(self.ulysses_device_mesh) self.use_remove_padding = self.config.model.get('use_remove_padding', False) - self.config.micro_batch_size //= torch.distributed.get_world_size() + + # normalize config + if self.config.micro_batch_size is not None: + self.config.micro_batch_size //= torch.distributed.get_world_size() + self.config.micro_batch_size_per_gpu = self.config.micro_batch_size def _build_model(self, config): # the following line is necessary @@ -1054,7 +1064,7 @@ def compute_rm_score(self, data: DataProto): max_token_len = self.config.forward_max_token_len_per_gpu * self.ulysses_sequence_parallel_size micro_batches, indices = rearrange_micro_batches(batch=rm_data.batch, max_token_len=max_token_len) else: - micro_batches = rm_data.batch.split(self.config.micro_batch_size) + micro_batches = rm_data.batch.split(self.config.micro_batch_size_per_gpu) output = [] for micro_batch in micro_batches: rm_score = self._forward_micro_batch(micro_batch) diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py index 180a7761..d0ae638e 100644 --- a/verl/workers/megatron_workers.py +++ b/verl/workers/megatron_workers.py @@ -112,13 +112,19 @@ def __init__(self, config: DictConfig, role: str): # normalize config if self._is_actor and self._is_rollout: self.config.actor.ppo_mini_batch_size //= mpu.get_data_parallel_world_size() - self.config.actor.ppo_micro_batch_size //= mpu.get_data_parallel_world_size() - self.config.rollout.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size() + if self.config.actor.ppo_micro_batch_size is not None: + self.config.actor.ppo_micro_batch_size //= mpu.get_data_parallel_world_size() + self.config.rollout.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size() + self.config.actor.ppo_micro_batch_size_per_gpu = self.config.actor.ppo_micro_batch_size + self.config.rollout.log_prob_micro_batch_size_per_gpu = self.config.rollout.log_prob_micro_batch_size + self._is_offload_param = self.config.actor.get('param_offload', False) self._is_offload_grad = self.config.actor.get('grad_offload', False) self._is_offload_optimizer = self.config.actor.get('optimizer_offload', False) elif self._is_ref: - self.config.ref.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size() + if self.config.ref.ppo_micro_batch_size is not None: + self.config.ref.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size() + self.config.ref.ppo_micro_batch_size_per_gpu = self.config.ref.ppo_micro_batch_size self._is_offload_param = self.config.ref.get('param_offload', False) def _build_model_optimizer(self, @@ -361,7 +367,7 @@ def generate_sequences(self, prompts: DataProto): validate = prompts.meta_info.get('validate', False) if self._is_actor and not validate: # we should always recompute old_log_probs when it is HybridEngine - output.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size + output.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size_per_gpu output.meta_info['temperature'] = self.config.rollout.temperature old_log_probs = self.actor.compute_log_prob(data=output) output.batch['old_log_probs'] = old_log_probs @@ -380,7 +386,7 @@ def compute_ref_log_prob(self, data: DataProto): if self._is_offload_param: load_megatron_param_and_grad(self.ref_module, torch.cuda.current_device(), self._is_offload_grad) - micro_batch_size = self.config.rollout.log_prob_micro_batch_size + micro_batch_size = self.config.rollout.log_prob_micro_batch_size_per_gpu data.meta_info['micro_batch_size'] = micro_batch_size data.meta_info['temperature'] = self.config.rollout.temperature output = self.ref_policy.compute_log_prob(data=data) @@ -439,7 +445,9 @@ def __init__(self, config): # normalize config self.config.ppo_mini_batch_size //= mpu.get_data_parallel_world_size() - self.config.ppo_micro_batch_size //= mpu.get_data_parallel_world_size() + if self.config.ppo_micro_batch_size is not None: + self.config.ppo_micro_batch_size //= mpu.get_data_parallel_world_size() + self.config.ppo_micro_batch_size_per_gpu = self.config.ppo_micro_batch_size # TODO(sgm): support critic model offload @@ -609,7 +617,9 @@ def __init__(self, config): set_random_seed(seed=self.config.megatron.seed) # normalize config - self.config.micro_batch_size //= mpu.get_data_parallel_world_size() + if self.config.micro_batch_size is not None: + self.config.micro_batch_size //= mpu.get_data_parallel_world_size() + self.config.micro_batch_size_per_gpu = self.config.micro_batch_size def _build_rm_model(self, model_path, megatron_config: ModelParallelConfig, override_model_config): from megatron.core.models.gpt.gpt_model import ModelType diff --git a/verl/workers/reward_model/megatron/reward_model.py b/verl/workers/reward_model/megatron/reward_model.py index 2c4c1b60..1b58f42c 100644 --- a/verl/workers/reward_model/megatron/reward_model.py +++ b/verl/workers/reward_model/megatron/reward_model.py @@ -196,8 +196,8 @@ def forward_batch(self, data: DataProto): group=mpu.get_pipeline_model_parallel_group()) # split into micro-batches - if self.config is not None and 'ppo_micro_batch_size' in self.config: - infer_batch_size = self.config.ppo_micro_batch_size + if self.config is not None and 'ppo_micro_batch_size_per_gpu' in self.config: + infer_batch_size = self.config.ppo_micro_batch_size_per_gpu else: infer_batch_size = data.batch.batch_size[0]