update

sanjay920 · Nov 24, 2024 · d2a7f2d · d2a7f2d
1 parent d4d7ec5
commit d2a7f2d
Show file tree

Hide file tree

Showing 30 changed files with 9,866 additions and 3 deletions.
diff --git a/configs/llama3.2-1b_4_blocks_coral.yaml b/configs/llama3.2-1b_4_blocks_coral.yaml
@@ -0,0 +1,45 @@
+### model
+# model_name_or_path: models/Llama-3.2-1B-Instruct-pro-4
+model_name_or_path: saves/llama3.2-1b-pro/4_blocks_coral_v1
+
+### method
+stage: sft
+do_train: true
+finetuning_type: freeze
+freeze_trainable_layers: 4
+freeze_trainable_modules: all
+use_llama_pro: true
+use_liger_kernel: true
+flash_attn: fa2
+
+### dataset
+dataset: coral
+template: llama3
+cutoff_len: 40000
+max_samples: 1000000000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3.2-1b-pro/4_blocks_coral_v1
+logging_steps: 10
+save_steps: 40000
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 12
+gradient_accumulation_steps: 3
+learning_rate: 0.00002
+max_grad_norm: 3.0
+num_train_epochs: 5.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
diff --git a/configs/llama3.2-1b_4_blocks_coral_pretain.yaml b/configs/llama3.2-1b_4_blocks_coral_pretain.yaml
@@ -0,0 +1,44 @@
+### model
+model_name_or_path: models/Llama-3.2-1B-pro-4
+
+### method
+stage: pt
+do_train: true
+finetuning_type: freeze
+freeze_trainable_layers: 4
+freeze_trainable_modules: all
+use_llama_pro: true
+use_liger_kernel: true
+flash_attn: fa2
+
+### dataset
+dataset: pretrain_coral
+template: llama3
+# cutoff_len: 40000
+max_samples: 1000000000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama32-1b-base-pro/4_blocks_pretrain_experiment_v1
+logging_steps: 10
+save_steps: 40000
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 2
+learning_rate: 0.00002
+max_grad_norm: 3.0
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
diff --git a/configs/llama3.2-1b_4_blocks_coral_sft.yaml b/configs/llama3.2-1b_4_blocks_coral_sft.yaml
@@ -0,0 +1,44 @@
+### model
+model_name_or_path: saves/llama32-1b-base-pro/4_blocks_pretrain_experiment_v1
+
+### method
+stage: sft
+do_train: true
+finetuning_type: freeze
+freeze_trainable_layers: 4
+freeze_trainable_modules: all
+use_llama_pro: true
+use_liger_kernel: true
+flash_attn: fa2
+
+### dataset
+dataset: smoltalk, coral, mmlu_pro_training, longwriter
+template: llama3
+# cutoff_len: 40000
+max_samples: 1000000000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama32-1b-base-pro/4_blocks_sft_experiment_v1
+logging_steps: 10
+save_steps: 40000
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 8
+gradient_accumulation_steps: 4
+learning_rate: 0.00002
+max_grad_norm: 3.0
+num_train_epochs: 2.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
diff --git a/configs/llama31_4_blocks_coral.yaml b/configs/llama31_4_blocks_coral.yaml
@@ -0,0 +1,44 @@
+### model
+model_name_or_path: models/llama3.1-8b-pro
+
+### method
+stage: sft
+do_train: true
+finetuning_type: freeze
+freeze_trainable_layers: 4
+freeze_trainable_modules: all
+use_llama_pro: true
+use_liger_kernel: true
+flash_attn: fa2
+
+### dataset
+dataset: coral
+template: llama3
+cutoff_len: 40000
+max_samples: 1000000000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama31-8b-pro/4_blocks_coral_v2
+logging_steps: 10
+save_steps: 40000
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 12
+gradient_accumulation_steps: 3
+learning_rate: 0.00002
+max_grad_norm: 3.0
+num_train_epochs: 10.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
diff --git a/configs/llama31_4_blocks_longhorn_example.yaml b/configs/llama31_4_blocks_longhorn_example.yaml
@@ -0,0 +1,44 @@
+### model
+model_name_or_path: models/llama3.1-8b-pro
+
+### method
+stage: sft
+do_train: true
+finetuning_type: freeze
+freeze_trainable_layers: 4
+freeze_trainable_modules: all
+use_llama_pro: true
+use_liger_kernel: true
+flash_attn: fa2
+
+### dataset
+dataset: longhorn_sample
+template: llama3
+cutoff_len: 40000
+max_samples: 1000000000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama31-8b-pro/4_blocks_longhorn
+logging_steps: 10
+save_steps: 1500000000000000
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 1
+learning_rate: 0.00002
+max_grad_norm: 3.0
+num_train_epochs: 80.0
+lr_scheduler_type: constant
+# warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
diff --git a/configs/llama31_4_blocks_longwriter.yaml b/configs/llama31_4_blocks_longwriter.yaml
@@ -0,0 +1,44 @@
+### model
+model_name_or_path: models/llama3.1-8b-pro
+
+### method
+stage: sft
+do_train: true
+finetuning_type: freeze
+freeze_trainable_layers: 4
+freeze_trainable_modules: all
+use_llama_pro: true
+use_liger_kernel: true
+flash_attn: fa2
+
+### dataset
+dataset: longwriter
+template: llama3
+cutoff_len: 400000
+max_samples: 1000000000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama31-8b-pro/4_blocks_longwriter_v1
+logging_steps: 10
+save_steps: 15000
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 2
+learning_rate: 0.00002
+max_grad_norm: 3.0
+num_train_epochs: 20.0
+lr_scheduler_type: cosine_with_restarts
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
diff --git a/configs/llama31_4_blocks_math.yaml b/configs/llama31_4_blocks_math.yaml
@@ -0,0 +1,44 @@
+### model
+model_name_or_path: models/llama3.1-8b-pro
+
+### method
+stage: sft
+do_train: true
+finetuning_type: freeze
+freeze_trainable_layers: 4
+freeze_trainable_modules: all
+use_llama_pro: true
+use_liger_kernel: true
+flash_attn: fa2
+
+### dataset
+dataset: mathinstruct
+template: llama3
+cutoff_len: 40000
+max_samples: 1000000000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama31-8b-pro/4_blocks_math_v2
+logging_steps: 10
+save_steps: 15000
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 6
+gradient_accumulation_steps: 8
+learning_rate: 0.00002
+max_grad_norm: 3.0
+num_train_epochs: 2.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500
diff --git a/configs/llama31_4_blocks_pretrain_experiment.yaml b/configs/llama31_4_blocks_pretrain_experiment.yaml
@@ -0,0 +1,44 @@
+### model
+model_name_or_path: models/llama3.1-8b-pro
+
+### method
+stage: pt
+do_train: true
+finetuning_type: freeze
+freeze_trainable_layers: 4
+freeze_trainable_modules: all
+use_llama_pro: true
+use_liger_kernel: true
+flash_attn: fa2
+
+### dataset
+dataset: pretrain_experiment_elections
+template: llama3
+cutoff_len: 40000
+max_samples: 1000000000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama31-8b-pro/4_blocks_pretrain_experiment_v1
+logging_steps: 10
+save_steps: 40000
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 1
+learning_rate: 0.00002
+max_grad_norm: 3.0
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500