Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
sanjay920 committed Nov 24, 2024
1 parent d4d7ec5 commit d2a7f2d
Show file tree
Hide file tree
Showing 30 changed files with 9,866 additions and 3 deletions.
45 changes: 45 additions & 0 deletions configs/llama3.2-1b_4_blocks_coral.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
### model
# model_name_or_path: models/Llama-3.2-1B-Instruct-pro-4
model_name_or_path: saves/llama3.2-1b-pro/4_blocks_coral_v1

### method
stage: sft
do_train: true
finetuning_type: freeze
freeze_trainable_layers: 4
freeze_trainable_modules: all
use_llama_pro: true
use_liger_kernel: true
flash_attn: fa2

### dataset
dataset: coral
template: llama3
cutoff_len: 40000
max_samples: 1000000000
overwrite_cache: true
preprocessing_num_workers: 16

### output
output_dir: saves/llama3.2-1b-pro/4_blocks_coral_v1
logging_steps: 10
save_steps: 40000
plot_loss: true
overwrite_output_dir: true

### train
per_device_train_batch_size: 12
gradient_accumulation_steps: 3
learning_rate: 0.00002
max_grad_norm: 3.0
num_train_epochs: 5.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000

### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
44 changes: 44 additions & 0 deletions configs/llama3.2-1b_4_blocks_coral_pretain.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
### model
model_name_or_path: models/Llama-3.2-1B-pro-4

### method
stage: pt
do_train: true
finetuning_type: freeze
freeze_trainable_layers: 4
freeze_trainable_modules: all
use_llama_pro: true
use_liger_kernel: true
flash_attn: fa2

### dataset
dataset: pretrain_coral
template: llama3
# cutoff_len: 40000
max_samples: 1000000000
overwrite_cache: true
preprocessing_num_workers: 16

### output
output_dir: saves/llama32-1b-base-pro/4_blocks_pretrain_experiment_v1
logging_steps: 10
save_steps: 40000
plot_loss: true
overwrite_output_dir: true

### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 2
learning_rate: 0.00002
max_grad_norm: 3.0
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000

### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
44 changes: 44 additions & 0 deletions configs/llama3.2-1b_4_blocks_coral_sft.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
### model
model_name_or_path: saves/llama32-1b-base-pro/4_blocks_pretrain_experiment_v1

### method
stage: sft
do_train: true
finetuning_type: freeze
freeze_trainable_layers: 4
freeze_trainable_modules: all
use_llama_pro: true
use_liger_kernel: true
flash_attn: fa2

### dataset
dataset: smoltalk, coral, mmlu_pro_training, longwriter
template: llama3
# cutoff_len: 40000
max_samples: 1000000000
overwrite_cache: true
preprocessing_num_workers: 16

### output
output_dir: saves/llama32-1b-base-pro/4_blocks_sft_experiment_v1
logging_steps: 10
save_steps: 40000
plot_loss: true
overwrite_output_dir: true

### train
per_device_train_batch_size: 8
gradient_accumulation_steps: 4
learning_rate: 0.00002
max_grad_norm: 3.0
num_train_epochs: 2.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000

### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
44 changes: 44 additions & 0 deletions configs/llama31_4_blocks_coral.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
### model
model_name_or_path: models/llama3.1-8b-pro

### method
stage: sft
do_train: true
finetuning_type: freeze
freeze_trainable_layers: 4
freeze_trainable_modules: all
use_llama_pro: true
use_liger_kernel: true
flash_attn: fa2

### dataset
dataset: coral
template: llama3
cutoff_len: 40000
max_samples: 1000000000
overwrite_cache: true
preprocessing_num_workers: 16

### output
output_dir: saves/llama31-8b-pro/4_blocks_coral_v2
logging_steps: 10
save_steps: 40000
plot_loss: true
overwrite_output_dir: true

### train
per_device_train_batch_size: 12
gradient_accumulation_steps: 3
learning_rate: 0.00002
max_grad_norm: 3.0
num_train_epochs: 10.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000

### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
44 changes: 44 additions & 0 deletions configs/llama31_4_blocks_longhorn_example.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
### model
model_name_or_path: models/llama3.1-8b-pro

### method
stage: sft
do_train: true
finetuning_type: freeze
freeze_trainable_layers: 4
freeze_trainable_modules: all
use_llama_pro: true
use_liger_kernel: true
flash_attn: fa2

### dataset
dataset: longhorn_sample
template: llama3
cutoff_len: 40000
max_samples: 1000000000
overwrite_cache: true
preprocessing_num_workers: 16

### output
output_dir: saves/llama31-8b-pro/4_blocks_longhorn
logging_steps: 10
save_steps: 1500000000000000
plot_loss: true
overwrite_output_dir: true

### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 1
learning_rate: 0.00002
max_grad_norm: 3.0
num_train_epochs: 80.0
lr_scheduler_type: constant
# warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000

### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
44 changes: 44 additions & 0 deletions configs/llama31_4_blocks_longwriter.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
### model
model_name_or_path: models/llama3.1-8b-pro

### method
stage: sft
do_train: true
finetuning_type: freeze
freeze_trainable_layers: 4
freeze_trainable_modules: all
use_llama_pro: true
use_liger_kernel: true
flash_attn: fa2

### dataset
dataset: longwriter
template: llama3
cutoff_len: 400000
max_samples: 1000000000
overwrite_cache: true
preprocessing_num_workers: 16

### output
output_dir: saves/llama31-8b-pro/4_blocks_longwriter_v1
logging_steps: 10
save_steps: 15000
plot_loss: true
overwrite_output_dir: true

### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 2
learning_rate: 0.00002
max_grad_norm: 3.0
num_train_epochs: 20.0
lr_scheduler_type: cosine_with_restarts
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000

### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
44 changes: 44 additions & 0 deletions configs/llama31_4_blocks_math.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
### model
model_name_or_path: models/llama3.1-8b-pro

### method
stage: sft
do_train: true
finetuning_type: freeze
freeze_trainable_layers: 4
freeze_trainable_modules: all
use_llama_pro: true
use_liger_kernel: true
flash_attn: fa2

### dataset
dataset: mathinstruct
template: llama3
cutoff_len: 40000
max_samples: 1000000000
overwrite_cache: true
preprocessing_num_workers: 16

### output
output_dir: saves/llama31-8b-pro/4_blocks_math_v2
logging_steps: 10
save_steps: 15000
plot_loss: true
overwrite_output_dir: true

### train
per_device_train_batch_size: 6
gradient_accumulation_steps: 8
learning_rate: 0.00002
max_grad_norm: 3.0
num_train_epochs: 2.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000

### eval
val_size: 0.1
per_device_eval_batch_size: 1
eval_strategy: steps
eval_steps: 500
44 changes: 44 additions & 0 deletions configs/llama31_4_blocks_pretrain_experiment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
### model
model_name_or_path: models/llama3.1-8b-pro

### method
stage: pt
do_train: true
finetuning_type: freeze
freeze_trainable_layers: 4
freeze_trainable_modules: all
use_llama_pro: true
use_liger_kernel: true
flash_attn: fa2

### dataset
dataset: pretrain_experiment_elections
template: llama3
cutoff_len: 40000
max_samples: 1000000000
overwrite_cache: true
preprocessing_num_workers: 16

### output
output_dir: saves/llama31-8b-pro/4_blocks_pretrain_experiment_v1
logging_steps: 10
save_steps: 40000
plot_loss: true
overwrite_output_dir: true

### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 1
learning_rate: 0.00002
max_grad_norm: 3.0
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000

### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
Loading

0 comments on commit d2a7f2d

Please sign in to comment.