Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fp8 integration #1086

Merged
merged 34 commits into from
Mar 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
68b98fc
Draft of FP8 support
sgugger Dec 7, 2022
f10a4b0
Missing import
sgugger Dec 8, 2022
23b0e04
Fix names
sgugger Dec 9, 2022
12ac9bf
Conversion is inplace
sgugger Dec 9, 2022
ab878ac
Enable fp8 in examples
sgugger Dec 9, 2022
b5578ea
Customization point for Recipe
sgugger Dec 9, 2022
b818aee
Auto-enable FP8 depending on compute capability
sgugger Dec 12, 2022
9e7157e
Fix typo
sgugger Dec 12, 2022
a1a86d1
Put back mixed precision arg
sgugger Dec 12, 2022
94d7101
Add debug script
sgugger Dec 12, 2022
13deee8
Add more tests in debug
sgugger Dec 12, 2022
097d3eb
Add more stuff to debug
sgugger Dec 12, 2022
23d188f
Don't forget train
sgugger Dec 12, 2022
4656b23
Put the train in the right place
sgugger Dec 12, 2022
17d0d38
Add options for selective conversion
sgugger Dec 13, 2022
aa3c0c2
Fix typo
sgugger Dec 13, 2022
8c69cd4
Properly recurse
sgugger Dec 13, 2022
4df14d0
Add more debug utils
sgugger Dec 13, 2022
978ed82
Typo and init
sgugger Dec 14, 2022
a2efc93
Last choice
sgugger Dec 14, 2022
583c5c5
More fixes
sgugger Dec 14, 2022
cb2b2f1
More options in example
sgugger Dec 14, 2022
7deee02
Remove debug scripts
sgugger Jan 26, 2023
e3e2ee9
Clean up debug and new names
sgugger Jan 26, 2023
659e1dc
Add torch.no_grad for conversion
sgugger Jan 27, 2023
9c715b0
Optimizer is deconnected from model?
sgugger Jan 27, 2023
b3f997f
Re-attach model parameters to optimizer
sgugger Jan 27, 2023
2787834
Fix extract
sgugger Jan 27, 2023
e2070df
Style
sgugger Jan 27, 2023
6f21320
Cleanup post-rebase
sgugger Feb 15, 2023
063d33e
Deal with apdding
sgugger Feb 15, 2023
9f6b40f
fix examples
sgugger Feb 15, 2023
73e06ac
Update src/accelerate/accelerator.py
sgugger Mar 2, 2023
10f879a
Address comments
sgugger Mar 2, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions examples/by_feature/automatic_gradient_accumulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from accelerate import Accelerator, DistributedType
from accelerate import Accelerator
from accelerate.utils import find_executable_batch_size


Expand Down Expand Up @@ -84,10 +84,20 @@ def tokenize_function(examples):
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
if accelerator.distributed_type == DistributedType.TPU:
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
elif accelerator.mixed_precision != "no":
pad_to_multiple_of = 8
else:
pad_to_multiple_of = None

return tokenizer.pad(
examples,
padding="longest",
pad_to_multiple_of=pad_to_multiple_of,
return_tensors="pt",
)

# Instantiate dataloaders.
train_dataloader = DataLoader(
Expand Down Expand Up @@ -215,7 +225,7 @@ def main():
"--mixed_precision",
type=str,
default=None,
choices=["no", "fp16", "bf16"],
choices=["no", "fp16", "bf16", "fp8"],
help="Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU.",
Expand Down
21 changes: 17 additions & 4 deletions examples/by_feature/checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,22 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
if accelerator.distributed_type == DistributedType.TPU:
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
elif accelerator.mixed_precision != "no":
pad_to_multiple_of = 8
else:
pad_to_multiple_of = None

return tokenizer.pad(
examples,
padding="longest",
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors="pt",
)

# Instantiate dataloaders.
train_dataloader = DataLoader(
Expand Down Expand Up @@ -269,7 +282,7 @@ def main():
"--mixed_precision",
type=str,
default=None,
choices=["no", "fp16", "bf16"],
choices=["no", "fp16", "bf16", "fp8"],
help="Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU.",
Expand Down
21 changes: 17 additions & 4 deletions examples/by_feature/cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,22 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
if accelerator.distributed_type == DistributedType.TPU:
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
elif accelerator.mixed_precision != "no":
pad_to_multiple_of = 8
else:
pad_to_multiple_of = None

return tokenizer.pad(
examples,
padding="longest",
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors="pt",
)

# Instantiate dataloaders.
train_dataloader = DataLoader(
Expand Down Expand Up @@ -251,7 +264,7 @@ def main():
"--mixed_precision",
type=str,
default=None,
choices=["no", "fp16", "bf16"],
choices=["no", "fp16", "bf16", "fp8"],
help="Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU.",
Expand Down
21 changes: 17 additions & 4 deletions examples/by_feature/fsdp_with_peak_mem_tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,22 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
if accelerator.distributed_type == DistributedType.TPU:
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
elif accelerator.mixed_precision != "no":
pad_to_multiple_of = 8
else:
pad_to_multiple_of = None

return tokenizer.pad(
examples,
padding="longest",
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors="pt",
)

# Instantiate dataloaders.
train_dataloader = DataLoader(
Expand Down Expand Up @@ -331,7 +344,7 @@ def main():
"--mixed_precision",
type=str,
default=None,
choices=["no", "fp16", "bf16"],
choices=["no", "fp16", "bf16", "fp8"],
help="Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU.",
Expand Down
21 changes: 17 additions & 4 deletions examples/by_feature/gradient_accumulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,22 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
if accelerator.distributed_type == DistributedType.TPU:
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
elif accelerator.mixed_precision != "no":
pad_to_multiple_of = 8
else:
pad_to_multiple_of = None

return tokenizer.pad(
examples,
padding="longest",
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors="pt",
)

# Instantiate dataloaders.
train_dataloader = DataLoader(
Expand Down Expand Up @@ -193,7 +206,7 @@ def main():
"--mixed_precision",
type=str,
default=None,
choices=["no", "fp16", "bf16"],
choices=["no", "fp16", "bf16", "fp8"],
help="Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU.",
Expand Down
21 changes: 17 additions & 4 deletions examples/by_feature/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,22 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
if accelerator.distributed_type == DistributedType.TPU:
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
elif accelerator.mixed_precision != "no":
pad_to_multiple_of = 8
else:
pad_to_multiple_of = None

return tokenizer.pad(
examples,
padding="longest",
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors="pt",
)

# Instantiate dataloaders.
train_dataloader = DataLoader(
Expand Down Expand Up @@ -205,7 +218,7 @@ def main():
"--mixed_precision",
type=str,
default=None,
choices=["no", "fp16", "bf16"],
choices=["no", "fp16", "bf16", "fp8"],
help="Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU.",
Expand Down
21 changes: 17 additions & 4 deletions examples/by_feature/multi_process_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,22 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
if accelerator.distributed_type == DistributedType.TPU:
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
elif accelerator.mixed_precision != "no":
pad_to_multiple_of = 8
else:
pad_to_multiple_of = None

return tokenizer.pad(
examples,
padding="longest",
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors="pt",
)

# Instantiate dataloaders.
train_dataloader = DataLoader(
Expand Down Expand Up @@ -210,7 +223,7 @@ def main():
"--mixed_precision",
type=str,
default=None,
choices=["no", "fp16", "bf16"],
choices=["no", "fp16", "bf16", "fp8"],
help="Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU.",
Expand Down
21 changes: 17 additions & 4 deletions examples/by_feature/tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,22 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
if accelerator.distributed_type == DistributedType.TPU:
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
elif accelerator.mixed_precision != "no":
pad_to_multiple_of = 8
else:
pad_to_multiple_of = None

return tokenizer.pad(
examples,
padding="longest",
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors="pt",
)

# Instantiate dataloaders.
train_dataloader = DataLoader(
Expand Down Expand Up @@ -237,7 +250,7 @@ def main():
"--mixed_precision",
type=str,
default=None,
choices=["no", "fp16", "bf16"],
choices=["no", "fp16", "bf16", "fp8"],
help="Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU.",
Expand Down
2 changes: 1 addition & 1 deletion examples/complete_cv_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def main():
"--mixed_precision",
type=str,
default=None,
choices=["no", "fp16", "bf16"],
choices=["no", "fp16", "bf16", "fp8"],
muellerzr marked this conversation as resolved.
Show resolved Hide resolved
help="Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU.",
Expand Down
21 changes: 17 additions & 4 deletions examples/complete_nlp_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,22 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
if accelerator.distributed_type == DistributedType.TPU:
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
elif accelerator.mixed_precision != "no":
pad_to_multiple_of = 8
else:
pad_to_multiple_of = None

return tokenizer.pad(
examples,
padding="longest",
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors="pt",
)

# Instantiate dataloaders.
train_dataloader = DataLoader(
Expand Down Expand Up @@ -251,7 +264,7 @@ def main():
"--mixed_precision",
type=str,
default=None,
choices=["no", "fp16", "bf16"],
choices=["no", "fp16", "bf16", "fp8"],
help="Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU.",
Expand Down
2 changes: 1 addition & 1 deletion examples/cv_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def main():
"--mixed_precision",
type=str,
default=None,
choices=["no", "fp16", "bf16"],
choices=["no", "fp16", "bf16", "fp8"],
help="Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU.",
Expand Down
23 changes: 18 additions & 5 deletions examples/nlp_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,22 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
if accelerator.distributed_type == DistributedType.TPU:
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
elif accelerator.mixed_precision != "no":
pad_to_multiple_of = 8
else:
pad_to_multiple_of = None

return tokenizer.pad(
examples,
padding="longest",
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors="pt",
)

# Instantiate dataloaders.
train_dataloader = DataLoader(
Expand Down Expand Up @@ -120,7 +133,6 @@ def training_function(config, args):
# Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
# creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
model = model.to(accelerator.device)

# Instantiate optimizer
optimizer = AdamW(params=model.parameters(), lr=lr)

Expand All @@ -134,6 +146,7 @@ def training_function(config, args):
# Prepare everything
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
# prepare method.

model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)
Expand Down Expand Up @@ -177,7 +190,7 @@ def main():
"--mixed_precision",
type=str,
default=None,
choices=["no", "fp16", "bf16"],
choices=["no", "fp16", "bf16", "fp8"],
help="Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU.",
Expand Down
Loading