diff --git a/torchbenchmark/e2e_models/hf_bert/__init__.py b/torchbenchmark/e2e_models/hf_bert/__init__.py index 77860a11b0..3fe6da001f 100644 --- a/torchbenchmark/e2e_models/hf_bert/__init__.py +++ b/torchbenchmark/e2e_models/hf_bert/__init__.py @@ -9,6 +9,7 @@ from accelerate.utils.dataclasses import DeepSpeedPlugin from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.nn.parallel import DistributedDataParallel as DDP +from torch.optim import AdamW from torch.utils.data import DataLoader from torchbenchmark.tasks import NLP from torchbenchmark.util.e2emodel import E2EBenchmarkModel @@ -22,7 +23,6 @@ preprocess_dataset, ) from transformers import ( - AdamW, AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, @@ -172,7 +172,7 @@ def prep(self, hf_args): # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). self.data_collator = DataCollatorWithPadding( - tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None) + tokenizer, pad_to_multiple_of=(8 if accelerator.mixed_precision == "fp16" else None) ) train_dataloader = DataLoader( diff --git a/torchbenchmark/e2e_models/hf_t5/__init__.py b/torchbenchmark/e2e_models/hf_t5/__init__.py index 30654717ee..c1e9dcc32a 100644 --- a/torchbenchmark/e2e_models/hf_t5/__init__.py +++ b/torchbenchmark/e2e_models/hf_t5/__init__.py @@ -256,7 +256,7 @@ def prep(self, hf_args): tokenizer, model=model, label_pad_token_id=label_pad_token_id, - pad_to_multiple_of=8 if accelerator.use_fp16 else None, + pad_to_multiple_of=8 if accelerator.mixed_precision == "fp16" else None, ) train_dataloader = DataLoader(