-
Notifications
You must be signed in to change notification settings - Fork 83
Open
Description
I have been fine-tuning distilbert
from the HuggingFace Transformers project. When calling trainer.train()
, somewhere smdebug
tries to call os.environ.get()
and I get the above error.
There are no other messages.
It affects this line: /smdebug/core/logger.py", line 51, in get_logger
whether or not I set debugger_hook_config=False
from datasets import load_dataset
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments
import argparse
import json
import logging
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import torch.utils.data.distributed
import math
import os
import sys
from dataclasses import dataclass, field
from typing import Optional
#logger = logging.getLogger(__name__)
#logger.setLevel(logging.DEBUG)
#logger.addHandler(logging.StreamHandler(sys.stdout))
#logging.basicConfig(
# format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
# datefmt="%m/%d/%Y %H:%M:%S",
# level=logging.WARN,
# )
def group_texts(examples):
# Concatenate all texts.
block_size = 64
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
total_length = (total_length // block_size) * block_size
# Split by chunks of max_len.
result = {
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items()
}
result["labels"] = result["input_ids"].copy()
return result
def train(args):
train_file = os.path.join(args.train_dir, "new_train.txt")
valid_file = os.path.join(args.valid_dir, "new_valid.txt")
model_checkpoint = "distilgpt2"
#datasets = load_dataset('text', data_files={'train': train_file, 'test': valid_file, 'valid': valid_file})
datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
return tokenizer(examples['text'], truncation=True, max_length=64)
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=2, remove_columns=["text"])
block_size = 64
lm_datasets = tokenized_datasets.map(
group_texts,
batched=True,
batch_size=1000,
num_proc=2,
)
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
training_args = TrainingArguments(
"robs-chesterton-results",
evaluation_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
num_train_epochs=3,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=lm_datasets["train"],
eval_dataset=lm_datasets["validation"],
)
print('we made it this far')
train_result = trainer.train()
#trainer.save_model(args.model_dir)
#return trainer, model, train_result
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Data and model checkpoints directories
parser.add_argument(
"--num_labels", type=int, default=2, metavar="N", help="input batch size for training (default: 64)"
)
parser.add_argument(
"--batch-size", type=int, default=4, metavar="N", help="input batch size for training (default: 64)"
)
parser.add_argument(
"--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)"
)
parser.add_argument("--epochs", type=int, default=10, metavar="N", help="number of epochs to train (default: 10)")
parser.add_argument("--lr", type=float, default=0.01, metavar="LR", help="learning rate (default: 0.01)")
parser.add_argument("--momentum", type=float, default=0.5, metavar="M", help="SGD momentum (default: 0.5)")
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
parser.add_argument(
"--log-interval",
type=int,
default=1,
metavar="N",
help="how many batches to wait before logging training status",
)
parser.add_argument(
"--backend",
type=str,
default=None,
help="backend for distributed training (tcp, gloo on cpu and gloo, nccl on gpu)",
)
# Container environment
parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"])
parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
parser.add_argument("--train-dir", type=str, default=os.environ["SM_CHANNEL_TRAINING"])
parser.add_argument("--valid-dir", type=str, default=os.environ["SM_CHANNEL_VALIDATION"])
parser.add_argument("--num-gpus", type=int, default=os.environ["SM_NUM_GPUS"])
#trainer.save_model() # Saves the tokenizer too for easy upload
#output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
#model_2_save = model.module if hasattr(model, "module") else model
#model_2_save.save_pretrained(save_directory=args.model_dir)
train(parser.parse_args())
Metadata
Metadata
Assignees
Labels
No labels