Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions RecommenderSystems/dlrm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ def int_list(x):
parser.add_argument("--batch_size", type=int, default=16384)
parser.add_argument("--batch_size_per_proc", type=int, default=None)
parser.add_argument("--learning_rate", type=float, default=1e-3)
parser.add_argument("--warmup_batches", type=int, default=2750)
parser.add_argument("--decay_batches", type=int, default=27772)
parser.add_argument("--decay_start", type=int, default=49315)
parser.add_argument("--vocab_size", type=int, default=1603616)
parser.add_argument("--embedding_vec_size", type=int, default=128)
parser.add_argument("--num_dense_fields", type=int, default=13)
Expand Down
4 changes: 2 additions & 2 deletions RecommenderSystems/dlrm/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ def build(self):


class DLRMTrainGraph(flow.nn.Graph):
def __init__(self, wdl_module, dataloader, bce_loss, optimizer):
def __init__(self, wdl_module, dataloader, bce_loss, optimizer, lr_scheduler=None):
super(DLRMTrainGraph, self).__init__()
self.module = wdl_module
self.dataloader = dataloader
self.bce_loss = bce_loss
self.add_optimizer(optimizer)
self.add_optimizer(optimizer, lr_sch=lr_scheduler)

def build(self):
(
Expand Down
92 changes: 92 additions & 0 deletions RecommenderSystems/dlrm/lr_scheduler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import os
import math
import oneflow as flow
from oneflow.nn.optimizer.lr_scheduler import LrScheduler


class PolynomialLR(LrScheduler):
"""This operator creates a polynomial decayed learning rate scheduler.
The learning rate will be updated as follows:
If cycle is `True`, the equation is:
.. math::
& decay\\_batch = decay\\_batch*ceil(\\frac{current\\_batch}{decay\\_batch})
& learning\\_rate = (base\\_lr-end\\_lr)*(1-\\frac{current\\_batch}{decay\\_batch})^{pow}+end\\_lr
If cycle is `False`, the equation is:
.. math::
& decay\\_batch = min(decay\\_batch, current\\_batch)
& learning\\_rate = (base\\_lr-end\\_lr)*(1-\\frac{current\\_batch}{decay\\_batch})^{pow}+end\\_lr
Args:
steps (int): The decayed steps
end_learning_rate (float, optional): The final learning rate. Defaults to 0.0001.
power (float, optional): The power of polynomial. Defaults to 1.0.
cycle (bool, optional): If cycle is true, the scheduler will decay the learning rate every decay steps. Defaults to False.
For example:
.. code-block:: python
import oneflow as flow

...
polynomial_scheduler = flow.optimizer.lr_scheduler.PolynomialScheduler(optimizer,
steps=5,
end_learning_rate=0.00001,
power=2)
for epoch in range(num_epoch):
train(...)
polynomial_scheduler.step()
"""

def __init__(
self,
optimizer,
steps: int,
end_learning_rate: float = 0.0001,
power: float = 1.0,
cycle: bool = False,
last_step=-1,
verbose=False,
):
assert steps > 0, f"steps must greater than zero, but got {steps}"
self.max_decay_steps = steps
self.end_learning_rate = end_learning_rate
self.power = power
self.cycle = cycle
super().__init__(optimizer, last_step, verbose)

def get_lr(self):
decay_batch = self.max_decay_steps
cur_batch = self.last_step
if self.cycle:
decay_batch = decay_batch * math.ceil(cur_batch / decay_batch)
else:
cur_batch = min(cur_batch, decay_batch)
return [
(base_lr - self.end_learning_rate)
* ((1 - cur_batch / decay_batch) ** (self.power))
+ self.end_learning_rate
for base_lr in self.base_lrs
]

def _generate_conf_for_graph(self, opt_confs):
# CosineDecayLR is the same as CosineDecayConf in nn.Graph
for opt_conf in opt_confs:
learning_rate_decay_conf = opt_conf.mutable_learning_rate_decay()
learning_rate_decay_conf.mutable_polynomial_conf().set_decay_batches(
self.max_decay_steps
)
learning_rate_decay_conf.mutable_polynomial_conf().set_end_learning_rate(
self.end_learning_rate
)
learning_rate_decay_conf.mutable_polynomial_conf().set_power(self.power)
learning_rate_decay_conf.mutable_polynomial_conf().set_cycle(self.cycle)


def make_lr_scheduler(args, optimizer):
warmup_batches = args.warmup_batches
decay_batches = args.decay_batches

os.environ['DECAY_START'] = f'{args.decay_start}'
lr_scheduler = PolynomialLR(optimizer, steps=decay_batches, end_learning_rate=0.0, power=2.0)

lr_scheduler = flow.optim.lr_scheduler.WarmUpLR(
lr_scheduler, warmup_factor=0, warmup_iters=warmup_batches, warmup_method="linear"
)
return lr_scheduler
6 changes: 1 addition & 5 deletions RecommenderSystems/dlrm/models/dlrm.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,7 @@ def __init__(self, vocab_size, embed_size):
# Can't change the embedding_size 128 because the kv store value_length has been set to 128
"embedding_size": embed_size,
"dtype": flow.float,
"encoder": "invalid",
"partitioning": "invalid",
"initializer": "invalid",
"optimizer": "invalid",
"backend": "invalid",
"embedding_options": '{"embedding_size": embed_size, "embedding_name":"EmbeddingTest"}',
}
super(OneEmbedding, self).__init__(options)

Expand Down
5 changes: 4 additions & 1 deletion RecommenderSystems/dlrm/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from config import get_args
from models.data import make_data_loader
from models.dlrm import make_dlrm_module
from lr_scheduler import make_lr_scheduler
from oneflow.nn.parallel import DistributedDataParallel as DDP
from graph import DLRMValGraph, DLRMTrainGraph
import warnings
Expand Down Expand Up @@ -56,14 +57,16 @@ def __init__(self):
self.opt = flow.optim.SGD(
self.dlrm_module.parameters(), lr=args.learning_rate
)
self.lr_scheduler = make_lr_scheduler(args, self.opt)

self.loss = flow.nn.BCELoss(reduction="none").to("cuda")
if self.execution_mode == "graph":
self.eval_graph = DLRMValGraph(
self.dlrm_module, self.val_dataloader
)
self.train_graph = DLRMTrainGraph(
self.dlrm_module, self.train_dataloader, self.loss, self.opt
self.dlrm_module, self.train_dataloader, self.loss, self.opt,
self.lr_scheduler
)

def init_model(self):
Expand Down