123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118 |
- import torch
- from . import _functional as F
- from .optimizer import Optimizer, _maximize_doc
- __all__ = ['SparseAdam']
- class SparseAdam(Optimizer):
- def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, maximize: bool = False):
- if not 0.0 < lr:
- raise ValueError("Invalid learning rate: {}".format(lr))
- if not 0.0 < eps:
- raise ValueError("Invalid epsilon value: {}".format(eps))
- if not 0.0 <= betas[0] < 1.0:
- raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
- if not 0.0 <= betas[1] < 1.0:
- raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
- params = list(params)
- sparse_params = []
- for index, param in enumerate(params):
- if isinstance(param, dict):
- # given param group, convert given params to a list first before iterating
- param['params'] = list(param.get("params", []))
- for d_index, d_param in enumerate(param['params']):
- if d_param.is_sparse:
- sparse_params.append([index, d_index])
- elif param.is_sparse:
- sparse_params.append(index)
- if sparse_params:
- raise ValueError(
- f"Sparse params at indices {sparse_params}: SparseAdam requires dense parameter tensors"
- )
- defaults = dict(lr=lr, betas=betas, eps=eps, maximize=maximize)
- super().__init__(params, defaults)
- @torch.no_grad()
- def step(self, closure=None):
- """Performs a single optimization step.
- Args:
- closure (Callable, optional): A closure that reevaluates the model
- and returns the loss.
- """
- loss = None
- if closure is not None:
- with torch.enable_grad():
- loss = closure()
- for group in self.param_groups:
- params_with_grad = []
- grads = []
- exp_avgs = []
- exp_avg_sqs = []
- state_steps = []
- eps = group['eps']
- lr = group['lr']
- beta1, beta2 = group['betas']
- maximize = group.get('maximize', False)
- for p in group['params']:
- if p.grad is not None:
- params_with_grad.append(p)
- if not p.grad.is_sparse:
- raise RuntimeError('SparseAdam does not support dense gradients, please consider Adam instead')
- grads.append(p.grad)
- state = self.state[p]
- # State initialization
- if len(state) == 0:
- state['step'] = 0
- # Exponential moving average of gradient values
- state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
- # Exponential moving average of squared gradient values
- state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
- exp_avgs.append(state['exp_avg'])
- exp_avg_sqs.append(state['exp_avg_sq'])
- # update the steps for each param group update
- state['step'] += 1
- # record the step after step update
- state_steps.append(state['step'])
- F.sparse_adam(params_with_grad,
- grads,
- exp_avgs,
- exp_avg_sqs,
- state_steps,
- beta1=beta1,
- beta2=beta2,
- lr=group['lr'],
- eps=group['eps'],
- maximize=maximize)
- return loss
- SparseAdam.__doc__ = r"""Implements lazy version of Adam algorithm suitable for sparse tensors.
- In this variant, only moments that show up in the gradient get updated, and
- only those portions of the gradient get applied to the parameters.
- Args:
- params (iterable): iterable of parameters to optimize or dicts defining
- parameter groups
- lr (float, optional): learning rate (default: 1e-3)
- betas (Tuple[float, float], optional): coefficients used for computing
- running averages of gradient and its square (default: (0.9, 0.999))
- eps (float, optional): term added to the denominator to improve
- numerical stability (default: 1e-8)
- {maximize}
- .. _Adam\: A Method for Stochastic Optimization:
- https://arxiv.org/abs/1412.6980
- """.format(maximize=_maximize_doc)
|