PyTorch

1、SGD

随机梯度下降算法,每次参数更新时,仅仅选取一个样本计算其梯度。
算法:
2022-07-01-09-58-41.888532000.png
函数:

  1. class torch.optim.SGD(params, lr=, momentum=0, dampening=0, weight_decay=0, nesterov=False)
  2. '''
  3. 参数:
  4. params (iterable) – 待优化参数的iterable或者是定义了参数组的dict
  5. lr (float) – 学习率
  6. momentum (float, 可选) – 动量因子(默认:0)
  7. weight_decay (float, 可选) – 权重衰减(L2惩罚)(默认:0)
  8. dampening (float, 可选) – 动量的抑制因子(默认:0)
  9. nesterov (bool, 可选) – 使用Nesterov动量(默认:False)
  10. '''
  11. http://www.cs.toronto.edu/~hinton/absps/momentum.pdf

自定义优化器基类:

  1. import warnings
  2. import functools
  3. from collections import defaultdict
  4. class CustomOptimizer:
  5. def __init__(self, params, defaults):
  6. self.defaults = defaults
  7. """
  8. defaultdict的作用在于当字典里的key被查找但不存在时,
  9. 返回的不是keyError而是一个默认值,
  10. 此处defaultdict(dict)返回的默认值会是个空字典。
  11. """
  12. self.state = defaultdict(dict)
  13. self.param_groups = []
  14. param_groups = list(params)
  15. if len(param_groups) == 0:
  16. raise ValueError('optimizer got an empty parameter list')
  17. ifnot
  18. isinstance(param_groups[0], dict):
  19. param_groups = [{'params': param_groups}]
  20. for param_group in param_groups:
  21. self.add_param_group(param_group)
  22. def add_param_group(self, param_group):
  23. """
  24. 作用是将param_group放进self.param_groups中
  25. param_group是字典,Key是params,Value是param_groups=list(params)
  26. """
  27. assert isinstance(param_group, dict), "param group must be a dict"
  28. params = param_group['params']
  29. if isinstance(params, torch.Tensor):
  30. param_group['params'] = [params]
  31. else:
  32. param_group['params'] = list(params)
  33. """将self.defaults中的键值对遍历放到字典param_group中"""
  34. for name, default in self.defaults.items():
  35. param_group.setdefault(name, default)
  36. params = param_group['params']
  37. if len(params) != len(set(params)):
  38. warnings.warn("optimizer contains a parameter group with duplicate parameters; "
  39. "in future, this will cause an error; ", stacklevel=3)
  40. """对self.param_groups和param_group中的元素进行判断,确保没有重复的参数"""
  41. param_set = set()
  42. for group in self.param_groups:
  43. param_set.update(set(group['params']))
  44. if not param_set.isdisjoint(set(param_group['params'])):
  45. raise ValueError("some parameters appear in more than one parameter group")
  46. """将字典param_group放进列表self.param_groups"""
  47. self.param_groups.append(param_group)
  48. def __setstate__(self, state):
  49. self.__dict__.update(state)
  50. def __getstate__(self):
  51. return {
  52. 'defaults': self.defaults,
  53. 'state': self.state,
  54. 'param_groups': self.param_groups,
  55. }
  56. def step(self, closure):
  57. raise NotImplementedError
  58. def zero_grad(self):
  59. r"""将梯度置为零"""
  60. for group in self.param_groups:
  61. for p in group['params']:
  62. if p.grad isnotNone:
  63. p.grad.detach_()
  64. p.grad.zero_()
  65. def __repr__(self):
  66. format_string = self.__class__.__name__ + ' ('
  67. for i, group in enumerate(self.param_groups):
  68. format_string += '\n'
  69. format_string += 'Parameter Group {0}\n'.format(i)
  70. for key in sorted(group.keys()):
  71. if key != 'params':
  72. format_string += ' {0}: {1}\n'.format(key, group[key])
  73. format_string += ')'
  74. return format_string

自定义SGD优化器:

  1. class CustomSGD(CustomOptimizer):
  2. def __init__(self, params, lr, momentum=0, dampening=0, weight_decay=0, nesterov=False, maximize=False):
  3. """参数被打包进字典中命名为defaults"""
  4. defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
  5. weight_decay=weight_decay, nesterov=nesterov, maximize=maximize)
  6. super(CustomSGD, self).__init__(params, defaults)
  7. def __setstate__(self, state):
  8. super(CustomSGD, self).__setstate__(state)
  9. for group in self.param_groups:
  10. group.setdefault('nesterov', False)
  11. group.setdefault('maximize', False)
  12. @torch.no_grad()
  13. def step(self, closure=None):
  14. """更新参数
  15. Arguments:
  16. closure (callable, optional): A closure that reevaluates the model
  17. and returns the loss.
  18. """
  19. loss = None
  20. if closure isnotNone:
  21. loss = closure()
  22. """self.param_groups 是在父类的__init__函数中创建的"""
  23. for group in self.param_groups:
  24. weight_decay = group['weight_decay']
  25. momentum = group['momentum']
  26. dampening = group['dampening']
  27. nesterov = group['nesterov']
  28. maximize = group['maximize']
  29. for p in group['params']:
  30. if p.grad isNone: continue
  31. d_p = p.grad.data
  32. if weight_decay != 0:
  33. d_p.add_(weight_decay, p.data)
  34. if momentum != 0:
  35. param_state = self.state[p]
  36. if 'momentum_buffer'notin param_state:
  37. buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
  38. else:
  39. buf = param_state['momentum_buffer']
  40. buf.mul_(momentum).add_(1 - dampening, d_p)
  41. if nesterov:
  42. d_p = d_p.add(momentum, buf)
  43. else:
  44. d_p = buf
  45. # 对参数进行更新
  46. if maximize:
  47. p.data.add_(group['lr'], d_p)
  48. else:
  49. p.data.add_(-group['lr'], d_p)
  50. return loss

对比:

  1. from torch.optim import SGD
  2. w0=optim(model, optim_fn=SGD, lr=0.1,weight_decay=0.5)
  3. w0_custom = optim(model, optim_fn=CustomSGD, lr=0.1,weight_decay=0.5)
  4. plot([w0,w0_custom], titles=['SGD','CustomSGD'])

Fcant_2022-07-01_23-33-46.png

2、Adam

将Momentum算法和RMSProp算法结合起来使用的一种算法,既用动量来累积梯度,又使得收敛速度更快同时使得波动的幅度更小,并进行了偏差修正。
算法:
2022-07-01-09-58-42.136684400.png
函数:

  1. class torch.optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)[source]
  2. '''
  3. 参数:
  4. betas (Tuple[float, float], 可选) – 用于计算梯度以及梯度平方的运行平均值的系数(默认:0.9,0.999)
  5. eps (float, 可选) – 为了增加数值计算的稳定性而加到分母里的项(默认:1e-8)
  6. '''
  7. https://arxiv.org/abs/1412.6980

自定义Adam优化器:

  1. import math
  2. class CustomAdam(CustomOptimizer):
  3. def __init__(self, params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False, maximize=False):
  4. defaults = dict(lr=lr, betas=betas, eps=eps,
  5. weight_decay=weight_decay, amsgrad=amsgrad, maximize=maximize)
  6. super(CustomAdam, self).__init__(params, defaults)
  7. def __setstate__(self, state):
  8. super(CustomAdam, self).__setstate__(state)
  9. for group in self.param_groups:
  10. group.setdefault('amsgrad', False)
  11. group.setdefault('maximize', False)
  12. @torch.no_grad()
  13. def step(self, closure=None):
  14. loss = None
  15. if closure isnotNone:
  16. loss = closure()
  17. for group in self.param_groups:
  18. for p in group['params']:
  19. if p.grad isNone: continue
  20. if group['maximize']:
  21. grad = -p.grad.data
  22. else:
  23. grad = p.grad.data
  24. if group['weight_decay'] != 0:
  25. grad.add_(group['weight_decay'], p.data)
  26. """Adam Optimizer只能处理dense gradient,
  27. 要想处理sparse gradient需要使用SparseAdam Optimizer"""
  28. if grad.is_sparse:
  29. raise RuntimeError('Adam does not support sparse gradients, '
  30. 'please consider SparseAdam instead')
  31. amsgrad = group['amsgrad']
  32. state = self.state[p]
  33. # 状态初始化
  34. if len(state) == 0:
  35. state['step'] = 0
  36. # 梯度值的指数移动平均
  37. state['exp_avg'] = torch.zeros_like(p.data)
  38. # 梯度平方值的指数移动平均
  39. state['exp_avg_sq'] = torch.zeros_like(p.data)
  40. if amsgrad:
  41. # 保留最大的梯度平均和梯度平方
  42. state['max_exp_avg_sq'] = torch.zeros_like(p.data)
  43. exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
  44. if amsgrad:
  45. max_exp_avg_sq = state['max_exp_avg_sq']
  46. beta1, beta2 = group['betas']
  47. state['step'] += 1
  48. bias_correction1 = 1 - beta1 ** state['step']
  49. bias_correction2 = 1 - beta2 ** state['step']
  50. # Decay the first and second moment running average coefficient
  51. exp_avg.mul_(beta1).add_(1 - beta1, grad)
  52. exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
  53. if amsgrad:
  54. # Maintains the maximum of all 2nd moment running avg. till now
  55. torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
  56. # Use the max. for normalizing running avg. of gradient
  57. denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
  58. else:
  59. denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
  60. step_size = group['lr'] / bias_correction1
  61. p.data.addcdiv_(-step_size, exp_avg, denom)
  62. return loss

对比:

  1. from torch.optim import Adam
  2. w0=optim(model, optim_fn=Adam, lr=0.1,weight_decay=0.5)
  3. w0_custom = optim(model, optim_fn=CustomAdam, lr=0.1,weight_decay=0.5)
  4. plot([w0,w0_custom], titles=['Adam','CustomAdam'])

Fcant_2022-07-01_23-33-58.png