Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

#378 Feature/sg 281 add kd notebook

Merged
Ghost merged 1 commits into Deci-AI:master from deci-ai:feature/SG-281-add_kd_notebook
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
  1. import torch.optim as optim
  2. import torch.nn as nn
  3. from torch.nn.modules.batchnorm import _BatchNorm
  4. from torch.nn.modules.conv import _ConvNd
  5. from super_gradients.common.abstractions.abstract_logger import get_logger
  6. from super_gradients.common.factories.optimizers_type_factory import OptimizersTypeFactory
  7. from super_gradients.training.params import DEFAULT_OPTIMIZER_PARAMS_SGD, DEFAULT_OPTIMIZER_PARAMS_ADAM, \
  8. DEFAULT_OPTIMIZER_PARAMS_RMSPROP, DEFAULT_OPTIMIZER_PARAMS_RMSPROPTF
  9. from super_gradients.training.utils import get_param
  10. from super_gradients.training.utils.optimizers.rmsprop_tf import RMSpropTF
  11. logger = get_logger(__name__)
  12. OPTIMIZERS_DEFAULT_PARAMS = {optim.SGD: DEFAULT_OPTIMIZER_PARAMS_SGD,
  13. optim.Adam: DEFAULT_OPTIMIZER_PARAMS_ADAM,
  14. optim.RMSprop: DEFAULT_OPTIMIZER_PARAMS_RMSPROP,
  15. RMSpropTF: DEFAULT_OPTIMIZER_PARAMS_RMSPROPTF}
  16. def separate_zero_wd_params_groups_for_optimizer(module: nn.Module, net_named_params, weight_decay: float):
  17. """
  18. separate param groups for batchnorm and biases and others with weight decay. return list of param groups in format
  19. required by torch Optimizer classes.
  20. bias + BN with weight decay=0 and the rest with the given weight decay
  21. :param module: train net module.
  22. :param net_named_params: list of params groups, output of SgModule.initialize_param_groups
  23. :param weight_decay: value to set for the non BN and bias parameters
  24. """
  25. # FIXME - replace usage of ids addresses to find batchnorm and biases params.
  26. # This solution iterate 2 times over module parameters, find a way to iterate only one time.
  27. no_decay_ids = _get_no_decay_param_ids(module)
  28. # split param groups for optimizer
  29. optimizer_param_groups = []
  30. for param_group in net_named_params:
  31. no_decay_params = []
  32. decay_params = []
  33. for name, param in param_group["named_params"]:
  34. if id(param) in no_decay_ids:
  35. no_decay_params.append(param)
  36. else:
  37. decay_params.append(param)
  38. # append two param groups from the original param group, with and without weight decay.
  39. extra_optim_params = {key: param_group[key] for key in param_group
  40. if key not in ["named_params", "weight_decay"]}
  41. optimizer_param_groups.append({"params": no_decay_params, "weight_decay": 0.0, **extra_optim_params})
  42. optimizer_param_groups.append({"params": decay_params, "weight_decay": weight_decay, **extra_optim_params})
  43. return optimizer_param_groups
  44. def _get_no_decay_param_ids(module: nn.Module):
  45. # FIXME - replace usage of ids addresses to find batchnorm and biases params.
  46. # Use other common way to identify torch parameters other than id or layer names
  47. """
  48. Iterate over module.modules() and returns params id addresses of batch-norm and biases params.
  49. NOTE - ALL MODULES WITH ATTRIBUTES NAMED BIAS AND ARE INSTANCE OF nn.Parameter WILL BE CONSIDERED A BIAS PARAM FOR
  50. ZERO WEIGHT DECAY.
  51. """
  52. batchnorm_types = (_BatchNorm,)
  53. torch_weight_with_bias_types = (_ConvNd, nn.Linear)
  54. no_decay_ids = []
  55. for name, m in module.named_modules():
  56. if isinstance(m, batchnorm_types):
  57. no_decay_ids.append(id(m.weight))
  58. no_decay_ids.append(id(m.bias))
  59. elif hasattr(m, "bias") and isinstance(m.bias, nn.Parameter):
  60. if not isinstance(m, torch_weight_with_bias_types):
  61. logger.warning(f"Module class: {m.__class__}, have a `bias` parameter attribute but is not instance of"
  62. f" torch primitive modules, this bias parameter will be part of param group with zero"
  63. f" weight decay.")
  64. no_decay_ids.append(id(m.bias))
  65. return no_decay_ids
  66. def build_optimizer(net, lr, training_params):
  67. """
  68. Wrapper function for initializing the optimizer
  69. :param net: the nn_module to build the optimizer for
  70. :param lr: initial learning rate
  71. :param training_params: training_parameters
  72. """
  73. if isinstance(training_params.optimizer, str):
  74. optimizer_cls = OptimizersTypeFactory().get(training_params.optimizer)
  75. else:
  76. optimizer_cls = training_params.optimizer
  77. default_optimizer_params = OPTIMIZERS_DEFAULT_PARAMS[optimizer_cls] if optimizer_cls in OPTIMIZERS_DEFAULT_PARAMS else {}
  78. training_params.optimizer_params = get_param(training_params, 'optimizer_params', default_optimizer_params)
  79. weight_decay = get_param(training_params.optimizer_params, 'weight_decay', 0.)
  80. # OPTIMIZER PARAM GROUPS ARE SET USING DEFAULT OR MODEL SPECIFIC INIT
  81. if hasattr(net.module, 'initialize_param_groups'):
  82. # INITIALIZE_PARAM_GROUPS MUST RETURN A LIST OF DICTS WITH 'named_params' AND OPTIMIZER's ATTRIBUTES PER GROUP
  83. net_named_params = net.module.initialize_param_groups(lr, training_params)
  84. else:
  85. net_named_params = [{'named_params': net.named_parameters()}]
  86. if training_params.zero_weight_decay_on_bias_and_bn:
  87. optimizer_training_params = separate_zero_wd_params_groups_for_optimizer(
  88. net.module, net_named_params, weight_decay
  89. )
  90. else:
  91. # Overwrite groups to include params instead of named params
  92. for ind_group, param_group in enumerate(net_named_params):
  93. param_group['params'] = [param[1] for param in list(param_group['named_params'])]
  94. del param_group['named_params']
  95. net_named_params[ind_group] = param_group
  96. optimizer_training_params = net_named_params
  97. # CREATE AN OPTIMIZER OBJECT AND INITIALIZE IT
  98. optimizer = optimizer_cls(optimizer_training_params, lr=lr, **training_params.optimizer_params)
  99. return optimizer
Discard
Tip!

Press p or to see the previous file or, n or to see the next file