#378 Feature/sg 281 add kd notebook

Merged
Ghost merged 1 commits into Deci-AI:master from deci-ai:feature/SG-281-add_kd_notebook
  
    
        
          
1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

          
            import torch.optim as optim
import torch.nn as nn
from torch.nn.modules.batchnorm import _BatchNorm
from torch.nn.modules.conv import _ConvNd
from super_gradients.common.abstractions.abstract_logger import get_logger
from super_gradients.common.factories.optimizers_type_factory import OptimizersTypeFactory
from super_gradients.training.params import DEFAULT_OPTIMIZER_PARAMS_SGD, DEFAULT_OPTIMIZER_PARAMS_ADAM, \
    DEFAULT_OPTIMIZER_PARAMS_RMSPROP, DEFAULT_OPTIMIZER_PARAMS_RMSPROPTF
from super_gradients.training.utils import get_param
from super_gradients.training.utils.optimizers.rmsprop_tf import RMSpropTF
logger = get_logger(__name__)
OPTIMIZERS_DEFAULT_PARAMS = {optim.SGD: DEFAULT_OPTIMIZER_PARAMS_SGD,
                             optim.Adam: DEFAULT_OPTIMIZER_PARAMS_ADAM,
                             optim.RMSprop: DEFAULT_OPTIMIZER_PARAMS_RMSPROP,
                             RMSpropTF: DEFAULT_OPTIMIZER_PARAMS_RMSPROPTF}
def separate_zero_wd_params_groups_for_optimizer(module: nn.Module, net_named_params, weight_decay: float):
    """
    separate param groups for batchnorm and biases and others with weight decay. return list of param groups in format
     required by torch Optimizer classes.
    bias + BN with weight decay=0 and the rest with the given weight decay
        :param module: train net module.
        :param net_named_params: list of params groups, output of SgModule.initialize_param_groups
        :param weight_decay: value to set for the non BN and bias parameters
    """
    # FIXME - replace usage of ids addresses to find batchnorm and biases params.
    #  This solution iterate 2 times over module parameters, find a way to iterate only one time.
    no_decay_ids = _get_no_decay_param_ids(module)
    # split param groups for optimizer
    optimizer_param_groups = []
    for param_group in net_named_params:
        no_decay_params = []
        decay_params = []
        for name, param in param_group["named_params"]:
            if id(param) in no_decay_ids:
                no_decay_params.append(param)
            else:
                decay_params.append(param)
        # append two param groups from the original param group, with and without weight decay.
        extra_optim_params = {key: param_group[key] for key in param_group
                              if key not in ["named_params", "weight_decay"]}
        optimizer_param_groups.append({"params": no_decay_params, "weight_decay": 0.0, **extra_optim_params})
        optimizer_param_groups.append({"params": decay_params, "weight_decay": weight_decay, **extra_optim_params})
    return optimizer_param_groups
def _get_no_decay_param_ids(module: nn.Module):
    # FIXME - replace usage of ids addresses to find batchnorm and biases params.
    #  Use other common way to identify torch parameters other than id or layer names
    """
    Iterate over module.modules() and returns params id addresses of batch-norm and biases params.
    NOTE - ALL MODULES WITH ATTRIBUTES NAMED BIAS AND ARE INSTANCE OF nn.Parameter WILL BE CONSIDERED A BIAS PARAM FOR
        ZERO WEIGHT DECAY.
    """
    batchnorm_types = (_BatchNorm,)
    torch_weight_with_bias_types = (_ConvNd, nn.Linear)
    no_decay_ids = []
    for name, m in module.named_modules():
        if isinstance(m, batchnorm_types):
            no_decay_ids.append(id(m.weight))
            no_decay_ids.append(id(m.bias))
        elif hasattr(m, "bias") and isinstance(m.bias, nn.Parameter):
            if not isinstance(m, torch_weight_with_bias_types):
                logger.warning(f"Module class: {m.__class__}, have a `bias` parameter attribute but is not instance of"
                               f" torch primitive modules, this bias parameter will be part of param group with zero"
                               f" weight decay.")
            no_decay_ids.append(id(m.bias))
    return no_decay_ids
def build_optimizer(net, lr, training_params):
    """
    Wrapper function for initializing the optimizer
        :param net: the nn_module to build the optimizer for
        :param lr: initial learning rate
        :param training_params: training_parameters
    """
    if isinstance(training_params.optimizer, str):
        optimizer_cls = OptimizersTypeFactory().get(training_params.optimizer)
    else:
        optimizer_cls = training_params.optimizer
    default_optimizer_params = OPTIMIZERS_DEFAULT_PARAMS[optimizer_cls] if optimizer_cls in OPTIMIZERS_DEFAULT_PARAMS else {}
    training_params.optimizer_params = get_param(training_params, 'optimizer_params', default_optimizer_params)
    weight_decay = get_param(training_params.optimizer_params, 'weight_decay', 0.)
    # OPTIMIZER PARAM GROUPS ARE SET USING DEFAULT OR MODEL SPECIFIC INIT
    if hasattr(net.module, 'initialize_param_groups'):
        # INITIALIZE_PARAM_GROUPS MUST RETURN A LIST OF DICTS WITH 'named_params' AND OPTIMIZER's ATTRIBUTES PER GROUP
        net_named_params = net.module.initialize_param_groups(lr, training_params)
    else:
        net_named_params = [{'named_params': net.named_parameters()}]
    if training_params.zero_weight_decay_on_bias_and_bn:
        optimizer_training_params = separate_zero_wd_params_groups_for_optimizer(
            net.module, net_named_params, weight_decay
        )
    else:
        # Overwrite groups to include params instead of named params
        for ind_group, param_group in enumerate(net_named_params):
            param_group['params'] = [param[1] for param in list(param_group['named_params'])]
            del param_group['named_params']
            net_named_params[ind_group] = param_group
        optimizer_training_params = net_named_params
    # CREATE AN OPTIMIZER OBJECT AND INITIALIZE IT
    optimizer = optimizer_cls(optimizer_training_params, lr=lr, **training_params.optimizer_params)
    return optimizer

          
        
      

  
Tip!
Press p or to see the previous file or, n or to see the next file
Deci-AI / super-gradients connected to https://github.com/Deci-AI/super-gradients.git

#378 Feature/sg 281 add kd notebook

Deci-AI
/
super-gradients
connected to https://github.com/Deci-AI/super-gradients.git