#378 Feature/sg 281 add kd notebook

Merged
Ghost merged 1 commits into Deci-AI:master from deci-ai:feature/SG-281-add_kd_notebook
  
    
        
          
1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

          
            import math
import warnings
from copy import deepcopy
from typing import Union
import torch
from torch import nn
from super_gradients.training import utils as core_utils
from super_gradients.training.models import SgModule
from super_gradients.training.models.kd_modules.kd_module import KDModule
def copy_attr(a: nn.Module, b: nn.Module, include: Union[list, tuple] = (), exclude: Union[list, tuple] = ()):
    # Copy attributes from b to a, options to only include [...] and to exclude [...]
    for k, v in b.__dict__.items():
        if (len(include) and k not in include) or k.startswith('_') or k in exclude:
            continue
        else:
            setattr(a, k, v)
class ModelEMA:
    """ Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
    Keep a moving average of everything in the model state_dict (parameters and buffers).
    This is intended to allow functionality like
    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
    A smoothed version of the weights is necessary for some training schemes to perform well.
    This class is sensitive where it is initialized in the sequence of model init,
    GPU assignment and distributed training wrappers.
    """
    def __init__(self, model, decay: float = 0.9999, beta: float = 15, exp_activation: bool = True):
        """
        Init the EMA
        :param model: Union[SgModule, nn.Module], the training model to construct the EMA model by
                    IMPORTANT: WHEN THE APPLICATION OF EMA ONLY ON A SUBSET OF ATTRIBUTES IS DESIRED, WRAP THE NN.MODULE
                    AS SgModule AND OVERWRITE get_include_attributes() AND get_exclude_attributes() AS DESIRED.
        :param decay: the maximum decay value. as the training process advances, the decay will climb towards this value
                      until the EMA_t+1 = EMA_t * decay + TRAINING_MODEL * (1- decay)
        :param beta: the exponent coefficient. The higher the beta, the sooner in the training the decay will saturate to
                     its final value. beta=15 is ~40% of the training process.
        """
        # Create EMA
        self.ema = deepcopy(model)
        self.ema.eval()
        if exp_activation:
            self.decay_function = lambda x: decay * (1 - math.exp(-x * beta))  # decay exponential ramp (to help early epochs)
        else:
            self.decay_function = lambda x: decay  # always return the same decay factor
        """"
        we hold a list of model attributes (not wights and biases) which we would like to include in each
        attribute update or exclude from each update. a SgModule declare these attribute using
        get_include_attributes and get_exclude_attributes functions. for a nn.Module which is not a SgModule
        all non-private (not starting with '_') attributes will be updated (and only them).
        """
        if isinstance(model.module, SgModule):
            self.include_attributes = model.module.get_include_attributes()
            self.exclude_attributes = model.module.get_exclude_attributes()
        else:
            warnings.warn("Warning: EMA should be used with SgModule instance. All attributes of the model will be "
                          "included in EMA")
            self.include_attributes = []
            self.exclude_attributes = []
        for p in self.ema.module.parameters():
            p.requires_grad_(False)
    def update(self, model, training_percent: float):
        """
        Update the state of the EMA model.
        :param model: current training model
        :param training_percent: the percentage of the training process [0,1]. i.e 0.4 means 40% of the training have passed
        """
        # Update EMA parameters
        with torch.no_grad():
            decay = self.decay_function(training_percent)
            for ema_v, model_v in zip(self.ema.module.state_dict().values(), model.state_dict().values()):
                if ema_v.dtype.is_floating_point:
                    ema_v.copy_(ema_v * decay + (1. - decay) * model_v.detach())
    def update_attr(self, model):
        """
        This function updates model attributes (not weight and biases) from original model to the ema model.
        attributes of the original model, such as anchors and grids (of detection models), may be crucial to the
        model operation and need to be updated.
        If include_attributes and exclude_attributes lists were not defined, all non-private (not starting with '_')
        attributes will be updated (and only them).
        :param model: the source model
        """
        copy_attr(self.ema.module, model.module, self.include_attributes, self.exclude_attributes)
class KDModelEMA(ModelEMA):
    """ Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
    Keep a moving average of everything in the model state_dict (parameters and buffers).
    This is intended to allow functionality like
    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
    A smoothed version of the weights is necessary for some training schemes to perform well.
    This class is sensitive where it is initialized in the sequence of model init,
    GPU assignment and distributed training wrappers.
    """
    def __init__(self, kd_model: KDModule, decay: float = 0.9999, beta: float = 15, exp_activation: bool = True):
        """
        Init the EMA
        :param kd_model: KDModule, the training Knowledge distillation model to construct the EMA model by
                    IMPORTANT: WHEN THE APPLICATION OF EMA ONLY ON A SUBSET OF ATTRIBUTES IS DESIRED, WRAP THE NN.MODULE
                    AS SgModule AND OVERWRITE get_include_attributes() AND get_exclude_attributes() AS DESIRED.
        :param decay: the maximum decay value. as the training process advances, the decay will climb towards this value
                      until the EMA_t+1 = EMA_t * decay + TRAINING_MODEL * (1- decay)
        :param beta: the exponent coefficient. The higher the beta, the sooner in the training the decay will saturate to
                     its final value. beta=15 is ~40% of the training process.
        """
        # Only work on the student (we don't want to update and to have a duplicate of the teacher)
        super().__init__(model=core_utils.WrappedModel(kd_model.module.student),
                         decay=decay,
                         beta=beta,
                         exp_activation=exp_activation)
        # Overwrite current ema attribute with combination of the student model EMA (current self.ema)
        # with already the instantiated teacher, to have the final KD EMA
        self.ema = core_utils.WrappedModel(KDModule(arch_params=kd_model.module.arch_params,
                                                    student=self.ema.module,
                                                    teacher=kd_model.module.teacher,
                                                    run_teacher_on_eval=kd_model.module.run_teacher_on_eval))

          
        
      

  
Tip!
Press p or to see the previous file or, n or to see the next file
Deci-AI / super-gradients connected to https://github.com/Deci-AI/super-gradients.git

#378 Feature/sg 281 add kd notebook

Deci-AI
/
super-gradients
connected to https://github.com/Deci-AI/super-gradients.git