@@ -14,6 +14,7 @@ from super_gradients.training.exceptions.kd_model_exceptions import Architecture
 
                                 UnsupportedKDArchitectureException, InconsistentParamsException, UnsupportedKDModelArgException, \
                
 
                                 TeacherKnowledgeException, UndefinedNumClassesException
                
 
                             from super_gradients.training.utils.callbacks import KDModelMetricsUpdateCallback
                
 
                            +from super_gradients.training.utils.ema import KDModelEMA
                
 
                             logger = get_logger(__name__)
                
@@ -247,3 +248,15 @@ class KDModel(SgModel):
 
                                                                "teacher_arch_params": self.teacher_arch_params
                
 
                                                                })
                
 
                                     return hyper_param_config
                
 
                            +
                
 
                            +    def instantiate_ema_model(self, decay: float = 0.9999, beta: float = 15, exp_activation: bool = True) -> KDModelEMA:
                
 
                            +        """Instantiate KD ema model for KDModule.
                
 
                            +
                
 
                            +        If the model is of class KDModule, the instance will be adapted to work on knowledge distillation.
                
 
                            +        :param decay:           the maximum decay value. as the training process advances, the decay will climb towards
                
 
                            +                                this value until the EMA_t+1 = EMA_t * decay + TRAINING_MODEL * (1- decay)
                
 
                            +        :param beta:            the exponent coefficient. The higher the beta, the sooner in the training the decay will
                
 
                            +                                saturate to its final value. beta=15 is ~40% of the training process.
                
 
                            +        :param exp_activation:
                
 
                            +        """
                
 
                            +        return KDModelEMA(self.net, decay, beta, exp_activation)
                
@@ -823,7 +823,7 @@ class SgModel:
 
                                     if self.ema:
                
 
                                         ema_params = self.training_params.ema_params
                
 
                                         logger.info(f'Using EMA with params {ema_params}')
                
 
                            -            self.ema_model = ModelEMA(self.net, **ema_params)
                
 
                            +            self.ema_model = self.instantiate_ema_model(**ema_params)
                
 
                                         self.ema_model.updates = self.start_epoch * num_batches // self.batch_accumulate
                
 
                                         if self.load_checkpoint:
                
 
                                             if 'ema_net' in self.checkpoint.keys():
                
@@ -1792,3 +1792,12 @@ class SgModel:
 
                                             arch_params.num_classes = num_classes_new_head
                
 
                                     return net
                
 
                            +
                
 
                            +    def instantiate_ema_model(self, decay: float = 0.9999, beta: float = 15, exp_activation: bool = True) -> ModelEMA:
                
 
                            +        """Instantiate ema model for standard SgModule.
                
 
                            +        :param decay: the maximum decay value. as the training process advances, the decay will climb towards this value
                
 
                            +                      until the EMA_t+1 = EMA_t * decay + TRAINING_MODEL * (1- decay)
                
 
                            +        :param beta: the exponent coefficient. The higher the beta, the sooner in the training the decay will saturate to
                
 
                            +                     its final value. beta=15 is ~40% of the training process.
                
 
                            +        """
                
 
                            +        return ModelEMA(self.net, decay, beta, exp_activation)
                
@@ -6,7 +6,9 @@ from typing import Union
 
                             import torch
                
 
                             from torch import nn
                
 
                            +from super_gradients.training import utils as core_utils
                
 
                             from super_gradients.training.models import SgModule
                
 
                            +from super_gradients.training.models.kd_modules.kd_module import KDModule
                
 
                             def copy_attr(a: nn.Module, b: nn.Module, include: Union[list, tuple] = (), exclude: Union[list, tuple] = ()):
                
@@ -49,8 +51,8 @@ class ModelEMA:
 
                                         self.decay_function = lambda x: decay  # always return the same decay factor
                
 
                                     """"
                
 
                            -        we hold a list of model attributes (not wights and biases) which we would like to include in each 
                
 
                            -        attribute update or exclude from each update. a SgModule declare these attribute using 
                
 
                            +        we hold a list of model attributes (not wights and biases) which we would like to include in each
                
 
                            +        attribute update or exclude from each update. a SgModule declare these attribute using
                
 
                                     get_include_attributes and get_exclude_attributes functions. for a nn.Module which is not a SgModule
                
 
                                     all non-private (not starting with '_') attributes will be updated (and only them).
                
 
                                     """
                
@@ -89,3 +91,39 @@ class ModelEMA:
 
                                     :param model: the source model
                
 
                                     """
                
 
                                     copy_attr(self.ema.module, model.module, self.include_attributes, self.exclude_attributes)
                
 
                            +
                
 
                            +
                
 
                            +class KDModelEMA(ModelEMA):
                
 
                            +    """ Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
                
 
                            +    Keep a moving average of everything in the model state_dict (parameters and buffers).
                
 
                            +    This is intended to allow functionality like
                
 
                            +    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
                
 
                            +    A smoothed version of the weights is necessary for some training schemes to perform well.
                
 
                            +    This class is sensitive where it is initialized in the sequence of model init,
                
 
                            +    GPU assignment and distributed training wrappers.
                
 
                            +    """
                
 
                            +
                
 
                            +    def __init__(self, kd_model: KDModule, decay: float = 0.9999, beta: float = 15, exp_activation: bool = True):
                
 
                            +        """
                
 
                            +        Init the EMA
                
 
                            +        :param kd_model: KDModule, the training Knowledge distillation model to construct the EMA model by
                
 
                            +                    IMPORTANT: WHEN THE APPLICATION OF EMA ONLY ON A SUBSET OF ATTRIBUTES IS DESIRED, WRAP THE NN.MODULE
                
 
                            +                    AS SgModule AND OVERWRITE get_include_attributes() AND get_exclude_attributes() AS DESIRED (SEE
                
 
                            +                    YoLoV5Base IMPLEMENTATION IN super_gradients.trainer.models.yolov5.py AS AN EXAMPLE).
                
 
                            +        :param decay: the maximum decay value. as the training process advances, the decay will climb towards this value
                
 
                            +                      until the EMA_t+1 = EMA_t * decay + TRAINING_MODEL * (1- decay)
                
 
                            +        :param beta: the exponent coefficient. The higher the beta, the sooner in the training the decay will saturate to
                
 
                            +                     its final value. beta=15 is ~40% of the training process.
                
 
                            +        """
                
 
                            +        # Only work on the student (we don't want to update and to have a duplicate of the teacher)
                
 
                            +        super().__init__(model=core_utils.WrappedModel(kd_model.module.student),
                
 
                            +                         decay=decay,
                
 
                            +                         beta=beta,
                
 
                            +                         exp_activation=exp_activation)
                
 
                            +
                
 
                            +        # Overwrite current ema attribute with combination of the student model EMA (current self.ema)
                
 
                            +        # with already the instantiated teacher, to have the final KD EMA
                
 
                            +        self.ema = core_utils.WrappedModel(KDModule(arch_params=kd_model.module.arch_params,
                
 
                            +                                                    student=self.ema.module,
                
 
                            +                                                    teacher=kd_model.module.teacher,
                
 
                            +                                                    run_teacher_on_eval=kd_model.module.run_teacher_on_eval))
                
@@ -12,6 +12,7 @@ from tests.unit_tests.strictload_enum_test import StrictLoadEnumTest
 
                             from tests.unit_tests.train_with_intialized_param_args_test import TrainWithInitializedObjectsTest
                
 
                             from tests.unit_tests.pretrained_models_unit_test import PretrainedModelsUnitTest
                
 
                             from tests.unit_tests.lr_warmup_test import LRWarmupTest
                
 
                            +from tests.unit_tests.kd_ema_test import KDEMATest
                
 
                             from tests.unit_tests.kd_model_test import KDModelTest
                
 
                             from tests.unit_tests.dice_loss_test import DiceLossTest
                
 
                             from tests.unit_tests.vit_unit_test import TestViT
                
@@ -54,6 +55,7 @@ class CoreUnitTestSuiteRunner:
 
                                     self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(FactoriesTest))
                
 
                                     self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(DiceLossTest))
                
 
                                     self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(TestViT))
                
 
                            +        self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(KDEMATest))
                
 
                                     self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(KDModelTest))
                
 
                                     self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(TestYOLOX))
                
 
                                     self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(InitializeWithDataloadersTest))
                
 
            import unittest
from super_gradients.training.sg_model import SgModel
from super_gradients.training.kd_model.kd_model import KDModel
import torch
from super_gradients.training.utils.utils import check_models_have_same_weights
from super_gradients.training.datasets.dataset_interfaces.dataset_interface import ClassificationTestDatasetInterface
from super_gradients.training.metrics import Accuracy
from super_gradients.training.losses.kd_losses import KDLogitsLoss


class KDEMATest(unittest.TestCase):
    @classmethod
    def setUp(cls):
        cls.sg_trained_teacher = SgModel("sg_trained_teacher", device='cpu')
        cls.dataset_params = {"batch_size": 5}
        cls.dataset = ClassificationTestDatasetInterface(dataset_params=cls.dataset_params)

        cls.kd_train_params = {"max_epochs": 3, "lr_updates": [1], "lr_decay_factor": 0.1, "lr_mode": "step",
                               "lr_warmup_epochs": 0, "initial_lr": 0.1,
                               "loss": KDLogitsLoss(torch.nn.CrossEntropyLoss()),
                               "optimizer": "SGD",
                               "criterion_params": {}, "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
                               "train_metrics_list": [Accuracy()], "valid_metrics_list": [Accuracy()],
                               "metric_to_watch": "Accuracy",
                               'loss_logging_items_names': ["Loss", "Task Loss", "Distillation Loss"],
                               "greater_metric_to_watch_is_better": True, "average_best_models": False,
                               "ema": True}

    def test_teacher_ema_not_duplicated(self):
        """Check that the teacher EMA is a reference to the teacher net (not a copy)."""

        kd_model = KDModel("test_teacher_ema_not_duplicated", device='cpu')
        kd_model.connect_dataset_interface(self.dataset)
        kd_model.build_model(student_architecture='resnet18',
                             teacher_architecture='resnet50',
                             student_arch_params={'num_classes': 1000},
                             teacher_arch_params={'num_classes': 1000},
                             checkpoint_params={'teacher_pretrained_weights': "imagenet"},
                             run_teacher_on_eval=True, )

        kd_model.train(self.kd_train_params)

        self.assertTrue(kd_model.ema_model.ema.module.teacher is kd_model.net.module.teacher)
        self.assertTrue(kd_model.ema_model.ema.module.student is not kd_model.net.module.student)

    def test_kd_ckpt_reload_ema(self):
        """Check that the KD model load correctly from checkpoint when "load_ema_as_net=True"."""

        # Create a KD model and train it
        kd_model = KDModel("test_kd_ema_ckpt_reload", device='cpu')
        kd_model.connect_dataset_interface(self.dataset)
        kd_model.build_model(student_architecture='resnet18',
                             teacher_architecture='resnet50',
                             student_arch_params={'num_classes': 1000},
                             teacher_arch_params={'num_classes': 1000},
                             checkpoint_params={'teacher_pretrained_weights': "imagenet"},
                             run_teacher_on_eval=True, )
        kd_model.train(self.kd_train_params)
        ema_model = kd_model.ema_model.ema
        net = kd_model.net

        # Load the trained KD model
        kd_model = KDModel("test_kd_ema_ckpt_reload", device='cpu')
        kd_model.connect_dataset_interface(self.dataset)
        kd_model.build_model(student_architecture='resnet18',
                             teacher_architecture='resnet50',
                             student_arch_params={'num_classes': 1000},
                             teacher_arch_params={'num_classes': 1000},
                             checkpoint_params={"load_checkpoint": True, "load_ema_as_net": True},
                             run_teacher_on_eval=True, )

        # TRAIN FOR 0 EPOCHS JUST TO SEE THAT WHEN CONTINUING TRAINING EMA MODEL HAS BEEN SAVED CORRECTLY
        kd_model.train(self.kd_train_params)
        reloaded_ema_model = kd_model.ema_model.ema
        reloaded_net = kd_model.net

        # trained ema == loaded ema (Should always be true as long as "ema=True" in train_params)
        self.assertTrue(check_models_have_same_weights(ema_model, reloaded_ema_model))

        # loaded net != trained net (since load_ema_as_net = True)
        self.assertTrue(not check_models_have_same_weights(reloaded_net, net))

        # loaded net == trained ema (since load_ema_as_net = True)
        self.assertTrue(check_models_have_same_weights(reloaded_net, ema_model))

        # loaded student ema == loaded student net (since load_ema_as_net = True)
        self.assertTrue(check_models_have_same_weights(reloaded_ema_model.module.student, reloaded_net.module.student))

        # loaded teacher ema == loaded teacher net (teacher always loads ema)
        self.assertTrue(check_models_have_same_weights(reloaded_ema_model.module.teacher, reloaded_net.module.teacher))

    def test_kd_ckpt_reload_net(self):
        """Check that the KD model load correctly from checkpoint when "load_ema_as_net=False"."""

        # Create a KD model and train it
        kd_model = KDModel("test_kd_ema_ckpt_reload", device='cpu')
        kd_model.connect_dataset_interface(self.dataset)
        kd_model.build_model(student_architecture='resnet18',
                             teacher_architecture='resnet50',
                             student_arch_params={'num_classes': 1000},
                             teacher_arch_params={'num_classes': 1000},
                             checkpoint_params={'teacher_pretrained_weights': "imagenet"},
                             run_teacher_on_eval=True, )
        kd_model.train(self.kd_train_params)
        ema_model = kd_model.ema_model.ema
        net = kd_model.net

        # Load the trained KD model
        kd_model = KDModel("test_kd_ema_ckpt_reload", device='cpu')
        kd_model.connect_dataset_interface(self.dataset)
        kd_model.build_model(student_architecture='resnet18',
                             teacher_architecture='resnet50',
                             student_arch_params={'num_classes': 1000},
                             teacher_arch_params={'num_classes': 1000},
                             checkpoint_params={"load_checkpoint": True, "load_ema_as_net": False},
                             run_teacher_on_eval=True, )

        # TRAIN FOR 0 EPOCHS JUST TO SEE THAT WHEN CONTINUING TRAINING EMA MODEL HAS BEEN SAVED CORRECTLY
        kd_model.train(self.kd_train_params)
        reloaded_ema_model = kd_model.ema_model.ema
        reloaded_net = kd_model.net

        # trained ema == loaded ema (Should always be true as long as "ema=True" in train_params)
        self.assertTrue(check_models_have_same_weights(ema_model, reloaded_ema_model))

        # loaded net == trained net (since load_ema_as_net = False)
        self.assertTrue(check_models_have_same_weights(reloaded_net, net))

        # loaded net != trained ema (since load_ema_as_net = False)
        self.assertTrue(not check_models_have_same_weights(reloaded_net, ema_model))

        # loaded student ema == loaded  student net (since load_ema_as_net = False)
        self.assertTrue(not check_models_have_same_weights(reloaded_ema_model.module.student, reloaded_net.module.student))

        # loaded teacher ema == loaded teacher net (teacher always loads ema)
        self.assertTrue(check_models_have_same_weights(reloaded_ema_model.module.teacher, reloaded_net.module.teacher))


if __name__ == '__main__':
    unittest.main()