Deci-AI
/
super-gradients
connected to https://github.com/Deci-AI/super-gradients.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
110

	
111

	
112

	
113

	
114

	
115

	
116

	
117

	
118

	
119

	
120

	
121

	
122

	
123

	
124

	
            import shutil
import unittest

import super_gradients
import torch
import os
from super_gradients import SgModel, ClassificationTestDatasetInterface
from super_gradients.training.metrics import Accuracy, Top5


class TestTrainer(unittest.TestCase):
    @classmethod
    def setUp(cls):
        super_gradients.init_trainer()
        # NAMES FOR THE EXPERIMENTS TO LATER DELETE
        cls.folder_names = ['test_train', 'test_save_load', 'test_load_w', 'test_load_w2',
                            'test_load_w3', 'test_checkpoint_content', 'analyze']
        cls.training_params = {"max_epochs": 1,
                               "silent_mode": True,
                               "lr_decay_factor": 0.1,
                               "initial_lr": 0.1,
                               "lr_updates": [4],
                               "lr_mode": "step",
                               "loss": "cross_entropy", "train_metrics_list": [Accuracy(), Top5()],
                               "valid_metrics_list": [Accuracy(), Top5()],
                               "loss_logging_items_names": ["Loss"], "metric_to_watch": "Accuracy",
                               "greater_metric_to_watch_is_better": True}

    @classmethod
    def tearDownClass(cls) -> None:
        # ERASE ALL THE FOLDERS THAT WERE CREATED DURING THIS TEST
        for folder in cls.folder_names:
            if os.path.isdir(os.path.join('checkpoints', folder)):
                shutil.rmtree(os.path.join('checkpoints', folder))

    @staticmethod
    def get_classification_trainer(name=''):
        model = SgModel(name, model_checkpoints_location='local')
        dataset_params = {"batch_size": 4}
        dataset = ClassificationTestDatasetInterface(dataset_params=dataset_params)
        model.connect_dataset_interface(dataset)
        model.build_model("resnet18_cifar", load_checkpoint=False)
        return model

    def test_train(self):
        model = self.get_classification_trainer(self.folder_names[0])
        model.train(training_params=self.training_params)

    def test_save_load(self):
        model = self.get_classification_trainer(self.folder_names[1])
        model.train(training_params=self.training_params)
        model.build_model("resnet18_cifar", load_checkpoint=True)

    def test_load_only_weights_from_ckpt(self):
        # Create a checkpoint with 100% accuracy
        model = self.get_classification_trainer(self.folder_names[2])
        params = self.training_params.copy()

        params['max_epochs'] = 3
        model.train(training_params=params)
        # Build a model that continues the training
        model = self.get_classification_trainer(self.folder_names[3])
        model.build_model('resnet18_cifar', load_checkpoint=True, source_ckpt_folder_name=self.folder_names[2],
                          load_weights_only=False)
        self.assertTrue(model.best_metric > -1)
        self.assertTrue(model.start_epoch != 0)
        # start_epoch is not initialized, adding to max_epochs
        self.training_params['max_epochs'] += 3
        model.train(training_params=self.training_params)
        # Build a model that loads the weights and starts from scratch
        model = self.get_classification_trainer(self.folder_names[4])
        model.build_model('resnet18_cifar', load_checkpoint=True, source_ckpt_folder_name=self.folder_names[2],
                          load_weights_only=True)
        self.assertTrue(model.best_metric == -1)
        self.assertTrue(model.start_epoch == 0)
        self.training_params['max_epochs'] += 3
        model.train(training_params=self.training_params)

    def test_checkpoint_content(self):
        """VERIFY THAT ALL CHECKPOINTS ARE SAVED AND CONTAIN ALL THE EXPECTED KEYS"""
        model = self.get_classification_trainer(self.folder_names[5])
        params = self.training_params.copy()
        params["save_ckpt_epoch_list"] = [1]
        model.train(training_params=params)
        ckpt_filename = ['ckpt_best.pth', 'ckpt_latest.pth', 'ckpt_epoch_1.pth']
        ckpt_paths = [os.path.join(model.checkpoints_dir_path, suf) for suf in ckpt_filename]
        for ckpt_path in ckpt_paths:
            ckpt = torch.load(ckpt_path)
            self.assertListEqual(['net', 'acc', 'epoch', 'optimizer_state_dict', 'scaler_state_dict'],
                                 list(ckpt.keys()))
        model.save_checkpoint()
        weights_only = torch.load(os.path.join(model.checkpoints_dir_path, 'ckpt_latest_weights_only.pth'))
        self.assertListEqual(['net'], list(weights_only.keys()))

    def test_compute_model_runtime(self):
        model = self.get_classification_trainer(self.folder_names[6])
        model.compute_model_runtime()
        model.compute_model_runtime(batch_sizes=1, input_dims=(3, 224, 224), verbose=False)
        model.compute_model_runtime(batch_sizes=[1, 2, 3], verbose=True)
        # VERIFY MODEL RETURNS TO PREVIOUS TRAINING MODE
        model.net.train()
        model.compute_model_runtime(batch_sizes=1, verbose=False)
        assert model.net.training, 'MODEL WAS SET TO eval DURING compute_model_runtime, BUT DIDN\'t RETURN TO PREVIOUS'
        model.net.eval()
        model.compute_model_runtime(batch_sizes=1, verbose=False)
        assert not model.net.training, 'MODEL WAS SET TO eval DURING compute_model_runtime, BUT RETURNED TO TRAINING'

        # THESE SHOULD HANDLE THE EXCEPTION OF CUDA OUT OF MEMORY
        if torch.cuda.is_available():
            model._switch_device('cuda')
            model.compute_model_runtime(batch_sizes=10000, verbose=False, input_dims=(3, 224, 224))
            model.compute_model_runtime(batch_sizes=[10000, 10, 50, 100, 1000, 5000], verbose=True)

    def test_predict(self):
        model = self.get_classification_trainer(self.folder_names[6])
        inputs = torch.randn((5, 3, 32, 32))
        targets = torch.randint(0, 5, (5, 1))
        model.predict(inputs=inputs, targets=targets)
        model.predict(inputs=inputs, targets=targets, half=True)
        model.predict(inputs=inputs, targets=targets, half=False, verbose=True)


if __name__ == '__main__':
    unittest.main()