Deci-AI
/
super-gradients
connected to https://github.com/Deci-AI/super-gradients.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
110

	
111

	
112

	
113

	
114

	
115

	
116

	
117

	
118

	
119

	
120

	
121

	
122

	
123

	
124

	
125

	
126

	
127

	
128

	
129

	
            import shutil
import unittest

from super_gradients.common.object_names import Models
from super_gradients.training import models

import super_gradients
import torch
import os
from super_gradients import Trainer
from super_gradients.training.dataloaders.dataloaders import classification_test_dataloader
from super_gradients.training.metrics import Accuracy, Top5
from super_gradients.common.environment.checkpoints_dir_utils import get_checkpoints_dir_path


class TestTrainer(unittest.TestCase):
    @classmethod
    def setUp(cls):
        super_gradients.init_trainer()
        # NAMES FOR THE EXPERIMENTS TO LATER DELETE
        cls.experiment_names = [
            "test_train",
            "test_save_load",
            "test_load_w",
            "test_load_w2",
            "test_load_w3",
            "test_checkpoint_content",
            "analyze",
            "test_yaml_metrics_present",
        ]
        cls.training_params = {
            "max_epochs": 1,
            "silent_mode": True,
            "lr_decay_factor": 0.1,
            "initial_lr": 0.1,
            "lr_updates": [4],
            "lr_mode": "StepLRScheduler",
            "loss": "CrossEntropyLoss",
            "train_metrics_list": [Accuracy(), Top5()],
            "valid_metrics_list": [Accuracy(), Top5()],
            "metric_to_watch": "Accuracy",
            "greater_metric_to_watch_is_better": True,
        }

    @classmethod
    def tearDownClass(cls) -> None:
        # ERASE ALL THE EXPERIMENT FOLDERS THAT WERE CREATED DURING THIS TEST
        for experiment_name in cls.experiment_names:
            experiment_dir = get_checkpoints_dir_path(experiment_name=experiment_name)
            if os.path.isdir(experiment_dir):
                # TODO: Occasionally this method fails because log files are still open (See setup_logging() call).
                # TODO: Need to find a way to close them at the end of training, this is however tricky to achieve
                # TODO: because setup_logging() called outside of Trainer class.
                shutil.rmtree(experiment_dir, ignore_errors=True)

    @staticmethod
    def get_classification_trainer(name=""):
        trainer = Trainer(name)
        model = models.get(Models.RESNET18, num_classes=5)
        return trainer, model

    def test_train(self):
        trainer, model = self.get_classification_trainer(self.experiment_names[0])
        trainer.train(
            model=model, training_params=self.training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
        )

    def test_save_load(self):
        trainer, model = self.get_classification_trainer(self.experiment_names[1])
        trainer.train(
            model=model, training_params=self.training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
        )
        resume_training_params = self.training_params.copy()
        resume_training_params["resume"] = True
        resume_training_params["max_epochs"] = 2
        trainer, model = self.get_classification_trainer(self.experiment_names[1])
        trainer.train(
            model=model, training_params=resume_training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
        )

    def test_checkpoint_content(self):
        """VERIFY THAT ALL CHECKPOINTS ARE SAVED AND CONTAIN ALL THE EXPECTED KEYS"""
        trainer, model = self.get_classification_trainer(self.experiment_names[5])
        params = self.training_params.copy()
        params["save_ckpt_epoch_list"] = [1]
        trainer.train(model=model, training_params=params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader())
        ckpt_filename = ["ckpt_best.pth", "ckpt_latest.pth", "ckpt_epoch_1.pth"]
        ckpt_paths = [os.path.join(trainer.checkpoints_dir_path, suf) for suf in ckpt_filename]
        for ckpt_path in ckpt_paths:
            ckpt = torch.load(ckpt_path)
            self.assertListEqual(sorted(["net", "acc", "epoch", "optimizer_state_dict", "scaler_state_dict", "metrics", "packages"]), sorted(list(ckpt.keys())))
        trainer._save_checkpoint()
        weights_only = torch.load(os.path.join(trainer.checkpoints_dir_path, "ckpt_latest_weights_only.pth"))
        self.assertListEqual(["net"], list(weights_only.keys()))

    def test_validation_frequency_divisible(self):
        trainer, model = self.get_classification_trainer(self.experiment_names[0])
        training_params = self.training_params.copy()
        training_params["max_epochs"] = 4
        training_params["run_validation_freq"] = 2
        trainer.train(
            model=model, training_params=training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
        )
        ckpt_filename = ["ckpt_best.pth", "ckpt_latest.pth"]
        ckpt_paths = [os.path.join(trainer.checkpoints_dir_path, suf) for suf in ckpt_filename]
        metrics = {}
        for ckpt_path in ckpt_paths:
            ckpt = torch.load(ckpt_path)
            metrics[ckpt_path] = ckpt["metrics"]
        self.assertTrue(metrics[ckpt_paths[0]] == metrics[ckpt_paths[1]])

    def test_validation_frequency_and_save_ckpt_list(self):
        trainer, model = self.get_classification_trainer(self.experiment_names[0])
        training_params = self.training_params.copy()
        training_params["max_epochs"] = 5
        training_params["run_validation_freq"] = 3
        training_params["save_ckpt_epoch_list"] = [1]
        trainer.train(
            model=model, training_params=training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
        )
        ckpt_filename = ["ckpt_epoch_1.pth", "ckpt_latest.pth"]
        ckpt_paths = [os.path.join(trainer.checkpoints_dir_path, suf) for suf in ckpt_filename]
        for ckpt_path in ckpt_paths:
            ckpt = torch.load(ckpt_path)
            self.assertTrue("valid" in ckpt["metrics"].keys())


if __name__ == "__main__":
    unittest.main()