1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
- import shutil
- import unittest
- from super_gradients.common.object_names import Models
- from super_gradients.training import models
- import super_gradients
- import torch
- import os
- from super_gradients import Trainer
- from super_gradients.training.dataloaders.dataloaders import classification_test_dataloader
- from super_gradients.training.metrics import Accuracy, Top5
- from super_gradients.common.environment.checkpoints_dir_utils import get_checkpoints_dir_path
- class TestTrainer(unittest.TestCase):
- @classmethod
- def setUp(cls):
- super_gradients.init_trainer()
- # NAMES FOR THE EXPERIMENTS TO LATER DELETE
- cls.experiment_names = [
- "test_train",
- "test_save_load",
- "test_load_w",
- "test_load_w2",
- "test_load_w3",
- "test_checkpoint_content",
- "analyze",
- "test_yaml_metrics_present",
- ]
- cls.training_params = {
- "max_epochs": 1,
- "silent_mode": True,
- "lr_decay_factor": 0.1,
- "initial_lr": 0.1,
- "lr_updates": [4],
- "lr_mode": "StepLRScheduler",
- "loss": "CrossEntropyLoss",
- "train_metrics_list": [Accuracy(), Top5()],
- "valid_metrics_list": [Accuracy(), Top5()],
- "metric_to_watch": "Accuracy",
- "greater_metric_to_watch_is_better": True,
- }
- @classmethod
- def tearDownClass(cls) -> None:
- # ERASE ALL THE EXPERIMENT FOLDERS THAT WERE CREATED DURING THIS TEST
- for experiment_name in cls.experiment_names:
- experiment_dir = get_checkpoints_dir_path(experiment_name=experiment_name)
- if os.path.isdir(experiment_dir):
- # TODO: Occasionally this method fails because log files are still open (See setup_logging() call).
- # TODO: Need to find a way to close them at the end of training, this is however tricky to achieve
- # TODO: because setup_logging() called outside of Trainer class.
- shutil.rmtree(experiment_dir, ignore_errors=True)
- @staticmethod
- def get_classification_trainer(name=""):
- trainer = Trainer(name)
- model = models.get(Models.RESNET18, num_classes=5)
- return trainer, model
- def test_train(self):
- trainer, model = self.get_classification_trainer(self.experiment_names[0])
- trainer.train(
- model=model, training_params=self.training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
- )
- def test_save_load(self):
- trainer, model = self.get_classification_trainer(self.experiment_names[1])
- trainer.train(
- model=model, training_params=self.training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
- )
- resume_training_params = self.training_params.copy()
- resume_training_params["resume"] = True
- resume_training_params["max_epochs"] = 2
- trainer, model = self.get_classification_trainer(self.experiment_names[1])
- trainer.train(
- model=model, training_params=resume_training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
- )
- def test_checkpoint_content(self):
- """VERIFY THAT ALL CHECKPOINTS ARE SAVED AND CONTAIN ALL THE EXPECTED KEYS"""
- trainer, model = self.get_classification_trainer(self.experiment_names[5])
- params = self.training_params.copy()
- params["save_ckpt_epoch_list"] = [1]
- trainer.train(model=model, training_params=params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader())
- ckpt_filename = ["ckpt_best.pth", "ckpt_latest.pth", "ckpt_epoch_1.pth"]
- ckpt_paths = [os.path.join(trainer.checkpoints_dir_path, suf) for suf in ckpt_filename]
- for ckpt_path in ckpt_paths:
- ckpt = torch.load(ckpt_path)
- self.assertListEqual(sorted(["net", "acc", "epoch", "optimizer_state_dict", "scaler_state_dict", "metrics", "packages"]), sorted(list(ckpt.keys())))
- trainer._save_checkpoint()
- weights_only = torch.load(os.path.join(trainer.checkpoints_dir_path, "ckpt_latest_weights_only.pth"))
- self.assertListEqual(["net"], list(weights_only.keys()))
- def test_validation_frequency_divisible(self):
- trainer, model = self.get_classification_trainer(self.experiment_names[0])
- training_params = self.training_params.copy()
- training_params["max_epochs"] = 4
- training_params["run_validation_freq"] = 2
- trainer.train(
- model=model, training_params=training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
- )
- ckpt_filename = ["ckpt_best.pth", "ckpt_latest.pth"]
- ckpt_paths = [os.path.join(trainer.checkpoints_dir_path, suf) for suf in ckpt_filename]
- metrics = {}
- for ckpt_path in ckpt_paths:
- ckpt = torch.load(ckpt_path)
- metrics[ckpt_path] = ckpt["metrics"]
- self.assertTrue(metrics[ckpt_paths[0]] == metrics[ckpt_paths[1]])
- def test_validation_frequency_and_save_ckpt_list(self):
- trainer, model = self.get_classification_trainer(self.experiment_names[0])
- training_params = self.training_params.copy()
- training_params["max_epochs"] = 5
- training_params["run_validation_freq"] = 3
- training_params["save_ckpt_epoch_list"] = [1]
- trainer.train(
- model=model, training_params=training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
- )
- ckpt_filename = ["ckpt_epoch_1.pth", "ckpt_latest.pth"]
- ckpt_paths = [os.path.join(trainer.checkpoints_dir_path, suf) for suf in ckpt_filename]
- for ckpt_path in ckpt_paths:
- ckpt = torch.load(ckpt_path)
- self.assertTrue("valid" in ckpt["metrics"].keys())
- if __name__ == "__main__":
- unittest.main()
|