1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
- """
- Train classification model for MNIST
- """
- import json
- import pickle
- import numpy as np
- import time
- import yaml
- import os
- # New imports
- import torch
- import torch.utils.data
- import torch.nn.functional as F
- import torch.optim as optim
- from my_torch_model import Net
- from dagshub import dagshub_logger
- def relpath(path):
- return os.path.join(os.path.dirname(__file__), path)
- # New function
- def train(model, device, train_loader, optimizer, epoch, logger):
- log_interval = 100
- steps_so_far = (epoch - 1) * len(train_loader)
- model.train()
- for batch_idx, (data, target) in enumerate(train_loader):
- data, target = data.to(device), target.to(device)
- optimizer.zero_grad()
- output = model(data)
- loss = F.nll_loss(output, target)
- loss.backward()
- logger.log_metrics(loss=loss.item(), step_num=batch_idx + steps_so_far)
- optimizer.step()
- if batch_idx % log_interval == 0:
- print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
- epoch, batch_idx * len(data), len(train_loader.dataset),
- 100. * batch_idx / len(train_loader), loss.item()))
- def train_model(params: dict):
- # Setting up network
- print("Setting up Params...")
- print(params)
- device = torch.device("cpu")
- batch_size = params['batch_size']
- epochs = params['epochs']
- learning_rate = params['learning_rate']
- momentum = params['momentum']
- print("done.")
- # Load training data
- print("Load training data...")
- train_data = np.load('./data/processed_train_data.npy')
- # Divide loaded data-set into data and labels
- labels = torch.Tensor(train_data[:, 0]).long()
- data = torch.Tensor(train_data[:, 1:].reshape([train_data.shape[0], 1, 28, 28]))
- torch_train_data = torch.utils.data.TensorDataset(data, labels)
- train_loader = torch.utils.data.DataLoader(torch_train_data,
- batch_size=batch_size,
- shuffle=True)
- print("done.")
- # Define SVM classifier and train model
- print("Training model...")
- model = Net(**params).to(device)
- optimizer = optim.SGD(model.parameters(),
- lr=learning_rate,
- momentum=momentum)
- with dagshub_logger(relpath('../metrics/train_metrics.csv')) as logger:
- # Measure training time
- start_time = time.time()
- for epoch in range(1, epochs + 1):
- logger.log_metrics(epoch=epoch, step_num=((epoch - 1) * len(train_loader)))
- train(model, device, train_loader, optimizer, epoch, logger)
- print("done.")
- print("Save model and training time metric...")
- # End training time measurement
- end_time = time.time()
- logger.log_metrics(training_time=end_time - start_time)
- # Save model as pkl
- with open("./data/model.pkl", 'wb') as f:
- pickle.dump(model, f)
-
- print("done.")
- if __name__ == '__main__':
- with open(relpath('params.yml')) as f:
- params = yaml.safe_load(f)
- train_model(params)
|