Browse Source

trained SVM

vlad-winter 1 month ago
parent
commit
86aa1e2c6d
6 changed files with 150 additions and 0 deletions
  1. 43 0
      code/featurization.py
  2. 57 0
      code/train_model.py
  3. 4 0
      data/.gitignore
  4. 26 0
      featurization.dvc
  5. 1 0
      metrics/train_metric.json
  6. 19 0
      training.dvc

+ 43 - 0
code/featurization.py

@@ -0,0 +1,43 @@
+"""
+Create feature CSVs for train and test datasets
+"""
+import json
+import numpy as np
+import pandas as pd
+
+
+def featurization():
+    # Load data-sets
+    print("Loading data sets...")
+    train_data = pd.read_csv('./data/train_data.csv', header=None, dtype=float)
+    test_data = pd.read_csv('./data/test_data.csv', header=None, dtype=float)
+    print("done.")
+
+    # Normalize the train data
+    print("Normalizing data...")
+    # We choose all columns except the first, since that is where our labels are
+    train_mean = train_data.values[:, 1:].mean()
+    train_std = train_data.values[:, 1:].std()
+
+    # Normalize train and test data according to the train data distribution
+    train_data.values[:, 1:] -= train_mean
+    train_data.values[:, 1:] /= train_std
+    test_data.values[:, 1:] -= train_mean
+    test_data.values[:, 1:] /= train_std
+
+    print("done.")
+
+    print("Saving processed datasets and normalization parameters...")
+    # Save normalized data-sets
+    np.save('./data/processed_train_data', train_data)
+    np.save('./data/processed_test_data', test_data)
+
+    # Save mean and std for future inference
+    with open('./data/norm_params.json', 'w') as f:
+        json.dump({'mean': train_mean, 'std': train_std}, f)
+
+    print("done.")
+
+
+if __name__ == '__main__':
+    featurization()

+ 57 - 0
code/train_model.py

@@ -0,0 +1,57 @@
+"""
+Train classification model for MNIST
+"""
+import json
+import pickle
+import numpy as np
+from sklearn.svm import SVC
+from sklearn.multiclass import OneVsRestClassifier
+import time
+
+
+def train_model():
+    # Measure training time
+    start_time = time.time()
+
+    # Load training data
+    print("Load training data...")
+    train_data = np.load('./data/processed_train_data.npy')
+
+    # Choose a random sample of images from the training data.
+    # This is important since SVM training time increases quadratically with the number of training samples.
+    print("Choosing smaller sample to shorten training time...")
+    # Set a random seed so that we get the same "random" choices when we try to recreate the experiment.
+    np.random.seed(42)
+
+    num_samples = 5000
+    choice = np.random.choice(train_data.shape[0], num_samples, replace=False)
+    train_data = train_data[choice, :]
+
+    # Divide loaded data-set into data and labels
+    labels = train_data[:, 0]
+    data = train_data[:, 1:]
+    print("done.")
+
+    # Define SVM classifier and train model
+    print("Training model...")
+    model = OneVsRestClassifier(SVC(kernel='linear'), n_jobs=6)
+    model.fit(data, labels)
+    print("done.")
+
+    # Save model as pkl
+    print("Save model and training time metric...")
+    with open("./data/model.pkl", 'wb') as f:
+        pickle.dump(model, f)
+
+    # End training time measurement
+    end_time = time.time()
+
+    # Create metric for model training time
+    with open('./metrics/train_metric.json', 'w') as f:
+        json.dump({'training_time': end_time - start_time}, f)
+    print("done.")
+
+
+if __name__ == '__main__':
+    train_model()
+

+ 4 - 0
data/.gitignore

@@ -1,2 +1,6 @@
 /train_data.csv
 /test_data.csv
+/norm_params.json
+/processed_train_data.npy
+/processed_test_data.npy
+/model.pkl

+ 26 - 0
featurization.dvc

@@ -0,0 +1,26 @@
+md5: 629da87d4b9a1ae06b507adc0df2d808
+cmd: python code/featurization.py
+wdir: .
+deps:
+- md5: 5b49cf1b57fb9d6102b559d59d99df7c
+  path: data/train_data.csv
+- md5: c807df8d6d804ab2647fc15c3d40f543
+  path: data/test_data.csv
+- md5: 55f2ab79ee6dad39bd0a96ffff39dc64
+  path: code/featurization.py
+outs:
+- md5: e46984ac8b7097bfddfe5d9210f78ca4
+  path: data/norm_params.json
+  cache: true
+  metric: false
+  persist: false
+- md5: 9ee0468925c998fda26d197a14d1caec
+  path: data/processed_train_data.npy
+  cache: true
+  metric: false
+  persist: false
+- md5: a5257a91e73920bdd4cafd0f88105b74
+  path: data/processed_test_data.npy
+  cache: true
+  metric: false
+  persist: false

+ 1 - 0
metrics/train_metric.json

@@ -0,0 +1 @@
+{"training_time": 19.54646110534668}

+ 19 - 0
training.dvc

@@ -0,0 +1,19 @@
+md5: 30c3834e0ce1865c8665ccd9c7acab0b
+cmd: python code/train_model.py
+wdir: .
+deps:
+- md5: 9ee0468925c998fda26d197a14d1caec
+  path: data/processed_train_data.npy
+- md5: 655c3242c17b3d0213d7ce4d9f78344d
+  path: code/train_model.py
+outs:
+- md5: 2723303d5317424e48c3b404c29c387c
+  path: data/model.pkl
+  cache: true
+  metric: false
+  persist: false
+- md5: e5867e3138d0e181dbe318abd32886c3
+  path: metrics/train_metric.json
+  cache: false
+  metric: true
+  persist: false