Browse Source

Trained AI

Stav Eyal 5 months ago
parent
commit
d0d18de7ff
5 changed files with 77 additions and 0 deletions
  1. 2 0
      .gitattributes
  2. 3 0
      .vscode/settings.json
  3. 43 0
      code/featurization.py
  4. 3 0
      data/.gitignore
  5. 26 0
      featurization.dvc

+ 2 - 0
.gitattributes

@@ -0,0 +1,2 @@
+* text=auto
+*.py text

+ 3 - 0
.vscode/settings.json

@@ -0,0 +1,3 @@
+{
+    "python.pythonPath": ".env/bin/python3.7"
+}

+ 43 - 0
code/featurization.py

@@ -0,0 +1,43 @@
+"""
+Create feature CSVs for train and test datasets
+"""
+import json
+import numpy as np
+import pandas as pd
+
+
+def featurization():
+    # Load data-sets
+    print("Loading data sets...")
+    train_data = pd.read_csv('./data/train_data.csv', header=None, dtype=float)
+    test_data = pd.read_csv('./data/test_data.csv', header=None, dtype=float)
+    print("done.")
+
+    # Normalize the train data
+    print("Normalizing data...")
+    # We choose all columns except the first, since that is where our labels are
+    train_mean = train_data.values[:, 1:].mean()
+    train_std = train_data.values[:, 1:].std()
+
+    # Normalize train and test data according to the train data distribution
+    train_data.values[:, 1:] -= train_mean
+    train_data.values[:, 1:] /= train_std
+    test_data.values[:, 1:] -= train_mean
+    test_data.values[:, 1:] /= train_std
+
+    print("done.")
+
+    print("Saving processed datasets and normalization parameters...")
+    # Save normalized data-sets
+    np.save('./data/processed_train_data', train_data)
+    np.save('./data/processed_test_data', test_data)
+
+    # Save mean and std for future inference
+    with open('./data/norm_params.json', 'w') as f:
+        json.dump({'mean': train_mean, 'std': train_std}, f)
+
+    print("done.")
+
+
+if __name__ == '__main__':
+    featurization()

+ 3 - 0
data/.gitignore

@@ -1,2 +1,5 @@
 /test_data.csv
 /train_data.csv
+/norm_params.json
+/processed_train_data.npy
+/processed_test_data.npy

+ 26 - 0
featurization.dvc

@@ -0,0 +1,26 @@
+wdir: .
+cmd: python3 code/featurization.py
+outs:
+- path: data/norm_params.json
+  metric: false
+  cache: true
+  persist: false
+  md5: e46984ac8b7097bfddfe5d9210f78ca4
+- path: data/processed_train_data.npy
+  metric: false
+  cache: true
+  persist: false
+  md5: 9ee0468925c998fda26d197a14d1caec
+- path: data/processed_test_data.npy
+  metric: false
+  cache: true
+  persist: false
+  md5: a5257a91e73920bdd4cafd0f88105b74
+deps:
+- path: data/train_data.csv
+  md5: 5b49cf1b57fb9d6102b559d59d99df7c
+- path: data/test_data.csv
+  md5: c807df8d6d804ab2647fc15c3d40f543
+- path: code/featurization.py
+  md5: 55f2ab79ee6dad39bd0a96ffff39dc64
+md5: 194c670fdd1bd6c8ded61aeb7a7a2b61