1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
- import os
- from collections.abc import Iterable
- from pathlib import Path
- from typing import Union
- import matplotlib.pyplot as plt
- import pandas as pd
- import yaml
- from joblib import dump
- from sklearn.metrics import plot_confusion_matrix
- from yaml import Loader
- def read_yaml(
- file: Union[str, Path], key: str = None, default: Union[str, dict] = None
- ) -> dict:
- """
- Read yaml file and return `dict`.
- Args:
- file: `str` or `Path`. Yaml file path.
- key: `str`. Yaml key you want to read.
- default: `str` or `dict`. Yaml key or default dict to use as default values.
- Returns:
- Yaml file content as `dict` object.
- """
- with open(file, "r") as fp:
- params = yaml.load(fp, Loader)
- default = (
- default
- if isinstance(default, dict)
- else (params[default] if isinstance(default, str) else dict())
- )
- result = params[key] if key else params
- return {**default, **result}
- def dump_yaml(
- obj: dict, file_path: Union[str, Path], key: str = None, norm: bool = True
- ) -> Path:
- """
- Write yaml file and return `Path`.
- Args:
- obj: `dict` to write to yaml file.
- file: `str` or `Path`. Yaml file path.
- key: `str`. dict key you want to write.
- norm: `bool`. flag to normalize float values or not.
- Returns:
- `Path` of yaml file after writing.
- """
- obj = obj[key] if key else obj
- if norm:
- obj = normalize(obj)
- with open(file_path, "w+") as file:
- yaml.dump(obj, file)
- return Path(file_path)
- def normalize(obj: dict, ndigits: int = 4) -> dict:
- """Normalizes float values to `ndigits` decimal places"""
- if isinstance(obj, (float,)):
- return round(obj, ndigits)
- if isinstance(obj, (str,)):
- return obj
- if isinstance(obj, dict):
- for key, value in obj.items():
- obj[key] = normalize(value, ndigits)
- return obj
- if isinstance(obj, Iterable):
- return [normalize(x, ndigits) for x in obj]
- return obj
- def print_results(accuracy, c_matrix, model_name=""):
- print(f"Finished Training {model_name}:\nStats:")
- print(f"\tConfusion Matrix:\n{c_matrix}")
- print(f"\tModel Accuracy: {accuracy}")
- def evaluate_model(model, X_test, y_test):
- cmd = plot_confusion_matrix(model, X_test, y_test, cmap=plt.cm.Reds)
- c_matrix = cmd.confusion_matrix.tolist()
- accuracy = model.score(X_test, y_test)
- return float(accuracy), c_matrix, cmd.figure_
- def save_results(out_path, model, fig):
- if not os.path.isdir(out_path):
- os.makedirs(out_path)
- dump(model, f"{out_path}model.gz")
- if fig:
- fig.savefig(f"{out_path}confusion_matrix.svg", format="svg")
- def read_data(data_path: str) -> (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series):
- train = pd.read_csv(f"{data_path}train.csv")
- test = pd.read_csv(f"{data_path}test.csv")
- X_train, y_train = train.drop(columns=["class"]), train["class"]
- X_test, y_test = test.drop(columns=["class"]), test["class"]
- return X_train, X_test, y_train, y_test
|