puneethp
/
dvc_session


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
            import os
from collections.abc import Iterable
from pathlib import Path
from typing import Union

import matplotlib.pyplot as plt
import pandas as pd
import yaml
from joblib import dump
from sklearn.metrics import plot_confusion_matrix
from yaml import Loader


def read_yaml(
    file: Union[str, Path], key: str = None, default: Union[str, dict] = None
) -> dict:
    """
    Read yaml file and return `dict`.

    Args:
        file: `str` or `Path`. Yaml file path.
        key: `str`. Yaml key you want to read.
        default: `str` or `dict`. Yaml key or default dict to use as default values.

    Returns:
        Yaml file content as `dict` object.
    """
    with open(file, "r") as fp:
        params = yaml.load(fp, Loader)
    default = (
        default
        if isinstance(default, dict)
        else (params[default] if isinstance(default, str) else dict())
    )
    result = params[key] if key else params
    return {**default, **result}


def dump_yaml(
    obj: dict, file_path: Union[str, Path], key: str = None, norm: bool = True
) -> Path:
    """
    Write yaml file and return `Path`.

    Args:
        obj: `dict` to write to yaml file.
        file: `str` or `Path`. Yaml file path.
        key: `str`. dict key you want to write.
        norm: `bool`. flag to normalize float values or not.

    Returns:
        `Path` of yaml file after writing.
    """
    obj = obj[key] if key else obj
    if norm:
        obj = normalize(obj)
    with open(file_path, "w+") as file:
        yaml.dump(obj, file)
    return Path(file_path)


def normalize(obj: dict, ndigits: int = 4) -> dict:
    """Normalizes float values to `ndigits` decimal places"""
    if isinstance(obj, (float,)):
        return round(obj, ndigits)
    if isinstance(obj, (str,)):
        return obj
    if isinstance(obj, dict):
        for key, value in obj.items():
            obj[key] = normalize(value, ndigits)
        return obj
    if isinstance(obj, Iterable):
        return [normalize(x, ndigits) for x in obj]
    return obj


def print_results(accuracy, c_matrix, model_name=""):
    print(f"Finished Training {model_name}:\nStats:")
    print(f"\tConfusion Matrix:\n{c_matrix}")
    print(f"\tModel Accuracy: {accuracy}")


def evaluate_model(model, X_test, y_test):
    cmd = plot_confusion_matrix(model, X_test, y_test, cmap=plt.cm.Reds)
    c_matrix = cmd.confusion_matrix.tolist()
    accuracy = model.score(X_test, y_test)
    return float(accuracy), c_matrix, cmd.figure_


def save_results(out_path, model, fig):
    if not os.path.isdir(out_path):
        os.makedirs(out_path)
    dump(model, f"{out_path}model.gz")
    if fig:
        fig.savefig(f"{out_path}confusion_matrix.svg", format="svg")


def read_data(data_path: str) -> (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series):
    train = pd.read_csv(f"{data_path}train.csv")
    test = pd.read_csv(f"{data_path}test.csv")
    X_train, y_train = train.drop(columns=["class"]), train["class"]
    X_test, y_test = test.drop(columns=["class"]), test["class"]
    return X_train, X_test, y_train, y_test