Dean
/
dvc_session
forked from puneethp/dvc_session


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
            import os

import matplotlib.pyplot as plt
import pandas as pd
import plac
from dagshub import dagshub_logger
from joblib import dump, load
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import plot_confusion_matrix


@plac.annotations(
    data_path=("Path to source data", "option", "i", str),
    model_path=("Path to save trained Model", "option", "m", str),
    out_path=("Path to save trained Model", "option", "o", str)
)
def main(data_path='data/features/', model_path='data/models/', out_path='data/models/ensemble/'):
    train = pd.read_csv(f'{data_path}train.csv')
    test = pd.read_csv(f'{data_path}test.csv')

    X_train, y_train = train.drop(columns=['class']), train['class']
    X_test, y_test = test.drop(columns=['class']), test['class']

    cl1 = load_model(model_path, 'logistic')
    cl2 = load_model(model_path, 'svc')
    cl3 = load_model(model_path, 'r_forrest')
    estimators = [
        ('l_regression', cl1),
        ('l_svc', cl2),
        ('r_forrest', cl3)
    ]

    model = VotingClassifier(estimators)
    model.fit(X_train, y_train)

    if not os.path.isdir(out_path):
        os.makedirs(out_path)
    dump(model, f'{out_path}model.pkl')

    cmd = plot_confusion_matrix(model, X_test, y_test, cmap=plt.cm.Reds)
    cmd.figure_.savefig(f'{out_path}confusion_matrix.svg', format='svg')
    c_matrix = cmd.confusion_matrix
    accuracy = model.score(X_test, y_test)

    print(f'Finished Training Ensemble Model:\nStats:')
    print(f'\tConfusion Matrix:\n{c_matrix}')
    print(f'\tModel Accuracy: {accuracy}')
    with dagshub_logger(metrics_path=f'{out_path}theBestMetric.csv',
                        hparams_path=f'{out_path}theBestParams.yaml') as logger:
        logger.log_hyperparams(voting=model.voting)
        logger.log_metrics(accuracy=accuracy)


def load_model(model_path, model_name):
    return load(f'{model_path}{model_name}/model.pkl')


if __name__ == '__main__':
    plac.call(main)