Dean
/
Corso_MLOps
mirror of https://github.com/Clearbox-AI/Corso_MLOps.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
            import mlflow
from itertools import product
import warnings
from model.model_training import train_random_forest_model
from model.utils import model_metrics
import argparse
import json


def grid_search_random_forest(name_experiment):

    # this function runs a grid search over the hyper-parameters specified below
    max_depth = [3, 6]
    criterion = ['gini', 'entropy']
    min_samples_leaf = [5, 10]
    n_estimators = [50, 100]

    parameters = product(max_depth, criterion, min_samples_leaf, n_estimators)
    parameters_list = list(parameters)

    print('Number of experiments:', len(parameters_list))

    # Hyperparameter search
    results = []
    best_param = None
    best_f1 = 0.0
    warnings.filterwarnings('ignore')

    for i, param in enumerate(parameters_list):
        print('Running experiment number ', i)
        with mlflow.start_run(run_name=name_experiment):
            # Tell mlflow to log the following parameters for the experiments dashboard
            mlflow.log_param('depth', param[0])
            mlflow.log_param('criterion', param[1])
            mlflow.log_param('minsamplesleaf', param[2])
            mlflow.log_param('nestimators', param[3])

            try:
                parameters = dict(n_estimators=param[3],
                                  max_depth=param[0],
                                  criterion=param[1],
                                  min_sample_leaf=param[2])

                clf = train_random_forest_model(data_path='./data/adult_training.csv',
                                                parameters=parameters)

                metrics = model_metrics(clf, data_path='./data/adult_validation.csv')

                # Tell mlflow to log the following metrics
                mlflow.log_metric("precision", metrics['>50K']['precision'])
                mlflow.log_metric("F1", metrics['>50K']['f1-score'])

                # Store this artifact for each run
                json.dump(metrics, open("metrics.json", "w"))
                mlflow.log_artifact('./metrics.json')

                # save the best experiment yet (in terms of precision)
                if metrics['>50K']['f1-score'] > best_f1:
                    best_param = parameters
                    best_f1 = metrics['>50K']['f1-score']

                results.append([param, metrics['>50K']['f1-score']])

            except ValueError:
                print('bad parameter combination:', param)
                continue

    mlflow.end_run()
    print('Best F1 was:', best_f1)
    print('Using the following parameters')
    print(best_param)
    return results, best_param


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--name", help="experiment_name")
    args, leftovers = parser.parse_known_args()

    results, best_param = grid_search_random_forest(args.name)
    json.dump(best_param, open("best_params.json", "w"))