jnirschl
/
titanic_dvc
mirror of https://github.com/jnirschl/titanic_dvc.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
            #   -*- coding: utf-8 -*-
#  Copyright (c) 2021.  Jeffrey J. Nirschl. All rights reserved.
#
#   Licensed under the MIT license. See the LICENSE.md file in the project
#   root directory for full license information.
#
#   Time-stamp: <>
#   ======================================================================

import argparse
import os
from pathlib import Path

import yaml

from src.data import load_data, load_params, save_as_csv


def main(train_path, test_path,
         output_dir):
    """Split data into train, dev, and test"""

    output_dir = Path(output_dir).resolve()
    assert (os.path.isdir(output_dir)), NotADirectoryError

    # load data
    train_df, test_df = load_data([train_path, test_path], sep=",", header=0,
                                  index_col="PassengerId")

    # load params
    params = load_params()

    # fill nans with column mean/mode on test set
    # TODO - switch to allow for different interpolation methods (e.g., mean, median, MICE)
    if params["imputation"]["method"].lower() == "mean":
        mean_age = float(round(train_df["Age"].mean(), 4))
        mean_fare = float(round(train_df["Fare"].mean(), 4))
        train_df["Age"].fillna(value=mean_age,
                               inplace=True)
        test_df["Age"].fillna(value=mean_age,
                              inplace=True)
        test_df["Fare"].fillna(value=mean_fare,
                              inplace=True)

        # update params and save imputation scheme
        params["imputation"]["Age"] = mean_age
        params["imputation"]["Fare"] = mean_fare
    elif params["imputation"]["method"].lower() == "mice":
        # TODO MICE interpolation
        raise NotImplementedError
    else:
        raise NotImplementedError

    # update params
    new_params = yaml.safe_dump(params)

    with open("params.yaml", "w") as writer:
        writer.write(new_params)

    # save data
    save_as_csv([train_df, test_df],
                [train_path, test_path],
                output_dir,
                replace_text="_categorized.csv",
                suffix="_nan_imputed.csv",
                na_rep="nan")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-tr", "--train", dest="train_path",
                        required=True, help="Train CSV file")
    parser.add_argument("-te", "--test", dest="test_path",
                        required=True, help="Test CSV file")
    parser.add_argument("-o", "--out-dir", dest="output_dir",
                        default=Path("./data/interim").resolve(),
                        required=False, help="output directory")
    args = parser.parse_args()

    # convert categorical variables into integer codes
    main(args.train_path, args.test_path,
         args.output_dir)