jnirschl
/
mnist_dvc
mirror of https://github.com/jnirschl/mnist_dvc


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
            #   -*- coding: utf-8 -*-
#  Copyright (c)  2021. Jeffrey Nirschl. All rights reserved.
#
#  Licensed under the MIT license. See the LICENSE file in the project
#  root directory for  license information.
#
#  Time-stamp: <>
#   ======================================================================

import argparse
import os
from pathlib import Path

import cv2
import numpy as np
import pandas as pd

from src.img import transforms


def main(data_path, ext="png",
         img_shape=(28, 28, 1),
         output="./data/interim/",
         prefix="",
         na_rep="nan"):
    """Accept a numpy array of flattened images and
     save as images."""

    # create output director, if needed
    output = Path(output).resolve().joinpath(prefix)
    if not os.path.isdir(output):
        os.mkdir(output)

    # check for errors
    assert os.path.isfile(data_path), FileNotFoundError
    assert os.path.isdir(output), NotADirectoryError

    # remove period from ext
    ext = ext.replace(".", "")

    # read file
    img_array = pd.read_csv(data_path, sep=",",
                            header=0)

    # pop target column and save
    if "label" in img_array.columns:
        target = pd.DataFrame(img_array.pop("label"))
    else:
        target = pd.DataFrame({"label": np.full_like(img_array[img_array.columns[0]],
                                                     np.nan, dtype=np.float32)})

    # create mean image
    mean_image = transforms.mean_image(img_array)
    cv2.imwrite(str(output.parent.joinpath(f"{prefix}_mean_image.png")), mean_image)

    # save individual images
    filenames = save_images(img_array, target, img_shape, ext, output)

    # save a mapfile with the filename and label
    mapfile = pd.DataFrame({"filenames": filenames,
                            target.columns[0]: target[target.columns[0]]},
                           index=target.index)
    mapfile.to_csv(output.parent.joinpath(f"{prefix}_mapfile.csv"),
                   na_rep=na_rep)


def save_images(img_array, target, img_shape, ext, output):
    """Subfunction to process flattened images in dataframe"""

    # process dataframe line by line
    print(f"Reshaping flattened images in numpy array into 2D images...")
    filenames = []
    for idx in range(img_array.shape[0]):
        if (idx + 1) % 10000 == 0:
            print(f"\tProcessed {idx + 1} images")
        # select image and reshape
        temp_img = np.reshape(img_array.iloc[idx].to_numpy(),
                              img_shape).astype(np.float32)

        # set filename
        temp_name = f"{idx:06d}_{target.iloc[idx].to_numpy()[0]}.{ext}"

        filenames.append(str(output.joinpath(temp_name)))

        if not cv2.imwrite(filenames[idx], temp_img):
            raise SystemError

    print(f"\tProcessed {idx + 1} images")
    return filenames


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-tr", "--train_data", dest="train_data",
                        required=True, help="Train CSV file")
    parser.add_argument("-te", "--test_data", dest="test_data",
                        required=True, help="Test CSV file")
    parser.add_argument("-ex", "--ext", dest="ext",
                        default=".png",
                        required=False, help="Train CSV file")
    parser.add_argument("-o", "--out-dir", dest="output_dir",
                        default=Path("./data/interim").resolve(),
                        required=False, help="output directory")
    args = parser.parse_args()

    # categorical variables into integer codes
    main(args.train_data, prefix="train", ext=args.ext, output=args.output_dir)

    main(args.test_data, prefix="test", ext=args.ext, output=args.output_dir)