finardi
/
Cassandra-Pipeline


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
            import os
import glob
import csv

class DataPrep():
    """
    A class used to prepare a csv file to be used in the ETL pipeline
    
    . . .
    
    Methods
    -------
    _collect_files() 
        collect and join the file path and roots with the subdirectories
    _extract_data()
        create a list of rows that will be generated from each file
    write_csv()
        create a smaller csv file that will be used in the Apache Cassandra tables
    """
    def __init__(self, filepath_in, filepath_out):
        """
        Parameters
        ----------
        filepath_in : str
            The path to the original csv file
        filepath_out : str
            The path to save the processed csv file
        """
        self.filepath_in = filepath_in
        self.filepath_out = filepath_out
    
    def _collect_files(self):
        """Collect and join files in the subdirectories

        Returns
        -------
        list
            list of files 
        """
        for root, dirs, files in os.walk(self.filepath_in):
            file_path_list = glob.glob(os.path.join(root,'*'))
        
        return file_path_list
    
    def _extract_data(self):
        """Extract data by row from _collect_files and append in a list        

        Returns
        -------
        list
            list of files 
        """
        full_data_rows_list = []
        file_path_list = self._collect_files()
        for f in file_path_list:
            with open(f, 'r', encoding = 'utf8', newline='') as csvfile: 
                csvreader = csv.reader(csvfile) 
                next(csvreader)
                for line in csvreader:
                    full_data_rows_list.append(line) 
        
        return full_data_rows_list
    
    def write_csv(self):
        """Create a new csv file smaller that will be used with Apache Cassandra 
        
        Returns
        -------
        none
            
        """
        full_data_rows_list = self._extract_data()
        csv.register_dialect('myDialect', quoting=csv.QUOTE_ALL, skipinitialspace=True)
        with open(self.filepath_out + '.csv', 'w', encoding = 'utf8', newline='') as f:
            writer = csv.writer(f, dialect='myDialect')
            writer.writerow(['artist','firstName','gender','itemInSession','lastName','length',\
                        'level','location','sessionId','song','userId'])
            for row in full_data_rows_list:
                if (row[0] == ''):
                    continue
                writer.writerow((row[0], row[2], row[3], row[4], row[5], row[6], 
                                 row[7], row[8], row[12], row[13], row[16]))