Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

dataprep.py 2.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
  1. import os
  2. import glob
  3. import csv
  4. class DataPrep():
  5. """
  6. A class used to prepare a csv file to be used in the ETL pipeline
  7. . . .
  8. Methods
  9. -------
  10. _collect_files()
  11. collect and join the file path and roots with the subdirectories
  12. _extract_data()
  13. create a list of rows that will be generated from each file
  14. write_csv()
  15. create a smaller csv file that will be used in the Apache Cassandra tables
  16. """
  17. def __init__(self, filepath_in, filepath_out):
  18. """
  19. Parameters
  20. ----------
  21. filepath_in : str
  22. The path to the original csv file
  23. filepath_out : str
  24. The path to save the processed csv file
  25. """
  26. self.filepath_in = filepath_in
  27. self.filepath_out = filepath_out
  28. def _collect_files(self):
  29. """Collect and join files in the subdirectories
  30. Returns
  31. -------
  32. list
  33. list of files
  34. """
  35. for root, dirs, files in os.walk(self.filepath_in):
  36. file_path_list = glob.glob(os.path.join(root,'*'))
  37. return file_path_list
  38. def _extract_data(self):
  39. """Extract data by row from _collect_files and append in a list
  40. Returns
  41. -------
  42. list
  43. list of files
  44. """
  45. full_data_rows_list = []
  46. file_path_list = self._collect_files()
  47. for f in file_path_list:
  48. with open(f, 'r', encoding = 'utf8', newline='') as csvfile:
  49. csvreader = csv.reader(csvfile)
  50. next(csvreader)
  51. for line in csvreader:
  52. full_data_rows_list.append(line)
  53. return full_data_rows_list
  54. def write_csv(self):
  55. """Create a new csv file smaller that will be used with Apache Cassandra
  56. Returns
  57. -------
  58. none
  59. """
  60. full_data_rows_list = self._extract_data()
  61. csv.register_dialect('myDialect', quoting=csv.QUOTE_ALL, skipinitialspace=True)
  62. with open(self.filepath_out + '.csv', 'w', encoding = 'utf8', newline='') as f:
  63. writer = csv.writer(f, dialect='myDialect')
  64. writer.writerow(['artist','firstName','gender','itemInSession','lastName','length',\
  65. 'level','location','sessionId','song','userId'])
  66. for row in full_data_rows_list:
  67. if (row[0] == ''):
  68. continue
  69. writer.writerow((row[0], row[2], row[3], row[4], row[5], row[6],
  70. row[7], row[8], row[12], row[13], row[16]))
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...