Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

preprocess.R 8.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
  1. library(readr)
  2. library(dplyr)
  3. library(tidyr)
  4. library(tidyselect)
  5. library(lubridate)
  6. # Reading raw data --------------------------------------------------------
  7. # Google Mobility Report (GMR) dataset
  8. raw_dataset <- read_csv(file = 'data/raw/Global_Mobility_Report.csv',
  9. col_types = paste(c(rep('c', 4),
  10. 'D',
  11. rep('d', 6)),
  12. collapse=''))
  13. colnames(raw_dataset) <- c('country_code', 'country_name', 'region_name',
  14. 'county_name', 'date', 'retail_recreation',
  15. 'grocery_pharmacy', 'parks', 'transit_stations',
  16. 'workplaces', 'residential')
  17. # COVID19 dataset from ECDC
  18. covid <- read_delim(file = 'data/raw/COVID19_worldwide_raw.csv', na = '',
  19. col_types = cols('c', 'i', 'i', 'i', 'i', 'i', 'c', 'c',
  20. 'c', 'i'),
  21. delim = ',')
  22. colnames(covid) <- c('date', 'day', 'month', 'year', 'new_cases', 'new_deaths',
  23. 'country_name', 'country_id', 'territory_id',
  24. 'pop_data_2018')
  25. # Country details from UN Data
  26. # There were ~ in 0 or 0.0 numbers in the raw data file. I had to manually
  27. # replace ~0 and ~0.0 by 0 and 0.0, otherwise R wouldn't understand this is
  28. # a number.
  29. country_details <- read_delim(file = 'data/raw/UN_dataset.tsv', delim = '\t',
  30. col_types = paste(c('c',
  31. rep('d', 173),
  32. 'c'),
  33. collapse=''))
  34. # Preprocessing step ---------------------------------------------------------
  35. ####
  36. #
  37. # Working on GMR dataset
  38. #
  39. ####
  40. # Create a long table from the wide original version
  41. preprocessed_dataset <- pivot_longer(raw_dataset, cols=6:11, names_to = 'plot_name',
  42. values_to = 'variation')
  43. rm(raw_dataset)
  44. ####
  45. #
  46. # Working on COVID19 dataset
  47. #
  48. ####
  49. covid %>%
  50. # These columns are useless
  51. select(-c('day', 'month', 'year', 'country_id', 'territory_id')) %>%
  52. # Convert date to Date
  53. mutate(date, date = dmy(covid$date)) %>%
  54. # Replace _ by space in country names
  55. mutate(country_name,
  56. country_name = gsub('_',
  57. ' ',
  58. country_name)) -> covid
  59. # Some countries have different names between the datasets
  60. covid %>%
  61. mutate(country_name, country_name = case_when(
  62. country_name == 'United States of America' ~ 'United States',
  63. country_name == 'United Republic of Tanzania' ~ 'Tanzania',
  64. country_name == 'Guinea Bissau' ~ 'Guinea-Bissau',
  65. country_name == 'Bahamas' ~ 'The Bahamas',
  66. country_name == 'Myanmar' ~ 'Myanmar (Burma)',
  67. country_name == 'Cote dIvoire' ~ 'Côte d\'Ivoire',
  68. TRUE ~ country_name)
  69. ) -> covid
  70. # Some rows are missing like March 2nd and 3rd for Brazil. Apparently,
  71. # this happens when there is no case/death. Complete with these rows.
  72. covid %>%
  73. complete(date = seq.Date(min(date), max(date), by='day'), country_name) %>%
  74. mutate(new_cases = ifelse(is.na(new_cases), 0, new_cases)) %>%
  75. mutate(new_deaths = ifelse(is.na(new_deaths), 0, new_deaths)) %>%
  76. group_by(country_name) %>%
  77. fill(pop_data_2018, .direction = 'updown') -> covid
  78. ####
  79. #
  80. # Merge World COVID19 and GMR datasets
  81. #
  82. ####
  83. preprocessed_dataset <- left_join(preprocessed_dataset, covid,
  84. by = c('date', 'country_name'))
  85. rm(covid)
  86. ####
  87. #
  88. # Working on dataset after 1st merge
  89. #
  90. ####
  91. # Create column with accumulate cases/death
  92. preprocessed_dataset %>%
  93. group_by(country_name, plot_name, region_name) %>%
  94. arrange(date) %>%
  95. dplyr::mutate(acc_cases = cumsum(new_cases)) %>%
  96. dplyr::mutate(acc_deaths = cumsum(new_deaths)) %>%
  97. ungroup -> preprocessed_dataset
  98. # Set new_cases and new_deaths to NA for regions, because our COVID19 dataset
  99. # only has data for countries.
  100. preprocessed_dataset %>%
  101. mutate(new_cases = ifelse(!is.na(region_name),
  102. NA,
  103. new_cases)) %>%
  104. mutate(new_deaths = ifelse(!is.na(region_name),
  105. NA,
  106. new_deaths)) -> preprocessed_dataset
  107. ####
  108. #
  109. # Working on country details dataset (UN)
  110. #
  111. ####
  112. # Before merging to get more info about the countries, we must make sure all
  113. # country names are the same.
  114. # unique(preprocessed_dataset$country_name)[which(
  115. # unique(preprocessed_dataset$country_name) %in%
  116. # unique(country_details$country) == FALSE
  117. # )
  118. # ]
  119. country_details %>%
  120. mutate(region_name = case_when(
  121. region_name == 'Bolivia (Plurinational State of)' ~ 'Bolivia',
  122. region_name == 'Bahamas' ~ 'The Bahamas',
  123. region_name == 'Republic of Korea' ~ 'South Korea',
  124. region_name == 'Cabo Verde' ~ 'Cape Verde',
  125. region_name == 'China, Hong Kong SAR' ~ 'Hong Kong',
  126. region_name == 'Republic of Moldova' ~ 'Moldova',
  127. region_name == 'Lao People\'s Democratic Republic' ~ 'Laos',
  128. region_name == 'Myanmar' ~ 'Myanmar (Burma)',
  129. region_name == 'United Rep. of Tanzania' ~ 'Tanzania',
  130. region_name == 'United States of America' ~ 'United States',
  131. region_name == 'Venezuela (Boliv. Rep. of)' ~ 'Venezuela',
  132. region_name == 'Viet Nam' ~ 'Vietnam',
  133. TRUE ~ region_name)
  134. ) -> country_details
  135. ####
  136. #
  137. # Merge country details and preprocessed_dataset
  138. #
  139. ####
  140. preprocessed_dataset <- left_join(preprocessed_dataset, country_details,
  141. by = c('country_name' = 'region_name'))
  142. rm(country_details)
  143. # Remove rows that are related to regions
  144. preprocessed_dataset %>%
  145. filter(is.na(region_name)) %>%
  146. select(-c('region_name', 'county_name')) -> preprocessed_dataset
  147. # There are two columns for population (2018, World Bank and 2019, UN).
  148. # Keep the most updated one.
  149. preprocessed_dataset %>%
  150. select(-c(pop_data_2018)) -> preprocessed_dataset
  151. colnames(preprocessed_dataset)[10:182] %>%
  152. # Make them all lowercase
  153. tolower %>%
  154. # Replace space by underscore
  155. gsub(' ', '_', .) %>%
  156. gsub('-', '_', .) %>%
  157. gsub('_+', '_', .) %>%
  158. gsub(',', '', .) -> colnames(preprocessed_dataset)[10:182]
  159. # Create columns for lethality rate
  160. preprocessed_dataset %>%
  161. group_by(country_name) %>%
  162. arrange(date) %>%
  163. mutate(lethality_rate_percent = (acc_deaths[n()]/acc_cases[n()])*100) %>%
  164. ungroup() -> preprocessed_dataset
  165. # Bring plot_names from row to column
  166. preprocessed_dataset %>%
  167. pivot_wider(names_from = plot_name, values_from = variation) -> preprocessed_dataset
  168. # Add epidemiological week to column
  169. preprocessed_dataset %>%
  170. group_by(country_name) %>%
  171. mutate(first_case_date = min(date[acc_cases > 0])) %>%
  172. mutate(n_days_since_1st_case =
  173. if_else(acc_cases > 0,
  174. as.numeric(date - min(date[acc_cases > 0])+1),
  175. 0)) %>%
  176. ungroup() -> preprocessed_dataset
  177. # Set manually first case for countries whose first case happened before Feb 15
  178. # Wikipedia contributors, "2019–20 coronavirus pandemic", Wikipedia, The Free
  179. # Encyclopedia,
  180. # https://en.wikipedia.org/w/index.php?title=2019%E2%80%9320_coronavirus_pandemic&oldid=952187370
  181. # (accessed April 21, 2020).
  182. preprocessed_dataset %>%
  183. mutate(first_case_date = case_when(
  184. # country_name == 'China' ~ dmy('01-12-2019'),
  185. country_name == 'Thailand' ~ dmy('13-01-2020'),
  186. country_name == 'Japan' ~ dmy('16-01-2020'),
  187. country_name == 'South Korea' ~ dmy('20-01-2020'),
  188. country_name == 'United States' ~ dmy('20-01-2020'),
  189. country_name == 'Taiwan' ~ dmy('21-01-2020'),
  190. country_name == 'Hong Kong' ~ dmy('22-01-2020'),
  191. country_name == 'Singapore' ~ dmy('23-01-2020'),
  192. country_name == 'Vietnam' ~ dmy('23-01-2020'),
  193. country_name == 'France' ~ dmy('24-01-2020'),
  194. country_name == 'Nepal' ~ dmy('24-01-2020'),
  195. country_name == 'Australia' ~ dmy('25-01-2020'),
  196. country_name == 'Canada' ~ dmy('25-01-2020'),
  197. country_name == 'Malaysia' ~ dmy('25-01-2020'),
  198. country_name == 'Cambodia' ~ dmy('27-01-2020'),
  199. country_name == 'Germany' ~ dmy('27-01-2020'),
  200. country_name == 'Sri Lanka' ~ dmy('27-01-2020'),
  201. country_name == 'Finalnd' ~ dmy('29-01-2020'),
  202. country_name == 'United Arab Emirates' ~ dmy('29-01-2020'),
  203. country_name == 'India' ~ dmy('30-01-2020'),
  204. country_name == 'Italy' ~ dmy('30-01-2020'),
  205. country_name == 'Philippines' ~ dmy('30-01-2020'),
  206. country_name == 'Spain' ~ dmy('31-01-2020'),
  207. country_name == 'Sweden' ~ dmy('31-01-2020'),
  208. country_name == 'United Kingdom' ~ dmy('31-01-2020'),
  209. country_name == 'Belgium' ~ dmy('04-02-2020'),
  210. country_name == 'Egypt' ~ dmy('14-01-2020'),
  211. TRUE ~ first_case_date
  212. )
  213. ) -> preprocessed_dataset
  214. # Saving final preprocessed dataset ---------------------------------------
  215. # Save full dataset
  216. write_tsv(x = preprocessed_dataset, path = 'data/preprocessed/DIB_dataset.tsv', quote_escape = FALSE)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...