Updates preprocess pipeline with several changes

1. Fixes the right number of days since first case column for countries that
   had first case before the Feb 11
2. Adds number of days since first death column (from Feb 11)
3. Adds info about DVC remote (once again donated by DAGsHub)
4. Fixes duplicated rows for US from different UN datasets
parent
commit
3613d3cd6f
3 changed files with 59 additions and 12 deletions
  1. 4
    0
      .dvc/config
  2. 3
    3
      preprocess.dvc
  3. 52
    9
      scripts/preprocess.R
@@ -0,0 +1,4 @@
+[core]
+    remote = s3remote
+['remote "s3remote"']
+    url = s3://dagshub-covid19-gmr-public
@@ -1,4 +1,4 @@
-md5: f071416f134f289e486986d12ab29390
+md5: 022a56239fda37856a8f555b46ca01e7
 cmd: Rscript scripts/preprocess.R
 cmd: Rscript scripts/preprocess.R
 deps:
 deps:
 - md5: 6c379663426e8135df92af20075e33b4
 - md5: 6c379663426e8135df92af20075e33b4
@@ -7,10 +7,10 @@ deps:
   path: data/raw/Global_Mobility_Report.csv
   path: data/raw/Global_Mobility_Report.csv
 - md5: ac537d89c981b02641203d606123c78a
 - md5: ac537d89c981b02641203d606123c78a
   path: data/raw/UN_dataset.tsv
   path: data/raw/UN_dataset.tsv
-- md5: a82b39358ca5f7a37941b60a604c745c
+- md5: c0c64a88cb4311be38196a365e927b44
   path: scripts/preprocess.R
   path: scripts/preprocess.R
 outs:
 outs:
-- md5: bcc995433b8502d4c0c1dd761f313786
+- md5: c97cc30cf2e11c445e64a5bfc4a146ee
   path: data/preprocessed/DIB_dataset.tsv
   path: data/preprocessed/DIB_dataset.tsv
   cache: true
   cache: true
   metric: false
   metric: false
@@ -32,9 +32,6 @@ colnames(covid) <- c('date', 'day', 'month', 'year', 'new_cases', 'new_deaths',
                      'pop_data_2018')
                      'pop_data_2018')
 
 
 # Country details from UN Data
 # Country details from UN Data
-# There were ~ in 0 or 0.0 numbers in the raw data file. I had to manually
-# replace ~0 and ~0.0 by 0 and 0.0, otherwise R wouldn't understand this is
-# a number.
 country_details <- read_delim(file = 'data/raw/UN_dataset.tsv', delim = '\t',
 country_details <- read_delim(file = 'data/raw/UN_dataset.tsv', delim = '\t',
                             col_types = paste(c('c',
                             col_types = paste(c('c',
                                                 rep('d', 173),
                                                 rep('d', 173),
@@ -134,11 +131,11 @@ preprocessed_dataset %>%
 
 
 # Before merging to get more info about the countries, we must make sure all
 # Before merging to get more info about the countries, we must make sure all
 # country names are the same.
 # country names are the same.
-# unique(preprocessed_dataset$country_name)[which(
-#      unique(preprocessed_dataset$country_name) %in%
-#        unique(country_details$country) == FALSE
-#    )
-#  ]
+#unique(preprocessed_dataset$country_name)[which(
+#     unique(preprocessed_dataset$country_name) %in%
+#       unique(country_details$region_name) == FALSE
+#   )
+# ]
 
 
 country_details %>%
 country_details %>%
   mutate(region_name = case_when(
   mutate(region_name = case_when(
@@ -157,6 +154,26 @@ country_details %>%
     TRUE ~ region_name)
     TRUE ~ region_name)
   ) -> country_details
   ) -> country_details
 
 
+# In some datastes US appears as United States, and in others as United States
+# of America. The naming was fixed earlier, but we have two rows for US. Fix.
+# ids <- which(country_details$region_name == 'United States')
+
+country_details[88,][,17:42] <- country_details[217,][,17:42]
+country_details[88,][,45:51] <- country_details[217,][,45:51]
+country_details[88,][,53:55] <- country_details[217,][,53:55]
+country_details[88,][,63:72] <- country_details[217,][,63:72]
+country_details[88,][,77:80] <- country_details[217,][,77:80]
+country_details[88,][,90:93] <- country_details[217,][,90:93]
+country_details[88,][,97:104] <- country_details[217,][,97:104]
+country_details[88,][,112:114] <- country_details[217,][,112:114]
+country_details[88,][,116:118] <- country_details[217,][,116:118]
+country_details[88,][,128:133] <- country_details[217,][,128:133]
+country_details[88,][,143:148] <- country_details[217,][,143:148]
+country_details[88,][,150] <- country_details[217,][,150]
+country_details[88,][,152:157] <- country_details[217,][,152:157]
+country_details[88,][,159:175] <- country_details[217,][,159:175]
+country_details <- country_details[-217, ]
+
 ####
 ####
 #
 #
 # Merge country details and preprocessed_dataset
 # Merge country details and preprocessed_dataset
@@ -198,7 +215,7 @@ preprocessed_dataset %>%
 preprocessed_dataset %>%
 preprocessed_dataset %>%
   pivot_wider(names_from = plot_name, values_from = variation) -> preprocessed_dataset
   pivot_wider(names_from = plot_name, values_from = variation) -> preprocessed_dataset
 
 
-# Add epidemiological week to column
+# Add n_days_since_1st_case column
 preprocessed_dataset %>%
 preprocessed_dataset %>%
   group_by(country_name) %>%
   group_by(country_name) %>%
   mutate(first_case_date = min(date[acc_cases > 0])) %>%
   mutate(first_case_date = min(date[acc_cases > 0])) %>%
@@ -208,6 +225,16 @@ preprocessed_dataset %>%
                    0)) %>%
                    0)) %>%
   ungroup() -> preprocessed_dataset
   ungroup() -> preprocessed_dataset
 
 
+# Add n_days_since_1st_death column
+preprocessed_dataset %>%
+  group_by(country_name) %>%
+  mutate(first_death_date = min(date[acc_deaths > 0])) %>%
+  mutate(n_days_since_1st_death =
+           if_else(acc_deaths > 0,
+                   as.numeric(date - min(date[acc_deaths > 0])+1),
+                   0)) %>%
+  ungroup() -> preprocessed_dataset
+
 # Set manually first case for countries whose first case happened before Feb 15
 # Set manually first case for countries whose first case happened before Feb 15
 # Wikipedia contributors, "2019–20 coronavirus pandemic", Wikipedia, The Free
 # Wikipedia contributors, "2019–20 coronavirus pandemic", Wikipedia, The Free
 # Encyclopedia,
 # Encyclopedia,
@@ -246,6 +273,22 @@ preprocessed_dataset %>%
     )
     )
   ) -> preprocessed_dataset
   ) -> preprocessed_dataset
 
 
+# Fix n_days since 1st case for countries that had 1st case before Feb 11
+countries <- c('Thailand', 'Japan', 'South Korea', 'United States', 'Taiwan',
+               'Hong Kong', 'Singapore', 'Vietnam', 'France', 'Nepal',
+               'Australia', 'Canada', 'Malaysia', 'Cambodia', 'Germany',
+               'Sri Lanka', 'Finland', 'United Arab Emirates', 'India',
+               'Italy', 'Philippines', 'Spain', 'Sweden', 'United Kingdom',
+               'Belgium', 'Egypt')
+preprocessed_dataset %>%
+  group_by(country_name) %>%
+  mutate(n_days_since_1st_case =
+           if_else(country_name %in% countries,
+                   as.numeric(date - first_case_date)+1,
+                   n_days_since_1st_case)) %>%
+  ungroup() -> preprocessed_dataset
+rm(countries)
+
 # Saving final preprocessed dataset ---------------------------------------
 # Saving final preprocessed dataset ---------------------------------------
 
 
 # Save full dataset
 # Save full dataset