Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

check_update.R 8.0 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
  1. library(readr)
  2. library(dplyr)
  3. library(lubridate)
  4. library(tidyr)
  5. repro <- FALSE
  6. # Checking Google Community Mobility Reports data. ------------------------
  7. read_csv(file = 'data/raw/Global_Mobility_Report.csv',
  8. col_types = cols_only(country_region_code = 'c',
  9. country_region= 'c',
  10. sub_region_1 = 'c',
  11. sub_region_2 = 'c',
  12. date = 'D',
  13. retail_and_recreation_percent_change_from_baseline = 'd',
  14. grocery_and_pharmacy_percent_change_from_baseline = 'd',
  15. parks_percent_change_from_baseline = 'd',
  16. transit_stations_percent_change_from_baseline = 'd',
  17. workplaces_percent_change_from_baseline = 'd',
  18. residential_percent_change_from_baseline = 'd')) %>%
  19. select(date) %>%
  20. pull %>%
  21. max -> last_date
  22. GMR_url <- paste0('https://www.gstatic.com/covid19/mobility/',
  23. 'Global_Mobility_Report.csv')
  24. latest_GMR_dataset <- read_csv(file = GMR_url,
  25. col_types = cols_only(country_region_code = 'c',
  26. country_region= 'c',
  27. sub_region_1 = 'c',
  28. sub_region_2 = 'c',
  29. metro_area = 'c',
  30. date = 'D',
  31. retail_and_recreation_percent_change_from_baseline = 'd',
  32. grocery_and_pharmacy_percent_change_from_baseline = 'd',
  33. parks_percent_change_from_baseline = 'd',
  34. transit_stations_percent_change_from_baseline = 'd',
  35. workplaces_percent_change_from_baseline = 'd',
  36. residential_percent_change_from_baseline = 'd'))
  37. if (max(latest_GMR_dataset$date) > last_date) {
  38. print(paste0('There is more data available from Google Community Mobility ',
  39. 'Reports dataset. Starting to update...'))
  40. write.table(latest_GMR_dataset, 'data/raw/Global_Mobility_Report.csv',
  41. row.names = FALSE, quote = TRUE, sep=',', na = '')
  42. print('Google Community Mobility Reports raw dataset updated.')
  43. repro <- TRUE
  44. } else {
  45. print('Google Community Mobility Reports dataset is up to date.')
  46. }
  47. # Checking ECDC data ------------------------------------------------------
  48. read_delim(file = 'data/raw/COVID19_worldwide_raw.csv', na = '',
  49. col_types = cols_only(dateRep = 'c', day = 'i', month = 'i',
  50. year = 'i', cases = 'i', deaths = 'i',
  51. countriesAndTerritories = 'c', geoId = 'c',
  52. countryterritoryCode = 'c',
  53. popData2018 = 'i', continentExp = 'c'),
  54. delim = ',') %>%
  55. select(dateRep) %>%
  56. pull %>%
  57. dmy(.) %>%
  58. max -> last_date
  59. ECDC_url <- 'https://opendata.ecdc.europa.eu/covid19/casedistribution/csv'
  60. latest_ECDC_dataset <- read_delim(file = ECDC_url, na = '',
  61. col_types = cols_only(dateRep = 'c', day = 'i', month = 'i',
  62. year = 'i', cases = 'i', deaths = 'i',
  63. countriesAndTerritories = 'c', geoId = 'c',
  64. countryterritoryCode = 'c',
  65. # The fact it's 2019 now does not matter,
  66. # after all, we do not use this variable
  67. popData2019 = 'i', continentExp = 'c'),
  68. delim = ',')
  69. if (max(dmy(latest_ECDC_dataset$dateRep)) > last_date) {
  70. print(paste0('There is more data available from the European Centre for ',
  71. 'Disease Prevention and Control COVID-19 dataset. Starting to ',
  72. 'update...'))
  73. write.table(latest_ECDC_dataset, 'data/raw/COVID19_worldwide_raw.csv',
  74. row.names = FALSE, quote = TRUE, sep=',', na = '')
  75. print('ECDC COVID-19 raw dataset updated.')
  76. repro <- TRUE
  77. } else {
  78. print('ECDC COVID-19 dataset is up to date.')
  79. }
  80. # Checking JHU data ----------------------------------------------------
  81. read_delim(file = 'data/raw/hk-reunion-covid-19.csv', na = '',
  82. col_types = cols_only(locality_name = 'c', date = 'c',
  83. new_cases = 'd', new_deaths = 'd'),
  84. delim = ',') %>%
  85. select(date) %>%
  86. pull %>%
  87. max -> last_date
  88. # Get datasets and join
  89. cases_url <- paste0('https://raw.githubusercontent.com/CSSEGISandData/COVID-1',
  90. '9/master/csse_covid_19_data/csse_covid_19_time_series/tim',
  91. 'e_series_covid19_confirmed_global.csv')
  92. deaths_url <- paste0('https://raw.githubusercontent.com/CSSEGISandData/COVID-1',
  93. '9/master/csse_covid_19_data/csse_covid_19_time_series/ti',
  94. 'me_series_covid19_deaths_global.csv')
  95. latest_JHU_dataset <- read_delim(file = cases_url, na = '',
  96. col_types = cols(.default = 'd',
  97. `Province/State` = 'c',
  98. `Country/Region` = 'c'),
  99. delim = ',') %>%
  100. select(-c('Lat', 'Long', 'Country/Region')) %>%
  101. pivot_longer(cols = 2:ncol(.),
  102. names_to = 'date',
  103. values_to = 'new_cases') %>%
  104. mutate(date = mdy(date)) %>%
  105. filter(`Province/State` %in% c('Hong Kong', 'Reunion'))
  106. deaths_dataset <- read_delim(file = deaths_url, na = '',
  107. col_types = cols(.default = 'd',
  108. `Province/State` = 'c',
  109. `Country/Region` = 'c'),
  110. delim = ',') %>%
  111. select(-c('Lat', 'Long', 'Country/Region')) %>%
  112. pivot_longer(cols = 2:ncol(.),
  113. names_to = 'date',
  114. values_to = 'new_deaths') %>%
  115. mutate(date = mdy(date)) %>%
  116. filter(`Province/State` %in% c('Hong Kong', 'Reunion'))
  117. latest_JHU_dataset <- left_join(latest_JHU_dataset, deaths_dataset,
  118. by = c('date', 'Province/State'))
  119. latest_JHU_dataset <- latest_JHU_dataset %>%
  120. group_by(`Province/State`) %>%
  121. arrange(date) %>%
  122. mutate(new_cases = new_cases - lag(new_cases, default = first(new_cases))) %>%
  123. mutate(new_deaths = new_deaths - lag(new_deaths, default = first(new_deaths)))
  124. colnames(latest_JHU_dataset)[1] <- 'locality_name'
  125. latest_JHU_dataset$date %>%
  126. format('%m/%d/%y') %>%
  127. mdy(.) -> latest_JHU_dataset$date
  128. # Check if it's new
  129. if (max(latest_JHU_dataset$date) > last_date) {
  130. print(paste0('There is more data available from the John Hopkins University ',
  131. 'dataset. Starting to update...'))
  132. write.table(latest_JHU_dataset, 'data/raw/hk-reunion-covid-19.csv',
  133. row.names = FALSE, quote = FALSE, sep=',', na = '')
  134. print('JHU COVID-19 raw dataset updated.')
  135. repro <- TRUE
  136. } else {
  137. print('JHU COVID-19 dataset is up to date.')
  138. }
  139. # Update remote / git repo ------------------------------------------------
  140. #if (repro == TRUE) {
  141. # system('dvc repro preprocess.dvc')
  142. # system(paste0('git add data/raw/COVID19_worldwide_raw.csv.dvc preprocess.dvc',
  143. # ' data/raw/hk-reunion-covid-19.csv.dvc data/raw/Global_Mobilit',
  144. # 'y_Report.csv.dvc'))
  145. # commit_msg <- paste0('Updates raw datasets \'', today(), '\'')
  146. # system(paste0('git commit -m \"', commit_msg, '\"'))
  147. # system('git push')
  148. #} else {
  149. # print('Everything is up to date. Nothing else to do.')
  150. #}
  151. ##
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...