Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

drivendata_waterpump.py 29 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
  1. # ---
  2. # jupyter:
  3. # jupytext:
  4. # formats: ipynb,py:percent
  5. # text_representation:
  6. # extension: .py
  7. # format_name: percent
  8. # format_version: '1.3'
  9. # jupytext_version: 1.6.0
  10. # kernelspec:
  11. # display_name: Python 3
  12. # name: python3
  13. # ---
  14. # %% [markdown] id="view-in-github" colab_type="text"
  15. # <a href="https://colab.research.google.com/github/mspoorendonk/drivendata/blob/marc/drivendata_waterpump.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  16. # %% [markdown] id="pC9Hngyu5E6R"
  17. # # Analysis of condition of water points in Tanzania
  18. #
  19. # Problem statement:
  20. # predict the operating condition of a waterpoint for each record in the dataset: functioning, functioning but needs repair, not functioning
  21. #
  22. #
  23. # Approach
  24. # 1. Download datasets
  25. # 1. Explore data and understand which features are relevant for the prediction.
  26. # 1. Clean data [Bart]
  27. # 1. Engineer some derived features
  28. # 1. decide on a method for predicting (trees or neuralnets or knn or ...)
  29. # 1. perform a train / test / validate split on the data
  30. # 1. Train model on training values and labels
  31. # 1. Predict training labels that correspond to training values
  32. # 1. Report the accuracy
  33. # 1. Tune hyperparameters with gridsearch
  34. # 1. Predict the test labels
  35. # 1. Submit CSV [Marc]
  36. #
  37. #
  38. # TODO:
  39. # here: check xgboost, pandas, bokeh (interactief)
  40. # somewhere else: how to deploy a model in production. What software and frameworks etc.
  41. #
  42. # %% [markdown] id="hwqftAGR-k2J"
  43. # # Dependencies
  44. # %% id="PuCGSSOKu8-h" outputId="4e1ba0b1-c2e7-415d-aa7f-0f9f9b61db51" colab={"base_uri": "https://localhost:8080/", "height": 728}
  45. # installations
  46. # !pip install gmaps
  47. # %% id="CJlE0N65YUFd" outputId="d185adbf-f040-41dc-fac3-18b6b92f5566" colab={"base_uri": "https://localhost:8080/", "height": 52}
  48. # imports
  49. from datetime import datetime
  50. import pandas as pd
  51. import random
  52. import numpy as np
  53. import gmaps
  54. import IPython
  55. from sklearn import tree # to create a decision tree
  56. from sklearn.ensemble import RandomForestClassifier
  57. from sklearn import metrics # to compute accuracy
  58. from sklearn.neighbors import KNeighborsClassifier
  59. from sklearn.model_selection import train_test_split # Import train_test_split function
  60. from sklearn import preprocessing # for normalizing data for knn
  61. from sklearn.metrics import accuracy_score
  62. from sklearn.preprocessing import MinMaxScaler
  63. import pydotplus # To create our Decision Tree Graph
  64. from IPython.display import Image # To Display a image of our graph
  65. from IPython.display import display
  66. from ipywidgets.embed import embed_minimal_html
  67. # Seaborn visualization library
  68. import seaborn as sns # for pairplots
  69. import matplotlib.pyplot as plt
  70. # %% [markdown] id="mOSkSXlq-gzD"
  71. # # Download datasets
  72. # %% id="tGa2jMrp23Sm" outputId="7c82da7a-96fa-4ae3-9b26-f360fb799bd1" colab={"base_uri": "https://localhost:8080/", "height": 357}
  73. # download datasets from driven-data.org. Urls copied from data download section on website.
  74. # They expire after 2 days or so. Then you need to copy/paste them again.
  75. # testvalues
  76. # !wget "https://drivendata-prod.s3.amazonaws.com/data/7/public/702ddfc5-68cd-4d1d-a0de-f5f566f76d91.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200927%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200927T185304Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=f2b7c554cb780a1facf849dc85cd18a0ce5110100690a748eaa1df42f43a12da" -O test_values.csv
  77. # training labels
  78. # !wget "https://drivendata-prod.s3.amazonaws.com/data/7/public/0bf8bc6e-30d0-4c50-956a-603fc693d966.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200927%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200927T185304Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=f91daa03811de5cb244f5f2d8446fb46a99eb37bedf7bd0c609d8b076bebfbe2" -O training_labels.csv
  79. # training values
  80. # !wget "https://drivendata-prod.s3.amazonaws.com/data/7/public/4910797b-ee55-40a7-8668-10efd5c1b960.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200927%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200927T185304Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=b38e27dab8fac51df99d1ec837ffd2f4a3c3e1ffd48494951a846a144f88434f" -O training_values.csv
  81. # %% id="s5JdImi9qyql"
  82. # Boundary coordinates of Tanzania
  83. # Source: https://en.wikipedia.org/wiki/List_of_countries_by_northernmost_point (and similar)
  84. tanzania_lat = [-11.750-0.1, -0.983+0.1]
  85. tanzania_lon = [29.167-0.1, 40.250+0.1]
  86. # %% pycharm={"name": "#%%\n"}
  87. # Data location
  88. data_path = ''
  89. # %% id="TlOL83NOOp9n" outputId="f1473e4e-a572-42ed-9413-b57797dcdc6d" colab={"base_uri": "https://localhost:8080/", "height": 369}
  90. # Load training values
  91. na_values = {
  92. 'longitude': 0.0,
  93. 'latitude':-2.e-8,
  94. 'gps_height': 0,
  95. # 'wpt_name': 'none',
  96. # 'construction_year': 0,
  97. # 'population': 0,
  98. # 'district_code': 0,
  99. }
  100. # na_values = {}
  101. training_values = pd.read_csv(data_path + 'training_values.csv',
  102. parse_dates=['date_recorded'],
  103. index_col='id',
  104. na_values=na_values)
  105. # Drop column(s) without information
  106. training_values.drop(columns=['num_private'], inplace=True)
  107. print('Shape: ', training_values.shape)
  108. # Show example
  109. display(training_values.iloc[:5, 0:20])
  110. display(training_values.iloc[:5, 20:])
  111. # %% pycharm={"name": "#%%\n"}
  112. # Load training labels
  113. training_labels = pd.read_csv(data_path + 'training_labels.csv',
  114. index_col='id')
  115. training_labels.head()
  116. # %% pycharm={"name": "#%%\n"}
  117. # Merge training values and -labels
  118. training_all = pd.merge(training_values, training_labels, on='id')
  119. training_all.head()
  120. # %% pycharm={"name": "#%%\n"}
  121. # Load submission values
  122. test_values = pd.read_csv(data_path + 'test_values.csv',
  123. parse_dates=['date_recorded'],
  124. index_col='id',
  125. na_values=na_values)
  126. # %% id="e1LWiBzgPUyv"
  127. # column_names = ''
  128. # for n in training_values.columns:
  129. # column_names = column_names + "'" + n + "', "
  130. # print(column_names) # print a string from which we can copy/paste the following lists
  131. columns_time = ['date_recorded']
  132. columns_numerical = ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population',
  133. 'construction_year', 'region_code', 'district_code']
  134. columns_categorical = ['funder', 'installer', 'wpt_name', 'basin', 'subvillage',
  135. 'region', 'lga', 'ward', 'public_meeting', 'recorded_by',
  136. 'scheme_management', 'scheme_name', 'permit', 'extraction_type',
  137. 'extraction_type_group', 'extraction_type_class', 'management',
  138. 'management_group', 'payment', 'payment_type', 'water_quality',
  139. 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
  140. 'source_class', 'waterpoint_type', 'waterpoint_type_group']
  141. columns_location = ['latitude', 'longitude', 'gps_height', 'wpt_name', 'basin', 'subvillage',
  142. 'region', 'region_code', 'district_code', 'lga', 'ward']
  143. print('Time: ', len(columns_time))
  144. print('Numerical: ', len(columns_numerical))
  145. print('Categorical: ', len(columns_categorical))
  146. # %% [markdown]
  147. # # Exploratory Data Analysis
  148. #
  149. # %% pycharm={"name": "#%%\n"}
  150. # Show main training data characteristics
  151. training_values.info()
  152. training_values.describe()
  153. # %% pycharm={"name": "#%%\n"}
  154. # Investigate duplicate rows
  155. headers = list(training_values)
  156. duplicate_full = training_values.duplicated(subset=headers, keep=False)
  157. duplicate_full_all = training_all.duplicated(subset=(headers + ['status_group']), keep=False)
  158. print('Number of fully duplicate rows: ', duplicate_full.sum())
  159. print('Number of fully duplicate rows (incl label): ', duplicate_full_all.sum())
  160. print('Examples of duplicate rows:')
  161. display(training_values[duplicate_full].sort_values(by=headers).head(10))
  162. # Find the rows that are duplicate apart from the label
  163. id_diff = set(training_values[duplicate_full].index).difference(training_all[duplicate_full_all].index)
  164. print('Duplicate apart from label:')
  165. display(training_all.loc[id_diff].head(10))
  166. # Mark duplicates for removal
  167. flag_droprows = training_values.sort_index().duplicated(subset=headers, keep='first')
  168. # Add both rows with different label
  169. flag_droprows[id_diff] = True
  170. print('Marked for removal: {}'.format(flag_droprows.sum()))
  171. # %% pycharm={"name": "#%%\n"}
  172. # Investigate Total static head (amount_tsh)
  173. # Note that the units are unknown
  174. fig, ax = plt.subplots()
  175. training_values['amount_tsh'].plot.hist(ax=ax, log=True, bins=20)
  176. n_zero_tsh= (training_values['amount_tsh']==0).sum()
  177. n_total = training_values.shape[0]
  178. print('Rows where amount_tsh == 0.0: {}, ({:3d}%)'\
  179. .format(n_zero_tsh,int(round(100*n_zero_tsh/n_total, 2))))
  180. # Conclusion: amount_tsh looks okay, some zeros might actually be missing
  181. # values, but cannot be distinguished from measured zeros.
  182. # %% pycharm={"name": "#%%\n"}
  183. # Investigate date_recorded
  184. fig, ax = plt.subplots()
  185. training_values['date_recorded'].dt.year.plot.hist(ax=ax)
  186. display(training_values['date_recorded'].dt.year.value_counts(sort=False))
  187. ax.set_title('Histogram of date_recorded')
  188. year2011 = (training_values['date_recorded'].dt.year == 2011)
  189. year2012 = (training_values['date_recorded'].dt.year == 2012)
  190. year2013 = (training_values['date_recorded'].dt.year == 2013)
  191. fig, ax2 = plt.subplots()
  192. training_values[year2011]['date_recorded'].dt.month.plot.hist(ax=ax2, bins=12,
  193. range=(1, 12), log=True)
  194. ax2.set_title('2011, records per month')
  195. fig, ax3 = plt.subplots()
  196. training_values[year2012]['date_recorded'].dt.month.plot.hist(ax=ax3, bins=12,
  197. range=(1, 12), log=True)
  198. ax3.set_title('2012, records per month')
  199. fig, ax4 = plt.subplots()
  200. training_values[year2013]['date_recorded'].dt.month.plot.hist(ax=ax4, bins=12,
  201. range=(1, 12), log=True)
  202. ax4.set_title('2013, records per month')
  203. # Conclusion: dates look OK.
  204. # %% pycharm={"name": "#%%\n"}
  205. # Investigate GPS height
  206. # Note that GPS height is inaccurate, deviations of 120 m are not uncommon.
  207. height_neg = training_values['gps_height'] < 0
  208. height_pos = training_values['gps_height'] > 0
  209. height_zero = training_values['gps_height'] == 0
  210. fig, ax = plt.subplots()
  211. training_values['gps_height'].plot.hist(ax=ax)
  212. ax.set_title('Histogram of GPS Height')
  213. fig, ax = plt.subplots()
  214. training_values[height_neg]['gps_height'].plot.hist(ax=ax)
  215. ax.set_title('Histogram of GPS Height (strictly negative)')
  216. print('Rows with zero height: {}'.format(height_zero.sum()))
  217. print('Rows with negative height: {}'.format(height_neg.sum()))
  218. fig, ax = plt.subplots()
  219. training_values['gps_height'].plot.hist(ax=ax, range=(-20,20))
  220. print(training_values[~height_zero]['gps_height'].median())
  221. # Conclusion: GPS height looks OK. Negative values can be explained by
  222. # inaccuracy of measurement. Zeros are most likely missing values,
  223. # since they occur much more frequently than other values.
  224. # %% pycharm={"name": "#%%\n"}
  225. # Investigate region
  226. # Note: Tanzania has 31 regions, 169 districts (2012)
  227. # https://en.wikipedia.org/wiki/Districts_of_Tanzania
  228. region_counts = training_values[['region_code','region']]\
  229. .sort_values(by='region').value_counts(sort=False)
  230. display(region_counts)
  231. display(training_values.query('region_code==5 & region=="Tanga"')\
  232. .loc[:, columns_location].head(10))
  233. # Conclusion: some region codes (5, 11, 14, 17, 18) seem to refer to multiple
  234. # regions. However, if the region with the highest count is correct, this
  235. # affects only 123 rows. Some regions (Arusha, Lindi, Mtwara, Mwanza, Pwani,
  236. # Shinyanga, Tanga) have multiple region codes associated to them.
  237. # Assuming that the most common mapping is correct, this affects around 2500 rows.
  238. # Solutions:
  239. # - Use only latitude and longitude as location data. However, the region,
  240. # district, lga, ward might contain information about governance.
  241. # - Remove dubious location data and/or mark as missing.
  242. # - Use an external source (e.g. GeoNames.org) to verify which values are
  243. # most likely incorrect and replace those.
  244. # %% pycharm={"name": "#%%\n"}
  245. # Investigate district
  246. # Note: according to wikipedia, regions have up to 10 districts
  247. display(training_values.sort_values(by='district_code')\
  248. .value_counts(subset='district_code',sort=False))
  249. display(training_values.sort_values(by='district_code')\
  250. .value_counts(subset=['region', 'district_code'],sort=False))
  251. # Investigate example of district_code > 10
  252. display(training_values.query('district_code==80')\
  253. .loc[:,['region','region_code','district_code','subvillage']].head(12))
  254. # Conclusion: all district codes larger than 10 are most probably duplicates of
  255. # other codes. This affects some 4200 rows. The tuple (region_code,
  256. # district_code) should uniquely identify a region, so we can derive a new
  257. # feature based on these codes.
  258. # %% pycharm={"name": "#%%\n"}
  259. # Investigate location (latitude, longitude)
  260. # Check if latitude, longitude is actually inside Tanzania (or NaN)
  261. lon_isna = training_values['longitude'].isna()
  262. lat_isna = training_values['latitude'].isna()
  263. lon_in_range = (tanzania_lon[0] <= training_values['longitude']) & \
  264. (training_values['longitude'] <= tanzania_lon[1])
  265. lat_in_range = (tanzania_lat[0] <= training_values['latitude']) & \
  266. (training_values['latitude'] <= tanzania_lat[1])
  267. pos_isna = lon_isna | lat_isna
  268. pos_in_range = lon_in_range & lat_in_range
  269. print('Number of missing (n/a) coordinates: ', pos_isna.sum())
  270. print('Number of invalid coordinates: ', (~pos_in_range & ~pos_isna).sum())
  271. # Investigate duplicate locations
  272. duplicate_location = training_values.duplicated(
  273. subset=['longitude', 'latitude'], keep=False)
  274. print('Number of rows with duplicate locations: ',
  275. (duplicate_location & ~pos_isna & ~duplicate_full).sum())
  276. training_values[duplicate_location & ~pos_isna & ~duplicate_full]\
  277. .sort_values(['latitude', 'longitude']).head(10)
  278. # %% id="XXko4e2zOyim"
  279. training_values.describe()
  280. # %% id="u637XYufRh8f"
  281. training_labels
  282. # %% id="wyL52BBIFcsH"
  283. # Create the default pairplot
  284. # sns.pairplot(training_all[columns_numerical + ['status_group']], hue = 'status_group')
  285. # %% [markdown] id="kPHHIaJI-tVR"
  286. # # Engineer features
  287. # %% id="XscRcgEZRYuQ"
  288. # engineer some features
  289. # maybe days since reporting a functional pump?
  290. # lifetime: date_recorded - construction_year
  291. # %% [markdown] id="vI-9_Bsg-zaK"
  292. # # Clean data
  293. # %%
  294. for column in ['latitude', 'longitude', 'gps_height']:
  295. for data in [training_values, test_values]:
  296. # Impute, stepwise from specific to general (ward > lga > region > column)
  297. data[column].fillna(data.groupby(['region', 'lga', 'ward'])[column].transform('mean'), inplace=True)
  298. data[column].fillna(data.groupby(['region', 'lga'])[column].transform('mean'), inplace=True)
  299. data[column].fillna(data.groupby(['region'])[column].transform('mean'), inplace=True)
  300. data[column].fillna(data[column].mean(), inplace=True)
  301. display(training_values[['latitude', 'longitude', 'gps_height']].isna().sum())
  302. display(test_values[['latitude', 'longitude', 'gps_height']].isna().sum())
  303. # %% id="uLzANEKyRuS9"
  304. # plot n pumps on a map. Everything above 200 gets slow
  305. n = 200
  306. gmaps.configure(api_key="AIzaSyCDAaxun4CXAyEmLzzJbYkqXii-sbVhVNc") # This is my personal API key, please don't abuse.
  307. colors = []
  308. labels = []
  309. sampled_pumps = training_values.sample(n)
  310. for i in range(len(sampled_pumps)):
  311. id = sampled_pumps.iloc[i]['id']
  312. #print(id)
  313. state = training_labels[training_labels['id']==id]['status_group'].iloc[0]
  314. if state=='functional':
  315. colors.append('green')
  316. elif state=='non functional':
  317. colors.append('red')
  318. else:
  319. colors.append('yellow') # needs repair
  320. labels.append('source %s' % sampled_pumps[sampled_pumps['id']==id].iloc[0]['source'])
  321. pump_locations = sampled_pumps[['latitude' , 'longitude']]
  322. info_box_template = """
  323. <dl>
  324. <td>Name</td><dd>{scheme_name}</dd>
  325. </dl>
  326. """
  327. pump_info = training_values['scheme_name'][:2]
  328. #marker_layer = gmaps.marker_layer(pump_locations, hover_text=pump_info, info_box_content=pump_info)
  329. marker_layer = gmaps.symbol_layer(pump_locations, fill_color=colors, stroke_color=colors, scale=3, hover_text=labels)
  330. figure_layout = {
  331. 'width': '1400px',
  332. 'height': '1200px',
  333. 'border': '1px solid black',
  334. 'padding': '1px'
  335. }
  336. fig = gmaps.figure(layout=figure_layout)
  337. fig.add_layer(marker_layer)
  338. #fig
  339. embed_minimal_html('export.html', views=[fig])
  340. IPython.display.HTML(filename='export.html')
  341. # %% id="2oP1BmwhaoD9"
  342. training_values[['longitude', 'latitude']].head()
  343. # %% [markdown] id="eFoRLkhd-5Ca"
  344. # # Prepare for training
  345. # %% id="rd8zedFQX7JF"
  346. # set n to low number for faster runs and to len(training_values) for max accuracy
  347. # n = 5000
  348. n = len(training_values)
  349. # select the describing variables
  350. # columns_select = [
  351. # 'id',
  352. # 'date_recorded',
  353. # 'amount_tsh',
  354. # 'gps_height',
  355. # 'longitude',
  356. # 'latitude',
  357. # 'region_code',
  358. # 'district_code',
  359. # 'population',
  360. # 'construction_year',
  361. # 'source',
  362. # 'quality_group',
  363. # 'quantity_group',
  364. # 'extraction_type_group',
  365. # ]
  366. columns_select = [
  367. 'date_recorded',
  368. 'amount_tsh',
  369. 'gps_height',
  370. 'longitude',
  371. 'latitude',
  372. 'region_code',
  373. 'district_code',
  374. 'population',
  375. 'construction_year',
  376. 'source',
  377. 'source_class',
  378. 'management_group',
  379. 'payment_type',
  380. 'extraction_type_group',
  381. 'waterpoint_type_group',
  382. 'quality_group',
  383. 'quantity_group',
  384. 'extraction_type_group',
  385. ]
  386. X = pd.get_dummies(training_values[columns_select][:n])
  387. #X = pd.get_dummies(training_values[:n])
  388. # X=X.drop(X[X['construction_year']< 1900].index) # drop all lines with missing construction year (but thenalso drop the y!!)
  389. X['lifetime']=pd.DatetimeIndex(X['date_recorded']).year-X['construction_year'] # engineer a feature but don't do it for rows where construction_year is empty
  390. X.loc[X['lifetime']> 1000, 'lifetime']=-1
  391. X['date_recorded']=X['date_recorded'].apply(datetime.toordinal) # otherwise dates get ignored in the correlation and the tree
  392. enc = preprocessing.LabelEncoder()
  393. Y = enc.fit_transform(np.ravel(training_labels[['status_group']][:n]))
  394. X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
  395. scaler = MinMaxScaler()
  396. scaler.fit(X)
  397. X_train_normalized = scaler.transform(X_train)
  398. X_test_normalized = scaler.transform(X_test)
  399. # Load and transform verification/submission data
  400. X_submission = pd.get_dummies(test_values[columns_select][:n])
  401. X_submission['lifetime']=pd.DatetimeIndex(X_submission['date_recorded']).year-X_submission['construction_year']
  402. X_submission.loc[X_submission['lifetime']> 1000, 'lifetime']=-1
  403. X_submission['date_recorded']=X_submission['date_recorded'].apply(datetime.toordinal) # otherwise dates get ignored in the correlation and the tree
  404. # %% id="Ux1_vI5d2jbe"
  405. # %% id="wu8gomyui65T"
  406. Y_train
  407. # %% id="qPhexPmsgZ0-"
  408. np.array(X_train['lifetime'][:100])
  409. # %% id="KqlNFS1OUrAN"
  410. # figure out which variables correlate with Y
  411. sns.set(rc={'figure.facecolor':'#a0a0a0'})
  412. XY=pd.concat([X, Y], axis=1) # get them side by side
  413. corrMatrix = XY.corr()
  414. plt.figure(figsize=(60,25))
  415. # for tips on formatting the heatmap:
  416. # https://heartbeat.fritz.ai/seaborn-heatmaps-13-ways-to-customize-correlation-matrix-visualizations-f1c49c816f07
  417. sns.heatmap(corrMatrix, annot=True, fmt='.2f', vmin=-1, vmax=1, center= 0, cmap= 'coolwarm')
  418. plt.show()
  419. # %% [markdown] id="FnfLpQ-IRJuI"
  420. # #Forecast
  421. # %% id="21pkGct572-T"
  422. def calc_accuracy(y_pred, Y_test):
  423. correct = 0
  424. for i in range(len(y_pred)):
  425. y_vals = Y_test.iloc[i].values
  426. y_pred_vals = y_pred[i]
  427. #print(y_vals, y_pred_vals)
  428. if (y_vals == y_pred_vals).all():
  429. #print("correct")
  430. correct += 1
  431. #else:
  432. #print('incorrect')
  433. #if correct>10: break
  434. return correct/len(y_pred), correct
  435. results = {}
  436. # %% [markdown] id="SyNH4rTsRlO0"
  437. # ##Decision tree
  438. # %% id="OZDyxJP1lXAp"
  439. print("Train on %d samples. Test on %d samples." % (len(X_train), len(X_test)))
  440. results['tree'] = 0
  441. for d in [1, 5, 10, 15, 20, 25, 5]: # end with 5 so it can be plotted in next cell
  442. model = tree.DecisionTreeClassifier(criterion='gini',max_depth=d)
  443. model = model.fit(X_train, Y_train)
  444. #Predict the response for test dataset
  445. y_pred = model.predict(X_test)
  446. accuracy, correct=calc_accuracy(y_pred, Y_test)
  447. print("Max depth: %d Accuracy on test set: %.2f #correct: %d" % (d, accuracy, correct))
  448. if accuracy > results['tree']: results['tree']=accuracy
  449. # %% id="nQp0frzjlYo_"
  450. # Export/Print a decision tree in DOT format. Only do this when max_depth is small (<=6) otherwise it gets too slow.
  451. #print(tree.export_graphviz(clf, None))
  452. if d < 6:
  453. print('extracting dot')
  454. #Create Dot Data
  455. dot_data = tree.export_graphviz(model, out_file=None, feature_names=list(X_train.columns.values),
  456. class_names=['func', 'repair', 'nonfunc'], rounded=True, filled=True) #Gini decides which attribute/feature should be placed at the root node, which features will act as internal nodes or leaf nodes
  457. #print(dot_data)
  458. print('Create graph image from DOT data')
  459. graph = pydotplus.graph_from_dot_data(dot_data)
  460. print('Show graph')
  461. Image(graph.create_png())
  462. else:
  463. print('graph to deep to fit in image')
  464. # %% [markdown] id="TitZ_1pRbHIu"
  465. # ##Random forest
  466. # %% id="cwKWxHMmasjX"
  467. print("Train on %d samples, %d variables. Test on %d samples." % (X_train.shape[0], X_train.shape[1], len(X_test)))
  468. d=25
  469. model = RandomForestClassifier(n_jobs=-1, random_state=27, verbose=0, max_depth=d, criterion='gini')
  470. model = model.fit(X_train, np.ravel(Y_train))
  471. #Predict the response for test dataset
  472. y_pred = model.predict(X_test)
  473. # accuracy, correct=calc_accuracy(y_pred, Y_test)
  474. # print("Max depth: %d Accuracy on test set: %.2f #correct: %d" % (d, accuracy, correct))
  475. accuracy = accuracy_score(Y_test, y_pred)
  476. print("Max depth: %d Accuracy on test set: %.3f" % (d, accuracy))
  477. # results['forest']=accuracy
  478. # %% id="9FnQqh1FABWE" outputId="1f001ef9-2481-489d-d94f-e8071f23bebe" colab={"base_uri": "https://localhost:8080/", "height": 595}
  479. # feature importances
  480. #inspiration: https://github.com/ernestng11/touchpoint-prediction/blob/master/model-building.ipynb
  481. print(len(model.feature_importances_))
  482. combined = zip(model.feature_importances_, X_train.columns)
  483. combined = sorted(combined, reverse=True)[:50]
  484. #print(combined)
  485. #for i in len(combined):
  486. # print('%s\t%.3f' % (combined[i][1], combined[i][0]))
  487. importance, features = list(zip(*combined))
  488. f, ax = plt.subplots(figsize=(35,5))
  489. plot = sns.barplot(x=np.array(features), y=np.array(importance))
  490. ax.set_title('Feature Importance')
  491. plot.set_xticklabels(plot.get_xticklabels(),rotation='vertical')
  492. plt.show()
  493. # %%
  494. # Permutation importance
  495. # To prevent biased towards high-cardinality features and
  496. from sklearn.inspection import permutation_importance
  497. result = permutation_importance(model, X_test, Y_test,
  498. n_repeats=10,
  499. scoring='accuracy',
  500. random_state=0)
  501. for i in result.importances_mean.argsort()[::-1]:
  502. if result.importances_mean[i] - 2 * result.importances_std[i] > 0:
  503. print(f"{X_test.columns[i]:<8}"
  504. f"{result.importances_mean[i]:.3f}"
  505. f" +/- {result.importances_std[i]:.3f}")
  506. sorted_idx = result.importances_mean.argsort()
  507. fig, ax = plt.subplots()
  508. ax.boxplot(result.importances[sorted_idx[-20:]].T,
  509. vert=False, labels=X_test.columns[sorted_idx[-20:]])
  510. ax.set_title("Permutation Importances (test set)")
  511. fig.tight_layout()
  512. plt.show()
  513. # %% [markdown] id="aZ0aE1t8RRTO"
  514. # ##KNN
  515. # %% id="hBkoB37GCayu" outputId="337b4e6e-c637-42c3-b1cb-4c1c962e4c04" colab={"base_uri": "https://localhost:8080/", "height": 177}
  516. print("Train on %d samples. Test on %d samples." % (len(X_train), len(X_test)))
  517. results['knn']=-1
  518. for d in [1, 2, 3, 5, 10, 15, 20, 30]:
  519. model = KNeighborsClassifier(n_neighbors=d)
  520. model = model.fit(X_train_normalized, Y_train)
  521. #Predict the response for test dataset
  522. y_pred = model.predict(X_test_normalized)
  523. accuracy, correct=calc_accuracy(y_pred, Y_test)
  524. print("n_neighbors: %d Accuracy on test set: %.2f #correct: %d" % (d, accuracy, correct))
  525. if accuracy > results['knn']: results['knn']=accuracy
  526. # %% id="L-cFwS5CHzMB" outputId="7c71f624-fdb3-4ebf-fc2e-975889b2a920" colab={"base_uri": "https://localhost:8080/", "height": 411}
  527. pd.DataFrame( Y_train)
  528. # %% [markdown] id="1gM_orokRXhO"
  529. # ##Neuralnet
  530. # %% id="KPk1W5P-E4iv" outputId="a505866d-3612-4af7-b94b-fb537c7bca23" colab={"base_uri": "https://localhost:8080/", "height": 1000}
  531. print("Train on %d samples. Test on %d samples." % (len(X_train), len(X_test)))
  532. import tensorflow as tf
  533. from tensorflow import keras
  534. from tensorflow.keras import layers
  535. model = keras.Sequential()
  536. #model.add(layers.Dense(2, activation="relu"))
  537. model.add(layers.Dense(20, activation="relu", input_shape = (X_test_normalized.shape[1],)))
  538. model.add(layers.Dense(10, activation="relu"))
  539. model.add(layers.Dense(5, activation="relu"))
  540. model.add(layers.Dense(3, activation='sigmoid'))
  541. model.compile('adam', "binary_crossentropy", metrics=["accuracy"])
  542. model.fit(x=X_train_normalized, y=Y_train, epochs=35)
  543. model.summary()
  544. y_pred = model.predict(X_test_normalized)
  545. print(len(y_pred))
  546. y_pred = (y_pred > 0.5).astype("int32")
  547. accuracy, correct=calc_accuracy(y_pred, Y_test)
  548. print("Accuracy on test set: %.2f #correct: %d" % (accuracy, correct))
  549. results['neural net']=accuracy
  550. # %% [markdown] id="GSrcI0Gae9cE"
  551. # ##XGBoost
  552. # %% id="YK24eHPVfFQw" outputId="22dba33c-d6ce-49c8-a341-556ae4bbb141" colab={"base_uri": "https://localhost:8080/", "height": 35}
  553. # inspiration: https://www.kaggle.com/stuarthallows/using-xgboost-with-scikit-learn
  554. from sklearn.multiclass import OneVsRestClassifier
  555. from xgboost import XGBClassifier
  556. from sklearn.preprocessing import MultiLabelBinarizer
  557. #for d in range(1,35):
  558. results['xgboost']=-1
  559. #for d in [2, 15, 30, 50]:
  560. for d in [30]:
  561. model = OneVsRestClassifier(XGBClassifier(n_jobs=-1, max_depth=d, objective="multi:softprob", num_class=3))
  562. model = model.fit(X_train_normalized, Y_train)
  563. #Predict the response for test dataset
  564. y_pred = model.predict(X_test_normalized)
  565. accuracy, correct=calc_accuracy(y_pred, Y_test)
  566. print("XGBoost: max_depth: %d Accuracy on test set: %.2f #correct: %d" % (d, accuracy, correct))
  567. if accuracy>results['xgboost']: results['xgboost']=accuracy
  568. # %% id="wtt6Yw9AZMoc"
  569. #print(confusion_matrix(Y_test, y_pred))
  570. # %% [markdown] id="OKXg_c92EjZZ"
  571. # #Evaluation
  572. # - randomforest: .72
  573. # - tree: .70
  574. # - xgboost: .70
  575. # - nn: .65
  576. # - knn: .48
  577. # %% id="eiN5HwCD9cEC"
  578. for k in results.keys():
  579. print('%s: %.2f' % (k.capitalize(), results[k]))
  580. # %% id="q6LPzRASj4M5"
  581. import requests
  582. gcloud_token = !gcloud auth print-access-token
  583. gcloud_tokeninfo = requests.get('https://www.googleapis.com/oauth2/v3/tokeninfo?access_token=' + gcloud_token[0]).json()
  584. gcloud_tokeninfo
  585. # %% [markdown] id="wHrNq7Z1PuQt"
  586. # #Submit result
  587. # %% id="G0ag_iZe-aE3"
  588. print('train model')
  589. model = RandomForestClassifier(n_jobs=None,random_state=27,verbose=0, max_depth=25, criterion='gini')
  590. # re-train on the full training set
  591. model = model.fit(X, np.ravel(Y))
  592. print('predict')
  593. #Predict the response for test dataset
  594. y_pred = model.predict(X_submission)
  595. print('create submission')
  596. # create a dataframe for submission
  597. submission = pd.DataFrame(columns=['id', 'status_group'])
  598. status = enc.inverse_transform(y_pred)
  599. submission['status_group'] = status
  600. submission['id'] = test_values.index
  601. display(submission.head())
  602. submission['status_group'].value_counts().plot(kind='bar')
  603. # save as csv
  604. submission.to_csv('submission.csv', index=False)
  605. # %% id="-Qv6DQ-SS5Gi"
  606. test_values
  607. # %% id="5Vdfr1-xWIJD"
  608. # %% [markdown] id="Ww0xXVHe201e"
  609. # # Graveyard
  610. # Snippets that are incomplete but interesting nonetheless
  611. # %% id="KwCo_6lJCtV1"
  612. # inspired by: https://medium.com/@gabrielziegler3/multiclass-multilabel-classification-with-xgboost-66195e4d9f2d
  613. from xgboost import XGBClassifier
  614. from sklearn.datasets import load_iris
  615. from sklearn.metrics import confusion_matrix
  616. from sklearn.model_selection import train_test_split
  617. from sklearn.model_selection import cross_val_score, KFold
  618. model = XGBClassifier(max_depth=5, objective='multi:softprob', n_estimators=1000,
  619. num_classes=3)
  620. model = model.fit(X_train_normalized, Y_train)
  621. #Predict the response for test dataset
  622. y_pred = model.predict(X_test_normalized)
  623. accuracy=calc_accuracy(y_pred, Y_test)
  624. print("n_neighbors: %d Accuracy on test set: %.2f #correct: %d" % (d, accuracy, correct))
  625. accuracy_xgboost=accuracy
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...