Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

Airline_Capstone Project_R Studio.R 5.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
  1. install.packages("tidyverse")
  2. install.packages("dplyr")
  3. install.packages("psych")
  4. install.packages ("corrplot")
  5. install.packages("janitor")
  6. install.packages("umx")
  7. install.packages("devtools")
  8. getwd()
  9. library(readxl)
  10. library(dplyr)
  11. library(umx)
  12. library (psych)
  13. library (corrplot)
  14. library (janitor)
  15. #Upload the dataset
  16. train <- read_excel("C:/Users/pansy.dwe/Desktop/AIRLINE/train.xlsx")
  17. attach(train)
  18. names(train)
  19. #data transformation from
  20. train_transform1 <- train %>%mutate_at(c("Gender", "Customer Type", "Type of Travel", "satisfaction", "Class"), funs(recode(.,"Business" = 1, "Eco"= 2, "Eco Plus" = 3,"neutral or dissatisfied"= 1, "satisfied" = 2, "Female" = 1, "Male"= 2, "Loyal Customer" =1, "disloyal Customer" = 2, "Personal Travel" = 1, "Business travel" = 2)))
  21. #Remove null value
  22. sum(is.na(train_transform1))
  23. train_without_null = na.omit(train_transform1)
  24. train_without_null
  25. # Outliers detection
  26. boxplot(train_without_null$Age, ylab = "Age", col = I("turquoise"), main = "Population distribution")
  27. summary(Age)
  28. boxplot(train_without_null$`Flight Distance`, ylab = "Flight Distance (km)", col = I("orange"), main = "Flight distance distribution")
  29. boxplot(train_without_null$`Arrival Delay in Minutes`, ylab = "Arrival Delay in minutes", col = I("orange"), main = "Arrival delay in minutes")
  30. boxplot(train_without_null$`Departure Delay in Minutes`, ylab = "Departure Delay in minutes", col = I("orange"), main = "Departure delay in minutes")
  31. #Note: Outliers detected in Flight distance, Arrival delay in minutes, Departure delay in minutes but no outliers detected in Age.
  32. #correlation coefficient matrix
  33. library(corrplot)
  34. corrplot(cor(train_without_null), method = "circle")
  35. mcor<-round(cor(train_without_null),2)
  36. mcor
  37. #Pearson's correlation
  38. cor.test(train_without_null$satisfaction, train_without_null$`Gate location`)
  39. cor.test(train_without_null$satisfaction, train_without_null$`Online boarding`)
  40. cor.test(train_without_null$satisfaction, train_without_null$`Inflight wifi service`)
  41. cor.test(train_without_null$satisfaction, train_without_null$`Ease of Online booking`)
  42. cor.test(train_without_null$satisfaction, train_without_null$`Baggage handling`)
  43. cor.test(train_without_null$satisfaction, train_without_null$`Inflight service`)
  44. cor.test(train_without_null$satisfaction, train_without_null$`Departure/Arrival time convenient`)
  45. cor.test(train_without_null$satisfaction, train_without_null$Age)
  46. cor.test(train_without_null$`Arrival Delay in Minutes`, train_without_null$`Departure/Arrival time convenient`)
  47. cor.test(train_without_null$`Departure Delay in Minutes`, train_without_null$`Departure/Arrival time convenient`)
  48. #Reliability test (cronbarch Alpha value)
  49. install.packages("umx")
  50. train_matrix <- data.matrix(train_without_null)
  51. library(umx)
  52. reliability(cov(train_without_null))
  53. # > reliability(cov(train_without_null))
  54. # Results: Alpha reliability = 0.0064 and Standardized alpha = 0.6615.
  55. # Cronbach's appha value should be at least 0.07 - 0.08 to get a good quality/ reliable dataset. Therefore, we will remove columsn which are not relavant for the analysis (serial number, id) and data detected for outliers and very low/ no correlation with satisfaction (Arrival delay in minutes and departure delay in minutes) and group some variable which are important for our anlaysis (Age and flight distsance)
  56. #Groupping
  57. train_without_null$Agegroup <- cut (train_without_null$Age,
  58. breaks = c (-Inf
  59. ,20,30,40,50,60,70
  60. , Inf),
  61. Labels = c ("under 20"
  62. , "20 to 29", "30 to 39", "40 to 49",
  63. "50 to 59", "60 to 69", "70 and above"),
  64. right = FALSE)
  65. train_without_null$flight_distance_group <- cut (train_without_null$`Flight Distance`,
  66. breaks = c (-Inf
  67. ,499,2000,
  68. Inf),
  69. Labels = c ("less than 500 km"
  70. ,"between 500 & 2000", "above 2000"),
  71. right = FALSE)
  72. #Remove columns
  73. train_without_null_remove_columns <- train_without_null[, ! (names (train_without_null)%in% c( "...1", "id","Age", "Flight Distance", "Arrival Delay in Minutes", "Departure Delay in Minutes"))]
  74. head(train_without_null_remove_columns)
  75. #Reliability test after groupping and removing un-necessary data(cronbarch Alpha value)
  76. install.packages("umx")
  77. train_matrix2 <- data.matrix(train_without_null_remove_columns)
  78. library(umx)
  79. reliability(cov(train_matrix2))
  80. #correlation matrix with final dataset for 14 categories
  81. train_cat_corr<- train_without_null_remove_columns[, ! (names (train_without_null_remove_columns)%in% c("Gender", "Customer Type","Type of Travel", "Class","Agegroup", "flight_distance_group"))]
  82. head(train_cat_corr)
  83. library(corrplot)
  84. corrplot(cor(train_cat_corr), method = "number")
  85. mcor<-round(cor(train_cat_corr),2)
  86. mcor
  87. write.table(train_without_null, file = "ExportfromR.csv", sep = ",")
  88. write.table(mcor, file = "correlation.csv", sep = ",")
  89. cor.test(train_without_null$satisfaction, train_without_null$Age)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...