survey
In order to examine this dataset, the following libraries are required and loaded. This dataset is not available in base-R; therefore, we need to read the dataset first as follows:
## Rows: 886
## Columns: 32
## $ History <int> 1, 1, 1, 4, 3, 5, 3, 5, 3, 3, 3, 2, 4, 2, 2,...
## $ Psychology <int> 5, 3, 2, 4, 2, 3, 3, 2, 2, 2, 3, 2, 4, 2, 5,...
## $ Politics <int> 1, 4, 1, 5, 3, 4, 1, 3, 1, 3, 3, 5, 4, 1, 1,...
## $ Mathematics <int> 3, 5, 5, 4, 2, 2, 1, 1, 1, 3, 2, 1, 1, 1, 1,...
## $ Physics <int> 3, 2, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ Internet <int> 5, 4, 4, 3, 2, 4, 2, 5, 1, 5, 4, 5, 3, 3, 4,...
## $ PC <int> 3, 4, 2, 1, 2, 4, 1, 4, 1, 1, 5, 4, 2, 3, 2,...
## $ Economy.Management <int> 5, 5, 4, 2, 2, 1, 3, 1, 1, 4, 3, 1, 1, 3, 3,...
## $ Biology <int> 3, 1, 1, 3, 3, 4, 5, 2, 3, 2, 2, 1, 5, 1, 2,...
## $ Chemistry <int> 3, 1, 1, 3, 3, 4, 5, 2, 1, 1, 1, 1, 5, 1, 1,...
## $ Reading <int> 3, 4, 5, 5, 5, 3, 3, 2, 5, 4, 3, 3, 5, 4, 4,...
## $ Geography <int> 3, 4, 2, 4, 2, 3, 3, 3, 1, 4, 3, 5, 3, 1, 1,...
## $ Foreign.languages <int> 5, 5, 5, 4, 3, 4, 4, 4, 1, 5, 5, 2, 5, 5, 3,...
## $ Medicine <int> 3, 1, 2, 2, 3, 4, 5, 1, 1, 1, 2, 1, 5, 1, 1,...
## $ Law <int> 1, 2, 3, 5, 2, 3, 3, 2, 1, 1, 4, 3, 2, 1, 1,...
## $ Cars <int> 1, 2, 1, 1, 3, 5, 4, 1, 1, 1, 2, 1, 3, 1, 1,...
## $ Art.exhibitions <int> 1, 2, 5, 5, 1, 2, 1, 1, 1, 4, 2, 5, 1, 3, 4,...
## $ Religion <int> 1, 1, 5, 4, 4, 2, 1, 2, 2, 4, 2, 1, 1, 1, 2,...
## $ Countryside..outdoors <int> 5, 1, 5, 1, 4, 5, 4, 2, 4, 4, 4, 5, 5, 5, 3,...
## $ Dancing <int> 3, 1, 5, 1, 1, 1, 3, 1, 1, 5, 1, 1, 3, 3, 1,...
## $ Musical.instruments <int> 3, 1, 5, 1, 3, 5, 2, 1, 2, 3, 1, 1, 4, 3, 1,...
## $ Writing <int> 2, 1, 5, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5,...
## $ Passive.sport <int> 1, 1, 5, 1, 3, 5, 5, 4, 4, 4, 5, 5, 5, 3, 3,...
## $ Active.sport <int> 5, 1, 2, 1, 1, 4, 3, 5, 1, 4, 1, 3, 3, 3, 1,...
## $ Gardening <int> 5, 1, 1, 1, 4, 2, 3, 1, 1, 1, 3, 1, 4, 1, 5,...
## $ Celebrities <int> 1, 2, 1, 2, 3, 1, 1, 3, 5, 2, 2, 2, 3, 5, 5,...
## $ Shopping <int> 4, 3, 4, 4, 3, 2, 3, 3, 2, 4, 5, 3, 2, 5, 5,...
## $ Science.and.technology <int> 4, 3, 2, 3, 3, 3, 4, 2, 1, 3, 4, 3, 3, 2, 2,...
## $ Theatre <int> 2, 2, 5, 1, 2, 1, 3, 2, 5, 5, 2, 1, 2, 3, 4,...
## $ Fun.with.friends <int> 5, 4, 5, 2, 4, 3, 5, 4, 4, 5, 4, 3, 4, 5, 5,...
## $ Adrenaline.sports <int> 4, 2, 5, 1, 2, 3, 1, 2, 1, 2, 1, 1, 1, 4, 1,...
## $ Pets <int> 4, 5, 5, 1, 1, 2, 5, 5, 1, 2, 5, 1, 2, 5, 5,...
## History Psychology Politics Mathematics Physics
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.00 1st Qu.:1.000
## Median :3.000 Median :3.000 Median :2.000 Median :2.00 Median :2.000
## Mean :3.207 Mean :3.117 Mean :2.611 Mean :2.37 Mean :2.082
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:3.00 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.00 Max. :5.000
## Internet PC Economy.Management Biology
## Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:2.00 1st Qu.:1.000 1st Qu.:1.000
## Median :4.000 Median :3.00 Median :2.000 Median :2.000
## Mean :4.176 Mean :3.13 Mean :2.648 Mean :2.649
## 3rd Qu.:5.000 3rd Qu.:4.00 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.00 Max. :5.000 Max. :5.000
## Chemistry Reading Geography Foreign.languages
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.000
## Median :2.000 Median :3.000 Median :3.000 Median :4.000
## Mean :2.148 Mean :3.163 Mean :3.105 Mean :3.788
## 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Medicine Law Cars Art.exhibitions
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :2.000 Median :2.000 Median :2.000 Median :2.000
## Mean :2.509 Mean :2.254 Mean :2.658 Mean :2.595
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Religion Countryside..outdoors Dancing Musical.instruments
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:1.000
## Median :2.000 Median :4.000 Median :2.000 Median :2.000
## Mean :2.262 Mean :3.675 Mean :2.438 Mean :2.318
## 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Writing Passive.sport Active.sport Gardening
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:1.000
## Median :1.000 Median :4.000 Median :3.000 Median :1.000
## Mean :1.886 Mean :3.412 Mean :3.275 Mean :1.894
## 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:2.750
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Celebrities Shopping Science.and.technology Theatre
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :3.000 Median :3.000 Median :3.000
## Mean :2.345 Mean :3.263 Mean :3.251 Mean :3.026
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Fun.with.friends Adrenaline.sports Pets
## Min. :2.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:2.000 1st Qu.:2.000
## Median :5.000 Median :3.000 Median :4.000
## Mean :4.554 Mean :2.927 Mean :3.326
## 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000
survey
#Do PCA with princomp function and use correlation matrix to create components
survey_pca_result <- princomp(survey, cor=TRUE)
summary(survey_pca_result, loadings = TRUE)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 2.0433958 1.8069970 1.60438109 1.46391156 1.26463164
## Proportion of Variance 0.1304833 0.1020387 0.08043871 0.06696991 0.04997791
## Cumulative Proportion 0.1304833 0.2325220 0.31296072 0.37993063 0.42990854
## Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
## Standard deviation 1.18461175 1.06955650 1.05602251 1.04363010 1.00633062
## Proportion of Variance 0.04385328 0.03574847 0.03484949 0.03403637 0.03164692
## Cumulative Proportion 0.47376182 0.50951029 0.54435978 0.57839615 0.61004306
## Comp.11 Comp.12 Comp.13 Comp.14 Comp.15
## Standard deviation 0.95951254 0.93789360 0.93168719 0.89503805 0.87262765
## Proportion of Variance 0.02877076 0.02748889 0.02712628 0.02503416 0.02379622
## Cumulative Proportion 0.63881382 0.66630271 0.69342899 0.71846315 0.74225937
## Comp.16 Comp.17 Comp.18 Comp.19 Comp.20
## Standard deviation 0.85510068 0.8329419 0.80296355 0.78632469 0.76747805
## Proportion of Variance 0.02284991 0.0216810 0.02014845 0.01932208 0.01840695
## Cumulative Proportion 0.76510928 0.7867903 0.80693874 0.82626082 0.84466777
## Comp.21 Comp.22 Comp.23 Comp.24 Comp.25
## Standard deviation 0.74821762 0.72513227 0.71054805 0.70905599 0.69087699
## Proportion of Variance 0.01749468 0.01643178 0.01577745 0.01571126 0.01491597
## Cumulative Proportion 0.86216245 0.87859422 0.89437168 0.91008294 0.92499891
## Comp.26 Comp.27 Comp.28 Comp.29 Comp.30
## Standard deviation 0.65750430 0.64296368 0.60888823 0.5954011 0.551814415
## Proportion of Variance 0.01350975 0.01291882 0.01158578 0.0110782 0.009515598
## Cumulative Proportion 0.93850866 0.95142748 0.96301326 0.9740915 0.983607058
## Comp.31 Comp.32
## Standard deviation 0.539017589 0.483770803
## Proportion of Variance 0.009079374 0.007313568
## Cumulative Proportion 0.992686432 1.000000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## History 0.196 0.184 0.243 0.130 0.243 0.142 0.194
## Psychology 0.244 0.119 0.112 -0.102
## Politics 0.131 0.202 0.255 0.204 0.270 0.153 -0.108
## Mathematics 0.295 -0.126 0.133 -0.309 -0.220 -0.273
## Physics 0.344 -0.241 0.136 -0.155 -0.151 -0.151
## Internet 0.237 0.127 -0.400 0.370
## PC 0.377 -0.134 -0.291 0.137 0.144
## Economy.Management 0.174 0.306 0.214 -0.182 -0.166 -0.288
## Biology 0.269 -0.377 -0.151 0.221
## Chemistry 0.199 -0.421 0.230
## Reading 0.282 -0.188 0.176 0.173
## Geography 0.160 0.145 0.151 0.218 0.226 0.243
## Foreign.languages 0.208 0.217 -0.117 -0.165 0.227
## Medicine 0.269 -0.317 -0.118 0.266
## Law 0.143 0.147 0.277 0.363 0.116 -0.142
## Cars 0.350 -0.168
## Art.exhibitions 0.313 -0.201
## Religion 0.230 0.166 0.128
## Countryside..outdoors 0.194 -0.102 -0.334 0.167 0.119
## Dancing 0.241 -0.230 -0.107 -0.326
## Musical.instruments 0.210 -0.361 -0.158
## Writing 0.231 0.161 -0.194 -0.114 0.214 -0.171
## Passive.sport 0.162 -0.222 -0.135 0.116
## Active.sport 0.200 -0.241 -0.138 0.340 -0.286
## Gardening 0.192 -0.178 0.445 -0.200
## Celebrities 0.183 -0.351 0.181 -0.289 0.234
## Shopping -0.136 0.178 -0.399 0.153 -0.241
## Science.and.technology 0.348 0.195
## Theatre 0.302 -0.128 -0.144 -0.256 0.115
## Fun.with.friends 0.135 -0.260 -0.154 -0.498 0.208
## Adrenaline.sports 0.254 -0.221 -0.200 0.313 -0.179
## Pets -0.256 0.138 0.132
## Comp.9 Comp.10 Comp.11 Comp.12 Comp.13 Comp.14 Comp.15
## History 0.162 0.252 0.233
## Psychology 0.214 -0.181 0.232 0.272 0.536 0.153 -0.266
## Politics 0.139 -0.187 0.146
## Mathematics 0.360 -0.129 0.147
## Physics 0.127 0.160 0.130
## Internet -0.107 0.147 0.264 0.174 0.157
## PC -0.183 0.100
## Economy.Management -0.163 0.189 -0.190 -0.148
## Biology
## Chemistry -0.142 0.101
## Reading 0.116 -0.130 -0.150 0.177
## Geography -0.335 0.105 -0.349 -0.228 0.113 0.155
## Foreign.languages -0.364 -0.122 -0.187 0.114 -0.408
## Medicine -0.135 -0.178 -0.101
## Law 0.135 -0.179 -0.158
## Cars 0.101 0.125 -0.331 -0.100 -0.174
## Art.exhibitions 0.166 -0.156 0.114 -0.343
## Religion -0.220 0.370 0.456 -0.146 0.147 0.103
## Countryside..outdoors -0.240 0.371 0.412
## Dancing -0.204 -0.108 0.114 0.102 0.145 0.200
## Musical.instruments -0.106 -0.226 0.270 -0.141 -0.110 -0.428 0.197
## Writing 0.212 -0.319 -0.309 0.149
## Passive.sport -0.150 0.259 0.560 -0.466 0.125 -0.296
## Active.sport -0.204 -0.214 0.153 0.175
## Gardening 0.108 0.220 -0.109 0.146
## Celebrities 0.185
## Shopping -0.108 0.105
## Science.and.technology 0.194 -0.169 -0.186 0.121 -0.194 -0.182
## Theatre 0.190 0.145 -0.258 -0.252
## Fun.with.friends 0.173 -0.147 0.130 0.471
## Adrenaline.sports 0.103 -0.120 0.141
## Pets 0.467 0.336 0.108 0.306 -0.465 0.209
## Comp.16 Comp.17 Comp.18 Comp.19 Comp.20 Comp.21 Comp.22
## History 0.141 0.130 0.369
## Psychology -0.125 0.116 0.242 0.163
## Politics 0.149 0.140 -0.154
## Mathematics 0.183 -0.171 0.169
## Physics 0.150 -0.125 0.126 0.162
## Internet 0.195 -0.216 -0.225 -0.113
## PC 0.226 -0.103 -0.249
## Economy.Management -0.333 -0.159 -0.224
## Biology -0.141
## Chemistry -0.102
## Reading -0.411 -0.271 0.131
## Geography -0.236 0.302 -0.223 0.285
## Foreign.languages -0.329 0.154 0.303 0.202 0.111
## Medicine -0.165
## Law 0.187 -0.301 -0.262 0.183
## Cars 0.164 -0.145 0.380 0.464
## Art.exhibitions 0.122 -0.313 0.286 -0.199
## Religion 0.125 -0.435 -0.322 -0.259
## Countryside..outdoors 0.192 0.189 -0.397 0.125
## Dancing 0.345 0.202 0.211 -0.311 0.108 0.310
## Musical.instruments 0.200 0.303 0.213 -0.230
## Writing -0.169 -0.250 -0.224 -0.178 0.358
## Passive.sport -0.128 0.274
## Active.sport 0.220 0.172 -0.432 0.108 -0.301
## Gardening -0.564 -0.193 0.189 0.236
## Celebrities 0.248 -0.252 0.131 0.254 -0.187 -0.201
## Shopping 0.101 -0.178 0.282 0.122 0.118
## Science.and.technology -0.189 0.554 -0.102 -0.148
## Theatre 0.185 -0.147 -0.153
## Fun.with.friends -0.304 -0.160 0.126
## Adrenaline.sports -0.111 -0.365 -0.278 -0.278
## Pets 0.240 -0.263 0.152
## Comp.23 Comp.24 Comp.25 Comp.26 Comp.27 Comp.28 Comp.29
## History 0.129 0.487 0.247 0.164
## Psychology -0.228 -0.162 -0.152 -0.144 -0.131
## Politics 0.119 -0.337 0.355 0.335 -0.385 0.120
## Mathematics -0.195
## Physics -0.172 -0.129 0.263
## Internet 0.219 -0.206 0.359 -0.223
## PC 0.189 -0.442 0.499
## Economy.Management 0.180 0.443 -0.235 -0.101
## Biology 0.102
## Chemistry 0.221 0.135 0.165
## Reading 0.303 -0.144 -0.245 -0.303 -0.348 -0.174
## Geography -0.175 -0.272
## Foreign.languages -0.174 0.149 0.178 0.190 0.137 0.118
## Medicine -0.110 0.155
## Law -0.438 -0.128 -0.253 0.319
## Cars 0.262 -0.220 -0.261
## Art.exhibitions -0.126 0.242 0.332 0.122 -0.262 -0.311
## Religion 0.142
## Countryside..outdoors -0.135 -0.142 0.280 -0.132
## Dancing 0.274 -0.102 0.142 0.260
## Musical.instruments 0.103 -0.257 -0.171 -0.179
## Writing -0.110 0.165 -0.197 0.289 0.140
## Passive.sport 0.112 -0.129
## Active.sport -0.258 -0.139 0.125 -0.204 0.126
## Gardening -0.112 0.157 -0.193 0.135
## Celebrities 0.318 -0.237 -0.228 0.169 0.194
## Shopping -0.342 0.322 0.255 -0.186 -0.295 -0.215 0.163
## Science.and.technology -0.107 0.175 -0.435
## Theatre -0.202 0.486 0.372
## Fun.with.friends -0.126 -0.305 0.136
## Adrenaline.sports 0.290 0.419 -0.245
## Pets
## Comp.30 Comp.31 Comp.32
## History 0.179
## Psychology -0.195
## Politics 0.113
## Mathematics 0.575
## Physics 0.123 -0.650
## Internet
## PC 0.162
## Economy.Management -0.239
## Biology -0.792
## Chemistry -0.623 0.390
## Reading -0.155
## Geography
## Foreign.languages
## Medicine 0.632 0.397
## Law
## Cars
## Art.exhibitions -0.102
## Religion -0.111
## Countryside..outdoors
## Dancing
## Musical.instruments
## Writing 0.150
## Passive.sport
## Active.sport
## Gardening
## Celebrities
## Shopping 0.134
## Science.and.technology
## Theatre -0.175 0.161
## Fun.with.friends
## Adrenaline.sports
## Pets
ggplot(data = data.frame(PC = 1:length(survey_pca_result$sdev), var_exp = cumsum(survey_pca_result$sdev^2 / sum(survey_pca_result$sdev^2))), aes(x = PC, y = var_exp)) +
geom_line() +
geom_point() +
geom_hline(yintercept = 1, color = "black") +
geom_hline(yintercept = 0.9, color = "blue2") +
geom_vline(xintercept = 24, color = "blue2") +
geom_hline(yintercept = 0.8, color = "green2") +
geom_vline(xintercept = 18, color = "green2") +
scale_y_continuous(labels = scales::percent,breaks=seq(0,1,length.out=11)) +
scale_x_continuous(breaks=seq(0,32,by=2)) +
labs(title = "Trade-off Graph between Explanatory Power and number of Explanatory Variables", y = "Cumulative Explained Variance in %")
#Correlation is a similarity measure between -1 and 1
#Get the negative of it as a distance measure and add 1 to make sure distances start from 0
survey_dist <- 1 - cor(survey)
#Apply MDS into 2 dimensional scale
survey_mds <- cmdscale(survey_dist, k=2)
#Provide column names
colnames(survey_mds) <- c("x","y")
print(survey_mds)
## x y
## History 0.06372563 0.012956654
## Psychology 0.23826552 -0.024422880
## Politics -0.14749496 0.107881476
## Mathematics -0.32345626 -0.297702691
## Physics -0.32464296 -0.499240678
## Internet -0.43673303 0.187118496
## PC -0.58516404 -0.107666761
## Economy.Management -0.30422083 0.383264449
## Biology 0.25483318 -0.395803325
## Chemistry 0.12745384 -0.486209735
## Reading 0.54412403 -0.002498553
## Geography -0.04741594 0.042633809
## Foreign.languages 0.20488850 0.223464512
## Medicine 0.21225026 -0.352629731
## Law -0.07751035 0.252871501
## Cars -0.58387376 0.046912232
## Art.exhibitions 0.37143545 0.009747048
## Religion 0.19597972 -0.174122589
## Countryside..outdoors 0.14091867 -0.071530114
## Dancing 0.27743365 0.136486519
## Musical.instruments 0.16521053 -0.134880364
## Writing 0.30297822 -0.032446839
## Passive.sport -0.30096850 0.108795592
## Active.sport -0.24240576 0.047405197
## Gardening 0.19439442 -0.077481786
## Celebrities 0.06494596 0.467218540
## Shopping 0.20807212 0.463704403
## Science.and.technology -0.36445481 -0.278443856
## Theatre 0.44547786 0.054248884
## Fun.with.friends -0.05316373 0.253721685
## Adrenaline.sports -0.30291870 0.041430796
## Pets 0.08203608 0.095218109
#Plot
ggplot(data.frame(survey_mds), aes(x = x, y = y)) +
geom_text(label = rownames(survey_mds), angle = 45, size = 2.7) +
labs(title = "Multidimensional Scaling Output", subtitle = "Correlation taken as the measure of similarity")
Young People Survey
MDS output.set.seed(100)
sum_of_squares <- c()
for(i in 2:15){
fit <- kmeans(survey_mds, centers = i)
sum_of_squares[i] <- 100 * round(1-(fit$tot.withinss/fit$totss),digits = 3)
}
plot(x=1:15, y=sum_of_squares, xlab = "Number of clusters", ylab = "Between_SS / Total_SS")
lines(1:15, sum_of_squares)
abline(h=80)
abline(h=90)
abline(v=6)
#Set the seed since K-Means algorithm uses a search based method
set.seed(100)
#Apply K-Means
hobby_cluster <- kmeans(survey_mds, centers = 6) #k=6
#Get the clusters
mds_clusters <- data.frame(hobby = names(hobby_cluster$cluster), cluster_mds = hobby_cluster$cluster) %>%
arrange(cluster_mds, hobby)
mds_clusters
## hobby cluster_mds
## Biology Biology 1
## Chemistry Chemistry 1
## Medicine Medicine 1
## Active.sport Active.sport 2
## Adrenaline.sports Adrenaline.sports 2
## Economy.Management Economy.Management 2
## Fun.with.friends Fun.with.friends 2
## Geography Geography 2
## Law Law 2
## Passive.sport Passive.sport 2
## Politics Politics 2
## Cars Cars 3
## Internet Internet 3
## PC PC 3
## Mathematics Mathematics 4
## Physics Physics 4
## Science.and.technology Science.and.technology 4
## Celebrities Celebrities 5
## Foreign.languages Foreign.languages 5
## Shopping Shopping 5
## Art.exhibitions Art.exhibitions 6
## Countryside..outdoors Countryside..outdoors 6
## Dancing Dancing 6
## Gardening Gardening 6
## History History 6
## Musical.instruments Musical.instruments 6
## Pets Pets 6
## Psychology Psychology 6
## Reading Reading 6
## Religion Religion 6
## Theatre Theatre 6
## Writing Writing 6
#Plot the output
data.frame(survey_mds) %>%
mutate(clusters = as.factor(hobby_cluster$cluster), hobby = rownames(survey_mds)) %>%
ggplot(., aes(x = x, y = y)) +
geom_text(aes(label = hobby, color = clusters), angle=45, size=2.7) +
geom_point(data = as.data.frame(hobby_cluster$centers), aes(x=x,y=y)) +
labs(title = "K-Means Method Applied into MDS Output", color = "Clusters")
#Set the seed because K-Means algorithm uses a search based method
set.seed(100)
#Apply K-Means to the raw data
hobby_cluster_raw <- kmeans(t(survey), centers = 6)
#Build the comparison data
compare_kmeans <- left_join(mds_clusters, data.frame(hobby = names(hobby_cluster_raw$cluster), cluster_raw = hobby_cluster_raw$cluster), by="hobby")
print(compare_kmeans)
## hobby cluster_mds cluster_raw
## 1 Biology 1 1
## 2 Chemistry 1 1
## 3 Medicine 1 1
## 4 Active.sport 2 2
## 5 Adrenaline.sports 2 2
## 6 Economy.Management 2 4
## 7 Fun.with.friends 2 3
## 8 Geography 2 5
## 9 Law 2 4
## 10 Passive.sport 2 2
## 11 Politics 2 4
## 12 Cars 3 4
## 13 Internet 3 3
## 14 PC 3 4
## 15 Mathematics 4 1
## 16 Physics 4 1
## 17 Science.and.technology 4 4
## 18 Celebrities 5 6
## 19 Foreign.languages 5 3
## 20 Shopping 5 5
## 21 Art.exhibitions 6 6
## 22 Countryside..outdoors 6 3
## 23 Dancing 6 6
## 24 Gardening 6 6
## 25 History 6 5
## 26 Musical.instruments 6 6
## 27 Pets 6 5
## 28 Psychology 6 5
## 29 Reading 6 5
## 30 Religion 6 6
## 31 Theatre 6 5
## 32 Writing 6 6
#Set the seed because K-Means algorithm uses a search based method
set.seed(100)
#Prepare data (get mean scores) and apply K-Means
survey_means_data <- survey %>%
summarise_each(funs(mean)) %>%
t()
survey_means_data
## [,1]
## History 3.206546
## Psychology 3.117381
## Politics 2.610609
## Mathematics 2.370203
## Physics 2.082393
## Internet 4.176072
## PC 3.129797
## Economy.Management 2.647856
## Biology 2.648984
## Chemistry 2.147856
## Reading 3.162528
## Geography 3.104966
## Foreign.languages 3.787810
## Medicine 2.509029
## Law 2.253950
## Cars 2.658014
## Art.exhibitions 2.594808
## Religion 2.261851
## Countryside..outdoors 3.674944
## Dancing 2.437923
## Musical.instruments 2.318284
## Writing 1.886005
## Passive.sport 3.411964
## Active.sport 3.275395
## Gardening 1.893905
## Celebrities 2.345372
## Shopping 3.262980
## Science.and.technology 3.250564
## Theatre 3.025959
## Fun.with.friends 4.554176
## Adrenaline.sports 2.926637
## Pets 3.326185
means_kmeans <- kmeans(survey_means_data, centers = 6)
#Add to compare kmeans
compare_kmeans <- left_join(compare_kmeans, data.frame(hobby = names(means_kmeans$cluster), cluster_mean = means_kmeans$cluster), by="hobby")
print(compare_kmeans)
## hobby cluster_mds cluster_raw cluster_mean
## 1 Biology 1 1 4
## 2 Chemistry 1 1 6
## 3 Medicine 1 1 4
## 4 Active.sport 2 2 2
## 5 Adrenaline.sports 2 2 2
## 6 Economy.Management 2 4 4
## 7 Fun.with.friends 2 3 3
## 8 Geography 2 5 2
## 9 Law 2 4 1
## 10 Passive.sport 2 2 2
## 11 Politics 2 4 4
## 12 Cars 3 4 4
## 13 Internet 3 3 3
## 14 PC 3 4 2
## 15 Mathematics 4 1 1
## 16 Physics 4 1 6
## 17 Science.and.technology 4 4 2
## 18 Celebrities 5 6 1
## 19 Foreign.languages 5 3 5
## 20 Shopping 5 5 2
## 21 Art.exhibitions 6 6 4
## 22 Countryside..outdoors 6 3 5
## 23 Dancing 6 6 1
## 24 Gardening 6 6 6
## 25 History 6 5 2
## 26 Musical.instruments 6 6 1
## 27 Pets 6 5 2
## 28 Psychology 6 5 2
## 29 Reading 6 5 2
## 30 Religion 6 6 1
## 31 Theatre 6 5 2
## 32 Writing 6 6 6
### Single link (MIN) method
survey_hc_min <- hclust(as.dist(survey_dist), method="single")
plot(survey_hc_min, hang=-1)
### Complete link (MAX) method
survey_hc_max <- hclust(as.dist(survey_dist), method="complete")
plot(survey_hc_max, hang=-1)
### Average method
survey_hc_avg <- hclust(as.dist(survey_dist), method="average")
plot(survey_hc_avg, hang=-1)
#Run a simple linear regression
ggplot(data=survey) +
geom_jitter(aes(x=Biology, y=Medicine, color=Biology)) +
geom_smooth(method='lm',aes(x=Biology,y=Medicine),se=FALSE)
##
## Call:
## lm(formula = Biology ~ Medicine, data = survey)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4494 -0.5583 -0.2811 0.5506 3.4417
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.83547 0.06780 12.32 <2e-16 ***
## Medicine 0.72280 0.02375 30.44 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9632 on 884 degrees of freedom
## Multiple R-squared: 0.5117, Adjusted R-squared: 0.5112
## F-statistic: 926.5 on 1 and 884 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = Biology ~ ., data = survey)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5615 -0.4854 -0.0821 0.4531 3.0555
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.069068 0.258427 -0.267 0.78933
## History -0.010261 0.026593 -0.386 0.69970
## Psychology 0.101361 0.025590 3.961 8.09e-05 ***
## Politics -0.080921 0.026817 -3.018 0.00262 **
## Mathematics -0.007042 0.027649 -0.255 0.79902
## Physics 0.026856 0.032853 0.817 0.41391
## Internet -0.048857 0.036370 -1.343 0.17951
## PC -0.004014 0.028582 -0.140 0.88835
## Economy.Management -0.020069 0.024706 -0.812 0.41683
## Chemistry 0.394377 0.027990 14.090 < 2e-16 ***
## Reading -0.008875 0.024308 -0.365 0.71511
## Geography 0.052637 0.024747 2.127 0.03370 *
## Foreign.languages -0.010099 0.028463 -0.355 0.72282
## Medicine 0.410047 0.027886 14.705 < 2e-16 ***
## Law -0.021906 0.027282 -0.803 0.42222
## Cars -0.060565 0.023939 -2.530 0.01159 *
## Art.exhibitions 0.038568 0.027250 1.415 0.15733
## Religion 0.059759 0.023517 2.541 0.01123 *
## Countryside..outdoors 0.027817 0.026175 1.063 0.28821
## Dancing -0.012575 0.022962 -0.548 0.58407
## Musical.instruments -0.008970 0.021070 -0.426 0.67042
## Writing -0.044092 0.025543 -1.726 0.08468 .
## Passive.sport 0.027892 0.021128 1.320 0.18713
## Active.sport -0.002701 0.021028 -0.128 0.89783
## Gardening 0.106469 0.026850 3.965 7.95e-05 ***
## Celebrities 0.026536 0.025667 1.034 0.30150
## Shopping 0.039514 0.027008 1.463 0.14382
## Science.and.technology 0.046472 0.026900 1.728 0.08442 .
## Theatre -0.010621 0.028141 -0.377 0.70597
## Fun.with.friends 0.046378 0.041194 1.126 0.26055
## Adrenaline.sports -0.014664 0.023326 -0.629 0.52975
## Pets 0.043556 0.018900 2.305 0.02143 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8176 on 854 degrees of freedom
## Multiple R-squared: 0.6602, Adjusted R-squared: 0.6479
## F-statistic: 53.52 on 31 and 854 DF, p-value: < 2.2e-16
Click here in order to see the first part of the assignment where I reviewed another dataset esoph