#Load library
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
#load data
load("atp_tennis_data_2017.RData")
#analysis
champ_flags_df <- tourney_df %>% left_join(.,player_df,by=c("singles_winner_player_id"= "player_id")) %>% count(flag_code,sort=TRUE)
nonchamp_players <-
player_df %>% select(player_id,flag_code) %>% anti_join(.,champ_flags_df)
## Joining, by = "flag_code"
nonchamp_players %>% left_join(.,score_df,by=c("player_id"="winner_player_id")) %>% group_by(flag_code) %>% summarise(total_won=sum(winner_games_won,na.rm=TRUE)) %>% arrange(desc(total_won))
## # A tibble: 93 x 2
## flag_code total_won
## <chr> <dbl>
## 1 AUS 1989
## 2 CZE 1209
## 3 CAN 1190
## 4 SVK 889
## 5 BRA 873
## 6 POR 621
## 7 RSA 566
## 8 KAZ 495
## 9 KOR 438
## 10 GEO 377
## # … with 83 more rows
#analysis of what are the most surface using in the country
ss <- left_join(player_df, tourney_df, by = c('player_id'='singles_winner_player_id'))
ss %>% group_by(flag_code,tourney_surface) %>% filter(tourney_surface != "NA") %>% count(tourney_surface) %>% arrange(desc(n))
## # A tibble: 33 x 3
## # Groups: flag_code, tourney_surface [33]
## flag_code tourney_surface n
## <chr> <chr> <int>
## 1 USA Hard 7
## 2 ESP Clay 6
## 3 SUI Hard 5
## 4 BUL Hard 4
## 5 ESP Hard 4
## 6 FRA Hard 4
## 7 GER Hard 4
## 8 GER Clay 3
## 9 BEL Hard 2
## 10 BIH Hard 2
## # … with 23 more rows
#analysis of which player take most rate according to match duration and total points
abc <- inner_join(stats_df, score_df,by=c("match_id"))
abc %>% select(winner_player_id.x, match_duration, winner_total_points_total,loser_total_points_total) %>% filter(match_duration > 250)%>% mutate(rate=match_duration/(winner_total_points_total+loser_total_points_total))%>% arrange(desc(rate))
## # A tibble: 14 x 5
## winner_player_id… match_duration winner_total_po… loser_total_poi… rate
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 d646 252 180 180 0.7
## 2 n409 296 365 365 0.405
## 3 w367 274 339 339 0.404
## 4 i165 289 379 379 0.381
## 5 sc77 272 365 365 0.373
## 6 ma30 288 389 389 0.370
## 7 gb88 260 359 359 0.362
## 8 e224 260 360 360 0.361
## 9 sc73 251 354 354 0.355
## 10 r772 295 418 418 0.353
## 11 k336 315 450 450 0.35
## 12 f401 267 396 396 0.337
## 13 cd85 257 382 382 0.336
## 14 bh09 265 399 399 0.332