url<-url("https://github.com/pjournal/mef03-karaahmetlid/blob/master/atp_tennis_data_2017.RData?raw=true")
load(url)
library(tidyverse)
library(dplyr)
library(ggplot2)
library(viridis) 
library(knitr)

Task1: Rank countries by the singles champions.

player_tourney_df <- left_join(tourney_df, player_df, by = c("singles_winner_player_id" = "player_id"))
player_tourney_df %>% select(flag_code) %>% group_by(flag_code) %>% summarise(Count=n()) %>% arrange(desc(Count))
## # A tibble: 21 x 2
##    flag_code Count
##    <chr>     <int>
##  1 ESP          11
##  2 USA           9
##  3 SUI           8
##  4 FRA           7
##  5 GER           7
##  6 BUL           4
##  7 ARG           2
##  8 BEL           2
##  9 BIH           2
## 10 CRO           2
## # ... with 11 more rows

Task 2: Rank countries which did not get any singles championships by the games won when they win the match.

non_ch_pl <- player_df %>% select(player_id, flag_code) %>% anti_join(player_tourney_df)
country_non_ch_pl <- non_ch_pl %>% left_join(score_df, by = c("player_id" = "winner_player_id")) %>% 
  group_by(flag_code) %>% summarise(totalWon = sum(winner_games_won, na.rm = TRUE)) %>% arrange(desc(totalWon))
country_non_ch_pl
## # A tibble: 93 x 2
##    flag_code totalWon
##    <chr>        <dbl>
##  1 AUS           1989
##  2 CZE           1209
##  3 CAN           1190
##  4 SVK            889
##  5 BRA            873
##  6 POR            621
##  7 RSA            566
##  8 KAZ            495
##  9 KOR            438
## 10 GEO            377
## # ... with 83 more rows
country_non_ch_pl %>% top_n(20) %>%
  ggplot(aes(x = flag_code, y = totalWon, fill = flag_code)) +
  geom_col()

Time spent on court of the players who had reached at least Quarter Finals in Grand Slam Tournements

match_dur_df <- left_join(score_df, stats_df) %>% 
  select("match_id", "tourney_id", "winner_player_id", "loser_player_id", "tourney_round_name", "match_duration")
match_dur_df$tourney_id <- as.numeric(match_dur_df$tourney_id)
match_dur_df
## # A tibble: 3,834 x 6
##    match_id tourney_id winner_player_id loser_player_id tourney_round_n~
##    <chr>         <dbl> <chr>            <chr>           <chr>           
##  1 2017-33~        339 d875             n552            Finals          
##  2 2017-33~        339 d875             r975            Semi-Finals     
##  3 2017-33~        339 n552             w367            Semi-Finals     
##  4 2017-33~        339 r975             n409            Quarter-Finals  
##  5 2017-33~        339 w367             e831            Quarter-Finals  
##  6 2017-33~        339 n552             tc61            Quarter-Finals  
##  7 2017-33~        339 d875             tb69            Quarter-Finals  
##  8 2017-33~        339 r975             sm37            Round of 16     
##  9 2017-33~        339 w367             t840            Round of 16     
## 10 2017-33~        339 n552             dc58            Round of 16     
## # ... with 3,824 more rows, and 1 more variable: match_duration <dbl>
tourney_GSlam <- tourney_df %>% 
  filter(tourney_singles_draw == 128) %>% select("tourney_id", "tourney_name")
tourney_GSlam
## # A tibble: 4 x 2
##   tourney_id tourney_name   
##        <dbl> <chr>          
## 1        580 Australian Open
## 2        520 Roland Garros  
## 3        540 Wimbledon      
## 4        560 US Open
GSlam_Mdur <- left_join(tourney_GSlam, match_dur_df)
kable(head(GSlam_Mdur, n = 20L))
tourney_id tourney_name match_id winner_player_id loser_player_id tourney_round_name match_duration
580 Australian Open 2017-580-f324-n409 f324 n409 Finals 218
580 Australian Open 2017-580-f324-w367 f324 w367 Semi-Finals 185
580 Australian Open 2017-580-n409-d875 n409 d875 Semi-Finals 296
580 Australian Open 2017-580-n409-r975 n409 r975 Quarter-Finals 164
580 Australian Open 2017-580-w367-t786 w367 t786 Quarter-Finals 135
580 Australian Open 2017-580-d875-gb88 d875 gb88 Quarter-Finals 133
580 Australian Open 2017-580-f324-z168 f324 z168 Quarter-Finals 92
580 Australian Open 2017-580-z168-mc10 z168 mc10 Round of 16 214
580 Australian Open 2017-580-r975-bd06 r975 bd06 Round of 16 169
580 Australian Open 2017-580-w367-sa93 w367 sa93 Round of 16 164
580 Australian Open 2017-580-f324-n552 f324 n552 Round of 16 204
580 Australian Open 2017-580-n409-mc65 n409 mc65 Round of 16 176
580 Australian Open 2017-580-gb88-tb69 gb88 tb69 Round of 16 164
580 Australian Open 2017-580-t786-e687 t786 e687 Round of 16 173
580 Australian Open 2017-580-d875-i165 d875 i165 Round of 16 145
580 Australian Open 2017-580-mc10-q927 mc10 q927 Round of 32 119
580 Australian Open 2017-580-r975-sd32 r975 sd32 Round of 32 155
580 Australian Open 2017-580-w367-t840 w367 t840 Round of 32 152
580 Australian Open 2017-580-n552-l797 n552 l797 Round of 32 131
580 Australian Open 2017-580-mc65-k435 mc65 k435 Round of 32 116
GSlam_QF_Players <- GSlam_Mdur %>% filter(tourney_round_name == "Round of 16") %>% 
  select("winner_player_id")  %>% distinct(winner_player_id) %>% 
  left_join(player_df,  by = c("winner_player_id" = "player_id")) %>%
  select(1, 2, 3, 4)
kable(GSlam_QF_Players)
winner_player_id player_slug first_name last_name
z168 mischa-zverev Mischa Zverev
r975 milos-raonic Milos Raonic
w367 stan-wawrinka Stan Wawrinka
f324 roger-federer Roger Federer
n409 rafael-nadal Rafael Nadal
gb88 david-goffin David Goffin
t786 jo-wilfried-tsonga Jo-Wilfried Tsonga
d875 grigor-dimitrov Grigor Dimitrov
mc10 andy-murray Andy Murray
d643 novak-djokovic Novak Djokovic
cd85 pablo-carreno-busta Pablo Carreno Busta
tb69 dominic-thiem Dominic Thiem
c977 marin-cilic Marin Cilic
n552 kei-nishikori Kei Nishikori
ma30 gilles-muller Gilles Muller
ba47 tomas-berdych Tomas Berdych
q927 sam-querrey Sam Querrey
d683 juan-martin-del-potro Juan Martin del Potro
re44 andrey-rublev Andrey Rublev
sm37 diego-schwartzman Diego Schwartzman
a678 kevin-anderson Kevin Anderson
min_on_court <- GSlam_Mdur %>% 
  group_by(winner_player_id) %>% summarise(player_on_court = sum(match_duration))
min_on_court <- left_join(GSlam_QF_Players, min_on_court) %>% select(4, 5) %>% na.omit() %>%
  arrange(desc(player_on_court))
kable(min_on_court)
last_name player_on_court
Nadal 3235
Federer 2295
Thiem 1885
Anderson 1856
Rublev 1762
Murray 1706
Wawrinka 1686
Cilic 1675
Carreno Busta 1530
Zverev 1520
Raonic 1440
Nishikori 1422
Dimitrov 1302
Goffin 1270
del Potro 1182
Djokovic 1162
Berdych 1085
Muller 1055
Schwartzman 1020
Tsonga 944
min_on_court %>%
  ggplot(aes(x = last_name, y = player_on_court, fill = last_name)) +
  geom_col() + 
  scale_fill_viridis(discrete=TRUE) +
  theme(legend.position = "bottom")

Point win/loss ratios of the players ranked top 5 by year end 2017

top5_players <- rank_df %>% filter(week_title == max(week_title)) %>% top_n(5, ranking_points) %>%
  select(2, 3) %>% left_join(select(player_df, player_id, first_name, last_name))
kable(top5_players)
player_id rank_number first_name last_name
n409 1 Rafael Nadal
f324 2 Roger Federer
d875 3 Grigor Dimitrov
z355 4 Alexander Zverev
tb69 5 Dominic Thiem
w_pts_win_loss <- stats_df %>% 
  select("winner_player_id", "winner_total_points_won", "loser_total_points_won") %>% na.omit() %>%
  group_by(winner_player_id) %>% 
  summarise(w_tot_pts_won = sum(winner_total_points_won), w_tot_pts_lost = sum(loser_total_points_won)) 

l_pts_win_loss <- stats_df %>% 
  select("loser_player_id", "loser_total_points_won", "winner_total_points_won") %>% na.omit() %>%
  group_by(loser_player_id) %>% 
  summarise(l_tot_pts_won = sum(loser_total_points_won), l_tot_pts_lost = sum(winner_total_points_won))

winloss_ratio <- left_join(w_pts_win_loss, l_pts_win_loss, by = c("winner_player_id" = "loser_player_id")) %>%
  rename(player_id = winner_player_id) %>%
  mutate(w_l_ratio = (w_tot_pts_won + l_tot_pts_won) / (w_tot_pts_lost + l_tot_pts_lost))
kable(head(winloss_ratio, n = 12L))
player_id w_tot_pts_won w_tot_pts_lost l_tot_pts_won l_tot_pts_lost w_l_ratio
a479 1015 869 712 842 1.0093513
a622 193 160 199 252 0.9514563
a678 3210 2602 1590 1754 1.1019284
a829 1657 1461 1805 2126 0.9651519
a853 62 30 334 436 0.8497854
a887 538 460 366 427 1.0191657
ae14 402 361 309 404 0.9294118
af32 70 64 49 67 0.9083969
b747 1746 1467 1283 1490 1.0243490
b837 1450 1204 1026 1284 0.9951768
b884 1275 1052 1333 1620 0.9760479
b896 514 441 800 906 0.9755011
top5_wlR <- left_join(top5_players, winloss_ratio)
kable(top5_wlR)
player_id rank_number first_name last_name w_tot_pts_won w_tot_pts_lost l_tot_pts_won l_tot_pts_lost w_l_ratio
n409 1 Rafael Nadal 5572 4158 947 1066 1.247894
f324 2 Roger Federer 4686 3754 467 476 1.218203
d875 3 Grigor Dimitrov 4110 3277 1685 1862 1.127651
z355 4 Alexander Zverev 4418 3701 1677 1939 1.080674
tb69 5 Dominic Thiem 4053 3286 2119 2468 1.072645