library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)
library(ggplot2)
library(viridis) 
## Loading required package: viridisLite
library(knitr)
url<-url("https://github.com/pjournal/mef03-Ozgunkrt/blob/master/atp_tennis_data_2017.rdata?raw=true")
load(url)
head(player_df)
## # A tibble: 6 x 13
##   player_id player_slug first_name last_name flag_code residence
##   <chr>     <chr>       <chr>      <chr>     <chr>     <chr>    
## 1 a002      ricardo-ac… Ricardo    Acuna     CHI       Jupiter,…
## 2 a001      sadiq-abdu… Sadiq      Abdullahi NGR       <NA>     
## 3 a005      nelson-aer… Nelson     Aerts     BRA       <NA>     
## 4 a004      egan-adams  Egan       Adams     USA       Palmetto…
## 5 a006      ronald-age… Ronald     Agenor    USA       Beverly …
## 6 a007      juan-aguil… Juan       Aguilera  ESP       <NA>     
## # … with 7 more variables: birth_place <chr>, birth_date <date>,
## #   turned_pro <dbl>, weight_kg <dbl>, height_cm <dbl>, handedness <chr>,
## #   backhand <chr>
head(rank_df)
## # A tibble: 6 x 6
##   week_title player_id rank_number ranking_points tourneys_played
##   <date>     <chr>           <dbl>          <dbl>           <dbl>
## 1 2017-11-20 n409                1          10645              18
## 2 2017-11-20 f324                2           9605              17
## 3 2017-11-20 d875                3           5150              23
## 4 2017-11-20 z355                4           4610              25
## 5 2017-11-20 tb69                5           4015              27
## 6 2017-11-20 c977                6           3805              22
## # … with 1 more variable: player_age <dbl>
head(score_df)
## # A tibble: 6 x 14
##   match_id tourney_id tourney_round_n… winner_player_id loser_player_id
##   <chr>    <chr>      <chr>            <chr>            <chr>          
## 1 2017-33… 339        Finals           d875             n552           
## 2 2017-33… 339        Semi-Finals      d875             r975           
## 3 2017-33… 339        Semi-Finals      n552             w367           
## 4 2017-33… 339        Quarter-Finals   r975             n409           
## 5 2017-33… 339        Quarter-Finals   w367             e831           
## 6 2017-33… 339        Quarter-Finals   n552             tc61           
## # … with 9 more variables: winner_seed <chr>, loser_seed <chr>,
## #   match_score_tiebreaks <chr>, winner_sets_won <dbl>,
## #   loser_sets_won <dbl>, winner_games_won <dbl>, loser_games_won <dbl>,
## #   winner_tiebreaks_won <dbl>, loser_tiebreaks_won <dbl>
head(stats_df)
## # A tibble: 6 x 54
##   match_id match_time match_duration winner_aces winner_double_f…
##   <chr>    <time>              <dbl>       <dbl>            <dbl>
## 1 2017-33… 02:21                 141          23                3
## 2 2017-33… 01:49                 109           7                2
## 3 2017-33… 01:28                  88           4                1
## 4 2017-33… 01:42                 102           1                1
## 5 2017-33… 02:05                 125           3                3
## 6 2017-33… 02:36                 156          11                3
## # … with 49 more variables: winner_first_serves_in <dbl>,
## #   winner_first_serves_total <dbl>, winner_first_serve_points_won <dbl>,
## #   winner_first_serve_points_total <dbl>,
## #   winner_second_serve_points_won <dbl>,
## #   winner_second_serve_points_total <dbl>,
## #   winner_break_points_saved <dbl>,
## #   winner_break_points_serve_total <dbl>,
## #   winner_service_points_won <dbl>, winner_service_points_total <dbl>,
## #   winner_first_serve_return_won <dbl>,
## #   winner_first_serve_return_total <dbl>,
## #   winner_second_serve_return_won <dbl>,
## #   winner_second_serve_return_total <dbl>,
## #   winner_break_points_converted <dbl>,
## #   winner_break_points_return_total <dbl>,
## #   winner_service_games_played <dbl>, winner_return_games_played <dbl>,
## #   winner_return_points_won <dbl>, winner_return_points_total <dbl>,
## #   winner_total_points_won <dbl>, winner_total_points_total <dbl>,
## #   loser_aces <dbl>, loser_double_faults <dbl>,
## #   loser_first_serves_in <dbl>, loser_first_serves_total <dbl>,
## #   loser_first_serve_points_won <dbl>,
## #   loser_first_serve_points_total <dbl>,
## #   loser_second_serve_points_won <dbl>,
## #   loser_second_serve_points_total <dbl>, loser_break_points_saved <dbl>,
## #   loser_break_points_serve_total <dbl>, loser_service_points_won <dbl>,
## #   loser_service_points_total <dbl>, loser_first_serve_return_won <dbl>,
## #   loser_first_serve_return_total <dbl>,
## #   loser_second_serve_return_won <dbl>,
## #   loser_second_serve_return_total <dbl>,
## #   loser_break_points_converted <dbl>,
## #   loser_break_points_return_total <dbl>,
## #   loser_service_games_played <dbl>, loser_return_games_played <dbl>,
## #   loser_return_points_won <dbl>, loser_return_points_total <dbl>,
## #   loser_total_points_won <dbl>, loser_total_points_total <dbl>,
## #   tourney_id <chr>, winner_player_id <chr>, loser_player_id <chr>
head(tourney_df)
## # A tibble: 6 x 12
##   tourney_id tourney_name tourney_location tourney_date tourney_singles…
##        <dbl> <chr>        <chr>            <date>                  <dbl>
## 1        339 Brisbane In… Brisbane, Austr… 2017-01-01                 28
## 2        891 Aircel Chen… Chennai, India   2017-01-02                 28
## 3        451 Qatar Exxon… Doha, Qatar      2017-01-02                 32
## 4        338 Sydney Inte… Sydney, Austral… 2017-01-08                 28
## 5        301 ASB Classic  Auckland, New Z… 2017-01-09                 28
## 6        580 Australian … Melbourne, Aust… 2017-01-16                128
## # … with 7 more variables: tourney_doubles_draw <dbl>,
## #   tourney_conditions <chr>, tourney_surface <chr>,
## #   tourney_fin_commit <chr>, singles_winner_player_id <chr>,
## #   doubles_winner_1_player_id <chr>, doubles_winner_2_player_id <chr>
#1. Rank countries(flag codes) by the singles champions
champ_flags_df <- tourney_df %>%  left_join(.,player_df, by=c("singles_winner_player_id"="player_id")) %>% count(flag_code, sort=TRUE) %>% print(champ_flags_df)
## # A tibble: 21 x 2
##    flag_code     n
##    <chr>     <int>
##  1 ESP          11
##  2 USA           9
##  3 SUI           8
##  4 FRA           7
##  5 GER           7
##  6 BUL           4
##  7 ARG           2
##  8 BEL           2
##  9 BIH           2
## 10 CRO           2
## # … with 11 more rows
#2 - Rank countries which did not get any singles championships by the games won when they win the match

winner_countries <- left_join(player_df, tourney_df, by=c("player_id" = "singles_winner_player_id")) %>% 
  drop_na() %>%
  select(flag_code) %>% distinct()

loser_countries <- anti_join(player_df, winner_countries) %>% select(flag_code) %>% distinct()
## Joining, by = "flag_code"
inner_join(player_df, loser_countries) %>% select(player_id, flag_code) %>% distinct() %>%
  inner_join(score_df, by = c("player_id" = "winner_player_id")) %>% select(flag_code, winner_games_won) %>%
  group_by(flag_code) %>% transmute(winner_games_won = sum(winner_games_won)) %>%distinct() %>%
  arrange(desc(winner_games_won))
## Joining, by = "flag_code"
## # A tibble: 39 x 2
## # Groups:   flag_code [39]
##    flag_code winner_games_won
##    <chr>                <dbl>
##  1 AUS                   1989
##  2 CZE                   1209
##  3 CAN                   1190
##  4 SVK                    889
##  5 BRA                    873
##  6 POR                    621
##  7 RSA                    566
##  8 KAZ                    495
##  9 KOR                    438
## 10 GEO                    377
## # … with 29 more rows
#3 - Players which won tourneys more than one and their number of championship
single_won <- left_join(player_df, tourney_df, by=c("player_id" = "singles_winner_player_id")) %>% 
  drop_na() %>%
  transmute(player_id,flag_code,tourney_name,first_name,last_name,winner_count = 1) %>% 
  group_by(player_id) %>% 
  transmute(winner_count = sum(winner_count),flag_code,first_name,last_name) %>% 
  arrange(desc(winner_count)) %>%
  distinct() %>%
  ungroup() %>% 
  select(first_name,last_name,winner_count) %>%
  slice(1:10)

print(single_won)
## # A tibble: 10 x 3
##    first_name  last_name     winner_count
##    <chr>       <chr>                <dbl>
##  1 Roger       Federer                  7
##  2 Rafael      Nadal                    6
##  3 Alexander   Zverev                   5
##  4 Grigor      Dimitrov                 4
##  5 Jo-Wilfried Tsonga                   4
##  6 Lucas       Pouille                  3
##  7 Jack        Sock                     3
##  8 Roberto     Bautista Agut            2
##  9 Novak       Djokovic                 2
## 10 Damir       Dzumhur                  2
ggplot(single_won, aes(x=first_name,y=winner_count,fill=first_name)) +
 geom_col() 

match_df <-left_join(stats_df,score_df, by=c("match_id" = "match_id")) %>%  
  select(match_id,match_duration,winner_player_id.x)

winners_match <-  left_join(player_df,match_df,by=c("player_id" = "winner_player_id.x"))

  left_join(tourney_df,winners_match,by=c("singles_winner_player_id" = "player_id")) %>% 
  drop_na() %>%
  select(singles_winner_player_id,first_name,last_name,match_duration) %>% 
  group_by(singles_winner_player_id) %>% 
  transmute(first_name,last_name,max_match_duration = max(match_duration)) %>% distinct() %>%
  arrange(max_match_duration)
## # A tibble: 35 x 4
## # Groups:   singles_winner_player_id [35]
##    singles_winner_player_id first_name last_name     max_match_duration
##    <chr>                    <chr>      <chr>                      <dbl>
##  1 g967                     Peter      Gojowczyk                    146
##  2 se73                     Yuichi     Sugita                       147
##  3 d875                     Grigor     Dimitrov                     156
##  4 k435                     Philipp    Kohlschreiber                157
##  5 d923                     Damir      Dzumhur                      157
##  6 sm25                     Jack       Sock                         163
##  7 h940                     Ryan       Harrison                     166
##  8 i186                     John       Isner                        166
##  9 f510                     Fabio      Fognini                      171
## 10 z355                     Alexander  Zverev                       174
## # … with 25 more rows