Library Import

library(tidyverse)

Exploring Data Frames

atp_data <- load("atp_tennis_data_2017.RData")

str(tourney_df)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 67 obs. of  12 variables:
##  $ tourney_id                : num  339 891 451 338 301 ...
##  $ tourney_name              : chr  "Brisbane International presented by Suncorp" "Aircel Chennai Open" "Qatar ExxonMobil Open" "Sydney International" ...
##  $ tourney_location          : chr  "Brisbane, Australia" "Chennai, India" "Doha, Qatar" "Sydney, Australia" ...
##  $ tourney_date              : Date, format: "2017-01-01" "2017-01-02" ...
##  $ tourney_singles_draw      : num  28 28 32 28 28 128 28 28 28 28 ...
##  $ tourney_doubles_draw      : num  28 28 32 28 28 128 28 28 28 28 ...
##  $ tourney_conditions        : chr  "Outdoor" "Outdoor" "Outdoor" "Outdoor" ...
##  $ tourney_surface           : chr  "Hard" "Hard" "Hard" "Hard" ...
##  $ tourney_fin_commit        : chr  "$495,630" "$505,730" "$1,334,270" "$495,630" ...
##  $ singles_winner_player_id  : chr  "d875" "bd06" "d643" "ma30" ...
##  $ doubles_winner_1_player_id: chr  "kd46" "b757" "ca12" "kc41" ...
##  $ doubles_winner_2_player_id: chr  "tc61" "n480" "me04" "mb88" ...
str(player_df)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 10912 obs. of  13 variables:
##  $ player_id  : chr  "a002" "a001" "a005" "a004" ...
##  $ player_slug: chr  "ricardo-acuna" "sadiq-abdullahi" "nelson-aerts" "egan-adams" ...
##  $ first_name : chr  "Ricardo" "Sadiq" "Nelson" "Egan" ...
##  $ last_name  : chr  "Acuna" "Abdullahi" "Aerts" "Adams" ...
##  $ flag_code  : chr  "CHI" "NGR" "BRA" "USA" ...
##  $ residence  : chr  "Jupiter, FL, USA" NA NA "Palmetto, FL, USA" ...
##  $ birth_place: chr  "Santiago, Chile" NA "Cachoeira Do Sul, Brazil" "Miami Beach, FL, USA" ...
##  $ birth_date : Date, format: "1958-01-13" "1960-02-02" ...
##  $ turned_pro : num  0 0 0 0 1983 ...
##  $ weight_kg  : num  68 0 75 73 82 68 0 0 0 82 ...
##  $ height_cm  : num  175 0 188 178 180 183 0 0 0 191 ...
##  $ handedness : chr  NA NA NA NA ...
##  $ backhand   : chr  NA NA NA NA ...
str(rank_df)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 87740 obs. of  6 variables:
##  $ week_title     : Date, format: "2017-11-20" "2017-11-20" ...
##  $ player_id      : chr  "n409" "f324" "d875" "z355" ...
##  $ rank_number    : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ ranking_points : num  10645 9605 5150 4610 4015 ...
##  $ tourneys_played: num  18 17 23 25 27 22 26 22 15 25 ...
##  $ player_age     : num  31 36 26 20 24 29 26 25 32 26 ...
str(score_df)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 3830 obs. of  14 variables:
##  $ match_id             : chr  "2017-339-d875-n552" "2017-339-d875-r975" "2017-339-n552-w367" "2017-339-r975-n409" ...
##  $ tourney_id           : chr  "339" "339" "339" "339" ...
##  $ tourney_round_name   : chr  "Finals" "Semi-Finals" "Semi-Finals" "Quarter-Finals" ...
##  $ winner_player_id     : chr  "d875" "d875" "n552" "r975" ...
##  $ loser_player_id      : chr  "n552" "r975" "w367" "n409" ...
##  $ winner_seed          : chr  "7" "7" "3" "1" ...
##  $ loser_seed           : chr  "3" "1" "2" "5" ...
##  $ match_score_tiebreaks: chr  "62 26 63" "76(7) 62" "76(3) 63" "46 63 64" ...
##  $ winner_sets_won      : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ loser_sets_won       : num  1 0 0 1 1 0 1 0 0 1 ...
##  $ winner_games_won     : num  14 13 13 16 18 12 16 12 13 16 ...
##  $ loser_games_won      : num  11 8 9 13 15 2 12 5 10 13 ...
##  $ winner_tiebreaks_won : num  0 1 1 0 0 0 0 0 1 0 ...
##  $ loser_tiebreaks_won  : num  0 0 0 0 1 0 0 0 0 0 ...
str(stats_df)
## Classes 'tbl_df', 'tbl' and 'data.frame':    3811 obs. of  54 variables:
##  $ match_id                        : chr  "2017-339-r975-n409" "2017-339-d875-n552" "2017-339-d875-r975" "2017-339-n552-w367" ...
##  $ match_time                      : 'hms' num  02:21:00 01:49:00 01:28:00 01:42:00 ...
##   ..- attr(*, "units")= chr "secs"
##  $ match_duration                  : num  141 109 88 102 125 156 62 69 89 90 ...
##  $ winner_aces                     : num  23 7 4 1 3 11 3 12 11 7 ...
##  $ winner_double_faults            : num  3 2 1 1 3 3 0 1 1 2 ...
##  $ winner_first_serves_in          : num  62 52 36 56 52 67 19 40 44 41 ...
##  $ winner_first_serves_total       : num  97 77 58 77 94 119 34 53 65 64 ...
##  $ winner_first_serve_points_won   : num  50 41 27 37 42 47 18 30 36 33 ...
##  $ winner_first_serve_points_total : num  62 52 36 56 52 67 19 40 44 41 ...
##  $ winner_second_serve_points_won  : num  16 12 18 14 23 28 10 7 15 15 ...
##  $ winner_second_serve_points_total: num  35 25 22 21 42 52 15 13 21 23 ...
##  $ winner_break_points_saved       : num  6 5 0 4 13 11 0 2 4 0 ...
##  $ winner_break_points_serve_total : num  7 7 0 5 14 13 0 3 4 1 ...
##  $ winner_service_points_won       : num  66 53 45 51 65 75 28 37 51 48 ...
##  $ winner_service_points_total     : num  97 77 58 77 94 119 34 53 65 64 ...
##  $ winner_first_serve_return_won   : num  22 13 4 10 8 13 13 12 1 7 ...
##  $ winner_first_serve_return_total : num  61 49 28 37 37 65 28 29 35 46 ...
##  $ winner_second_serve_return_won  : num  9 11 17 14 21 21 14 9 14 17 ...
##  $ winner_second_serve_return_total: num  23 20 33 24 45 32 19 15 23 25 ...
##  $ winner_break_points_converted   : num  2 3 2 2 3 4 5 4 1 2 ...
##  $ winner_break_points_return_total: num  4 5 4 2 7 10 8 6 4 5 ...
##  $ winner_service_games_played     : num  15 13 10 11 14 16 7 9 11 11 ...
##  $ winner_return_games_played      : num  14 12 10 10 14 16 7 8 10 11 ...
##  $ winner_return_points_won        : num  31 24 21 24 29 34 27 21 15 24 ...
##  $ winner_return_points_total      : num  84 69 61 61 82 97 47 44 58 71 ...
##  $ winner_total_points_won         : num  97 77 66 75 94 109 55 58 66 72 ...
##  $ winner_total_points_total       : num  181 146 119 138 176 216 81 97 123 135 ...
##  $ loser_aces                      : num  4 4 4 9 6 2 1 0 11 5 ...
##  $ loser_double_faults             : num  0 0 3 2 5 2 2 1 1 7 ...
##  $ loser_first_serves_in           : num  61 49 28 37 37 65 28 29 35 46 ...
##  $ loser_first_serves_total        : num  84 69 61 61 82 97 47 44 58 71 ...
##  $ loser_first_serve_points_won    : num  39 36 24 27 29 52 15 17 34 39 ...
##  $ loser_first_serve_points_total  : num  61 49 28 37 37 65 28 29 35 46 ...
##  $ loser_second_serve_points_won   : num  14 9 16 10 24 11 5 6 9 8 ...
##  $ loser_second_serve_points_total : num  23 20 33 24 45 32 19 15 23 25 ...
##  $ loser_break_points_saved        : num  2 2 2 0 4 6 3 2 3 3 ...
##  $ loser_break_points_serve_total  : num  4 5 4 2 7 10 8 6 4 5 ...
##  $ loser_service_points_won        : num  53 45 40 37 53 63 20 23 43 47 ...
##  $ loser_service_points_total      : num  84 69 61 61 82 97 47 44 58 71 ...
##  $ loser_first_serve_return_won    : num  12 11 9 19 10 20 1 10 8 8 ...
##  $ loser_first_serve_return_total  : num  62 52 36 56 52 67 19 40 44 41 ...
##  $ loser_second_serve_return_won   : num  19 13 4 7 19 24 5 6 6 8 ...
##  $ loser_second_serve_return_total : num  35 25 22 21 42 52 15 13 21 23 ...
##  $ loser_break_points_converted    : num  1 2 0 1 1 2 0 1 0 1 ...
##  $ loser_break_points_return_total : num  7 7 0 5 14 13 0 3 4 1 ...
##  $ loser_service_games_played      : num  14 12 10 10 14 16 7 8 10 11 ...
##  $ loser_return_games_played       : num  15 13 10 11 14 16 7 9 11 11 ...
##  $ loser_return_points_won         : num  31 24 13 26 29 44 6 16 14 16 ...
##  $ loser_return_points_total       : num  97 77 58 77 94 119 34 53 65 64 ...
##  $ loser_total_points_won          : num  84 69 53 63 82 107 26 39 57 63 ...
##  $ loser_total_points_total        : num  181 146 119 138 176 216 81 97 123 135 ...
##  $ tourney_id                      : chr  "339" "339" "339" "339" ...
##  $ winner_player_id                : chr  "r975" "d875" "d875" "n552" ...
##  $ loser_player_id                 : chr  "n409" "n552" "r975" "w367" ...

1. Rank Countries (Flag Codes) by the Singles Champions

champs_flags <- tourney_df %>% left_join(player_df, by = c("singles_winner_player_id" = "player_id")) %>% group_by(flag_code) %>% count(flag_code, sort = TRUE)

champs_flags
## # A tibble: 21 x 2
## # Groups:   flag_code [21]
##    flag_code     n
##    <chr>     <int>
##  1 ESP          11
##  2 USA           9
##  3 SUI           8
##  4 FRA           7
##  5 GER           7
##  6 BUL           4
##  7 ARG           2
##  8 BEL           2
##  9 BIH           2
## 10 CRO           2
## # ... with 11 more rows

2. Rank countries which did not get any singles champs by the games won when they win the match

player_df %>% select(player_id, flag_code) %>% anti_join(.,champs_flags) %>% 
  left_join(score_df, by = c("player_id" = "winner_player_id")) %>% 
  group_by(flag_code) %>% summarise(total_wins = sum(winner_games_won, na.rm = TRUE)) %>% arrange(desc(total_wins))
## Joining, by = "flag_code"
## # A tibble: 93 x 2
##    flag_code total_wins
##    <chr>          <dbl>
##  1 AUS             1989
##  2 CZE             1209
##  3 CAN             1190
##  4 SVK              889
##  5 BRA              873
##  6 POR              621
##  7 RSA              566
##  8 KAZ              495
##  9 KOR              438
## 10 GEO              377
## # ... with 83 more rows

3. Rank Countries (Flag Codes) by the Double Champs

double_winners <- tourney_df %>% 
  left_join(player_df, by = c("doubles_winner_1_player_id" = "player_id", "doubles_winner_2_player_id" = "player_id")) %>%
  select(tourney_name, flag_code, doubles_winner_1_player_id, doubles_winner_2_player_id)

double_winners
## # A tibble: 67 x 4
##    tourney_name              flag_code doubles_winner_1_pl~ doubles_winner_2_pl~
##    <chr>                     <chr>     <chr>                <chr>               
##  1 Brisbane International p~ AUS       kd46                 tc61                
##  2 Aircel Chennai Open       IND       b757                 n480                
##  3 Qatar ExxonMobil Open     FRA       ca12                 me04                
##  4 Sydney International      NED       kc41                 mb88                
##  5 ASB Classic               POL       m844                 q019                
##  6 Australian Open           FIN       ka80                 pc96                
##  7 Open Sud de France        GER       z168                 z355                
##  8 Ecuador Open              USA       c922                 o305                
##  9 Garanti Koza Sofia Open   SRB       t840                 z072                
## 10 Argentina Open            COL       c834                 f525                
## # ... with 57 more rows

3.1. Checking All Double Pairs are from the same countries

double_winners %>% gather("Winner_Player_IDs", "IDs", doubles_winner_1_player_id:doubles_winner_2_player_id)
## # A tibble: 134 x 4
##    tourney_name                          flag_code Winner_Player_IDs       IDs  
##    <chr>                                 <chr>     <chr>                   <chr>
##  1 Brisbane International presented by ~ AUS       doubles_winner_1_playe~ kd46 
##  2 Aircel Chennai Open                   IND       doubles_winner_1_playe~ b757 
##  3 Qatar ExxonMobil Open                 FRA       doubles_winner_1_playe~ ca12 
##  4 Sydney International                  NED       doubles_winner_1_playe~ kc41 
##  5 ASB Classic                           POL       doubles_winner_1_playe~ m844 
##  6 Australian Open                       FIN       doubles_winner_1_playe~ ka80 
##  7 Open Sud de France                    GER       doubles_winner_1_playe~ z168 
##  8 Ecuador Open                          USA       doubles_winner_1_playe~ c922 
##  9 Garanti Koza Sofia Open               SRB       doubles_winner_1_playe~ t840 
## 10 Argentina Open                        COL       doubles_winner_1_playe~ c834 
## # ... with 124 more rows
double_winners %>% group_by(flag_code) %>% count(flag_code, sort = TRUE)
## # A tibble: 25 x 2
## # Groups:   flag_code [25]
##    flag_code     n
##    <chr>     <int>
##  1 USA           8
##  2 POL           7
##  3 FRA           6
##  4 FIN           5
##  5 NED           5
##  6 AUT           4
##  7 GBR           4
##  8 COL           3
##  9 CRO           3
## 10 IND           3
## # ... with 15 more rows

4. Total Aces by Each Player in 2017

score_df$tourney_id <- as.numeric(score_df$tourney_id)
stats_df$tourney_id <- as.numeric(stats_df$tourney_id)

4.1. Total Aces of Winners

winner_aces_total <- stats_df %>% 
  left_join(player_df, by = c("winner_player_id" = "player_id")) %>%
  select(winner_player_id, first_name, last_name, winner_aces) %>%
  mutate(full_name = paste(first_name, last_name)) %>%
  group_by(full_name) %>%
  summarise(winner_total_aces = sum(winner_aces)) %>% select(full_name, winner_total_aces) %>% arrange(desc(winner_total_aces))

winner_aces_total
## # A tibble: 313 x 2
##    full_name          winner_total_aces
##    <chr>                          <dbl>
##  1 John Isner                       746
##  2 Gilles Muller                    526
##  3 Sam Querrey                      518
##  4 Marin Cilic                      497
##  5 Kevin Anderson                   491
##  6 Roger Federer                    489
##  7 Milos Raonic                     454
##  8 Alexander Zverev                 424
##  9 Grigor Dimitrov                  382
## 10 Jo-Wilfried Tsonga               374
## # ... with 303 more rows

4.2. Total Aces of Losers

loser_aces_total <- stats_df %>% 
  left_join(player_df, by = c("loser_player_id" = "player_id")) %>%
  select(loser_player_id, first_name, last_name, loser_aces) %>%
  mutate(full_name = paste(first_name, last_name)) %>%
  group_by(full_name) %>%
  summarise(loser_total_aces = sum(loser_aces)) %>% select(full_name, loser_total_aces) %>% arrange(desc(loser_total_aces))

loser_aces_total
## # A tibble: 463 x 2
##    full_name       loser_total_aces
##    <chr>                      <dbl>
##  1 Ivo Karlovic                 420
##  2 John Isner                   377
##  3 NA NA                        343
##  4 Kevin Anderson               261
##  5 Reilly Opelka                255
##  6 Marin Cilic                  227
##  7 Sam Querrey                  224
##  8 Benoit Paire                 218
##  9 Robin Haase                  218
## 10 Karen Khachanov              211
## # ... with 453 more rows

4.3. Total Number of Aces by Players

total_aces_full <- winner_aces_total %>% left_join(loser_aces_total, by = "full_name") %>% mutate(aces_total = winner_total_aces + loser_total_aces) %>% arrange(desc(aces_total))

#Removing the row that is shown with full name "NA NA"
total_aces_full <- subset(total_aces_full, full_name != "NA NA")

head(total_aces_full, 10)
## # A tibble: 10 x 4
##    full_name        winner_total_aces loser_total_aces aces_total
##    <chr>                        <dbl>            <dbl>      <dbl>
##  1 John Isner                     746              377       1123
##  2 Kevin Anderson                 491              261        752
##  3 Sam Querrey                    518              224        742
##  4 Gilles Muller                  526              209        735
##  5 Ivo Karlovic                   310              420        730
##  6 Marin Cilic                    497              227        724
##  7 Alexander Zverev               424              187        611
##  8 Milos Raonic                   454              112        566
##  9 Grigor Dimitrov                382              181        563
## 10 Roger Federer                  489               61        550