Library Import
library(tidyverse)
Exploring Data Frames
atp_data <- load("atp_tennis_data_2017.RData")
str(tourney_df)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 67 obs. of 12 variables:
## $ tourney_id : num 339 891 451 338 301 ...
## $ tourney_name : chr "Brisbane International presented by Suncorp" "Aircel Chennai Open" "Qatar ExxonMobil Open" "Sydney International" ...
## $ tourney_location : chr "Brisbane, Australia" "Chennai, India" "Doha, Qatar" "Sydney, Australia" ...
## $ tourney_date : Date, format: "2017-01-01" "2017-01-02" ...
## $ tourney_singles_draw : num 28 28 32 28 28 128 28 28 28 28 ...
## $ tourney_doubles_draw : num 28 28 32 28 28 128 28 28 28 28 ...
## $ tourney_conditions : chr "Outdoor" "Outdoor" "Outdoor" "Outdoor" ...
## $ tourney_surface : chr "Hard" "Hard" "Hard" "Hard" ...
## $ tourney_fin_commit : chr "$495,630" "$505,730" "$1,334,270" "$495,630" ...
## $ singles_winner_player_id : chr "d875" "bd06" "d643" "ma30" ...
## $ doubles_winner_1_player_id: chr "kd46" "b757" "ca12" "kc41" ...
## $ doubles_winner_2_player_id: chr "tc61" "n480" "me04" "mb88" ...
str(player_df)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 10912 obs. of 13 variables:
## $ player_id : chr "a002" "a001" "a005" "a004" ...
## $ player_slug: chr "ricardo-acuna" "sadiq-abdullahi" "nelson-aerts" "egan-adams" ...
## $ first_name : chr "Ricardo" "Sadiq" "Nelson" "Egan" ...
## $ last_name : chr "Acuna" "Abdullahi" "Aerts" "Adams" ...
## $ flag_code : chr "CHI" "NGR" "BRA" "USA" ...
## $ residence : chr "Jupiter, FL, USA" NA NA "Palmetto, FL, USA" ...
## $ birth_place: chr "Santiago, Chile" NA "Cachoeira Do Sul, Brazil" "Miami Beach, FL, USA" ...
## $ birth_date : Date, format: "1958-01-13" "1960-02-02" ...
## $ turned_pro : num 0 0 0 0 1983 ...
## $ weight_kg : num 68 0 75 73 82 68 0 0 0 82 ...
## $ height_cm : num 175 0 188 178 180 183 0 0 0 191 ...
## $ handedness : chr NA NA NA NA ...
## $ backhand : chr NA NA NA NA ...
str(rank_df)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 87740 obs. of 6 variables:
## $ week_title : Date, format: "2017-11-20" "2017-11-20" ...
## $ player_id : chr "n409" "f324" "d875" "z355" ...
## $ rank_number : num 1 2 3 4 5 6 7 8 9 10 ...
## $ ranking_points : num 10645 9605 5150 4610 4015 ...
## $ tourneys_played: num 18 17 23 25 27 22 26 22 15 25 ...
## $ player_age : num 31 36 26 20 24 29 26 25 32 26 ...
str(score_df)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 3830 obs. of 14 variables:
## $ match_id : chr "2017-339-d875-n552" "2017-339-d875-r975" "2017-339-n552-w367" "2017-339-r975-n409" ...
## $ tourney_id : chr "339" "339" "339" "339" ...
## $ tourney_round_name : chr "Finals" "Semi-Finals" "Semi-Finals" "Quarter-Finals" ...
## $ winner_player_id : chr "d875" "d875" "n552" "r975" ...
## $ loser_player_id : chr "n552" "r975" "w367" "n409" ...
## $ winner_seed : chr "7" "7" "3" "1" ...
## $ loser_seed : chr "3" "1" "2" "5" ...
## $ match_score_tiebreaks: chr "62 26 63" "76(7) 62" "76(3) 63" "46 63 64" ...
## $ winner_sets_won : num 2 2 2 2 2 2 2 2 2 2 ...
## $ loser_sets_won : num 1 0 0 1 1 0 1 0 0 1 ...
## $ winner_games_won : num 14 13 13 16 18 12 16 12 13 16 ...
## $ loser_games_won : num 11 8 9 13 15 2 12 5 10 13 ...
## $ winner_tiebreaks_won : num 0 1 1 0 0 0 0 0 1 0 ...
## $ loser_tiebreaks_won : num 0 0 0 0 1 0 0 0 0 0 ...
str(stats_df)
## Classes 'tbl_df', 'tbl' and 'data.frame': 3811 obs. of 54 variables:
## $ match_id : chr "2017-339-r975-n409" "2017-339-d875-n552" "2017-339-d875-r975" "2017-339-n552-w367" ...
## $ match_time : 'hms' num 02:21:00 01:49:00 01:28:00 01:42:00 ...
## ..- attr(*, "units")= chr "secs"
## $ match_duration : num 141 109 88 102 125 156 62 69 89 90 ...
## $ winner_aces : num 23 7 4 1 3 11 3 12 11 7 ...
## $ winner_double_faults : num 3 2 1 1 3 3 0 1 1 2 ...
## $ winner_first_serves_in : num 62 52 36 56 52 67 19 40 44 41 ...
## $ winner_first_serves_total : num 97 77 58 77 94 119 34 53 65 64 ...
## $ winner_first_serve_points_won : num 50 41 27 37 42 47 18 30 36 33 ...
## $ winner_first_serve_points_total : num 62 52 36 56 52 67 19 40 44 41 ...
## $ winner_second_serve_points_won : num 16 12 18 14 23 28 10 7 15 15 ...
## $ winner_second_serve_points_total: num 35 25 22 21 42 52 15 13 21 23 ...
## $ winner_break_points_saved : num 6 5 0 4 13 11 0 2 4 0 ...
## $ winner_break_points_serve_total : num 7 7 0 5 14 13 0 3 4 1 ...
## $ winner_service_points_won : num 66 53 45 51 65 75 28 37 51 48 ...
## $ winner_service_points_total : num 97 77 58 77 94 119 34 53 65 64 ...
## $ winner_first_serve_return_won : num 22 13 4 10 8 13 13 12 1 7 ...
## $ winner_first_serve_return_total : num 61 49 28 37 37 65 28 29 35 46 ...
## $ winner_second_serve_return_won : num 9 11 17 14 21 21 14 9 14 17 ...
## $ winner_second_serve_return_total: num 23 20 33 24 45 32 19 15 23 25 ...
## $ winner_break_points_converted : num 2 3 2 2 3 4 5 4 1 2 ...
## $ winner_break_points_return_total: num 4 5 4 2 7 10 8 6 4 5 ...
## $ winner_service_games_played : num 15 13 10 11 14 16 7 9 11 11 ...
## $ winner_return_games_played : num 14 12 10 10 14 16 7 8 10 11 ...
## $ winner_return_points_won : num 31 24 21 24 29 34 27 21 15 24 ...
## $ winner_return_points_total : num 84 69 61 61 82 97 47 44 58 71 ...
## $ winner_total_points_won : num 97 77 66 75 94 109 55 58 66 72 ...
## $ winner_total_points_total : num 181 146 119 138 176 216 81 97 123 135 ...
## $ loser_aces : num 4 4 4 9 6 2 1 0 11 5 ...
## $ loser_double_faults : num 0 0 3 2 5 2 2 1 1 7 ...
## $ loser_first_serves_in : num 61 49 28 37 37 65 28 29 35 46 ...
## $ loser_first_serves_total : num 84 69 61 61 82 97 47 44 58 71 ...
## $ loser_first_serve_points_won : num 39 36 24 27 29 52 15 17 34 39 ...
## $ loser_first_serve_points_total : num 61 49 28 37 37 65 28 29 35 46 ...
## $ loser_second_serve_points_won : num 14 9 16 10 24 11 5 6 9 8 ...
## $ loser_second_serve_points_total : num 23 20 33 24 45 32 19 15 23 25 ...
## $ loser_break_points_saved : num 2 2 2 0 4 6 3 2 3 3 ...
## $ loser_break_points_serve_total : num 4 5 4 2 7 10 8 6 4 5 ...
## $ loser_service_points_won : num 53 45 40 37 53 63 20 23 43 47 ...
## $ loser_service_points_total : num 84 69 61 61 82 97 47 44 58 71 ...
## $ loser_first_serve_return_won : num 12 11 9 19 10 20 1 10 8 8 ...
## $ loser_first_serve_return_total : num 62 52 36 56 52 67 19 40 44 41 ...
## $ loser_second_serve_return_won : num 19 13 4 7 19 24 5 6 6 8 ...
## $ loser_second_serve_return_total : num 35 25 22 21 42 52 15 13 21 23 ...
## $ loser_break_points_converted : num 1 2 0 1 1 2 0 1 0 1 ...
## $ loser_break_points_return_total : num 7 7 0 5 14 13 0 3 4 1 ...
## $ loser_service_games_played : num 14 12 10 10 14 16 7 8 10 11 ...
## $ loser_return_games_played : num 15 13 10 11 14 16 7 9 11 11 ...
## $ loser_return_points_won : num 31 24 13 26 29 44 6 16 14 16 ...
## $ loser_return_points_total : num 97 77 58 77 94 119 34 53 65 64 ...
## $ loser_total_points_won : num 84 69 53 63 82 107 26 39 57 63 ...
## $ loser_total_points_total : num 181 146 119 138 176 216 81 97 123 135 ...
## $ tourney_id : chr "339" "339" "339" "339" ...
## $ winner_player_id : chr "r975" "d875" "d875" "n552" ...
## $ loser_player_id : chr "n409" "n552" "r975" "w367" ...
1. Rank Countries (Flag Codes) by the Singles Champions
champs_flags <- tourney_df %>% left_join(player_df, by = c("singles_winner_player_id" = "player_id")) %>% group_by(flag_code) %>% count(flag_code, sort = TRUE)
champs_flags
## # A tibble: 21 x 2
## # Groups: flag_code [21]
## flag_code n
## <chr> <int>
## 1 ESP 11
## 2 USA 9
## 3 SUI 8
## 4 FRA 7
## 5 GER 7
## 6 BUL 4
## 7 ARG 2
## 8 BEL 2
## 9 BIH 2
## 10 CRO 2
## # ... with 11 more rows
2. Rank countries which did not get any singles champs by the games won when they win the match
player_df %>% select(player_id, flag_code) %>% anti_join(.,champs_flags) %>%
left_join(score_df, by = c("player_id" = "winner_player_id")) %>%
group_by(flag_code) %>% summarise(total_wins = sum(winner_games_won, na.rm = TRUE)) %>% arrange(desc(total_wins))
## Joining, by = "flag_code"
## # A tibble: 93 x 2
## flag_code total_wins
## <chr> <dbl>
## 1 AUS 1989
## 2 CZE 1209
## 3 CAN 1190
## 4 SVK 889
## 5 BRA 873
## 6 POR 621
## 7 RSA 566
## 8 KAZ 495
## 9 KOR 438
## 10 GEO 377
## # ... with 83 more rows
3. Rank Countries (Flag Codes) by the Double Champs
double_winners <- tourney_df %>%
left_join(player_df, by = c("doubles_winner_1_player_id" = "player_id", "doubles_winner_2_player_id" = "player_id")) %>%
select(tourney_name, flag_code, doubles_winner_1_player_id, doubles_winner_2_player_id)
double_winners
## # A tibble: 67 x 4
## tourney_name flag_code doubles_winner_1_pl~ doubles_winner_2_pl~
## <chr> <chr> <chr> <chr>
## 1 Brisbane International p~ AUS kd46 tc61
## 2 Aircel Chennai Open IND b757 n480
## 3 Qatar ExxonMobil Open FRA ca12 me04
## 4 Sydney International NED kc41 mb88
## 5 ASB Classic POL m844 q019
## 6 Australian Open FIN ka80 pc96
## 7 Open Sud de France GER z168 z355
## 8 Ecuador Open USA c922 o305
## 9 Garanti Koza Sofia Open SRB t840 z072
## 10 Argentina Open COL c834 f525
## # ... with 57 more rows
3.1. Checking All Double Pairs are from the same countries
double_winners %>% gather("Winner_Player_IDs", "IDs", doubles_winner_1_player_id:doubles_winner_2_player_id)
## # A tibble: 134 x 4
## tourney_name flag_code Winner_Player_IDs IDs
## <chr> <chr> <chr> <chr>
## 1 Brisbane International presented by ~ AUS doubles_winner_1_playe~ kd46
## 2 Aircel Chennai Open IND doubles_winner_1_playe~ b757
## 3 Qatar ExxonMobil Open FRA doubles_winner_1_playe~ ca12
## 4 Sydney International NED doubles_winner_1_playe~ kc41
## 5 ASB Classic POL doubles_winner_1_playe~ m844
## 6 Australian Open FIN doubles_winner_1_playe~ ka80
## 7 Open Sud de France GER doubles_winner_1_playe~ z168
## 8 Ecuador Open USA doubles_winner_1_playe~ c922
## 9 Garanti Koza Sofia Open SRB doubles_winner_1_playe~ t840
## 10 Argentina Open COL doubles_winner_1_playe~ c834
## # ... with 124 more rows
double_winners %>% group_by(flag_code) %>% count(flag_code, sort = TRUE)
## # A tibble: 25 x 2
## # Groups: flag_code [25]
## flag_code n
## <chr> <int>
## 1 USA 8
## 2 POL 7
## 3 FRA 6
## 4 FIN 5
## 5 NED 5
## 6 AUT 4
## 7 GBR 4
## 8 COL 3
## 9 CRO 3
## 10 IND 3
## # ... with 15 more rows
4. Total Aces by Each Player in 2017
score_df$tourney_id <- as.numeric(score_df$tourney_id)
stats_df$tourney_id <- as.numeric(stats_df$tourney_id)
4.1. Total Aces of Winners
winner_aces_total <- stats_df %>%
left_join(player_df, by = c("winner_player_id" = "player_id")) %>%
select(winner_player_id, first_name, last_name, winner_aces) %>%
mutate(full_name = paste(first_name, last_name)) %>%
group_by(full_name) %>%
summarise(winner_total_aces = sum(winner_aces)) %>% select(full_name, winner_total_aces) %>% arrange(desc(winner_total_aces))
winner_aces_total
## # A tibble: 313 x 2
## full_name winner_total_aces
## <chr> <dbl>
## 1 John Isner 746
## 2 Gilles Muller 526
## 3 Sam Querrey 518
## 4 Marin Cilic 497
## 5 Kevin Anderson 491
## 6 Roger Federer 489
## 7 Milos Raonic 454
## 8 Alexander Zverev 424
## 9 Grigor Dimitrov 382
## 10 Jo-Wilfried Tsonga 374
## # ... with 303 more rows
4.2. Total Aces of Losers
loser_aces_total <- stats_df %>%
left_join(player_df, by = c("loser_player_id" = "player_id")) %>%
select(loser_player_id, first_name, last_name, loser_aces) %>%
mutate(full_name = paste(first_name, last_name)) %>%
group_by(full_name) %>%
summarise(loser_total_aces = sum(loser_aces)) %>% select(full_name, loser_total_aces) %>% arrange(desc(loser_total_aces))
loser_aces_total
## # A tibble: 463 x 2
## full_name loser_total_aces
## <chr> <dbl>
## 1 Ivo Karlovic 420
## 2 John Isner 377
## 3 NA NA 343
## 4 Kevin Anderson 261
## 5 Reilly Opelka 255
## 6 Marin Cilic 227
## 7 Sam Querrey 224
## 8 Benoit Paire 218
## 9 Robin Haase 218
## 10 Karen Khachanov 211
## # ... with 453 more rows
4.3. Total Number of Aces by Players
total_aces_full <- winner_aces_total %>% left_join(loser_aces_total, by = "full_name") %>% mutate(aces_total = winner_total_aces + loser_total_aces) %>% arrange(desc(aces_total))
#Removing the row that is shown with full name "NA NA"
total_aces_full <- subset(total_aces_full, full_name != "NA NA")
head(total_aces_full, 10)
## # A tibble: 10 x 4
## full_name winner_total_aces loser_total_aces aces_total
## <chr> <dbl> <dbl> <dbl>
## 1 John Isner 746 377 1123
## 2 Kevin Anderson 491 261 752
## 3 Sam Querrey 518 224 742
## 4 Gilles Muller 526 209 735
## 5 Ivo Karlovic 310 420 730
## 6 Marin Cilic 497 227 724
## 7 Alexander Zverev 424 187 611
## 8 Milos Raonic 454 112 566
## 9 Grigor Dimitrov 382 181 563
## 10 Roger Federer 489 61 550