Library

library(tidyverse)

Exploring Data Frames

url<-url("https://github.com/pjournal/mef03-cakarb/blob/master/atp_tennis_data_2017.RData?raw=true")
atp_tennis<- load(url)

glimpse(rank_df)
## Observations: 87,740
## Variables: 6
## $ week_title      <date> 2017-11-20, 2017-11-20, 2017-11-20, 2017-11-2...
## $ player_id       <chr> "n409", "f324", "d875", "z355", "tb69", "c977"...
## $ rank_number     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ ranking_points  <dbl> 10645, 9605, 5150, 4610, 4015, 3805, 3775, 316...
## $ tourneys_played <dbl> 18, 17, 23, 25, 27, 22, 26, 22, 15, 25, 19, 16...
## $ player_age      <dbl> 31, 36, 26, 20, 24, 29, 26, 25, 32, 26, 29, 30...
glimpse(score_df)
## Observations: 3,830
## Variables: 14
## $ match_id              <chr> "2017-339-d875-n552", "2017-339-d875-r97...
## $ tourney_id            <chr> "339", "339", "339", "339", "339", "339"...
## $ tourney_round_name    <chr> "Finals", "Semi-Finals", "Semi-Finals", ...
## $ winner_player_id      <chr> "d875", "d875", "n552", "r975", "w367", ...
## $ loser_player_id       <chr> "n552", "r975", "w367", "n409", "e831", ...
## $ winner_seed           <chr> "7", "7", "3", "1", "2", "3", "7", "1", ...
## $ loser_seed            <chr> "3", "1", "2", "5", NA, "WC", "4", NA, N...
## $ match_score_tiebreaks <chr> "62 26 63", "76(7) 62", "76(3) 63", "46 ...
## $ winner_sets_won       <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ loser_sets_won        <dbl> 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0...
## $ winner_games_won      <dbl> 14, 13, 13, 16, 18, 12, 16, 12, 13, 16, ...
## $ loser_games_won       <dbl> 11, 8, 9, 13, 15, 2, 12, 5, 10, 13, 9, 2...
## $ winner_tiebreaks_won  <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0...
## $ loser_tiebreaks_won   <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0...
glimpse(tourney_df)
## Observations: 67
## Variables: 12
## $ tourney_id                 <dbl> 339, 891, 451, 338, 301, 580, 375, ...
## $ tourney_name               <chr> "Brisbane International presented b...
## $ tourney_location           <chr> "Brisbane, Australia", "Chennai, In...
## $ tourney_date               <date> 2017-01-01, 2017-01-02, 2017-01-02...
## $ tourney_singles_draw       <dbl> 28, 28, 32, 28, 28, 128, 28, 28, 28...
## $ tourney_doubles_draw       <dbl> 28, 28, 32, 28, 28, 128, 28, 28, 28...
## $ tourney_conditions         <chr> "Outdoor", "Outdoor", "Outdoor", "O...
## $ tourney_surface            <chr> "Hard", "Hard", "Hard", "Hard", "Ha...
## $ tourney_fin_commit         <chr> "$495,630", "$505,730", "$1,334,270...
## $ singles_winner_player_id   <chr> "d875", "bd06", "d643", "ma30", "sm...
## $ doubles_winner_1_player_id <chr> "kd46", "b757", "ca12", "kc41", "m8...
## $ doubles_winner_2_player_id <chr> "tc61", "n480", "me04", "mb88", "q0...
glimpse(stats_df)
## Observations: 3,811
## Variables: 54
## $ match_id                         <chr> "2017-339-r975-n409", "2017-3...
## $ match_time                       <time> 02:21:00, 01:49:00, 01:28:00...
## $ match_duration                   <dbl> 141, 109, 88, 102, 125, 156, ...
## $ winner_aces                      <dbl> 23, 7, 4, 1, 3, 11, 3, 12, 11...
## $ winner_double_faults             <dbl> 3, 2, 1, 1, 3, 3, 0, 1, 1, 2,...
## $ winner_first_serves_in           <dbl> 62, 52, 36, 56, 52, 67, 19, 4...
## $ winner_first_serves_total        <dbl> 97, 77, 58, 77, 94, 119, 34, ...
## $ winner_first_serve_points_won    <dbl> 50, 41, 27, 37, 42, 47, 18, 3...
## $ winner_first_serve_points_total  <dbl> 62, 52, 36, 56, 52, 67, 19, 4...
## $ winner_second_serve_points_won   <dbl> 16, 12, 18, 14, 23, 28, 10, 7...
## $ winner_second_serve_points_total <dbl> 35, 25, 22, 21, 42, 52, 15, 1...
## $ winner_break_points_saved        <dbl> 6, 5, 0, 4, 13, 11, 0, 2, 4, ...
## $ winner_break_points_serve_total  <dbl> 7, 7, 0, 5, 14, 13, 0, 3, 4, ...
## $ winner_service_points_won        <dbl> 66, 53, 45, 51, 65, 75, 28, 3...
## $ winner_service_points_total      <dbl> 97, 77, 58, 77, 94, 119, 34, ...
## $ winner_first_serve_return_won    <dbl> 22, 13, 4, 10, 8, 13, 13, 12,...
## $ winner_first_serve_return_total  <dbl> 61, 49, 28, 37, 37, 65, 28, 2...
## $ winner_second_serve_return_won   <dbl> 9, 11, 17, 14, 21, 21, 14, 9,...
## $ winner_second_serve_return_total <dbl> 23, 20, 33, 24, 45, 32, 19, 1...
## $ winner_break_points_converted    <dbl> 2, 3, 2, 2, 3, 4, 5, 4, 1, 2,...
## $ winner_break_points_return_total <dbl> 4, 5, 4, 2, 7, 10, 8, 6, 4, 5...
## $ winner_service_games_played      <dbl> 15, 13, 10, 11, 14, 16, 7, 9,...
## $ winner_return_games_played       <dbl> 14, 12, 10, 10, 14, 16, 7, 8,...
## $ winner_return_points_won         <dbl> 31, 24, 21, 24, 29, 34, 27, 2...
## $ winner_return_points_total       <dbl> 84, 69, 61, 61, 82, 97, 47, 4...
## $ winner_total_points_won          <dbl> 97, 77, 66, 75, 94, 109, 55, ...
## $ winner_total_points_total        <dbl> 181, 146, 119, 138, 176, 216,...
## $ loser_aces                       <dbl> 4, 4, 4, 9, 6, 2, 1, 0, 11, 5...
## $ loser_double_faults              <dbl> 0, 0, 3, 2, 5, 2, 2, 1, 1, 7,...
## $ loser_first_serves_in            <dbl> 61, 49, 28, 37, 37, 65, 28, 2...
## $ loser_first_serves_total         <dbl> 84, 69, 61, 61, 82, 97, 47, 4...
## $ loser_first_serve_points_won     <dbl> 39, 36, 24, 27, 29, 52, 15, 1...
## $ loser_first_serve_points_total   <dbl> 61, 49, 28, 37, 37, 65, 28, 2...
## $ loser_second_serve_points_won    <dbl> 14, 9, 16, 10, 24, 11, 5, 6, ...
## $ loser_second_serve_points_total  <dbl> 23, 20, 33, 24, 45, 32, 19, 1...
## $ loser_break_points_saved         <dbl> 2, 2, 2, 0, 4, 6, 3, 2, 3, 3,...
## $ loser_break_points_serve_total   <dbl> 4, 5, 4, 2, 7, 10, 8, 6, 4, 5...
## $ loser_service_points_won         <dbl> 53, 45, 40, 37, 53, 63, 20, 2...
## $ loser_service_points_total       <dbl> 84, 69, 61, 61, 82, 97, 47, 4...
## $ loser_first_serve_return_won     <dbl> 12, 11, 9, 19, 10, 20, 1, 10,...
## $ loser_first_serve_return_total   <dbl> 62, 52, 36, 56, 52, 67, 19, 4...
## $ loser_second_serve_return_won    <dbl> 19, 13, 4, 7, 19, 24, 5, 6, 6...
## $ loser_second_serve_return_total  <dbl> 35, 25, 22, 21, 42, 52, 15, 1...
## $ loser_break_points_converted     <dbl> 1, 2, 0, 1, 1, 2, 0, 1, 0, 1,...
## $ loser_break_points_return_total  <dbl> 7, 7, 0, 5, 14, 13, 0, 3, 4, ...
## $ loser_service_games_played       <dbl> 14, 12, 10, 10, 14, 16, 7, 8,...
## $ loser_return_games_played        <dbl> 15, 13, 10, 11, 14, 16, 7, 9,...
## $ loser_return_points_won          <dbl> 31, 24, 13, 26, 29, 44, 6, 16...
## $ loser_return_points_total        <dbl> 97, 77, 58, 77, 94, 119, 34, ...
## $ loser_total_points_won           <dbl> 84, 69, 53, 63, 82, 107, 26, ...
## $ loser_total_points_total         <dbl> 181, 146, 119, 138, 176, 216,...
## $ tourney_id                       <chr> "339", "339", "339", "339", "...
## $ winner_player_id                 <chr> "r975", "d875", "d875", "n552...
## $ loser_player_id                  <chr> "n409", "n552", "r975", "w367...
glimpse(player_df)
## Observations: 10,912
## Variables: 13
## $ player_id   <chr> "a002", "a001", "a005", "a004", "a006", "a007", "a...
## $ player_slug <chr> "ricardo-acuna", "sadiq-abdullahi", "nelson-aerts"...
## $ first_name  <chr> "Ricardo", "Sadiq", "Nelson", "Egan", "Ronald", "J...
## $ last_name   <chr> "Acuna", "Abdullahi", "Aerts", "Adams", "Agenor", ...
## $ flag_code   <chr> "CHI", "NGR", "BRA", "USA", "USA", "ESP", "NED", "...
## $ residence   <chr> "Jupiter, FL, USA", NA, NA, "Palmetto, FL, USA", "...
## $ birth_place <chr> "Santiago, Chile", NA, "Cachoeira Do Sul, Brazil",...
## $ birth_date  <date> 1958-01-13, 1960-02-02, 1963-04-25, 1959-06-15, 1...
## $ turned_pro  <dbl> 0, 0, 0, 0, 1983, 0, 0, 0, 0, 0, 0, 0, 0, 1983, 0,...
## $ weight_kg   <dbl> 68, 0, 75, 73, 82, 68, 0, 0, 0, 82, 73, 77, 0, 77,...
## $ height_cm   <dbl> 175, 0, 188, 178, 180, 183, 0, 0, 0, 191, 180, 178...
## $ handedness  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ backhand    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...

1. Countries Based on Their Championship (Rank countries (flag codes) by the number of singles champions)

left_join(player_df, tourney_df, by=c("player_id" = "singles_winner_player_id")) %>% drop_na() %>% 
  select(player_id,flag_code) %>% mutate(winner_count = 1) %>% select(flag_code, winner_count) %>% group_by(flag_code) %>% transmute(winner_count = sum(winner_count)) %>% arrange(desc(winner_count)) %>% distinct()
## # A tibble: 21 x 2
## # Groups:   flag_code [21]
##    flag_code winner_count
##    <chr>            <dbl>
##  1 ESP                 11
##  2 USA                  9
##  3 SUI                  8
##  4 GER                  7
##  5 FRA                  7
##  6 BUL                  4
##  7 CRO                  2
##  8 SRB                  2
##  9 ARG                  2
## 10 BIH                  2
## # ... with 11 more rows

2. Rank countries which did not get any singles champs by the games won when they win the match

winner_countries <- left_join(player_df, tourney_df, by=c("player_id" = "singles_winner_player_id")) %>% drop_na() %>%
  select(flag_code) %>% distinct()

loser_countries <- anti_join(player_df, winner_countries) %>% select(flag_code) %>% distinct() 

inner_join(player_df, loser_countries) %>% select(player_id, flag_code) %>% distinct() %>% 
  inner_join(score_df, by = c("player_id" = "winner_player_id")) %>% select(flag_code, winner_games_won) %>%
  group_by(flag_code) %>% transmute(winner_games_won = sum(winner_games_won)) %>%distinct() %>% 
  arrange(desc(winner_games_won))
## # A tibble: 39 x 2
## # Groups:   flag_code [39]
##    flag_code winner_games_won
##    <chr>                <dbl>
##  1 AUS                   1989
##  2 CZE                   1209
##  3 CAN                   1190
##  4 SVK                    889
##  5 BRA                    873
##  6 POR                    621
##  7 RSA                    566
##  8 KAZ                    495
##  9 KOR                    438
## 10 GEO                    377
## # ... with 29 more rows

3. Rank Total Match Duration of Winner Player

inner_join(stats_df, score_df) %>% select(match_id, match_time, match_duration, winner_player_id) %>% 
  inner_join(player_df, by=c("winner_player_id" = "player_id")) %>% mutate(full_name = paste(first_name," ",last_name)) %>% select(winner_player_id, full_name, match_id, match_duration) %>% group_by(winner_player_id) %>% 
  transmute(full_name, total_match_duration = sum(match_duration)) %>% distinct() %>% ungroup() %>% 
  select(full_name, total_match_duration) %>% arrange(desc(total_match_duration))
## # A tibble: 312 x 2
##    full_name               total_match_duration
##    <chr>                                  <dbl>
##  1 Rafael   Nadal                          7445
##  2 David   Goffin                          5483
##  3 Alexander   Zverev                      5218
##  4 Roger   Federer                         4910
##  5 Dominic   Thiem                         4827
##  6 Roberto   Bautista Agut                 4808
##  7 Grigor   Dimitrov                       4501
##  8 Albert   Ramos-Vinolas                  4501
##  9 Marin   Cilic                           4371
## 10 Adrian   Mannarino                      4339
## # ... with 302 more rows

4. Total Ranking Points by Countries (Flag Codes) in week_title 2017-11-20

rank_df %>% filter(week_title == "2017-11-20") %>% inner_join(player_df) %>% select(flag_code, ranking_points) %>%
  group_by(flag_code) %>% transmute(ranking_points = sum(ranking_points)) %>% arrange(desc(ranking_points)) %>%distinct()         
## # A tibble: 78 x 2
## # Groups:   flag_code [78]
##    flag_code ranking_points
##    <chr>              <dbl>
##  1 ESP                27030
##  2 USA                20712
##  3 FRA                18352
##  4 GER                15018
##  5 SUI                13582
##  6 ARG                11121
##  7 ITA                 8782
##  8 SRB                 7718
##  9 AUS                 7595
## 10 RUS                 6665
## # ... with 68 more rows