Introduction

An SQL join clause - corresponding to a join operation in relational algebra - combines columns from one or more tables in a relational database. It creates a set that can be saved as a table or used as it is. A JOIN is a means for combining columns from one (self-join) or more tables by using values common to each. Relational databases are usually normalized to eliminate duplication of information such as when entity types have one-to-many relationships.

Overview of Datasets

The following datasets consists of all necessary tennis information about ATP World Tour 2017.

player_df %>% glimpse()
## Rows: 10,912
## Columns: 13
## $ player_id   <chr> "a002", "a001", "a005", "a004", "a006", "a007", "a012",...
## $ player_slug <chr> "ricardo-acuna", "sadiq-abdullahi", "nelson-aerts", "eg...
## $ first_name  <chr> "Ricardo", "Sadiq", "Nelson", "Egan", "Ronald", "Juan",...
## $ last_name   <chr> "Acuna", "Abdullahi", "Aerts", "Adams", "Agenor", "Agui...
## $ flag_code   <chr> "CHI", "NGR", "BRA", "USA", "USA", "ESP", "NED", "ITA",...
## $ residence   <chr> "Jupiter, FL, USA", NA, NA, "Palmetto, FL, USA", "Bever...
## $ birth_place <chr> "Santiago, Chile", NA, "Cachoeira Do Sul, Brazil", "Mia...
## $ birth_date  <date> 1958-01-13, 1960-02-02, 1963-04-25, 1959-06-15, 1964-1...
## $ turned_pro  <dbl> 0, 0, 0, 0, 1983, 0, 0, 0, 0, 0, 0, 0, 0, 1983, 0, 0, 0...
## $ weight_kg   <dbl> 68, 0, 75, 73, 82, 68, 0, 0, 0, 82, 73, 77, 0, 77, 77, ...
## $ height_cm   <dbl> 175, 0, 188, 178, 180, 183, 0, 0, 0, 191, 180, 178, 0, ...
## $ handedness  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ backhand    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
rank_df %>% glimpse()
## Rows: 87,740
## Columns: 6
## $ week_title      <date> 2017-11-20, 2017-11-20, 2017-11-20, 2017-11-20, 20...
## $ player_id       <chr> "n409", "f324", "d875", "z355", "tb69", "c977", "gb...
## $ rank_number     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ ranking_points  <dbl> 10645, 9605, 5150, 4610, 4015, 3805, 3775, 3165, 31...
## $ tourneys_played <dbl> 18, 17, 23, 25, 27, 22, 26, 22, 15, 25, 19, 16, 23,...
## $ player_age      <dbl> 31, 36, 26, 20, 24, 29, 26, 25, 32, 26, 29, 30, 30,...
score_df %>% glimpse()
## Rows: 3,830
## Columns: 14
## $ match_id              <chr> "2017-339-d875-n552", "2017-339-d875-r975", "...
## $ tourney_id            <chr> "339", "339", "339", "339", "339", "339", "33...
## $ tourney_round_name    <chr> "Finals", "Semi-Finals", "Semi-Finals", "Quar...
## $ winner_player_id      <chr> "d875", "d875", "n552", "r975", "w367", "n552...
## $ loser_player_id       <chr> "n552", "r975", "w367", "n409", "e831", "tc61...
## $ winner_seed           <chr> "7", "7", "3", "1", "2", "3", "7", "1", "2", ...
## $ loser_seed            <chr> "3", "1", "2", "5", NA, "WC", "4", NA, NA, "Q...
## $ match_score_tiebreaks <chr> "62 26 63", "76(7) 62", "76(3) 63", "46 63 64...
## $ winner_sets_won       <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ loser_sets_won        <dbl> 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, ...
## $ winner_games_won      <dbl> 14, 13, 13, 16, 18, 12, 16, 12, 13, 16, 13, 1...
## $ loser_games_won       <dbl> 11, 8, 9, 13, 15, 2, 12, 5, 10, 13, 9, 2, 4, ...
## $ winner_tiebreaks_won  <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...
## $ loser_tiebreaks_won   <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
stats_df %>% glimpse()
## Rows: 3,811
## Columns: 54
## $ match_id                         <chr> "2017-339-r975-n409", "2017-339-d8...
## $ match_time                       <time> 02:21:00, 01:49:00, 01:28:00, 01:...
## $ match_duration                   <dbl> 141, 109, 88, 102, 125, 156, 62, 6...
## $ winner_aces                      <dbl> 23, 7, 4, 1, 3, 11, 3, 12, 11, 7, ...
## $ winner_double_faults             <dbl> 3, 2, 1, 1, 3, 3, 0, 1, 1, 2, 2, 0...
## $ winner_first_serves_in           <dbl> 62, 52, 36, 56, 52, 67, 19, 40, 44...
## $ winner_first_serves_total        <dbl> 97, 77, 58, 77, 94, 119, 34, 53, 6...
## $ winner_first_serve_points_won    <dbl> 50, 41, 27, 37, 42, 47, 18, 30, 36...
## $ winner_first_serve_points_total  <dbl> 62, 52, 36, 56, 52, 67, 19, 40, 44...
## $ winner_second_serve_points_won   <dbl> 16, 12, 18, 14, 23, 28, 10, 7, 15,...
## $ winner_second_serve_points_total <dbl> 35, 25, 22, 21, 42, 52, 15, 13, 21...
## $ winner_break_points_saved        <dbl> 6, 5, 0, 4, 13, 11, 0, 2, 4, 0, 2,...
## $ winner_break_points_serve_total  <dbl> 7, 7, 0, 5, 14, 13, 0, 3, 4, 1, 2,...
## $ winner_service_points_won        <dbl> 66, 53, 45, 51, 65, 75, 28, 37, 51...
## $ winner_service_points_total      <dbl> 97, 77, 58, 77, 94, 119, 34, 53, 6...
## $ winner_first_serve_return_won    <dbl> 22, 13, 4, 10, 8, 13, 13, 12, 1, 7...
## $ winner_first_serve_return_total  <dbl> 61, 49, 28, 37, 37, 65, 28, 29, 35...
## $ winner_second_serve_return_won   <dbl> 9, 11, 17, 14, 21, 21, 14, 9, 14, ...
## $ winner_second_serve_return_total <dbl> 23, 20, 33, 24, 45, 32, 19, 15, 23...
## $ winner_break_points_converted    <dbl> 2, 3, 2, 2, 3, 4, 5, 4, 1, 2, 5, 3...
## $ winner_break_points_return_total <dbl> 4, 5, 4, 2, 7, 10, 8, 6, 4, 5, 8, ...
## $ winner_service_games_played      <dbl> 15, 13, 10, 11, 14, 16, 7, 9, 11, ...
## $ winner_return_games_played       <dbl> 14, 12, 10, 10, 14, 16, 7, 8, 10, ...
## $ winner_return_points_won         <dbl> 31, 24, 21, 24, 29, 34, 27, 21, 15...
## $ winner_return_points_total       <dbl> 84, 69, 61, 61, 82, 97, 47, 44, 58...
## $ winner_total_points_won          <dbl> 97, 77, 66, 75, 94, 109, 55, 58, 6...
## $ winner_total_points_total        <dbl> 181, 146, 119, 138, 176, 216, 81, ...
## $ loser_aces                       <dbl> 4, 4, 4, 9, 6, 2, 1, 0, 11, 5, 2, ...
## $ loser_double_faults              <dbl> 0, 0, 3, 2, 5, 2, 2, 1, 1, 7, 2, 3...
## $ loser_first_serves_in            <dbl> 61, 49, 28, 37, 37, 65, 28, 29, 35...
## $ loser_first_serves_total         <dbl> 84, 69, 61, 61, 82, 97, 47, 44, 58...
## $ loser_first_serve_points_won     <dbl> 39, 36, 24, 27, 29, 52, 15, 17, 34...
## $ loser_first_serve_points_total   <dbl> 61, 49, 28, 37, 37, 65, 28, 29, 35...
## $ loser_second_serve_points_won    <dbl> 14, 9, 16, 10, 24, 11, 5, 6, 9, 8,...
## $ loser_second_serve_points_total  <dbl> 23, 20, 33, 24, 45, 32, 19, 15, 23...
## $ loser_break_points_saved         <dbl> 2, 2, 2, 0, 4, 6, 3, 2, 3, 3, 3, 0...
## $ loser_break_points_serve_total   <dbl> 4, 5, 4, 2, 7, 10, 8, 6, 4, 5, 8, ...
## $ loser_service_points_won         <dbl> 53, 45, 40, 37, 53, 63, 20, 23, 43...
## $ loser_service_points_total       <dbl> 84, 69, 61, 61, 82, 97, 47, 44, 58...
## $ loser_first_serve_return_won     <dbl> 12, 11, 9, 19, 10, 20, 1, 10, 8, 8...
## $ loser_first_serve_return_total   <dbl> 62, 52, 36, 56, 52, 67, 19, 40, 44...
## $ loser_second_serve_return_won    <dbl> 19, 13, 4, 7, 19, 24, 5, 6, 6, 8, ...
## $ loser_second_serve_return_total  <dbl> 35, 25, 22, 21, 42, 52, 15, 13, 21...
## $ loser_break_points_converted     <dbl> 1, 2, 0, 1, 1, 2, 0, 1, 0, 1, 0, 0...
## $ loser_break_points_return_total  <dbl> 7, 7, 0, 5, 14, 13, 0, 3, 4, 1, 2,...
## $ loser_service_games_played       <dbl> 14, 12, 10, 10, 14, 16, 7, 8, 10, ...
## $ loser_return_games_played        <dbl> 15, 13, 10, 11, 14, 16, 7, 9, 11, ...
## $ loser_return_points_won          <dbl> 31, 24, 13, 26, 29, 44, 6, 16, 14,...
## $ loser_return_points_total        <dbl> 97, 77, 58, 77, 94, 119, 34, 53, 6...
## $ loser_total_points_won           <dbl> 84, 69, 53, 63, 82, 107, 26, 39, 5...
## $ loser_total_points_total         <dbl> 181, 146, 119, 138, 176, 216, 81, ...
## $ tourney_id                       <chr> "339", "339", "339", "339", "339",...
## $ winner_player_id                 <chr> "r975", "d875", "d875", "n552", "d...
## $ loser_player_id                  <chr> "n409", "n552", "r975", "w367", "t...
tourney_df %>% glimpse()
## Rows: 67
## Columns: 12
## $ tourney_id                 <dbl> 339, 891, 451, 338, 301, 580, 375, 7161,...
## $ tourney_name               <chr> "Brisbane International presented by Sun...
## $ tourney_location           <chr> "Brisbane, Australia", "Chennai, India",...
## $ tourney_date               <date> 2017-01-01, 2017-01-02, 2017-01-02, 201...
## $ tourney_singles_draw       <dbl> 28, 28, 32, 28, 28, 128, 28, 28, 28, 28,...
## $ tourney_doubles_draw       <dbl> 28, 28, 32, 28, 28, 128, 28, 28, 28, 28,...
## $ tourney_conditions         <chr> "Outdoor", "Outdoor", "Outdoor", "Outdoo...
## $ tourney_surface            <chr> "Hard", "Hard", "Hard", "Hard", "Hard", ...
## $ tourney_fin_commit         <chr> "$495,630", "$505,730", "$1,334,270", "$...
## $ singles_winner_player_id   <chr> "d875", "bd06", "d643", "ma30", "sm25", ...
## $ doubles_winner_1_player_id <chr> "kd46", "b757", "ca12", "kc41", "m844", ...
## $ doubles_winner_2_player_id <chr> "tc61", "n480", "me04", "mb88", "q019", ...

Some Analysis

Top 10 players with the number matches (top winners by quantity)

an1 <- score_df %>% 
  count(winner_player_id, sort = TRUE) %>%
  slice(1:10) %>%
  left_join(player_df, by=c("winner_player_id" = "player_id")) %>%
  select(first_name, last_name, n) %>%
  rename(., number_of_win = n)
kable(an1, col.names = c("First Name", "Last Name", "Number of Win"))
First Name Last Name Number of Win
Rafael Nadal 67
Alexander Zverev 54
Roger Federer 53
David Goffin 53
Grigor Dimitrov 49
Dominic Thiem 47
Roberto Bautista Agut 46
Marin Cilic 43
Adrian Mannarino 39
Juan Martin del Potro 38

Top 10 countries by the singles champions

an2 <- tourney_df %>%
  inner_join(player_df, by=c("singles_winner_player_id" = "player_id")) %>%
  count(flag_code, sort = TRUE) %>%
  slice(1:10)
kable(an2, col.names = c("Country", "Number of Single Champions"))
Country Number of Single Champions
ESP 11
USA 9
SUI 8
FRA 7
GER 7
BUL 4
ARG 2
BEL 2
BIH 2
CRO 2

Rank countries which did not get any singles championships by the games won when they win the match

winner_countries <- left_join(player_df, tourney_df, by=c("player_id" = "singles_winner_player_id")) %>% 
  drop_na() %>%
  select(flag_code) %>% distinct()

loser_countries <- anti_join(player_df, winner_countries) %>% select(flag_code) %>% distinct()

inner_join(player_df, loser_countries) %>% select(player_id, flag_code) %>% distinct() %>%
  inner_join(score_df, by = c("player_id" = "winner_player_id")) %>% select(flag_code, winner_games_won) %>%
  group_by(flag_code) %>% transmute(winner_games_won = sum(winner_games_won)) %>%distinct() %>%
  arrange(desc(winner_games_won))
## # A tibble: 39 x 2
## # Groups:   flag_code [39]
##    flag_code winner_games_won
##    <chr>                <dbl>
##  1 AUS                   1989
##  2 CZE                   1209
##  3 CAN                   1190
##  4 SVK                    889
##  5 BRA                    873
##  6 POR                    621
##  7 RSA                    566
##  8 KAZ                    495
##  9 KOR                    438
## 10 GEO                    377
## 11 GRE                    359
## 12 NED                    344
## 13 COL                    308
## 14 IND                    273
## 15 MDA                    272
## 16 CYP                    257
## 17 ROU                    247
## 18 HUN                    239
## 19 ISR                    238
## 20 TUN                    237
## 21 TPE                    186
## 22 LAT                    165
## 23 POL                    165
## 24 CHI                    154
## 25 BAR                    152
## 26 NOR                    131
## 27 CHN                    103
## 28 SWE                     99
## 29 EGY                     95
## 30 LTU                     90
## 31 IRL                     70
## 32 BLR                     64
## 33 ECU                     57
## 34 SLO                     41
## 35 NZL                     36
## 36 EST                     30
## 37 MAR                     20
## 38 ESA                     12
## 39 TUR                      0

Players which won tourneys more than one and their number of championship

left_join(player_df, tourney_df, by=c("player_id" = "singles_winner_player_id")) %>% 
  drop_na() %>%
  transmute(player_id,flag_code,tourney_name,first_name,last_name,winner_count = 1) %>% 
  group_by(player_id) %>% 
  transmute(winner_count = sum(winner_count),flag_code,first_name,last_name) %>% 
  arrange(desc(winner_count)) %>% 
  distinct() %>%
  ungroup() %>% 
  select(first_name,last_name,winner_count)
## # A tibble: 35 x 3
##    first_name  last_name       winner_count
##    <chr>       <chr>                  <dbl>
##  1 Roger       Federer                    7
##  2 Rafael      Nadal                      6
##  3 Alexander   Zverev                     5
##  4 Grigor      Dimitrov                   4
##  5 Jo-Wilfried Tsonga                     4
##  6 Lucas       Pouille                    3
##  7 Jack        Sock                       3
##  8 Roberto     Bautista Agut              2
##  9 Novak       Djokovic                   2
## 10 Damir       Dzumhur                    2
## 11 David       Goffin                     2
## 12 John        Isner                      2
## 13 Gilles      Muller                     2
## 14 Sam         Querrey                    2
## 15 Pablo       Cuevas                     1
## 16 Marin       Cilic                      1
## 17 Pablo       Carreno Busta              1
## 18 Borna       Coric                      1
## 19 Juan Martin del Potro                  1
## 20 Alexandr    Dolgopolov                 1
## 21 Victor      Estrella Burgos            1
## 22 David       Ferrer                     1
## 23 Fabio       Fognini                    1
## 24 Peter       Gojowczyk                  1
## 25 Ryan        Harrison                   1
## 26 Denis       Istomin                    1
## 27 Steve       Johnson                    1
## 28 Philipp     Kohlschreiber              1
## 29 Feliciano   Lopez                      1
## 30 Andy        Murray                     1
## 31 Leonardo    Mayer                      1
## 32 Andrey      Rublev                     1
## 33 Yuichi      Sugita                     1
## 34 Dominic     Thiem                      1
## 35 Stan        Wawrinka                   1

Players which won the tourney at minimum duration

match_df <-left_join(stats_df,score_df, by=c("match_id" = "match_id")) %>%  
  select(match_id,match_duration,winner_player_id.x)

winners_match <-  left_join(player_df,match_df,by=c("player_id" = "winner_player_id.x"))

  left_join(tourney_df,winners_match,by=c("singles_winner_player_id" = "player_id")) %>% 
  drop_na() %>%
  select(singles_winner_player_id,first_name,last_name,match_duration) %>% 
  group_by(singles_winner_player_id) %>% 
  transmute(first_name,last_name,min_match_duration = min(match_duration)) %>% distinct() %>%
  arrange(min_match_duration)
## # A tibble: 35 x 4
## # Groups:   singles_winner_player_id [35]
##    singles_winner_player_id first_name  last_name       min_match_duration
##    <chr>                    <chr>       <chr>                        <dbl>
##  1 se73                     Yuichi      Sugita                           8
##  2 pf39                     Lucas       Pouille                         11
##  3 z355                     Alexander   Zverev                          13
##  4 f401                     David       Ferrer                          18
##  5 n409                     Rafael      Nadal                           24
##  6 i165                     Denis       Istomin                         24
##  7 g967                     Peter       Gojowczyk                       29
##  8 d923                     Damir       Dzumhur                         30
##  9 c977                     Marin       Cilic                           31
## 10 sm25                     Jack        Sock                            32
## 11 k435                     Philipp     Kohlschreiber                   35
## 12 w367                     Stan        Wawrinka                        37
## 13 d643                     Novak       Djokovic                        40
## 14 f324                     Roger       Federer                         43
## 15 re44                     Andrey      Rublev                          44
## 16 t786                     Jo-Wilfried Tsonga                          45
## 17 ma30                     Gilles      Muller                          46
## 18 j386                     Steve       Johnson                         48
## 19 q927                     Sam         Querrey                         49
## 20 bd06                     Roberto     Bautista Agut                   51
## 21 h940                     Ryan        Harrison                        52
## 22 tb69                     Dominic     Thiem                           52
## 23 d875                     Grigor      Dimitrov                        53
## 24 i186                     John        Isner                           53
## 25 gb88                     David       Goffin                          54
## 26 cg80                     Borna       Coric                           58
## 27 c882                     Pablo       Cuevas                          59
## 28 f510                     Fabio       Fognini                         59
## 29 d801                     Alexandr    Dolgopolov                      61
## 30 l397                     Feliciano   Lopez                           61
## 31 cd85                     Pablo       Carreno Busta                   62
## 32 d683                     Juan Martin del Potro                       63
## 33 md56                     Leonardo    Mayer                           64
## 34 mc10                     Andy        Murray                          73
## 35 e224                     Victor      Estrella Burgos                 79

Rank countries of players who have made no fault

no_fault <- stats_df %>% select(match_id, winner_double_faults) %>% filter(winner_double_faults == 0)
player_score <- score_df %>% select(match_id, winner_player_id)
no_fault_players <- no_fault %>% inner_join(., player_score, by = c("match_id"))
no_fault_countries <- no_fault_players %>% left_join(., player_df, by = c("winner_player_id" = "player_id")) %>%
  count(flag_code, sort = TRUE)
kable(no_fault_countries)
flag_code n
ESP 64
USA 52
FRA 42
ARG 40
GER 28
ITA 25
BEL 19
GBR 19
AUS 18
JPN 18
SUI 17
AUT 15
SRB 15
CRO 13
RUS 13
SVK 13
CAN 10
CZE 10
KAZ 9
UKR 9
UZB 7
BUL 6
LUX 6
BRA 5
GRE 5
KOR 5
POR 5
NA 5
ISR 4
NED 4
NOR 4
BIH 3
CHI 3
COL 3
HUN 3
ROU 3
RSA 3
TUN 3
BAR 2
BLR 2
CHN 2
GEO 2
IND 2
URU 2
ESA 1
MDA 1
NZL 1
TPE 1

Average duration of games in general, won games and lost games grouped by countries

stga1 <- stats_df %>% select(match_id,match_duration)
stga1a <- left_join(score_df,stga1,by="match_id")%>%select(match_duration,winner_player_id,loser_player_id)
stga2 <- inner_join(player_df,stga1a,by=c("player_id"="winner_player_id"))%>%mutate(win_duration=match_duration)
stga2a <- inner_join(player_df,stga1a,by=c("player_id"="loser_player_id"))%>%mutate(lose_duration=match_duration)
stga2b <- full_join(stga2,stga2a,by = c("player_id", "player_slug", "first_name", "last_name", "flag_code", "residence", "birth_place", "birth_date", "turned_pro", "weight_kg", "height_cm", "handedness", "backhand", "match_duration"))%>%select(flag_code,win_duration,lose_duration,match_duration)
stga2c <- stga2b%>%group_by(flag_code)%>%mutate(duration=mean(match_duration,na.rm=T),win_duration=mean(win_duration,na.rm=T),lose_duration=mean(lose_duration,na.rm=T))
stgmain <- stga2c[!duplicated(stga2c$flag_code), ]
stgap2 <- stgmain%>%arrange(desc(win_duration))%>%head(12)
stgap3 <- stgap2 %>%gather(key,value,duration,win_duration,lose_duration)
ggplot(stgap3, aes(fill=key, y=value, x=key)) + 
  geom_bar(position="dodge", stat="identity") +
  ggtitle("Countries' Game Durations") +
  facet_wrap(~flag_code) +
  theme(legend.position="none") +
  theme(axis.text.x = element_text(angle = 25))+
  xlab("")

Rafael Nadal - Ranking Points

plot_nadal <- rank_df %>% 
  filter(player_id=="n409") %>% 
  arrange(week_title)
plot_nadal <- plot_nadal[!duplicated(plot_nadal$ranking_points),]

ggplot(plot_nadal, aes(week_title,ranking_points)) + 
  geom_point() +
  geom_smooth() +  
  labs(title = "Rafael Nadal's Ranking Points ", x = "Months", y = "Points") +
  theme(title = element_text(size = 16, face = "bold"), plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(size = 14, face = "bold"), 
        axis.title.y = element_text(size = 14, face = "bold"), legend.title = element_blank()) 

Roger Federer Ranking Points

plot_federer <- rank_df %>% 
  filter(player_id=="f324") %>% 
  arrange(week_title)
plot_federer <- plot_federer[!duplicated(plot_federer$ranking_points),]

ggplot(plot_federer, aes(week_title,ranking_points)) + 
  geom_point() +
  geom_smooth() +  
  labs(title = "Roger Federer's Ranking Points ", x = "Months", y = "Points") +
  theme(title = element_text(size = 16, face = "bold"), plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(size = 14, face = "bold"), 
        axis.title.y = element_text(size = 14, face = "bold"), legend.title = element_blank()) 

References