An SQL join clause - corresponding to a join operation in relational algebra - combines columns from one or more tables in a relational database. It creates a set that can be saved as a table or used as it is. A JOIN is a means for combining columns from one (self-join) or more tables by using values common to each. Relational databases are usually normalized to eliminate duplication of information such as when entity types have one-to-many relationships.
The following datasets consists of all necessary tennis information about ATP World Tour 2017.
player_df %>% glimpse()
## Rows: 10,912
## Columns: 13
## $ player_id <chr> "a002", "a001", "a005", "a004", "a006", "a007", "a012",...
## $ player_slug <chr> "ricardo-acuna", "sadiq-abdullahi", "nelson-aerts", "eg...
## $ first_name <chr> "Ricardo", "Sadiq", "Nelson", "Egan", "Ronald", "Juan",...
## $ last_name <chr> "Acuna", "Abdullahi", "Aerts", "Adams", "Agenor", "Agui...
## $ flag_code <chr> "CHI", "NGR", "BRA", "USA", "USA", "ESP", "NED", "ITA",...
## $ residence <chr> "Jupiter, FL, USA", NA, NA, "Palmetto, FL, USA", "Bever...
## $ birth_place <chr> "Santiago, Chile", NA, "Cachoeira Do Sul, Brazil", "Mia...
## $ birth_date <date> 1958-01-13, 1960-02-02, 1963-04-25, 1959-06-15, 1964-1...
## $ turned_pro <dbl> 0, 0, 0, 0, 1983, 0, 0, 0, 0, 0, 0, 0, 0, 1983, 0, 0, 0...
## $ weight_kg <dbl> 68, 0, 75, 73, 82, 68, 0, 0, 0, 82, 73, 77, 0, 77, 77, ...
## $ height_cm <dbl> 175, 0, 188, 178, 180, 183, 0, 0, 0, 191, 180, 178, 0, ...
## $ handedness <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ backhand <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
rank_df %>% glimpse()
## Rows: 87,740
## Columns: 6
## $ week_title <date> 2017-11-20, 2017-11-20, 2017-11-20, 2017-11-20, 20...
## $ player_id <chr> "n409", "f324", "d875", "z355", "tb69", "c977", "gb...
## $ rank_number <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ ranking_points <dbl> 10645, 9605, 5150, 4610, 4015, 3805, 3775, 3165, 31...
## $ tourneys_played <dbl> 18, 17, 23, 25, 27, 22, 26, 22, 15, 25, 19, 16, 23,...
## $ player_age <dbl> 31, 36, 26, 20, 24, 29, 26, 25, 32, 26, 29, 30, 30,...
score_df %>% glimpse()
## Rows: 3,830
## Columns: 14
## $ match_id <chr> "2017-339-d875-n552", "2017-339-d875-r975", "...
## $ tourney_id <chr> "339", "339", "339", "339", "339", "339", "33...
## $ tourney_round_name <chr> "Finals", "Semi-Finals", "Semi-Finals", "Quar...
## $ winner_player_id <chr> "d875", "d875", "n552", "r975", "w367", "n552...
## $ loser_player_id <chr> "n552", "r975", "w367", "n409", "e831", "tc61...
## $ winner_seed <chr> "7", "7", "3", "1", "2", "3", "7", "1", "2", ...
## $ loser_seed <chr> "3", "1", "2", "5", NA, "WC", "4", NA, NA, "Q...
## $ match_score_tiebreaks <chr> "62 26 63", "76(7) 62", "76(3) 63", "46 63 64...
## $ winner_sets_won <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ loser_sets_won <dbl> 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, ...
## $ winner_games_won <dbl> 14, 13, 13, 16, 18, 12, 16, 12, 13, 16, 13, 1...
## $ loser_games_won <dbl> 11, 8, 9, 13, 15, 2, 12, 5, 10, 13, 9, 2, 4, ...
## $ winner_tiebreaks_won <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...
## $ loser_tiebreaks_won <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
stats_df %>% glimpse()
## Rows: 3,811
## Columns: 54
## $ match_id <chr> "2017-339-r975-n409", "2017-339-d8...
## $ match_time <time> 02:21:00, 01:49:00, 01:28:00, 01:...
## $ match_duration <dbl> 141, 109, 88, 102, 125, 156, 62, 6...
## $ winner_aces <dbl> 23, 7, 4, 1, 3, 11, 3, 12, 11, 7, ...
## $ winner_double_faults <dbl> 3, 2, 1, 1, 3, 3, 0, 1, 1, 2, 2, 0...
## $ winner_first_serves_in <dbl> 62, 52, 36, 56, 52, 67, 19, 40, 44...
## $ winner_first_serves_total <dbl> 97, 77, 58, 77, 94, 119, 34, 53, 6...
## $ winner_first_serve_points_won <dbl> 50, 41, 27, 37, 42, 47, 18, 30, 36...
## $ winner_first_serve_points_total <dbl> 62, 52, 36, 56, 52, 67, 19, 40, 44...
## $ winner_second_serve_points_won <dbl> 16, 12, 18, 14, 23, 28, 10, 7, 15,...
## $ winner_second_serve_points_total <dbl> 35, 25, 22, 21, 42, 52, 15, 13, 21...
## $ winner_break_points_saved <dbl> 6, 5, 0, 4, 13, 11, 0, 2, 4, 0, 2,...
## $ winner_break_points_serve_total <dbl> 7, 7, 0, 5, 14, 13, 0, 3, 4, 1, 2,...
## $ winner_service_points_won <dbl> 66, 53, 45, 51, 65, 75, 28, 37, 51...
## $ winner_service_points_total <dbl> 97, 77, 58, 77, 94, 119, 34, 53, 6...
## $ winner_first_serve_return_won <dbl> 22, 13, 4, 10, 8, 13, 13, 12, 1, 7...
## $ winner_first_serve_return_total <dbl> 61, 49, 28, 37, 37, 65, 28, 29, 35...
## $ winner_second_serve_return_won <dbl> 9, 11, 17, 14, 21, 21, 14, 9, 14, ...
## $ winner_second_serve_return_total <dbl> 23, 20, 33, 24, 45, 32, 19, 15, 23...
## $ winner_break_points_converted <dbl> 2, 3, 2, 2, 3, 4, 5, 4, 1, 2, 5, 3...
## $ winner_break_points_return_total <dbl> 4, 5, 4, 2, 7, 10, 8, 6, 4, 5, 8, ...
## $ winner_service_games_played <dbl> 15, 13, 10, 11, 14, 16, 7, 9, 11, ...
## $ winner_return_games_played <dbl> 14, 12, 10, 10, 14, 16, 7, 8, 10, ...
## $ winner_return_points_won <dbl> 31, 24, 21, 24, 29, 34, 27, 21, 15...
## $ winner_return_points_total <dbl> 84, 69, 61, 61, 82, 97, 47, 44, 58...
## $ winner_total_points_won <dbl> 97, 77, 66, 75, 94, 109, 55, 58, 6...
## $ winner_total_points_total <dbl> 181, 146, 119, 138, 176, 216, 81, ...
## $ loser_aces <dbl> 4, 4, 4, 9, 6, 2, 1, 0, 11, 5, 2, ...
## $ loser_double_faults <dbl> 0, 0, 3, 2, 5, 2, 2, 1, 1, 7, 2, 3...
## $ loser_first_serves_in <dbl> 61, 49, 28, 37, 37, 65, 28, 29, 35...
## $ loser_first_serves_total <dbl> 84, 69, 61, 61, 82, 97, 47, 44, 58...
## $ loser_first_serve_points_won <dbl> 39, 36, 24, 27, 29, 52, 15, 17, 34...
## $ loser_first_serve_points_total <dbl> 61, 49, 28, 37, 37, 65, 28, 29, 35...
## $ loser_second_serve_points_won <dbl> 14, 9, 16, 10, 24, 11, 5, 6, 9, 8,...
## $ loser_second_serve_points_total <dbl> 23, 20, 33, 24, 45, 32, 19, 15, 23...
## $ loser_break_points_saved <dbl> 2, 2, 2, 0, 4, 6, 3, 2, 3, 3, 3, 0...
## $ loser_break_points_serve_total <dbl> 4, 5, 4, 2, 7, 10, 8, 6, 4, 5, 8, ...
## $ loser_service_points_won <dbl> 53, 45, 40, 37, 53, 63, 20, 23, 43...
## $ loser_service_points_total <dbl> 84, 69, 61, 61, 82, 97, 47, 44, 58...
## $ loser_first_serve_return_won <dbl> 12, 11, 9, 19, 10, 20, 1, 10, 8, 8...
## $ loser_first_serve_return_total <dbl> 62, 52, 36, 56, 52, 67, 19, 40, 44...
## $ loser_second_serve_return_won <dbl> 19, 13, 4, 7, 19, 24, 5, 6, 6, 8, ...
## $ loser_second_serve_return_total <dbl> 35, 25, 22, 21, 42, 52, 15, 13, 21...
## $ loser_break_points_converted <dbl> 1, 2, 0, 1, 1, 2, 0, 1, 0, 1, 0, 0...
## $ loser_break_points_return_total <dbl> 7, 7, 0, 5, 14, 13, 0, 3, 4, 1, 2,...
## $ loser_service_games_played <dbl> 14, 12, 10, 10, 14, 16, 7, 8, 10, ...
## $ loser_return_games_played <dbl> 15, 13, 10, 11, 14, 16, 7, 9, 11, ...
## $ loser_return_points_won <dbl> 31, 24, 13, 26, 29, 44, 6, 16, 14,...
## $ loser_return_points_total <dbl> 97, 77, 58, 77, 94, 119, 34, 53, 6...
## $ loser_total_points_won <dbl> 84, 69, 53, 63, 82, 107, 26, 39, 5...
## $ loser_total_points_total <dbl> 181, 146, 119, 138, 176, 216, 81, ...
## $ tourney_id <chr> "339", "339", "339", "339", "339",...
## $ winner_player_id <chr> "r975", "d875", "d875", "n552", "d...
## $ loser_player_id <chr> "n409", "n552", "r975", "w367", "t...
tourney_df %>% glimpse()
## Rows: 67
## Columns: 12
## $ tourney_id <dbl> 339, 891, 451, 338, 301, 580, 375, 7161,...
## $ tourney_name <chr> "Brisbane International presented by Sun...
## $ tourney_location <chr> "Brisbane, Australia", "Chennai, India",...
## $ tourney_date <date> 2017-01-01, 2017-01-02, 2017-01-02, 201...
## $ tourney_singles_draw <dbl> 28, 28, 32, 28, 28, 128, 28, 28, 28, 28,...
## $ tourney_doubles_draw <dbl> 28, 28, 32, 28, 28, 128, 28, 28, 28, 28,...
## $ tourney_conditions <chr> "Outdoor", "Outdoor", "Outdoor", "Outdoo...
## $ tourney_surface <chr> "Hard", "Hard", "Hard", "Hard", "Hard", ...
## $ tourney_fin_commit <chr> "$495,630", "$505,730", "$1,334,270", "$...
## $ singles_winner_player_id <chr> "d875", "bd06", "d643", "ma30", "sm25", ...
## $ doubles_winner_1_player_id <chr> "kd46", "b757", "ca12", "kc41", "m844", ...
## $ doubles_winner_2_player_id <chr> "tc61", "n480", "me04", "mb88", "q019", ...
an1 <- score_df %>%
count(winner_player_id, sort = TRUE) %>%
slice(1:10) %>%
left_join(player_df, by=c("winner_player_id" = "player_id")) %>%
select(first_name, last_name, n) %>%
rename(., number_of_win = n)
kable(an1, col.names = c("First Name", "Last Name", "Number of Win"))
First Name | Last Name | Number of Win |
---|---|---|
Rafael | Nadal | 67 |
Alexander | Zverev | 54 |
Roger | Federer | 53 |
David | Goffin | 53 |
Grigor | Dimitrov | 49 |
Dominic | Thiem | 47 |
Roberto | Bautista Agut | 46 |
Marin | Cilic | 43 |
Adrian | Mannarino | 39 |
Juan Martin | del Potro | 38 |
an2 <- tourney_df %>%
inner_join(player_df, by=c("singles_winner_player_id" = "player_id")) %>%
count(flag_code, sort = TRUE) %>%
slice(1:10)
kable(an2, col.names = c("Country", "Number of Single Champions"))
Country | Number of Single Champions |
---|---|
ESP | 11 |
USA | 9 |
SUI | 8 |
FRA | 7 |
GER | 7 |
BUL | 4 |
ARG | 2 |
BEL | 2 |
BIH | 2 |
CRO | 2 |
winner_countries <- left_join(player_df, tourney_df, by=c("player_id" = "singles_winner_player_id")) %>%
drop_na() %>%
select(flag_code) %>% distinct()
loser_countries <- anti_join(player_df, winner_countries) %>% select(flag_code) %>% distinct()
inner_join(player_df, loser_countries) %>% select(player_id, flag_code) %>% distinct() %>%
inner_join(score_df, by = c("player_id" = "winner_player_id")) %>% select(flag_code, winner_games_won) %>%
group_by(flag_code) %>% transmute(winner_games_won = sum(winner_games_won)) %>%distinct() %>%
arrange(desc(winner_games_won))
## # A tibble: 39 x 2
## # Groups: flag_code [39]
## flag_code winner_games_won
## <chr> <dbl>
## 1 AUS 1989
## 2 CZE 1209
## 3 CAN 1190
## 4 SVK 889
## 5 BRA 873
## 6 POR 621
## 7 RSA 566
## 8 KAZ 495
## 9 KOR 438
## 10 GEO 377
## 11 GRE 359
## 12 NED 344
## 13 COL 308
## 14 IND 273
## 15 MDA 272
## 16 CYP 257
## 17 ROU 247
## 18 HUN 239
## 19 ISR 238
## 20 TUN 237
## 21 TPE 186
## 22 LAT 165
## 23 POL 165
## 24 CHI 154
## 25 BAR 152
## 26 NOR 131
## 27 CHN 103
## 28 SWE 99
## 29 EGY 95
## 30 LTU 90
## 31 IRL 70
## 32 BLR 64
## 33 ECU 57
## 34 SLO 41
## 35 NZL 36
## 36 EST 30
## 37 MAR 20
## 38 ESA 12
## 39 TUR 0
left_join(player_df, tourney_df, by=c("player_id" = "singles_winner_player_id")) %>%
drop_na() %>%
transmute(player_id,flag_code,tourney_name,first_name,last_name,winner_count = 1) %>%
group_by(player_id) %>%
transmute(winner_count = sum(winner_count),flag_code,first_name,last_name) %>%
arrange(desc(winner_count)) %>%
distinct() %>%
ungroup() %>%
select(first_name,last_name,winner_count)
## # A tibble: 35 x 3
## first_name last_name winner_count
## <chr> <chr> <dbl>
## 1 Roger Federer 7
## 2 Rafael Nadal 6
## 3 Alexander Zverev 5
## 4 Grigor Dimitrov 4
## 5 Jo-Wilfried Tsonga 4
## 6 Lucas Pouille 3
## 7 Jack Sock 3
## 8 Roberto Bautista Agut 2
## 9 Novak Djokovic 2
## 10 Damir Dzumhur 2
## 11 David Goffin 2
## 12 John Isner 2
## 13 Gilles Muller 2
## 14 Sam Querrey 2
## 15 Pablo Cuevas 1
## 16 Marin Cilic 1
## 17 Pablo Carreno Busta 1
## 18 Borna Coric 1
## 19 Juan Martin del Potro 1
## 20 Alexandr Dolgopolov 1
## 21 Victor Estrella Burgos 1
## 22 David Ferrer 1
## 23 Fabio Fognini 1
## 24 Peter Gojowczyk 1
## 25 Ryan Harrison 1
## 26 Denis Istomin 1
## 27 Steve Johnson 1
## 28 Philipp Kohlschreiber 1
## 29 Feliciano Lopez 1
## 30 Andy Murray 1
## 31 Leonardo Mayer 1
## 32 Andrey Rublev 1
## 33 Yuichi Sugita 1
## 34 Dominic Thiem 1
## 35 Stan Wawrinka 1
match_df <-left_join(stats_df,score_df, by=c("match_id" = "match_id")) %>%
select(match_id,match_duration,winner_player_id.x)
winners_match <- left_join(player_df,match_df,by=c("player_id" = "winner_player_id.x"))
left_join(tourney_df,winners_match,by=c("singles_winner_player_id" = "player_id")) %>%
drop_na() %>%
select(singles_winner_player_id,first_name,last_name,match_duration) %>%
group_by(singles_winner_player_id) %>%
transmute(first_name,last_name,min_match_duration = min(match_duration)) %>% distinct() %>%
arrange(min_match_duration)
## # A tibble: 35 x 4
## # Groups: singles_winner_player_id [35]
## singles_winner_player_id first_name last_name min_match_duration
## <chr> <chr> <chr> <dbl>
## 1 se73 Yuichi Sugita 8
## 2 pf39 Lucas Pouille 11
## 3 z355 Alexander Zverev 13
## 4 f401 David Ferrer 18
## 5 n409 Rafael Nadal 24
## 6 i165 Denis Istomin 24
## 7 g967 Peter Gojowczyk 29
## 8 d923 Damir Dzumhur 30
## 9 c977 Marin Cilic 31
## 10 sm25 Jack Sock 32
## 11 k435 Philipp Kohlschreiber 35
## 12 w367 Stan Wawrinka 37
## 13 d643 Novak Djokovic 40
## 14 f324 Roger Federer 43
## 15 re44 Andrey Rublev 44
## 16 t786 Jo-Wilfried Tsonga 45
## 17 ma30 Gilles Muller 46
## 18 j386 Steve Johnson 48
## 19 q927 Sam Querrey 49
## 20 bd06 Roberto Bautista Agut 51
## 21 h940 Ryan Harrison 52
## 22 tb69 Dominic Thiem 52
## 23 d875 Grigor Dimitrov 53
## 24 i186 John Isner 53
## 25 gb88 David Goffin 54
## 26 cg80 Borna Coric 58
## 27 c882 Pablo Cuevas 59
## 28 f510 Fabio Fognini 59
## 29 d801 Alexandr Dolgopolov 61
## 30 l397 Feliciano Lopez 61
## 31 cd85 Pablo Carreno Busta 62
## 32 d683 Juan Martin del Potro 63
## 33 md56 Leonardo Mayer 64
## 34 mc10 Andy Murray 73
## 35 e224 Victor Estrella Burgos 79
no_fault <- stats_df %>% select(match_id, winner_double_faults) %>% filter(winner_double_faults == 0)
player_score <- score_df %>% select(match_id, winner_player_id)
no_fault_players <- no_fault %>% inner_join(., player_score, by = c("match_id"))
no_fault_countries <- no_fault_players %>% left_join(., player_df, by = c("winner_player_id" = "player_id")) %>%
count(flag_code, sort = TRUE)
kable(no_fault_countries)
flag_code | n |
---|---|
ESP | 64 |
USA | 52 |
FRA | 42 |
ARG | 40 |
GER | 28 |
ITA | 25 |
BEL | 19 |
GBR | 19 |
AUS | 18 |
JPN | 18 |
SUI | 17 |
AUT | 15 |
SRB | 15 |
CRO | 13 |
RUS | 13 |
SVK | 13 |
CAN | 10 |
CZE | 10 |
KAZ | 9 |
UKR | 9 |
UZB | 7 |
BUL | 6 |
LUX | 6 |
BRA | 5 |
GRE | 5 |
KOR | 5 |
POR | 5 |
NA | 5 |
ISR | 4 |
NED | 4 |
NOR | 4 |
BIH | 3 |
CHI | 3 |
COL | 3 |
HUN | 3 |
ROU | 3 |
RSA | 3 |
TUN | 3 |
BAR | 2 |
BLR | 2 |
CHN | 2 |
GEO | 2 |
IND | 2 |
URU | 2 |
ESA | 1 |
MDA | 1 |
NZL | 1 |
TPE | 1 |
stga1 <- stats_df %>% select(match_id,match_duration)
stga1a <- left_join(score_df,stga1,by="match_id")%>%select(match_duration,winner_player_id,loser_player_id)
stga2 <- inner_join(player_df,stga1a,by=c("player_id"="winner_player_id"))%>%mutate(win_duration=match_duration)
stga2a <- inner_join(player_df,stga1a,by=c("player_id"="loser_player_id"))%>%mutate(lose_duration=match_duration)
stga2b <- full_join(stga2,stga2a,by = c("player_id", "player_slug", "first_name", "last_name", "flag_code", "residence", "birth_place", "birth_date", "turned_pro", "weight_kg", "height_cm", "handedness", "backhand", "match_duration"))%>%select(flag_code,win_duration,lose_duration,match_duration)
stga2c <- stga2b%>%group_by(flag_code)%>%mutate(duration=mean(match_duration,na.rm=T),win_duration=mean(win_duration,na.rm=T),lose_duration=mean(lose_duration,na.rm=T))
stgmain <- stga2c[!duplicated(stga2c$flag_code), ]
stgap2 <- stgmain%>%arrange(desc(win_duration))%>%head(12)
stgap3 <- stgap2 %>%gather(key,value,duration,win_duration,lose_duration)
ggplot(stgap3, aes(fill=key, y=value, x=key)) +
geom_bar(position="dodge", stat="identity") +
ggtitle("Countries' Game Durations") +
facet_wrap(~flag_code) +
theme(legend.position="none") +
theme(axis.text.x = element_text(angle = 25))+
xlab("")
plot_nadal <- rank_df %>%
filter(player_id=="n409") %>%
arrange(week_title)
plot_nadal <- plot_nadal[!duplicated(plot_nadal$ranking_points),]
ggplot(plot_nadal, aes(week_title,ranking_points)) +
geom_point() +
geom_smooth() +
labs(title = "Rafael Nadal's Ranking Points ", x = "Months", y = "Points") +
theme(title = element_text(size = 16, face = "bold"), plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(size = 14, face = "bold"),
axis.title.y = element_text(size = 14, face = "bold"), legend.title = element_blank())
plot_federer <- rank_df %>%
filter(player_id=="f324") %>%
arrange(week_title)
plot_federer <- plot_federer[!duplicated(plot_federer$ranking_points),]
ggplot(plot_federer, aes(week_title,ranking_points)) +
geom_point() +
geom_smooth() +
labs(title = "Roger Federer's Ranking Points ", x = "Months", y = "Points") +
theme(title = element_text(size = 16, face = "bold"), plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(size = 14, face = "bold"),
axis.title.y = element_text(size = 14, face = "bold"), legend.title = element_blank())