library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
library(viridis)
## Loading required package: viridisLite
library(knitr)
url<-url("https://github.com/pjournal/mef03-Ozgunkrt/blob/master/atp_tennis_data_2017.rdata?raw=true")
load(url)
head(player_df)
## # A tibble: 6 x 13
## player_id player_slug first_name last_name flag_code residence
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 a002 ricardo-ac… Ricardo Acuna CHI Jupiter,…
## 2 a001 sadiq-abdu… Sadiq Abdullahi NGR <NA>
## 3 a005 nelson-aer… Nelson Aerts BRA <NA>
## 4 a004 egan-adams Egan Adams USA Palmetto…
## 5 a006 ronald-age… Ronald Agenor USA Beverly …
## 6 a007 juan-aguil… Juan Aguilera ESP <NA>
## # … with 7 more variables: birth_place <chr>, birth_date <date>,
## # turned_pro <dbl>, weight_kg <dbl>, height_cm <dbl>, handedness <chr>,
## # backhand <chr>
head(rank_df)
## # A tibble: 6 x 6
## week_title player_id rank_number ranking_points tourneys_played
## <date> <chr> <dbl> <dbl> <dbl>
## 1 2017-11-20 n409 1 10645 18
## 2 2017-11-20 f324 2 9605 17
## 3 2017-11-20 d875 3 5150 23
## 4 2017-11-20 z355 4 4610 25
## 5 2017-11-20 tb69 5 4015 27
## 6 2017-11-20 c977 6 3805 22
## # … with 1 more variable: player_age <dbl>
head(score_df)
## # A tibble: 6 x 14
## match_id tourney_id tourney_round_n… winner_player_id loser_player_id
## <chr> <chr> <chr> <chr> <chr>
## 1 2017-33… 339 Finals d875 n552
## 2 2017-33… 339 Semi-Finals d875 r975
## 3 2017-33… 339 Semi-Finals n552 w367
## 4 2017-33… 339 Quarter-Finals r975 n409
## 5 2017-33… 339 Quarter-Finals w367 e831
## 6 2017-33… 339 Quarter-Finals n552 tc61
## # … with 9 more variables: winner_seed <chr>, loser_seed <chr>,
## # match_score_tiebreaks <chr>, winner_sets_won <dbl>,
## # loser_sets_won <dbl>, winner_games_won <dbl>, loser_games_won <dbl>,
## # winner_tiebreaks_won <dbl>, loser_tiebreaks_won <dbl>
head(stats_df)
## # A tibble: 6 x 54
## match_id match_time match_duration winner_aces winner_double_f…
## <chr> <time> <dbl> <dbl> <dbl>
## 1 2017-33… 02:21 141 23 3
## 2 2017-33… 01:49 109 7 2
## 3 2017-33… 01:28 88 4 1
## 4 2017-33… 01:42 102 1 1
## 5 2017-33… 02:05 125 3 3
## 6 2017-33… 02:36 156 11 3
## # … with 49 more variables: winner_first_serves_in <dbl>,
## # winner_first_serves_total <dbl>, winner_first_serve_points_won <dbl>,
## # winner_first_serve_points_total <dbl>,
## # winner_second_serve_points_won <dbl>,
## # winner_second_serve_points_total <dbl>,
## # winner_break_points_saved <dbl>,
## # winner_break_points_serve_total <dbl>,
## # winner_service_points_won <dbl>, winner_service_points_total <dbl>,
## # winner_first_serve_return_won <dbl>,
## # winner_first_serve_return_total <dbl>,
## # winner_second_serve_return_won <dbl>,
## # winner_second_serve_return_total <dbl>,
## # winner_break_points_converted <dbl>,
## # winner_break_points_return_total <dbl>,
## # winner_service_games_played <dbl>, winner_return_games_played <dbl>,
## # winner_return_points_won <dbl>, winner_return_points_total <dbl>,
## # winner_total_points_won <dbl>, winner_total_points_total <dbl>,
## # loser_aces <dbl>, loser_double_faults <dbl>,
## # loser_first_serves_in <dbl>, loser_first_serves_total <dbl>,
## # loser_first_serve_points_won <dbl>,
## # loser_first_serve_points_total <dbl>,
## # loser_second_serve_points_won <dbl>,
## # loser_second_serve_points_total <dbl>, loser_break_points_saved <dbl>,
## # loser_break_points_serve_total <dbl>, loser_service_points_won <dbl>,
## # loser_service_points_total <dbl>, loser_first_serve_return_won <dbl>,
## # loser_first_serve_return_total <dbl>,
## # loser_second_serve_return_won <dbl>,
## # loser_second_serve_return_total <dbl>,
## # loser_break_points_converted <dbl>,
## # loser_break_points_return_total <dbl>,
## # loser_service_games_played <dbl>, loser_return_games_played <dbl>,
## # loser_return_points_won <dbl>, loser_return_points_total <dbl>,
## # loser_total_points_won <dbl>, loser_total_points_total <dbl>,
## # tourney_id <chr>, winner_player_id <chr>, loser_player_id <chr>
head(tourney_df)
## # A tibble: 6 x 12
## tourney_id tourney_name tourney_location tourney_date tourney_singles…
## <dbl> <chr> <chr> <date> <dbl>
## 1 339 Brisbane In… Brisbane, Austr… 2017-01-01 28
## 2 891 Aircel Chen… Chennai, India 2017-01-02 28
## 3 451 Qatar Exxon… Doha, Qatar 2017-01-02 32
## 4 338 Sydney Inte… Sydney, Austral… 2017-01-08 28
## 5 301 ASB Classic Auckland, New Z… 2017-01-09 28
## 6 580 Australian … Melbourne, Aust… 2017-01-16 128
## # … with 7 more variables: tourney_doubles_draw <dbl>,
## # tourney_conditions <chr>, tourney_surface <chr>,
## # tourney_fin_commit <chr>, singles_winner_player_id <chr>,
## # doubles_winner_1_player_id <chr>, doubles_winner_2_player_id <chr>
#1. Rank countries(flag codes) by the singles champions
champ_flags_df <- tourney_df %>% left_join(.,player_df, by=c("singles_winner_player_id"="player_id")) %>% count(flag_code, sort=TRUE) %>% print(champ_flags_df)
## # A tibble: 21 x 2
## flag_code n
## <chr> <int>
## 1 ESP 11
## 2 USA 9
## 3 SUI 8
## 4 FRA 7
## 5 GER 7
## 6 BUL 4
## 7 ARG 2
## 8 BEL 2
## 9 BIH 2
## 10 CRO 2
## # … with 11 more rows
#2 - Rank countries which did not get any singles championships by the games won when they win the match
winner_countries <- left_join(player_df, tourney_df, by=c("player_id" = "singles_winner_player_id")) %>%
drop_na() %>%
select(flag_code) %>% distinct()
loser_countries <- anti_join(player_df, winner_countries) %>% select(flag_code) %>% distinct()
## Joining, by = "flag_code"
inner_join(player_df, loser_countries) %>% select(player_id, flag_code) %>% distinct() %>%
inner_join(score_df, by = c("player_id" = "winner_player_id")) %>% select(flag_code, winner_games_won) %>%
group_by(flag_code) %>% transmute(winner_games_won = sum(winner_games_won)) %>%distinct() %>%
arrange(desc(winner_games_won))
## Joining, by = "flag_code"
## # A tibble: 39 x 2
## # Groups: flag_code [39]
## flag_code winner_games_won
## <chr> <dbl>
## 1 AUS 1989
## 2 CZE 1209
## 3 CAN 1190
## 4 SVK 889
## 5 BRA 873
## 6 POR 621
## 7 RSA 566
## 8 KAZ 495
## 9 KOR 438
## 10 GEO 377
## # … with 29 more rows
#3 - Players which won tourneys more than one and their number of championship
single_won <- left_join(player_df, tourney_df, by=c("player_id" = "singles_winner_player_id")) %>%
drop_na() %>%
transmute(player_id,flag_code,tourney_name,first_name,last_name,winner_count = 1) %>%
group_by(player_id) %>%
transmute(winner_count = sum(winner_count),flag_code,first_name,last_name) %>%
arrange(desc(winner_count)) %>%
distinct() %>%
ungroup() %>%
select(first_name,last_name,winner_count) %>%
slice(1:10)
print(single_won)
## # A tibble: 10 x 3
## first_name last_name winner_count
## <chr> <chr> <dbl>
## 1 Roger Federer 7
## 2 Rafael Nadal 6
## 3 Alexander Zverev 5
## 4 Grigor Dimitrov 4
## 5 Jo-Wilfried Tsonga 4
## 6 Lucas Pouille 3
## 7 Jack Sock 3
## 8 Roberto Bautista Agut 2
## 9 Novak Djokovic 2
## 10 Damir Dzumhur 2
ggplot(single_won, aes(x=first_name,y=winner_count,fill=first_name)) +
geom_col()
match_df <-left_join(stats_df,score_df, by=c("match_id" = "match_id")) %>%
select(match_id,match_duration,winner_player_id.x)
winners_match <- left_join(player_df,match_df,by=c("player_id" = "winner_player_id.x"))
left_join(tourney_df,winners_match,by=c("singles_winner_player_id" = "player_id")) %>%
drop_na() %>%
select(singles_winner_player_id,first_name,last_name,match_duration) %>%
group_by(singles_winner_player_id) %>%
transmute(first_name,last_name,max_match_duration = max(match_duration)) %>% distinct() %>%
arrange(max_match_duration)
## # A tibble: 35 x 4
## # Groups: singles_winner_player_id [35]
## singles_winner_player_id first_name last_name max_match_duration
## <chr> <chr> <chr> <dbl>
## 1 g967 Peter Gojowczyk 146
## 2 se73 Yuichi Sugita 147
## 3 d875 Grigor Dimitrov 156
## 4 k435 Philipp Kohlschreiber 157
## 5 d923 Damir Dzumhur 157
## 6 sm25 Jack Sock 163
## 7 h940 Ryan Harrison 166
## 8 i186 John Isner 166
## 9 f510 Fabio Fognini 171
## 10 z355 Alexander Zverev 174
## # … with 25 more rows