library(dplyr)
library(tidyverse)
url<-url("https://github.com/pjournal/mef03-balkanburcu/blob/master/atp_tennis_data_2017.RData?raw=true")
atp_tennis<- load(url)
head(tourney_df)
## # A tibble: 6 x 12
## tourney_id tourney_name tourney_location tourney_date tourney_singles~
## <dbl> <chr> <chr> <date> <dbl>
## 1 339 Brisbane In~ Brisbane, Austr~ 2017-01-01 28
## 2 891 Aircel Chen~ Chennai, India 2017-01-02 28
## 3 451 Qatar Exxon~ Doha, Qatar 2017-01-02 32
## 4 338 Sydney Inte~ Sydney, Austral~ 2017-01-08 28
## 5 301 ASB Classic Auckland, New Z~ 2017-01-09 28
## 6 580 Australian ~ Melbourne, Aust~ 2017-01-16 128
## # ... with 7 more variables: tourney_doubles_draw <dbl>,
## # tourney_conditions <chr>, tourney_surface <chr>,
## # tourney_fin_commit <chr>, singles_winner_player_id <chr>,
## # doubles_winner_1_player_id <chr>, doubles_winner_2_player_id <chr>
head(player_df)
## # A tibble: 6 x 13
## player_id player_slug first_name last_name flag_code residence
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 a002 ricardo-ac~ Ricardo Acuna CHI Jupiter,~
## 2 a001 sadiq-abdu~ Sadiq Abdullahi NGR <NA>
## 3 a005 nelson-aer~ Nelson Aerts BRA <NA>
## 4 a004 egan-adams Egan Adams USA Palmetto~
## 5 a006 ronald-age~ Ronald Agenor USA Beverly ~
## 6 a007 juan-aguil~ Juan Aguilera ESP <NA>
## # ... with 7 more variables: birth_place <chr>, birth_date <date>,
## # turned_pro <dbl>, weight_kg <dbl>, height_cm <dbl>, handedness <chr>,
## # backhand <chr>
head(stats_df)
## # A tibble: 6 x 54
## match_id match_time match_duration winner_aces winner_double_f~
## <chr> <time> <dbl> <dbl> <dbl>
## 1 2017-33~ 02:21 141 23 3
## 2 2017-33~ 01:49 109 7 2
## 3 2017-33~ 01:28 88 4 1
## 4 2017-33~ 01:42 102 1 1
## 5 2017-33~ 02:05 125 3 3
## 6 2017-33~ 02:36 156 11 3
## # ... with 49 more variables: winner_first_serves_in <dbl>,
## # winner_first_serves_total <dbl>, winner_first_serve_points_won <dbl>,
## # winner_first_serve_points_total <dbl>,
## # winner_second_serve_points_won <dbl>,
## # winner_second_serve_points_total <dbl>,
## # winner_break_points_saved <dbl>,
## # winner_break_points_serve_total <dbl>,
## # winner_service_points_won <dbl>, winner_service_points_total <dbl>,
## # winner_first_serve_return_won <dbl>,
## # winner_first_serve_return_total <dbl>,
## # winner_second_serve_return_won <dbl>,
## # winner_second_serve_return_total <dbl>,
## # winner_break_points_converted <dbl>,
## # winner_break_points_return_total <dbl>,
## # winner_service_games_played <dbl>, winner_return_games_played <dbl>,
## # winner_return_points_won <dbl>, winner_return_points_total <dbl>,
## # winner_total_points_won <dbl>, winner_total_points_total <dbl>,
## # loser_aces <dbl>, loser_double_faults <dbl>,
## # loser_first_serves_in <dbl>, loser_first_serves_total <dbl>,
## # loser_first_serve_points_won <dbl>,
## # loser_first_serve_points_total <dbl>,
## # loser_second_serve_points_won <dbl>,
## # loser_second_serve_points_total <dbl>, loser_break_points_saved <dbl>,
## # loser_break_points_serve_total <dbl>, loser_service_points_won <dbl>,
## # loser_service_points_total <dbl>, loser_first_serve_return_won <dbl>,
## # loser_first_serve_return_total <dbl>,
## # loser_second_serve_return_won <dbl>,
## # loser_second_serve_return_total <dbl>,
## # loser_break_points_converted <dbl>,
## # loser_break_points_return_total <dbl>,
## # loser_service_games_played <dbl>, loser_return_games_played <dbl>,
## # loser_return_points_won <dbl>, loser_return_points_total <dbl>,
## # loser_total_points_won <dbl>, loser_total_points_total <dbl>,
## # tourney_id <chr>, winner_player_id <chr>, loser_player_id <chr>
head(rank_df)
## # A tibble: 6 x 6
## week_title player_id rank_number ranking_points tourneys_played
## <date> <chr> <dbl> <dbl> <dbl>
## 1 2017-11-20 n409 1 10645 18
## 2 2017-11-20 f324 2 9605 17
## 3 2017-11-20 d875 3 5150 23
## 4 2017-11-20 z355 4 4610 25
## 5 2017-11-20 tb69 5 4015 27
## 6 2017-11-20 c977 6 3805 22
## # ... with 1 more variable: player_age <dbl>
head(score_df)
## # A tibble: 6 x 14
## match_id tourney_id tourney_round_n~ winner_player_id loser_player_id
## <chr> <chr> <chr> <chr> <chr>
## 1 2017-33~ 339 Finals d875 n552
## 2 2017-33~ 339 Semi-Finals d875 r975
## 3 2017-33~ 339 Semi-Finals n552 w367
## 4 2017-33~ 339 Quarter-Finals r975 n409
## 5 2017-33~ 339 Quarter-Finals w367 e831
## 6 2017-33~ 339 Quarter-Finals n552 tc61
## # ... with 9 more variables: winner_seed <chr>, loser_seed <chr>,
## # match_score_tiebreaks <chr>, winner_sets_won <dbl>,
## # loser_sets_won <dbl>, winner_games_won <dbl>, loser_games_won <dbl>,
## # winner_tiebreaks_won <dbl>, loser_tiebreaks_won <dbl>
1 - Rank countries (flag codes) by the number of singles champions)
left_join(player_df, tourney_df, by=c("player_id" = "singles_winner_player_id")) %>%
drop_na() %>%
select(player_id,flag_code) %>%
mutate(winner_count = 1) %>% select(flag_code, winner_count) %>%
group_by(flag_code) %>%
transmute(winner_count = sum(winner_count)) %>% arrange(desc(winner_count)) %>%
distinct()
## # A tibble: 21 x 2
## # Groups: flag_code [21]
## flag_code winner_count
## <chr> <dbl>
## 1 ESP 11
## 2 USA 9
## 3 SUI 8
## 4 GER 7
## 5 FRA 7
## 6 BUL 4
## 7 CRO 2
## 8 SRB 2
## 9 ARG 2
## 10 BIH 2
## # ... with 11 more rows
2 - Rank countries which did not get any singles championships by the games won when they win the match
winner_countries <- left_join(player_df, tourney_df, by=c("player_id" = "singles_winner_player_id")) %>%
drop_na() %>%
select(flag_code) %>% distinct()
loser_countries <- anti_join(player_df, winner_countries) %>% select(flag_code) %>% distinct()
## Joining, by = "flag_code"
inner_join(player_df, loser_countries) %>% select(player_id, flag_code) %>% distinct() %>%
inner_join(score_df, by = c("player_id" = "winner_player_id")) %>% select(flag_code, winner_games_won) %>%
group_by(flag_code) %>% transmute(winner_games_won = sum(winner_games_won)) %>%distinct() %>%
arrange(desc(winner_games_won))
## Joining, by = "flag_code"
## # A tibble: 39 x 2
## # Groups: flag_code [39]
## flag_code winner_games_won
## <chr> <dbl>
## 1 AUS 1989
## 2 CZE 1209
## 3 CAN 1190
## 4 SVK 889
## 5 BRA 873
## 6 POR 621
## 7 RSA 566
## 8 KAZ 495
## 9 KOR 438
## 10 GEO 377
## # ... with 29 more rows
3 - Players which won tourneys more than one and their number of championship
left_join(player_df, tourney_df, by=c("player_id" = "singles_winner_player_id")) %>%
drop_na() %>%
transmute(player_id,flag_code,tourney_name,first_name,last_name,winner_count = 1) %>%
group_by(player_id) %>%
transmute(winner_count = sum(winner_count),flag_code,first_name,last_name) %>%
arrange(desc(winner_count)) %>%
distinct() %>%
ungroup() %>%
select(first_name,last_name,winner_count)
## # A tibble: 35 x 3
## first_name last_name winner_count
## <chr> <chr> <dbl>
## 1 Roger Federer 7
## 2 Rafael Nadal 6
## 3 Alexander Zverev 5
## 4 Grigor Dimitrov 4
## 5 Jo-Wilfried Tsonga 4
## 6 Lucas Pouille 3
## 7 Jack Sock 3
## 8 Roberto Bautista Agut 2
## 9 Novak Djokovic 2
## 10 Damir Dzumhur 2
## # ... with 25 more rows
4 - Players which won the tourney at minimum duration
match_df <-left_join(stats_df,score_df, by=c("match_id" = "match_id")) %>%
select(match_id,match_duration,winner_player_id.x)
winners_match <- left_join(player_df,match_df,by=c("player_id" = "winner_player_id.x"))
left_join(tourney_df,winners_match,by=c("singles_winner_player_id" = "player_id")) %>%
drop_na() %>%
select(singles_winner_player_id,first_name,last_name,match_duration) %>%
group_by(singles_winner_player_id) %>%
transmute(first_name,last_name,min_match_duration = min(match_duration)) %>% distinct() %>%
arrange(min_match_duration)
## # A tibble: 35 x 4
## # Groups: singles_winner_player_id [35]
## singles_winner_player_id first_name last_name min_match_duration
## <chr> <chr> <chr> <dbl>
## 1 se73 Yuichi Sugita 8
## 2 pf39 Lucas Pouille 11
## 3 z355 Alexander Zverev 13
## 4 f401 David Ferrer 18
## 5 n409 Rafael Nadal 24
## 6 i165 Denis Istomin 24
## 7 g967 Peter Gojowczyk 29
## 8 d923 Damir Dzumhur 30
## 9 c977 Marin Cilic 31
## 10 sm25 Jack Sock 32
## # ... with 25 more rows