raw_df <- load(url("https://github.com/pjournal/mef04-nerigrsy/blob/gh-pages/atp_tennis_data_2017.RData?raw=true"))
glimpse(raw_df)
## chr [1:6] "tourney_df" "player_df" "stats_df" "score_df" "rank_df" ...
head(raw_df)
## [1] "tourney_df" "player_df" "stats_df" "score_df"
## [5] "rank_df" "data_source_url"
tourney_df %>% print(n=10)
## # A tibble: 67 x 12
## tourney_id tourney_name tourney_location tourney_date tourney_singles~
## <dbl> <chr> <chr> <date> <dbl>
## 1 339 Brisbane In~ Brisbane, Austr~ 2017-01-01 28
## 2 891 Aircel Chen~ Chennai, India 2017-01-02 28
## 3 451 Qatar Exxon~ Doha, Qatar 2017-01-02 32
## 4 338 Sydney Inte~ Sydney, Austral~ 2017-01-08 28
## 5 301 ASB Classic Auckland, New Z~ 2017-01-09 28
## 6 580 Australian ~ Melbourne, Aust~ 2017-01-16 128
## 7 375 Open Sud de~ Montpellier, Fr~ 2017-02-06 28
## 8 7161 Ecuador Open Quito, Ecuador 2017-02-06 28
## 9 7434 Garanti Koz~ Sofia, Bulgaria 2017-02-06 28
## 10 506 Argentina O~ Buenos Aires, A~ 2017-02-13 28
## # ... with 57 more rows, and 7 more variables: tourney_doubles_draw <dbl>,
## # tourney_conditions <chr>, tourney_surface <chr>, tourney_fin_commit <chr>,
## # singles_winner_player_id <chr>, doubles_winner_1_player_id <chr>,
## # doubles_winner_2_player_id <chr>
player_df %>% print(n=5)
## # A tibble: 10,912 x 13
## player_id player_slug first_name last_name flag_code residence birth_place
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 a002 ricardo-ac~ Ricardo Acuna CHI Jupiter,~ Santiago, ~
## 2 a001 sadiq-abdu~ Sadiq Abdullahi NGR <NA> <NA>
## 3 a005 nelson-aer~ Nelson Aerts BRA <NA> Cachoeira ~
## 4 a004 egan-adams Egan Adams USA Palmetto~ Miami Beac~
## 5 a006 ronald-age~ Ronald Agenor USA Beverly ~ Rabat, Mor~
## # ... with 10,907 more rows, and 6 more variables: birth_date <date>,
## # turned_pro <dbl>, weight_kg <dbl>, height_cm <dbl>, handedness <chr>,
## # backhand <chr>
score_df %>% print(n=10)
## # A tibble: 3,830 x 14
## match_id tourney_id tourney_round_n~ winner_player_id loser_player_id
## <chr> <chr> <chr> <chr> <chr>
## 1 2017-33~ 339 Finals d875 n552
## 2 2017-33~ 339 Semi-Finals d875 r975
## 3 2017-33~ 339 Semi-Finals n552 w367
## 4 2017-33~ 339 Quarter-Finals r975 n409
## 5 2017-33~ 339 Quarter-Finals w367 e831
## 6 2017-33~ 339 Quarter-Finals n552 tc61
## 7 2017-33~ 339 Quarter-Finals d875 tb69
## 8 2017-33~ 339 Round of 16 r975 sm37
## 9 2017-33~ 339 Round of 16 w367 t840
## 10 2017-33~ 339 Round of 16 n552 dc58
## # ... with 3,820 more rows, and 9 more variables: winner_seed <chr>,
## # loser_seed <chr>, match_score_tiebreaks <chr>, winner_sets_won <dbl>,
## # loser_sets_won <dbl>, winner_games_won <dbl>, loser_games_won <dbl>,
## # winner_tiebreaks_won <dbl>, loser_tiebreaks_won <dbl>
rank_df %>% print(n=5)
## # A tibble: 87,740 x 6
## week_title player_id rank_number ranking_points tourneys_played player_age
## <date> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2017-11-20 n409 1 10645 18 31
## 2 2017-11-20 f324 2 9605 17 36
## 3 2017-11-20 d875 3 5150 23 26
## 4 2017-11-20 z355 4 4610 25 20
## 5 2017-11-20 tb69 5 4015 27 24
## # ... with 87,735 more rows
stats_df %>% print(n=5)
## # A tibble: 3,811 x 54
## match_id match_time match_duration winner_aces winner_double_f~
## <chr> <time> <dbl> <dbl> <dbl>
## 1 2017-33~ 02:21 141 23 3
## 2 2017-33~ 01:49 109 7 2
## 3 2017-33~ 01:28 88 4 1
## 4 2017-33~ 01:42 102 1 1
## 5 2017-33~ 02:05 125 3 3
## # ... with 3,806 more rows, and 49 more variables:
## # winner_first_serves_in <dbl>, winner_first_serves_total <dbl>,
## # winner_first_serve_points_won <dbl>, winner_first_serve_points_total <dbl>,
## # winner_second_serve_points_won <dbl>,
## # winner_second_serve_points_total <dbl>, winner_break_points_saved <dbl>,
## # winner_break_points_serve_total <dbl>, winner_service_points_won <dbl>,
## # winner_service_points_total <dbl>, winner_first_serve_return_won <dbl>,
## # winner_first_serve_return_total <dbl>,
## # winner_second_serve_return_won <dbl>,
## # winner_second_serve_return_total <dbl>,
## # winner_break_points_converted <dbl>,
## # winner_break_points_return_total <dbl>, winner_service_games_played <dbl>,
## # winner_return_games_played <dbl>, winner_return_points_won <dbl>,
## # winner_return_points_total <dbl>, winner_total_points_won <dbl>,
## # winner_total_points_total <dbl>, loser_aces <dbl>,
## # loser_double_faults <dbl>, loser_first_serves_in <dbl>,
## # loser_first_serves_total <dbl>, loser_first_serve_points_won <dbl>,
## # loser_first_serve_points_total <dbl>, loser_second_serve_points_won <dbl>,
## # loser_second_serve_points_total <dbl>, loser_break_points_saved <dbl>,
## # loser_break_points_serve_total <dbl>, loser_service_points_won <dbl>,
## # loser_service_points_total <dbl>, loser_first_serve_return_won <dbl>,
## # loser_first_serve_return_total <dbl>, loser_second_serve_return_won <dbl>,
## # loser_second_serve_return_total <dbl>, loser_break_points_converted <dbl>,
## # loser_break_points_return_total <dbl>, loser_service_games_played <dbl>,
## # loser_return_games_played <dbl>, loser_return_points_won <dbl>,
## # loser_return_points_total <dbl>, loser_total_points_won <dbl>,
## # loser_total_points_total <dbl>, tourney_id <chr>, winner_player_id <chr>,
## # loser_player_id <chr>
Top 20 players with the number matches (top winners by quantity)
top_20 <- score_df %>%
count(winner_player_id, sort = TRUE) %>%
slice(1:20) %>%
inner_join(player_df, by=c("winner_player_id" = "player_id")) %>%
select(first_name, last_name, n) %>%
rename(., number_of_win = n)
kable(top_20, col.names = c("First Name", "Last Name", "Number of Win"))
First Name
|
Last Name
|
Number of Win
|
Rafael
|
Nadal
|
67
|
Alexander
|
Zverev
|
54
|
Roger
|
Federer
|
53
|
David
|
Goffin
|
53
|
Grigor
|
Dimitrov
|
49
|
Dominic
|
Thiem
|
47
|
Roberto
|
Bautista Agut
|
46
|
Marin
|
Cilic
|
43
|
Adrian
|
Mannarino
|
39
|
Juan Martin
|
del Potro
|
38
|
Jack
|
Sock
|
38
|
Diego
|
Schwartzman
|
38
|
Damir
|
Dzumhur
|
37
|
Pablo
|
Carreno Busta
|
36
|
John
|
Isner
|
36
|
Tomas
|
Berdych
|
35
|
Fabio
|
Fognini
|
35
|
Sam
|
Querrey
|
35
|
Jo-Wilfried
|
Tsonga
|
35
|
Kevin
|
Anderson
|
34
|
Players which won tourneys more than one and their number of championship
left_join(player_df, tourney_df, by=c("player_id" = "singles_winner_player_id")) %>%
drop_na() %>%
transmute(player_id,flag_code,tourney_name,first_name,last_name,winner_count = 1) %>%
group_by(player_id) %>%
transmute(winner_count = sum(winner_count),flag_code,first_name,last_name) %>%
arrange(desc(winner_count)) %>%
distinct() %>%
ungroup() %>%
select(first_name,last_name,winner_count) %>%
#head(10)
kable()
first_name
|
last_name
|
winner_count
|
Roger
|
Federer
|
7
|
Rafael
|
Nadal
|
6
|
Alexander
|
Zverev
|
5
|
Grigor
|
Dimitrov
|
4
|
Jo-Wilfried
|
Tsonga
|
4
|
Lucas
|
Pouille
|
3
|
Jack
|
Sock
|
3
|
Roberto
|
Bautista Agut
|
2
|
Novak
|
Djokovic
|
2
|
Damir
|
Dzumhur
|
2
|
David
|
Goffin
|
2
|
John
|
Isner
|
2
|
Gilles
|
Muller
|
2
|
Sam
|
Querrey
|
2
|
Pablo
|
Cuevas
|
1
|
Marin
|
Cilic
|
1
|
Pablo
|
Carreno Busta
|
1
|
Borna
|
Coric
|
1
|
Juan Martin
|
del Potro
|
1
|
Alexandr
|
Dolgopolov
|
1
|
Victor
|
Estrella Burgos
|
1
|
David
|
Ferrer
|
1
|
Fabio
|
Fognini
|
1
|
Peter
|
Gojowczyk
|
1
|
Ryan
|
Harrison
|
1
|
Denis
|
Istomin
|
1
|
Steve
|
Johnson
|
1
|
Philipp
|
Kohlschreiber
|
1
|
Feliciano
|
Lopez
|
1
|
Andy
|
Murray
|
1
|
Leonardo
|
Mayer
|
1
|
Andrey
|
Rublev
|
1
|
Yuichi
|
Sugita
|
1
|
Dominic
|
Thiem
|
1
|
Stan
|
Wawrinka
|
1
|