library(tidyverse)
library(ggplot2)
url<-url("https://github.com/pjournal/mef03-Nilgun/blob/master/atp_tennis_data_2017.RData?raw=TRUE")
atp_tennis<-load(url)
Rank countries (flag codes) by the number of singles champions
singles_winners<-left_join(tourney_df,player_df,by=c("singles_winner_player_id"="player_id"))
champ_flags_df<-singles_winners%>%select(singles_winner_player_id,flag_code)%>%count(flag_code,sort=T)
champ_flags_df
## # A tibble: 21 x 2
## flag_code n
## <chr> <int>
## 1 ESP 11
## 2 USA 9
## 3 SUI 8
## 4 FRA 7
## 5 GER 7
## 6 BUL 4
## 7 ARG 2
## 8 BEL 2
## 9 BIH 2
## 10 CRO 2
## # ... with 11 more rows
Rank countries which did not get any singles championships by the games won when they win the match
nonchamp_players<- player_df %>%select(player_id, flag_code) %>%anti_join(., champ_flags_df,by="flag_code")
nonchamp_players %>% left_join(.,score_df, by= c("player_id"="winner_player_id")) %>%group_by(flag_code) %>%summarise(total_won= sum(winner_games_won, na.rm=TRUE)) %>%arrange(desc(total_won))
## # A tibble: 93 x 2
## flag_code total_won
## <chr> <dbl>
## 1 AUS 1989
## 2 CZE 1209
## 3 CAN 1190
## 4 SVK 889
## 5 BRA 873
## 6 POR 621
## 7 RSA 566
## 8 KAZ 495
## 9 KOR 438
## 10 GEO 377
## # ... with 83 more rows
rank_table<-inner_join(rank_df,player_df,by="player_id")%>%select(rank_number,first_name,last_name,flag_code,player_age)%>%group_by(rank_number)
head(rank_table,10)
## # A tibble: 10 x 5
## # Groups: rank_number [10]
## rank_number first_name last_name flag_code player_age
## <dbl> <chr> <chr> <chr> <dbl>
## 1 1 Rafael Nadal ESP 31
## 2 2 Roger Federer SUI 36
## 3 3 Grigor Dimitrov BUL 26
## 4 4 Alexander Zverev GER 20
## 5 5 Dominic Thiem AUT 24
## 6 6 Marin Cilic CRO 29
## 7 7 David Goffin BEL 26
## 8 8 Jack Sock USA 25
## 9 9 Stan Wawrinka SUI 32
## 10 10 Pablo Carreno Busta ESP 26
champions<-head(rank_table,3)
champions%>%ggplot(aes(x=flag_code,y=(1/(rank_number)),fill=flag_code))+geom_col()+labs(x="Winner Countries",y="Medals",title="Winner Podium (: ",fill="")+scale_fill_manual(values=c("#CC6633", "#FFCC00", "#CCC999"))+theme_classic()
tourney_df%>%mutate(tourney_idc=as.character(tourney_id))%>%inner_join(score_df,by=c("tourney_idc"="tourney_id"))%>%select(tourney_name,winner_games_won,loser_games_won)%>%group_by(tourney_name)%>%summarise(winner_won=sum(winner_games_won,na.rm=TRUE),loser_won=sum(loser_games_won,na.rm=TRUE),ratio=winner_won/loser_won*100)%>%arrange(ratio)
## # A tibble: 67 x 4
## tourney_name winner_won loser_won ratio
## <chr> <dbl> <dbl> <dbl>
## 1 Fayez Sarofim & Co. US Men’s Clay Court Cham~ 547 415 132.
## 2 Mercedes Cup 560 422 133.
## 3 Ecuador Open 556 418 133.
## 4 Nitto ATP Finals 230 169 136.
## 5 Sydney International 516 374 138.
## 6 Mutua Madrid Open 1027 742 138.
## 7 European Open 542 391 139.
## 8 Generali Open 544 391 139.
## 9 Aegon Championships 623 441 141.
## 10 Banque Eric Sturdza Geneva Open 509 360 141.
## # ... with 57 more rows