#Load library

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

#load data

load("atp_tennis_data_2017.RData")

#analysis

champ_flags_df <- tourney_df %>% left_join(.,player_df,by=c("singles_winner_player_id"= "player_id")) %>% count(flag_code,sort=TRUE)
nonchamp_players <- 
player_df %>% select(player_id,flag_code) %>% anti_join(.,champ_flags_df)
## Joining, by = "flag_code"
nonchamp_players %>% left_join(.,score_df,by=c("player_id"="winner_player_id")) %>% group_by(flag_code) %>% summarise(total_won=sum(winner_games_won,na.rm=TRUE)) %>% arrange(desc(total_won))
## # A tibble: 93 x 2
##    flag_code total_won
##    <chr>         <dbl>
##  1 AUS            1989
##  2 CZE            1209
##  3 CAN            1190
##  4 SVK             889
##  5 BRA             873
##  6 POR             621
##  7 RSA             566
##  8 KAZ             495
##  9 KOR             438
## 10 GEO             377
## # … with 83 more rows

#analysis of what are the most surface using in the country

ss <- left_join(player_df, tourney_df, by = c('player_id'='singles_winner_player_id'))

ss %>% group_by(flag_code,tourney_surface) %>% filter(tourney_surface != "NA") %>% count(tourney_surface) %>% arrange(desc(n))
## # A tibble: 33 x 3
## # Groups:   flag_code, tourney_surface [33]
##    flag_code tourney_surface     n
##    <chr>     <chr>           <int>
##  1 USA       Hard                7
##  2 ESP       Clay                6
##  3 SUI       Hard                5
##  4 BUL       Hard                4
##  5 ESP       Hard                4
##  6 FRA       Hard                4
##  7 GER       Hard                4
##  8 GER       Clay                3
##  9 BEL       Hard                2
## 10 BIH       Hard                2
## # … with 23 more rows

#analysis of which player take most rate according to match duration and total points

abc <- inner_join(stats_df, score_df,by=c("match_id"))
abc %>% select(winner_player_id.x, match_duration, winner_total_points_total,loser_total_points_total) %>% filter(match_duration > 250)%>% mutate(rate=match_duration/(winner_total_points_total+loser_total_points_total))%>% arrange(desc(rate))
## # A tibble: 14 x 5
##    winner_player_id… match_duration winner_total_po… loser_total_poi…  rate
##    <chr>                      <dbl>            <dbl>            <dbl> <dbl>
##  1 d646                         252              180              180 0.7  
##  2 n409                         296              365              365 0.405
##  3 w367                         274              339              339 0.404
##  4 i165                         289              379              379 0.381
##  5 sc77                         272              365              365 0.373
##  6 ma30                         288              389              389 0.370
##  7 gb88                         260              359              359 0.362
##  8 e224                         260              360              360 0.361
##  9 sc73                         251              354              354 0.355
## 10 r772                         295              418              418 0.353
## 11 k336                         315              450              450 0.35 
## 12 f401                         267              396              396 0.337
## 13 cd85                         257              382              382 0.336
## 14 bh09                         265              399              399 0.332