library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
load("/Users/bilgehan.kiran/Documents/BilgeDersler/BigDataAnalytics/week11/atp_tennis_data_2017.RData")
champ_flags_df <- tourney_df %>% left_join(.,player_df, by =c("singles_winner_player_id"="player_id")) %>% count(flag_code, sort = TRUE)
champ_flags_df
## # A tibble: 21 x 2
## flag_code n
## <chr> <int>
## 1 ESP 11
## 2 USA 9
## 3 SUI 8
## 4 FRA 7
## 5 GER 7
## 6 BUL 4
## 7 ARG 2
## 8 BEL 2
## 9 BIH 2
## 10 CRO 2
## # … with 11 more rows
nonchamp_players <- player_df %>% select(player_id, flag_code) %>% anti_join(.,champ_flags_df)
## Joining, by = "flag_code"
nochamp_winn_player <- nonchamp_players %>% left_join(.,score_df, by=c("player_id"="winner_player_id"))
nochamp_winn_player %>% group_by(flag_code) %>% summarise(total_wins=sum(winner_games_won,na.rm = T)) %>% arrange(desc(total_wins))
## # A tibble: 93 x 2
## flag_code total_wins
## <chr> <dbl>
## 1 AUS 1989
## 2 CZE 1209
## 3 CAN 1190
## 4 SVK 889
## 5 BRA 873
## 6 POR 621
## 7 RSA 566
## 8 KAZ 495
## 9 KOR 438
## 10 GEO 377
## # … with 83 more rows
nochamp_match_id <- inner_join(nonchamp_players,score_df, by=c("player_id"="loser_player_id")) %>% select(player_id,match_id )
player_better_in_service <- inner_join(nochamp_match_id , stats_df) %>% select(player_id,winner_service_points_won, loser_service_points_won) %>% filter(winner_service_points_won < loser_service_points_won)
## Joining, by = "match_id"
player_info <- inner_join(player_better_in_service, player_df) %>% distinct(player_id,first_name, last_name, flag_code,birth_place)
## Joining, by = "player_id"
player_info
## # A tibble: 82 x 5
## player_id first_name last_name flag_code birth_place
## <chr> <chr> <chr> <chr> <chr>
## 1 a678 Kevin Anderson RSA Johannesburg, South Africa
## 2 a829 Radu Albot MDA Chisinau, Moldova
## 3 a853 Marcelo Arevalo ESA Sonsonate, El Salvador
## 4 b837 Marcos Baghdatis CYP Limassol, Cyprus
## 5 ba47 Tomas Berdych CZE Valasske Mezirici, CZE
## 6 bc68 Philip Bester CAN Sonthoffen, Germany
## 7 bd20 Thomaz Bellucci BRA Tiete, Brazil
## 8 bd80 Attila Balazs HUN Budapest, Hungary
## 9 bg23 Nikoloz Basilashvili GEO Tbilisi, Georgia
## 10 bh98 Matthew Barton AUS Sydney, Australia
## # … with 72 more rows
We can see the information about 82 players who had better service points in the matches they had.
champ_winner_players <- score_df %>% left_join(.,player_df, by=c("winner_player_id"="player_id"))
champ_winner_players %>% group_by(flag_code) %>% count() %>% arrange(n)
## # A tibble: 61 x 2
## # Groups: flag_code [61]
## flag_code n
## <chr> <int>
## 1 ESA 1
## 2 TUR 1
## 3 EST 2
## 4 MAR 2
## 5 NZL 3
## 6 SLO 3
## 7 ECU 4
## 8 BLR 5
## 9 IRL 5
## 10 EGY 7
## # … with 51 more rows
As we can see sadly TR is one of the countries who had least match winner.