In Class 1

Author

Sezgi Ayhan

Published

January 10, 2024

Preparation

library (tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library (dplyr)
raw_data <- read.csv("./athlete_events.csv")

Fundamentals

raw_data %>% as_tibble()
# A tibble: 271,116 × 15
      ID Name     Sex     Age Height Weight Team  NOC   Games  Year Season City 
   <int> <chr>    <chr> <int>  <int>  <dbl> <chr> <chr> <chr> <int> <chr>  <chr>
 1     1 A Dijia… M        24    180     80 China CHN   1992…  1992 Summer Barc…
 2     2 A Lamusi M        23    170     60 China CHN   2012…  2012 Summer Lond…
 3     3 Gunnar … M        24     NA     NA Denm… DEN   1920…  1920 Summer Antw…
 4     4 Edgar L… M        34     NA     NA Denm… DEN   1900…  1900 Summer Paris
 5     5 Christi… F        21    185     82 Neth… NED   1988…  1988 Winter Calg…
 6     5 Christi… F        21    185     82 Neth… NED   1988…  1988 Winter Calg…
 7     5 Christi… F        25    185     82 Neth… NED   1992…  1992 Winter Albe…
 8     5 Christi… F        25    185     82 Neth… NED   1992…  1992 Winter Albe…
 9     5 Christi… F        27    185     82 Neth… NED   1994…  1994 Winter Lill…
10     5 Christi… F        27    185     82 Neth… NED   1994…  1994 Winter Lill…
# ℹ 271,106 more rows
# ℹ 3 more variables: Sport <chr>, Event <chr>, Medal <chr>
#List of first 10 games by city
raw_data %>% 
  slice (1:10)%>% 
  select(Games,City)
         Games        City
1  1992 Summer   Barcelona
2  2012 Summer      London
3  1920 Summer   Antwerpen
4  1900 Summer       Paris
5  1988 Winter     Calgary
6  1988 Winter     Calgary
7  1992 Winter Albertville
8  1992 Winter Albertville
9  1994 Winter Lillehammer
10 1994 Winter Lillehammer
#List of gold winners among basketball teams in 1972
gold_winner <- raw_data %>%
  filter(Year >= "1972" & Year <= "1980", Sport == "Basketball", Medal == "Gold")
as.tibble(gold_winner)
Warning: `as.tibble()` was deprecated in tibble 2.0.0.
ℹ Please use `as_tibble()` instead.
ℹ The signature and semantics have changed, see `?as_tibble`.
# A tibble: 60 × 15
      ID Name     Sex     Age Height Weight Team  NOC   Games  Year Season City 
   <int> <chr>    <chr> <int>  <int>  <dbl> <chr> <chr> <chr> <int> <chr>  <chr>
 1  5173 "Michel… M        20    190     77 Unit… USA   1976…  1976 Summer Mont…
 2  8384 "Olga F… F        21    168     67 Sovi… URS   1976…  1976 Summer Mont…
 3  8384 "Olga F… F        25    168     67 Sovi… URS   1980…  1980 Summer Mosk…
 4  9779 "Aleksa… M        20    200    100 Sovi… URS   1972…  1972 Summer Muni…
 5  9783 "Sergey… M        28    190     82 Sovi… URS   1972…  1972 Summer Muni…
 6 10964 "Vida B… F        23    190     91 Sovi… URS   1980…  1980 Summer Mosk…
 7 13017 "Aleksa… M        25    205    105 Sovi… URS   1972…  1972 Summer Muni…
 8 16109 "Willia… M        21    190     92 Unit… USA   1976…  1976 Summer Mont…
 9 18483 "Kennet… M        20    200    102 Unit… USA   1976…  1976 Summer Mont…
10 23251 "Kreimi… M        31    209     94 Yugo… YUG   1980…  1980 Summer Mosk…
# ℹ 50 more rows
# ℹ 3 more variables: Sport <chr>, Event <chr>, Medal <chr>
#Show mean_ages by year with number of athletes data
raw_data %>% 
  group_by(Year) %>%
  summarize(mean_age = mean(Age, na.rm = TRUE), sd_age = sd(Age, na.rm =TRUE), ID = n()) %>%
  arrange(mean_age)
# A tibble: 35 × 4
    Year mean_age sd_age    ID
   <int>    <dbl>  <dbl> <int>
 1  1896     23.6   4.69   380
 2  1980     23.7   5.08  8937
 3  1976     23.8   5.55 10502
 4  1984     23.9   5.25 11588
 5  1988     24.1   5.20 14676
 6  1968     24.2   5.76 10479
 7  1972     24.3   5.81 11959
 8  1992     24.3   5.17 16413
 9  1994     24.4   4.20  3160
10  1996     24.9   5.50 13780
# ℹ 25 more rows