Introduction to dplyr 1.0.0 : Exercises

Mine Kara

October 17, 2021

Dataset

raw_df <- readRDS("H:\\My Drive\\R\\Exercise Files\\rp_201801_202005_df.rds")

str(raw_df)
## tibble [21,168 x 17] (S3: tbl_df/tbl/data.frame)
##  $ dt             : POSIXct[1:21168], format: "2020-05-31 23:00:00" "2020-05-31 22:00:00" ...
##  $ wind_lic       : num [1:21168] 1434 1577 1858 1933 2032 ...
##  $ geothermal_lic : num [1:21168] 913 908 901 888 865 ...
##  $ biogas_lic     : num [1:21168] 75.8 75.6 75.4 76.8 77 ...
##  $ canalType_lic  : num [1:21168] 2585 2631 2585 2542 2594 ...
##  $ riverType_lic  : num [1:21168] 317 317 357 350 378 ...
##  $ biomass_lic    : num [1:21168] 262 253 247 250 248 ...
##  $ landfillGas_lic: num [1:21168] 100 101 100 101 102 ...
##  $ sun_lic        : num [1:21168] 0 0 0 0 2.06 ...
##  $ reservoir_lic  : num [1:21168] 2306 2296 2279 2309 2792 ...
##  $ others_lic     : num [1:21168] 48.4 48.4 48.4 48.4 48.5 ...
##  $ wind_ul        : num [1:21168] 3.78 4.84 7.67 11.81 13.11 ...
##  $ biogas_ul      : num [1:21168] 16.9 16.9 16.9 16.8 16.3 ...
##  $ canalType_ul   : num [1:21168] 4.17 4.42 4.47 4.6 4.62 ...
##  $ biomass_ul     : num [1:21168] 15.5 15.4 16 15.2 15.6 ...
##  $ sun_ul         : num [1:21168] 0.0582 0.032 0.0335 1.3121 103.3267 ...
##  $ others_ul      : num [1:21168] 0.061 0.0395 0.4136 0.5508 0.7106 ...

Exercise 1

Find the mean and standard deviation of licensed geothermal productions in all years. (Tip: Use lubridate::year to get years from date data.)

raw_df %>%
  select(datetime = dt, geothermal_lic) %>%
  group_by(year = lubridate::year(datetime)) %>%
  summarize(mean_geo = mean(geothermal_lic), 
            sd_geo = sd(geothermal_lic))
## # A tibble: 3 x 3
##    year mean_geo sd_geo
##   <dbl>    <dbl>  <dbl>
## 1  2018     681.   65.2
## 2  2019     799.   74.2
## 3  2020     935.   59.0

Exercise 2

Find the hourly average unlicensed solar (sun_ul) production levels for May 2020.

raw_df %>%
  mutate(hour = lubridate::hour(dt), month = lubridate::month(dt), year = lubridate::year(dt)) %>%
  filter(month == 5, year == 2020) %>%
  group_by(hour) %>%
  summarize(hourly_avg = mean(sun_ul)) %>%
  select(hour, avg_prod = hourly_avg)
## # A tibble: 24 x 2
##     hour avg_prod
##    <int>    <dbl>
##  1     0    0.166
##  2     1    0.375
##  3     2    0.701
##  4     3    0.910
##  5     4    1.26 
##  6     5   22.7  
##  7     6  305.   
##  8     7 1156.   
##  9     8 2316.   
## 10     9 3330.   
## # ... with 14 more rows

Exercise 3

Find the average daily percentage change of licensed biomass (biomass_lic) in 2019. (e.g. Suppose daily production is 50 in day 1 and 53 in day 2, then the change should be (53-50)/50 -1 = 0.06) (Tip: Use lubridate::as_date to convert date time to date. Use lag and lead functions to offset values.)

raw_df %>%
  mutate(date = lubridate::as_date(dt), 
         year = lubridate::year(dt)) %>%
  filter(year == 2019) %>%
  group_by(date) %>%
  summarize(sum_bio = sum(biomass_lic)) %>%
  transmute(date, sum_bio,
            daily_sum_next = lead(sum_bio,1), percent_daily = (daily_sum_next - sum_bio) / sum_bio) %>%
  summarize(avg_change = mean(percent_daily, na.rm=TRUE))
## # A tibble: 1 x 1
##   avg_change
##        <dbl>
## 1    0.00246

Exercise 4

Find the yearly total production levels in TWh (Current values are in MWh. 1 GWh is 1000 MWh and 1 TWh is 1000 GWh). (Tip: In order to avoid a lengthy summation you can use tidyr::pivot_longer to get a long format.)

raw_df %>%
  mutate(year=lubridate::year(dt)) %>%
  select(-dt) %>%
  pivot_longer(-year) %>%
  group_by(year) %>%
  summarise(total_production = sum(value)/10^6)
## # A tibble: 3 x 2
##    year total_production
##   <dbl>            <dbl>
## 1  2018             62.6
## 2  2019             76.7
## 3  2020             37.3