Dataset
raw_df <- readRDS("H:\\My Drive\\R\\Exercise Files\\rp_201801_202005_df.rds")
str(raw_df)
## tibble [21,168 x 17] (S3: tbl_df/tbl/data.frame)
## $ dt : POSIXct[1:21168], format: "2020-05-31 23:00:00" "2020-05-31 22:00:00" ...
## $ wind_lic : num [1:21168] 1434 1577 1858 1933 2032 ...
## $ geothermal_lic : num [1:21168] 913 908 901 888 865 ...
## $ biogas_lic : num [1:21168] 75.8 75.6 75.4 76.8 77 ...
## $ canalType_lic : num [1:21168] 2585 2631 2585 2542 2594 ...
## $ riverType_lic : num [1:21168] 317 317 357 350 378 ...
## $ biomass_lic : num [1:21168] 262 253 247 250 248 ...
## $ landfillGas_lic: num [1:21168] 100 101 100 101 102 ...
## $ sun_lic : num [1:21168] 0 0 0 0 2.06 ...
## $ reservoir_lic : num [1:21168] 2306 2296 2279 2309 2792 ...
## $ others_lic : num [1:21168] 48.4 48.4 48.4 48.4 48.5 ...
## $ wind_ul : num [1:21168] 3.78 4.84 7.67 11.81 13.11 ...
## $ biogas_ul : num [1:21168] 16.9 16.9 16.9 16.8 16.3 ...
## $ canalType_ul : num [1:21168] 4.17 4.42 4.47 4.6 4.62 ...
## $ biomass_ul : num [1:21168] 15.5 15.4 16 15.2 15.6 ...
## $ sun_ul : num [1:21168] 0.0582 0.032 0.0335 1.3121 103.3267 ...
## $ others_ul : num [1:21168] 0.061 0.0395 0.4136 0.5508 0.7106 ...
Exercise 1
Find the mean and standard deviation of licensed geothermal productions in all years. (Tip: Use lubridate::year to get years from date data.)
raw_df %>%
select(datetime = dt, geothermal_lic) %>%
group_by(year = lubridate::year(datetime)) %>%
summarize(mean_geo = mean(geothermal_lic),
sd_geo = sd(geothermal_lic))
## # A tibble: 3 x 3
## year mean_geo sd_geo
## <dbl> <dbl> <dbl>
## 1 2018 681. 65.2
## 2 2019 799. 74.2
## 3 2020 935. 59.0
Exercise 2
Find the hourly average unlicensed solar (sun_ul) production levels for May 2020.
raw_df %>%
mutate(hour = lubridate::hour(dt), month = lubridate::month(dt), year = lubridate::year(dt)) %>%
filter(month == 5, year == 2020) %>%
group_by(hour) %>%
summarize(hourly_avg = mean(sun_ul)) %>%
select(hour, avg_prod = hourly_avg)
## # A tibble: 24 x 2
## hour avg_prod
## <int> <dbl>
## 1 0 0.166
## 2 1 0.375
## 3 2 0.701
## 4 3 0.910
## 5 4 1.26
## 6 5 22.7
## 7 6 305.
## 8 7 1156.
## 9 8 2316.
## 10 9 3330.
## # ... with 14 more rows
Exercise 3
Find the average daily percentage change of licensed biomass (biomass_lic) in 2019. (e.g. Suppose daily production is 50 in day 1 and 53 in day 2, then the change should be (53-50)/50 -1 = 0.06) (Tip: Use lubridate::as_date to convert date time to date. Use lag and lead functions to offset values.)
raw_df %>%
mutate(date = lubridate::as_date(dt),
year = lubridate::year(dt)) %>%
filter(year == 2019) %>%
group_by(date) %>%
summarize(sum_bio = sum(biomass_lic)) %>%
transmute(date, sum_bio,
daily_sum_next = lead(sum_bio,1), percent_daily = (daily_sum_next - sum_bio) / sum_bio) %>%
summarize(avg_change = mean(percent_daily, na.rm=TRUE))
## # A tibble: 1 x 1
## avg_change
## <dbl>
## 1 0.00246
Exercise 4
Find the yearly total production levels in TWh (Current values are in MWh. 1 GWh is 1000 MWh and 1 TWh is 1000 GWh). (Tip: In order to avoid a lengthy summation you can use tidyr::pivot_longer to get a long format.)
raw_df %>%
mutate(year=lubridate::year(dt)) %>%
select(-dt) %>%
pivot_longer(-year) %>%
group_by(year) %>%
summarise(total_production = sum(value)/10^6)
## # A tibble: 3 x 2
## year total_production
## <dbl> <dbl>
## 1 2018 62.6
## 2 2019 76.7
## 3 2020 37.3