raw_df <- readRDS("~/Desktop/Data Analytics/R/Week_2/rp_201801_202005_df.rds")
raw_df %>% glimpse()
## Rows: 21,168
## Columns: 17
## $ dt <dttm> 2020-05-31 23:00:00, 2020-05-31 22:00:00, 2020-05-31…
## $ wind_lic <dbl> 1433.8132, 1577.1419, 1857.5492, 1933.0142, 2031.7862…
## $ geothermal_lic <dbl> 912.7882, 907.9303, 900.5844, 888.4561, 864.5402, 847…
## $ biogas_lic <dbl> 75.8047, 75.6163, 75.3906, 76.7873, 76.9707, 77.5750,…
## $ canalType_lic <dbl> 2584.930, 2630.602, 2585.038, 2542.381, 2594.459, 262…
## $ riverType_lic <dbl> 316.5538, 316.6800, 356.7637, 350.1544, 377.5312, 379…
## $ biomass_lic <dbl> 262.4994, 253.0814, 246.9268, 249.9152, 248.2336, 246…
## $ landfillGas_lic <dbl> 100.3971, 101.1378, 100.4442, 100.7307, 102.2474, 102…
## $ sun_lic <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 2.0594, 14.2800, 48.0…
## $ reservoir_lic <dbl> 2306.303, 2296.045, 2279.266, 2308.918, 2792.313, 318…
## $ others_lic <dbl> 48.3833, 48.4011, 48.4041, 48.4199, 48.4653, 48.5485,…
## $ wind_ul <dbl> 3.7751, 4.8375, 7.6659, 11.8121, 13.1070, 13.1830, 10…
## $ biogas_ul <dbl> 16.9293, 16.9227, 16.9052, 16.7517, 16.2928, 16.5989,…
## $ canalType_ul <dbl> 4.1749, 4.4221, 4.4658, 4.6020, 4.6195, 4.5146, 4.661…
## $ biomass_ul <dbl> 15.4698, 15.3609, 16.0483, 15.2271, 15.5563, 15.5007,…
## $ sun_ul <dbl> 0.0582, 0.0320, 0.0335, 1.3121, 103.3267, 555.5787, 1…
## $ others_ul <dbl> 0.0610, 0.0395, 0.4136, 0.5508, 0.7106, 1.3775, 2.746…
Solve the following exercises. Outputs are given below, you are expected write code to match the outputs.
## # A tibble: 3 x 3
## year mean_geo sd_geo
## <dbl> <dbl> <dbl>
## 1 2018 681. 65.2
## 2 2019 799. 74.2
## 3 2020 935. 59.0
Let’s take the time and licensed geothermal production column.
raw_df %>% select(date_time=dt,geothermal_lic)
## # A tibble: 21,168 x 2
## date_time geothermal_lic
## <dttm> <dbl>
## 1 2020-05-31 23:00:00 913.
## 2 2020-05-31 22:00:00 908.
## 3 2020-05-31 21:00:00 901.
## 4 2020-05-31 20:00:00 888.
## 5 2020-05-31 19:00:00 865.
## 6 2020-05-31 18:00:00 848.
## 7 2020-05-31 17:00:00 848.
## 8 2020-05-31 16:00:00 843.
## 9 2020-05-31 15:00:00 845.
## 10 2020-05-31 14:00:00 853.
## # … with 21,158 more rows
Then, add a year column with lubridate function and pivot the data on year. Calculate the mean and sd with the summarise function.
raw_df %>%
group_by(year = lubridate::year(dt)) %>%
summarise(mean_geo = round(mean(geothermal_lic)), sd_geo = round(sd(geothermal_lic),1))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 3
## year mean_geo sd_geo
## <dbl> <dbl> <dbl>
## 1 2018 681 65.2
## 2 2019 799 74.2
## 3 2020 935 59
## # A tibble: 24 x 2
## hour avg_prod
## <int> <dbl>
## 1 0 0.17
## 2 1 0.37
## 3 2 0.7
## # … with 21 more rows
raw_df %>%
mutate(hour = lubridate::hour(dt), year=lubridate::year(dt), month=lubridate::month(dt)) %>%
filter(year== 2020, month== 05) %>%
group_by(hour) %>%
summarise(avg_prod = round(mean(sun_ul),2)) %>%
select(hour, avg_prod)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 24 x 2
## hour avg_prod
## <int> <dbl>
## 1 0 0.17
## 2 1 0.37
## 3 2 0.7
## 4 3 0.91
## 5 4 1.26
## 6 5 22.7
## 7 6 305.
## 8 7 1156.
## 9 8 2316.
## 10 9 3330.
## # … with 14 more rows
## # A tibble: 1 x 1
## average_change
## <dbl>
## 1 0.00282
raw_df %>%
mutate(date = lubridate::date(dt), year=lubridate::year(dt)) %>%
filter(year == 2019) %>%
select(date,year,biomass_lic) %>%
arrange(date) %>%
group_by(date) %>%
summarise(sum_bl=sum(biomass_lic)) %>%
transmute(date, sum_bl, biomass_lic_next = lag(sum_bl,1)) %>%
summarise(sum_bl, biomass_lic_next, p = ((sum_bl-biomass_lic_next)/biomass_lic_next) -1) %>%
summarise(average_change = mean(c(NA, diff(p)),na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 1 x 1
## average_change
## <dbl>
## 1 0.000176
## # A tibble: 3 x 2
## year total_production
## <dbl> <dbl>
## 1 2018 62.6
## 2 2019 76.7
## 3 2020 37.3
raw_df %>%
group_by(year=lubridate::year(dt)) %>%
rowwise() %>%
summarise(total_prod = sum(c_across(where(is.numeric)))) %>%
summarise(tp = sum(total_prod)/1000000) %>%
transmute(year, total_production = round(tp,digits=1))
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
## year total_production
## <dbl> <dbl>
## 1 2018 62.6
## 2 2019 76.7
## 3 2020 37.3