The data consists of hourly renewable electricity production of YEKDEM plants from different origins and license types.
print(raw_df,n=3)
## # A tibble: 21,168 x 17
## dt wind_lic geothermal_lic biogas_lic canalType_lic
## <dttm> <dbl> <dbl> <dbl> <dbl>
## 1 2020-05-31 23:00:00 1434. 913. 75.8 2585.
## 2 2020-05-31 22:00:00 1577. 908. 75.6 2631.
## 3 2020-05-31 21:00:00 1858. 901. 75.4 2585.
## # ... with 21,165 more rows, and 12 more variables: riverType_lic <dbl>,
## # biomass_lic <dbl>, landfillGas_lic <dbl>, sun_lic <dbl>,
## # reservoir_lic <dbl>, others_lic <dbl>, wind_ul <dbl>, biogas_ul <dbl>,
## # canalType_ul <dbl>, biomass_ul <dbl>, sun_ul <dbl>, others_ul <dbl>
raw_df %>% glimpse()
## Rows: 21,168
## Columns: 17
## $ dt <dttm> 2020-05-31 23:00:00, 2020-05-31 22:00:00, 2020-05-...
## $ wind_lic <dbl> 1433.8132, 1577.1419, 1857.5492, 1933.0142, 2031.78...
## $ geothermal_lic <dbl> 912.7882, 907.9303, 900.5844, 888.4561, 864.5402, 8...
## $ biogas_lic <dbl> 75.8047, 75.6163, 75.3906, 76.7873, 76.9707, 77.575...
## $ canalType_lic <dbl> 2584.930, 2630.602, 2585.038, 2542.381, 2594.459, 2...
## $ riverType_lic <dbl> 316.5538, 316.6800, 356.7637, 350.1544, 377.5312, 3...
## $ biomass_lic <dbl> 262.4994, 253.0814, 246.9268, 249.9152, 248.2336, 2...
## $ landfillGas_lic <dbl> 100.3971, 101.1378, 100.4442, 100.7307, 102.2474, 1...
## $ sun_lic <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 2.0594, 14.2800, 48...
## $ reservoir_lic <dbl> 2306.303, 2296.045, 2279.266, 2308.918, 2792.313, 3...
## $ others_lic <dbl> 48.3833, 48.4011, 48.4041, 48.4199, 48.4653, 48.548...
## $ wind_ul <dbl> 3.7751, 4.8375, 7.6659, 11.8121, 13.1070, 13.1830, ...
## $ biogas_ul <dbl> 16.9293, 16.9227, 16.9052, 16.7517, 16.2928, 16.598...
## $ canalType_ul <dbl> 4.1749, 4.4221, 4.4658, 4.6020, 4.6195, 4.5146, 4.6...
## $ biomass_ul <dbl> 15.4698, 15.3609, 16.0483, 15.2271, 15.5563, 15.500...
## $ sun_ul <dbl> 0.0582, 0.0320, 0.0335, 1.3121, 103.3267, 555.5787,...
## $ others_ul <dbl> 0.0610, 0.0395, 0.4136, 0.5508, 0.7106, 1.3775, 2.7...
Calculate the daily total production and plot the hourly change of production for ‘wind_lic’ and 2020-04-02.
return_daily_production <- function(the_date,the_column){
raw_df %>%
filter(as_date(dt) == the_date) %>%
arrange(dt) %>%
select(dt, the_column)
}
# The_date and the_column are already defined in the beginning under 'params'.
daily_prod_df <- return_daily_production(params$the_date, params$the_column)
knitr::kable(daily_prod_df)
dt | wind_lic |
---|---|
2020-04-02 00:00:00 | 1376.317 |
2020-04-02 01:00:00 | 1484.757 |
2020-04-02 02:00:00 | 1643.236 |
2020-04-02 03:00:00 | 1863.580 |
2020-04-02 04:00:00 | 2076.822 |
2020-04-02 05:00:00 | 2367.872 |
2020-04-02 06:00:00 | 2481.355 |
2020-04-02 07:00:00 | 2323.889 |
2020-04-02 08:00:00 | 2017.700 |
2020-04-02 09:00:00 | 1966.403 |
2020-04-02 10:00:00 | 1989.430 |
2020-04-02 11:00:00 | 2168.680 |
2020-04-02 12:00:00 | 2433.737 |
2020-04-02 13:00:00 | 2701.929 |
2020-04-02 14:00:00 | 2844.760 |
2020-04-02 15:00:00 | 3019.050 |
2020-04-02 16:00:00 | 3040.508 |
2020-04-02 17:00:00 | 3189.575 |
2020-04-02 18:00:00 | 3281.175 |
2020-04-02 19:00:00 | 3483.915 |
2020-04-02 20:00:00 | 3420.224 |
2020-04-02 21:00:00 | 3102.883 |
2020-04-02 22:00:00 | 2917.997 |
2020-04-02 23:00:00 | 2803.036 |
Plotting the hourly change of production:
# Start with the canvas ggplot function
ggplot(daily_prod_df) +
# Define aesthetics and plot type
geom_line(aes(x=lubridate::hour(dt), y=!!as.symbol(params$the_column))) +
# Get predefined theme: theme minimal
theme_minimal() +
# Change labels and make title parametric
labs(x="Hour", y="Production (in MWh)",
title = paste0("Production of '", params$the_column, "' for ", params$the_date),
subtitle = "Data Source: seffaflik.epias.com.tr") +
# Change x axis breaks so each hour is displayed individually
scale_x_continuous(breaks = 0:23)
Below you can see which months have the highest hydro production and changes in terms of percentage.
raw_df %>%
# select date-time and hydro related columns
select(dt, canalType_lic, riverType_lic, reservoir_lic) %>%
# calculate total hydro production
mutate(total_hydro = canalType_lic + riverType_lic + reservoir_lic) %>%
# group by year-month using lubridate
group_by(year = lubridate::year(dt), month = lubridate::month(dt)) %>%
# get monthly totals
summarise(total_canal = sum(canalType_lic), total_river = sum(riverType_lic), total_reservoir = sum(reservoir_lic), total = sum(total_hydro)) %>%
# ungroup
ungroup() %>%
# sort by total hydro production descending to get the highest production months
arrange(desc(total)) %>%
# rearrange columns for convenience
relocate(total,.after=month) %>%
# offset production by one row above so we can calculate production changes
mutate(lead_total = lead(total,1),.after=total) %>%
# calculate percentage increases between ranks
mutate(perc_change = total/lead_total - 1) %>%
# return just relevant columns from year to lead_total and percentage change
select(year:lead_total, perc_change)
## # A tibble: 29 x 5
## year month total lead_total perc_change
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2019 5 6489784. 5675201. 0.144
## 2 2019 4 5675201. 5087625. 0.115
## 3 2020 5 5087625. 4809446. 0.0578
## 4 2020 4 4809446. 4662190. 0.0316
## 5 2020 3 4662190. 4009868. 0.163
## 6 2019 6 4009868. 4000380. 0.00237
## 7 2019 3 4000380. 3780435. 0.0582
## 8 2018 3 3780435. 3668319. 0.0306
## 9 2019 1 3668319. 3638288. 0.00825
## 10 2018 5 3638288. 3316165. 0.0971
## 11 2018 4 3316165. 3223725. 0.0287
## 12 2019 2 3223725. 3129490. 0.0301
## 13 2018 6 3129490. 2814036. 0.112
## 14 2020 2 2814036. 2696472. 0.0436
## 15 2018 12 2696472. 2288993. 0.178
## 16 2019 7 2288993. 2067763. 0.107
## 17 2018 1 2067763. 1994672. 0.0366
## 18 2018 2 1994672. 1870478. 0.0664
## 19 2018 7 1870478. 1819788. 0.0279
## 20 2019 8 1819788. 1693711. 0.0744
## 21 2020 1 1693711. 1628169. 0.0403
## 22 2018 8 1628169. 1619452. 0.00538
## 23 2019 9 1619452. 1502812. 0.0776
## 24 2019 12 1502812. 1459946. 0.0294
## 25 2019 10 1459946. 1137300. 0.284
## 26 2018 11 1137300. 1077448. 0.0555
## 27 2019 11 1077448. 1069941. 0.00702
## 28 2018 9 1069941. 1054169. 0.0150
## 29 2018 10 1054169. NA NA
Let’s create a scatterplot of licensed wind versus unlicensed solar production in May, 2020 and between hours 10 and 17.
plot_df1 <- raw_df %>% filter(dt >= "2020-05-01" & dt < "2020-06-01" & lubridate::hour(dt) >= 10 & lubridate::hour(dt) <= 17) %>% transmute(hour_of_day = lubridate::hour(dt),wind_lic,sun_ul)
We use ‘geom_point’ for scatter plot. In ggplot2 there are some ready to use themes. One of them is ‘theme_minimal’. The plot with some label changes using ‘labs’. Finally let’s touch the axes a little bit (thousands separator and angle and position adjustment of x-axis) and move the legend to top.
ggplot(plot_df1) + geom_point(aes(x = wind_lic, y = sun_ul, color=as.character(hour_of_day))) + theme_minimal() + labs(x="Licensed Wind Production (MWh)", y="Unlicensed Solar Production (MWh)", color="Hour of Day", title = "Licensed Wind vs Unlicensed Solar", subtitle = "Renewable production in May 2020, between 10:00-17:00 each day") + theme(legend.position="top",axis.text.x = element_text(angle=45,hjust=1,vjust=1)) + scale_y_continuous(labels=function(x) format(x, big.mark = ".", decimal.mark = ",", scientific = FALSE)) + scale_x_continuous(labels=function(x) format(x, big.mark = ".", decimal.mark = ",", scientific = FALSE))
We could have defined the aesthetics in the first object. ggplot2 is quite flexible about it.
Let’s create a line plot of licensed wind versus unlicensed solar production in May, 2020.
plot_df2 <- raw_df %>% filter(dt >= "2020-05-01" & dt < "2020-06-01") %>% select(dt,wind_lic,sun_ul)
Time series can be beautifully represented with a line plot. We are going to use pivot_longer function to convert the data from wide to long format.
plot_df2 %>% pivot_longer(.,-dt) %>% ggplot(.,aes(x=dt,y=value,color=name)) + geom_line()
Let’s create a data set where May 2020’s production is used and production type is differentiated as Licensed and Unlicensed in a different column.
plot_df3 <- raw_df %>% filter(dt >= "2020-05-01" & dt < "2020-06-01") %>% summarise(across(-dt,sum)) %>% pivot_longer(.,everything()) %>% mutate(type = ifelse( grepl("_lic+$",name),"Licensed","Unlicensed"))
We use geom_bar for bar chart and use fill instead of color for understandable reasons (use fill to see the change). We also ordered the columns according to their value with reorder.
ggplot(plot_df3,aes(x=reorder(name,-value),y=value)) + geom_bar(stat="identity",aes(fill=type)) + theme(axis.text.x = element_text(angle=45))
Pie chart is actually a bar chart with an extra (coord_polar).
ggplot(plot_df3 %>% filter(type=="Licensed"),aes(x="",y=value,fill=name)) + geom_bar(stat="identity",width=1) + coord_polar("y")
Find the mean and standard deviation of licensed geothermal ‘geothermal_lic’ productions in all years. (Tip: Use lubridate::year to get years from date data.)
raw_df %>%
# select date-time and licensed geothermal production
select(dt, geothermal_lic) %>%
# group by year using lubridate
group_by(year = lubridate::year(dt)) %>%
# get mean and standard deviation
summarise(mean_geo = mean(geothermal_lic), sd_geo = sd(geothermal_lic)) %>%
# ungroup
ungroup() %>%
# sort by year
arrange(year) %>%
# return just relevant columns
select(year, mean_geo, sd_geo)
## # A tibble: 3 x 3
## year mean_geo sd_geo
## <dbl> <dbl> <dbl>
## 1 2018 681. 65.2
## 2 2019 799. 74.2
## 3 2020 935. 59.0
Find the hourly average unlicensed solar ‘sun_ul’ production levels for May 2020.
raw_df %>%
# filter the year and month
filter(lubridate::year(dt) == 2020 & lubridate::month(dt) == 05) %>%
# select date-time and unlicensed solar production
select(dt, sun_ul) %>%
# group by year using lubridate
group_by(hour = lubridate::hour(dt)) %>%
# get the average of production
summarise(avg_prod = round(mean(sun_ul),2)) %>%
# ungroup
ungroup() %>%
# sort by hour
arrange(hour) %>%
# return just relevant columns
select(hour, avg_prod)
## # A tibble: 24 x 2
## hour avg_prod
## <int> <dbl>
## 1 0 0.17
## 2 1 0.37
## 3 2 0.7
## 4 3 0.91
## 5 4 1.26
## 6 5 22.7
## 7 6 305.
## 8 7 1156.
## 9 8 2316.
## 10 9 3330.
## 11 10 3995.
## 12 11 4298.
## 13 12 4332.
## 14 13 4230.
## 15 14 3947.
## 16 15 3357.
## 17 16 2510.
## 18 17 1476.
## 19 18 540.
## 20 19 80.3
## 21 20 0.48
## 22 21 0.06
## 23 22 0.03
## 24 23 0.06
Find the average daily percentage change of licensed biomass ‘biomass_lic’ in 2019. (e.g. Suppose daily production is 50 in day 1 and 53 in day 2, then the change should be (53-50)/50 -1 = 0.06) (Tip: Use lubridate::as_date to convert date time to date. Use lag and lead functions to offset values.)
raw_df %>%
# filter the year
filter(lubridate::year(dt) == 2019) %>%
# select date-time and biomass_lic production
select(dt, biomass_lic) %>%
# group by day using lubridate
group_by(date = lubridate::as_date(dt)) %>%
# get daily total
summarise(daily_total = sum(biomass_lic)) %>%
# ungroup
ungroup() %>%
# sort by date
arrange(date) %>%
# offset production by one row above so we can calculate production changes
mutate(lag_daily_total = lag(daily_total,1)) %>%
# calculate percentage changes
mutate(perc_change = (daily_total - lag_daily_total)/lag_daily_total - 1) %>%
# get the average change
summarise(average_change = mean(perc_change, na.rm = TRUE)) %>%
# return just relevant column
select(average_change)
## # A tibble: 1 x 1
## average_change
## <dbl>
## 1 -0.998
Find the yearly total production levels in TWh (Current values are in MWh. 1 GWh is 1000 MWh and 1 TWh is 1000 GWh). (Tip: In order to avoid a lengthy summation you can use tidyr::pivot_longer to get a long format.)
raw_df %>%
# select all columns
select(dt:others_ul) %>%
# group by year using lubridate
group_by(year = lubridate::year(dt)) %>%
# get total_production by summing up
summarise(total_production = sum(wind_lic+geothermal_lic+biogas_lic+canalType_lic+riverType_lic+biomass_lic+landfillGas_lic+sun_lic+reservoir_lic+others_lic+wind_ul+biogas_ul+canalType_ul+biomass_ul+sun_ul+others_ul)/(10^6)) %>%
# ungroup
ungroup() %>%
# sort by year
arrange(year) %>%
# return just relevant columns
select(year, total_production)
## # A tibble: 3 x 2
## year total_production
## <dbl> <dbl>
## 1 2018 62.6
## 2 2019 76.7
## 3 2020 37.3