R-Exercises related to ‘dplyr’ and ‘ggplot2’

Overview of dataset ‘raw_df’:

The data consists of hourly renewable electricity production of YEKDEM plants from different origins and license types.

print(raw_df,n=3)

## # A tibble: 21,168 x 17
##   dt                  wind_lic geothermal_lic biogas_lic canalType_lic
##   <dttm>                 <dbl>          <dbl>      <dbl>         <dbl>
## 1 2020-05-31 23:00:00    1434.           913.       75.8         2585.
## 2 2020-05-31 22:00:00    1577.           908.       75.6         2631.
## 3 2020-05-31 21:00:00    1858.           901.       75.4         2585.
## # ... with 21,165 more rows, and 12 more variables: riverType_lic <dbl>,
## #   biomass_lic <dbl>, landfillGas_lic <dbl>, sun_lic <dbl>,
## #   reservoir_lic <dbl>, others_lic <dbl>, wind_ul <dbl>, biogas_ul <dbl>,
## #   canalType_ul <dbl>, biomass_ul <dbl>, sun_ul <dbl>, others_ul <dbl>

raw_df %>% glimpse()

## Rows: 21,168
## Columns: 17
## $ dt              <dttm> 2020-05-31 23:00:00, 2020-05-31 22:00:00, 2020-05-...
## $ wind_lic        <dbl> 1433.8132, 1577.1419, 1857.5492, 1933.0142, 2031.78...
## $ geothermal_lic  <dbl> 912.7882, 907.9303, 900.5844, 888.4561, 864.5402, 8...
## $ biogas_lic      <dbl> 75.8047, 75.6163, 75.3906, 76.7873, 76.9707, 77.575...
## $ canalType_lic   <dbl> 2584.930, 2630.602, 2585.038, 2542.381, 2594.459, 2...
## $ riverType_lic   <dbl> 316.5538, 316.6800, 356.7637, 350.1544, 377.5312, 3...
## $ biomass_lic     <dbl> 262.4994, 253.0814, 246.9268, 249.9152, 248.2336, 2...
## $ landfillGas_lic <dbl> 100.3971, 101.1378, 100.4442, 100.7307, 102.2474, 1...
## $ sun_lic         <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 2.0594, 14.2800, 48...
## $ reservoir_lic   <dbl> 2306.303, 2296.045, 2279.266, 2308.918, 2792.313, 3...
## $ others_lic      <dbl> 48.3833, 48.4011, 48.4041, 48.4199, 48.4653, 48.548...
## $ wind_ul         <dbl> 3.7751, 4.8375, 7.6659, 11.8121, 13.1070, 13.1830, ...
## $ biogas_ul       <dbl> 16.9293, 16.9227, 16.9052, 16.7517, 16.2928, 16.598...
## $ canalType_ul    <dbl> 4.1749, 4.4221, 4.4658, 4.6020, 4.6195, 4.5146, 4.6...
## $ biomass_ul      <dbl> 15.4698, 15.3609, 16.0483, 15.2271, 15.5563, 15.500...
## $ sun_ul          <dbl> 0.0582, 0.0320, 0.0335, 1.3121, 103.3267, 555.5787,...
## $ others_ul       <dbl> 0.0610, 0.0395, 0.4136, 0.5508, 0.7106, 1.3775, 2.7...

In-class Exercises:

In-class Exercise 1)

Calculate the daily total production and plot the hourly change of production for ‘wind_lic’ and 2020-04-02.

return_daily_production <- function(the_date,the_column){
  raw_df %>%
    filter(as_date(dt) == the_date) %>%
    arrange(dt) %>%
    select(dt, the_column)
}
# The_date and the_column are already defined in the beginning under 'params'.
daily_prod_df <- return_daily_production(params$the_date, params$the_column)
knitr::kable(daily_prod_df)

dt	wind_lic
2020-04-02 00:00:00	1376.317
2020-04-02 01:00:00	1484.757
2020-04-02 02:00:00	1643.236
2020-04-02 03:00:00	1863.580
2020-04-02 04:00:00	2076.822
2020-04-02 05:00:00	2367.872
2020-04-02 06:00:00	2481.355
2020-04-02 07:00:00	2323.889
2020-04-02 08:00:00	2017.700
2020-04-02 09:00:00	1966.403
2020-04-02 10:00:00	1989.430
2020-04-02 11:00:00	2168.680
2020-04-02 12:00:00	2433.737
2020-04-02 13:00:00	2701.929
2020-04-02 14:00:00	2844.760
2020-04-02 15:00:00	3019.050
2020-04-02 16:00:00	3040.508
2020-04-02 17:00:00	3189.575
2020-04-02 18:00:00	3281.175
2020-04-02 19:00:00	3483.915
2020-04-02 20:00:00	3420.224
2020-04-02 21:00:00	3102.883
2020-04-02 22:00:00	2917.997
2020-04-02 23:00:00	2803.036

Total production is 59999.

Plotting the hourly change of production:

# Start with the canvas ggplot function
ggplot(daily_prod_df) + 
  # Define aesthetics and plot type
  geom_line(aes(x=lubridate::hour(dt), y=!!as.symbol(params$the_column))) + 
  # Get predefined theme: theme minimal
  theme_minimal() + 
  # Change labels and make title parametric
  labs(x="Hour", y="Production (in MWh)", 
       title = paste0("Production of '", params$the_column, "' for ", params$the_date), 
       subtitle = "Data Source: seffaflik.epias.com.tr") + 
  # Change x axis breaks so each hour is displayed individually
  scale_x_continuous(breaks = 0:23)

In-class Exercise 2) Monthly Hydro Production Analysis

Below you can see which months have the highest hydro production and changes in terms of percentage.

raw_df %>%
  # select date-time and hydro related columns
  select(dt, canalType_lic, riverType_lic, reservoir_lic) %>%
  # calculate total hydro production
  mutate(total_hydro = canalType_lic + riverType_lic + reservoir_lic) %>%
  # group by year-month using lubridate
  group_by(year = lubridate::year(dt), month = lubridate::month(dt)) %>%
  # get monthly totals
  summarise(total_canal = sum(canalType_lic), total_river = sum(riverType_lic), total_reservoir = sum(reservoir_lic), total = sum(total_hydro)) %>%
  # ungroup
  ungroup() %>%
  # sort by total hydro production descending to get the highest production months
  arrange(desc(total)) %>%
  # rearrange columns for convenience
  relocate(total,.after=month) %>%
  # offset production by one row above so we can calculate production changes
  mutate(lead_total = lead(total,1),.after=total) %>%
  # calculate percentage increases between ranks
  mutate(perc_change = total/lead_total - 1) %>%
  # return just relevant columns from year to lead_total and percentage change
  select(year:lead_total, perc_change)

## # A tibble: 29 x 5
##     year month    total lead_total perc_change
##    <dbl> <dbl>    <dbl>      <dbl>       <dbl>
##  1  2019     5 6489784.   5675201.     0.144  
##  2  2019     4 5675201.   5087625.     0.115  
##  3  2020     5 5087625.   4809446.     0.0578 
##  4  2020     4 4809446.   4662190.     0.0316 
##  5  2020     3 4662190.   4009868.     0.163  
##  6  2019     6 4009868.   4000380.     0.00237
##  7  2019     3 4000380.   3780435.     0.0582 
##  8  2018     3 3780435.   3668319.     0.0306 
##  9  2019     1 3668319.   3638288.     0.00825
## 10  2018     5 3638288.   3316165.     0.0971 
## 11  2018     4 3316165.   3223725.     0.0287 
## 12  2019     2 3223725.   3129490.     0.0301 
## 13  2018     6 3129490.   2814036.     0.112  
## 14  2020     2 2814036.   2696472.     0.0436 
## 15  2018    12 2696472.   2288993.     0.178  
## 16  2019     7 2288993.   2067763.     0.107  
## 17  2018     1 2067763.   1994672.     0.0366 
## 18  2018     2 1994672.   1870478.     0.0664 
## 19  2018     7 1870478.   1819788.     0.0279 
## 20  2019     8 1819788.   1693711.     0.0744 
## 21  2020     1 1693711.   1628169.     0.0403 
## 22  2018     8 1628169.   1619452.     0.00538
## 23  2019     9 1619452.   1502812.     0.0776 
## 24  2019    12 1502812.   1459946.     0.0294 
## 25  2019    10 1459946.   1137300.     0.284  
## 26  2018    11 1137300.   1077448.     0.0555 
## 27  2019    11 1077448.   1069941.     0.00702
## 28  2018     9 1069941.   1054169.     0.0150 
## 29  2018    10 1054169.        NA     NA

In-class Exercise 3) Scatterplot

Let’s create a scatterplot of licensed wind versus unlicensed solar production in May, 2020 and between hours 10 and 17.

plot_df1 <- raw_df %>% filter(dt >= "2020-05-01" & dt < "2020-06-01" & lubridate::hour(dt) >= 10 & lubridate::hour(dt) <= 17) %>% transmute(hour_of_day = lubridate::hour(dt),wind_lic,sun_ul)

We use ‘geom_point’ for scatter plot. In ggplot2 there are some ready to use themes. One of them is ‘theme_minimal’. The plot with some label changes using ‘labs’. Finally let’s touch the axes a little bit (thousands separator and angle and position adjustment of x-axis) and move the legend to top.

ggplot(plot_df1) + geom_point(aes(x = wind_lic, y = sun_ul, color=as.character(hour_of_day))) + theme_minimal() + labs(x="Licensed Wind Production (MWh)", y="Unlicensed Solar Production (MWh)", color="Hour of Day", title = "Licensed Wind vs Unlicensed Solar", subtitle = "Renewable production in May 2020, between 10:00-17:00 each day") + theme(legend.position="top",axis.text.x = element_text(angle=45,hjust=1,vjust=1)) + scale_y_continuous(labels=function(x) format(x, big.mark = ".", decimal.mark = ",", scientific = FALSE)) + scale_x_continuous(labels=function(x) format(x, big.mark = ".", decimal.mark = ",", scientific = FALSE))

We could have defined the aesthetics in the first object. ggplot2 is quite flexible about it.

In-class Exercise 4) Line plot

Let’s create a line plot of licensed wind versus unlicensed solar production in May, 2020.

plot_df2 <- raw_df %>% filter(dt >= "2020-05-01" & dt < "2020-06-01") %>% select(dt,wind_lic,sun_ul)

Time series can be beautifully represented with a line plot. We are going to use pivot_longer function to convert the data from wide to long format.

plot_df2 %>% pivot_longer(.,-dt) %>% ggplot(.,aes(x=dt,y=value,color=name)) + geom_line()

In-class Exercise 5) Bar & Pie chart

Let’s create a data set where May 2020’s production is used and production type is differentiated as Licensed and Unlicensed in a different column.

plot_df3 <- raw_df %>% filter(dt >= "2020-05-01" & dt < "2020-06-01") %>% summarise(across(-dt,sum)) %>% pivot_longer(.,everything()) %>% mutate(type = ifelse( grepl("_lic+$",name),"Licensed","Unlicensed"))

Bar chart:

We use geom_bar for bar chart and use fill instead of color for understandable reasons (use fill to see the change). We also ordered the columns according to their value with reorder.

ggplot(plot_df3,aes(x=reorder(name,-value),y=value)) + geom_bar(stat="identity",aes(fill=type)) + theme(axis.text.x = element_text(angle=45))

Pie chart:

Pie chart is actually a bar chart with an extra (coord_polar).

ggplot(plot_df3 %>% filter(type=="Licensed"),aes(x="",y=value,fill=name)) + geom_bar(stat="identity",width=1) + coord_polar("y")

Other Exercises:

Exercise 1)

Find the mean and standard deviation of licensed geothermal ‘geothermal_lic’ productions in all years. (Tip: Use lubridate::year to get years from date data.)

raw_df %>%
  # select date-time and licensed geothermal production
  select(dt, geothermal_lic) %>%
  # group by year using lubridate
  group_by(year = lubridate::year(dt)) %>%    
  # get mean and standard deviation
  summarise(mean_geo = mean(geothermal_lic), sd_geo = sd(geothermal_lic)) %>%
  # ungroup
  ungroup() %>%    
  # sort by year
  arrange(year) %>%
  # return just relevant columns    
  select(year, mean_geo, sd_geo)

## # A tibble: 3 x 3
##    year mean_geo sd_geo
##   <dbl>    <dbl>  <dbl>
## 1  2018     681.   65.2
## 2  2019     799.   74.2
## 3  2020     935.   59.0

Exercise 2)

Find the hourly average unlicensed solar ‘sun_ul’ production levels for May 2020.

raw_df %>%
  # filter the year and month
  filter(lubridate::year(dt) == 2020 & lubridate::month(dt) == 05) %>%
  # select date-time and unlicensed solar production
  select(dt, sun_ul) %>%
  # group by year using lubridate
  group_by(hour = lubridate::hour(dt)) %>%    
  # get the average of production
  summarise(avg_prod = round(mean(sun_ul),2)) %>%
  # ungroup
  ungroup() %>%
  # sort by hour
  arrange(hour) %>%
  # return just relevant columns    
  select(hour, avg_prod)

## # A tibble: 24 x 2
##     hour avg_prod
##    <int>    <dbl>
##  1     0     0.17
##  2     1     0.37
##  3     2     0.7 
##  4     3     0.91
##  5     4     1.26
##  6     5    22.7 
##  7     6   305.  
##  8     7  1156.  
##  9     8  2316.  
## 10     9  3330.  
## 11    10  3995.  
## 12    11  4298.  
## 13    12  4332.  
## 14    13  4230.  
## 15    14  3947.  
## 16    15  3357.  
## 17    16  2510.  
## 18    17  1476.  
## 19    18   540.  
## 20    19    80.3 
## 21    20     0.48
## 22    21     0.06
## 23    22     0.03
## 24    23     0.06

Exercise 3)

Find the average daily percentage change of licensed biomass ‘biomass_lic’ in 2019. (e.g. Suppose daily production is 50 in day 1 and 53 in day 2, then the change should be (53-50)/50 -1 = 0.06) (Tip: Use lubridate::as_date to convert date time to date. Use lag and lead functions to offset values.)

raw_df %>%
  # filter the year
  filter(lubridate::year(dt) == 2019) %>%
  # select date-time and biomass_lic production
  select(dt, biomass_lic) %>%
  # group by day using lubridate
  group_by(date = lubridate::as_date(dt)) %>%    
  # get daily total
  summarise(daily_total = sum(biomass_lic)) %>%
  # ungroup
  ungroup() %>%
  # sort by date
  arrange(date) %>%
  # offset production by one row above so we can calculate production changes
  mutate(lag_daily_total = lag(daily_total,1)) %>%
  # calculate percentage changes
  mutate(perc_change = (daily_total - lag_daily_total)/lag_daily_total - 1) %>%
  # get the average change
  summarise(average_change = mean(perc_change, na.rm = TRUE)) %>%
  # return just relevant column  
  select(average_change)

## # A tibble: 1 x 1
##   average_change
##            <dbl>
## 1         -0.998

Exercise 4)

Find the yearly total production levels in TWh (Current values are in MWh. 1 GWh is 1000 MWh and 1 TWh is 1000 GWh). (Tip: In order to avoid a lengthy summation you can use tidyr::pivot_longer to get a long format.)

raw_df %>%
  # select all columns
  select(dt:others_ul) %>%
  # group by year using lubridate
  group_by(year = lubridate::year(dt)) %>%    
  # get total_production by summing up
  summarise(total_production = sum(wind_lic+geothermal_lic+biogas_lic+canalType_lic+riverType_lic+biomass_lic+landfillGas_lic+sun_lic+reservoir_lic+others_lic+wind_ul+biogas_ul+canalType_ul+biomass_ul+sun_ul+others_ul)/(10^6)) %>%
  # ungroup
  ungroup() %>%    
  # sort by year
  arrange(year) %>%
  # return just relevant columns   
  select(year, total_production)

## # A tibble: 3 x 2
##    year total_production
##   <dbl>            <dbl>
## 1  2018             62.6
## 2  2019             76.7
## 3  2020             37.3