#REQUIRED LIBRARIES
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)
library(ggplot2 )
library(rvest)
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
## IMPORT DATA
#TO GET LIST OF URL OF BKM  

x <- list()
for (i in 0:9){
  for (k in 1:12) {
x[[paste0("url201",i,"0",k)]]<-paste0("https://bkm.com.tr/secilen-aya-ait-sektorel-gelisim/?filter_year=201",i,"&filter_month=",k,"&List=Listele")
  }
}
#GETTING DATA FROM URL LIST AND BINDING TO ALL INVOLVED ONE DATAFRAME
DF<-data.frame(matrix(ncol = 5, nrow = 0))

for (i in x){
 HTML_1<-read_html(i) 
 temp_DF<-html_table(html_nodes(HTML_1, "table"),fill = TRUE)[[4]]
 
 for (n in 1:nrow(temp_DF)){
   temp_DF$date[[n]]<-str_remove_all(gsub(".*year=(.+)&filter_month=", "\\1", i),"[A-Z]|[a-z]|&|=")
 }
 DF<-bind_rows(DF,temp_DF)
}
#DATA WRANGLING & CONTROL OF FINAL FORM OF DATA
DF$year = substr(DF$date, 1,4)
DF$month = substr(DF$date,start=5,stop=6)
colnames(DF)<-c("Sector","Trans_Numb_CC","Trans_Numb_DC","Trans_Amount_CC","Trans_Amount_DC","date","year","month")
DF<-DF%>%filter(Sector!="İşyeri Grubu")%>%filter(Sector!="TOPLAM")%>%filter(Sector!="Lütfen listeyi görebilmek için yukarıdan tarih seçiniz.")
str(DF)
## 'data.frame':    2840 obs. of  8 variables:
##  $ Sector         : chr  "ARABA KİRALAMA" "ARAÇ KİRALAMA-SATIŞ/SERVİS/YEDEK PARÇA" "BENZİN VE YAKIT İSTASYONLARI" "ÇEŞİTLİ GIDA" ...
##  $ Trans_Numb_CC  : chr  "30.540" "1.865.922" "16.655.032" "8.901.339" ...
##  $ Trans_Numb_DC  : chr  "1.485" "82.655" "1.356.094" "1.272.039" ...
##  $ Trans_Amount_CC: chr  "13,30" "523,95" "1.990,10" "756,56" ...
##  $ Trans_Amount_DC: chr  "0,22" "11,77" "60,63" "25,52" ...
##  $ date           : chr  "20101" "20101" "20101" "20101" ...
##  $ year           : chr  "2010" "2010" "2010" "2010" ...
##  $ month          : chr  "1" "1" "1" "1" ...
DF$Trans_Numb_CC<-gsub("[.]", "",DF$Trans_Numb_CC)
DF$Trans_Numb_DC<-gsub("[.]", "",DF$Trans_Numb_DC)
DF$Trans_Amount_CC<-gsub("[.]", "",DF$Trans_Amount_CC)
DF$Trans_Amount_DC<-gsub("[.]", "",DF$Trans_Amount_DC)
DF$Trans_Amount_CC<-gsub("[,]", ".",DF$Trans_Amount_CC)
DF$Trans_Amount_DC<-gsub("[,]", ".",DF$Trans_Amount_DC)

DF$Trans_Numb_CC<- as.numeric(as.character(DF$Trans_Numb_CC))
DF$Trans_Numb_DC<- as.numeric(as.character(DF$Trans_Numb_DC))
DF$Trans_Amount_CC<- as.numeric(as.character(DF$Trans_Amount_CC))
DF$Trans_Amount_DC<- as.numeric(as.character(DF$Trans_Amount_DC))

str(DF)
## 'data.frame':    2840 obs. of  8 variables:
##  $ Sector         : chr  "ARABA KİRALAMA" "ARAÇ KİRALAMA-SATIŞ/SERVİS/YEDEK PARÇA" "BENZİN VE YAKIT İSTASYONLARI" "ÇEŞİTLİ GIDA" ...
##  $ Trans_Numb_CC  : num  30540 1865922 16655032 8901339 5282218 ...
##  $ Trans_Numb_DC  : num  1485 82655 1356094 1272039 91911 ...
##  $ Trans_Amount_CC: num  13.3 524 1990.1 756.6 688.4 ...
##  $ Trans_Amount_DC: num  0.22 11.77 60.63 25.52 18.05 ...
##  $ date           : chr  "20101" "20101" "20101" "20101" ...
##  $ year           : chr  "2010" "2010" "2010" "2010" ...
##  $ month          : chr  "1" "1" "1" "1" ...
#Differences between credit_c and debit_c transaction
trans_num <- DF %>% group_by(year) %>% 
  summarise(number_cc=sum(Trans_Numb_CC), number_dc= sum(Trans_Numb_DC)) %>%
  mutate(diff_in_numb=(number_cc-number_dc)) %>%
  filter(year %in% c(2010:2018))
ggplot(trans_num, aes(x = year, y= diff_in_numb,group=1)) + 
    geom_smooth(color="blue")+
  geom_point(color="red")+
  labs(title = "Differences between credit_c and debit_c transaction", 
       subtitle = "Katilimci Sayisi", 
       caption="(based on data from EGM)")+
  scale_y_continuous(labels = comma ) +
  theme(axis.text.x = element_text(angle= 35, vjust= 0.5))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Transaction Amount Comparision of top 5 sector between 2010-2019

sector_share <- DF %>% group_by(Sector) %>% 
  summarise(total_amount=sum(Trans_Amount_CC + Trans_Amount_DC)) %>%
  arrange(desc(total_amount)) %>%
  mutate(share_percentage = round(total_amount / sum(total_amount)*100,2)) %>%
  slice(1:5)
sector_share
## # A tibble: 5 x 3
##   Sector                               total_amount share_percentage
##   <chr>                                       <dbl>            <dbl>
## 1 MARKET VE ALIŞVERİŞ MERKEZLERİ            854878.            17.6 
## 2 BENZİN VE YAKIT İSTASYONLARI              480773.             9.91
## 3 GİYİM VE AKSESUAR                         399549.             8.23
## 4 ÇEŞİTLİ GIDA                              322643.             6.65
## 5 ELEKTRİK-ELEKTRONİK EŞYA, BİLGİSAYAR      311508.             6.42
ggplot(sector_share, aes(x=Sector,y=share_percentage, fill=Sector)) +
geom_bar(stat = "identity") +
coord_polar() +
geom_text(
  aes(x=, y= share_percentage, label= percent(share_percentage/100)), 
  position = position_stack(vjust = 1,8),
  size=4,angle=0)

##Credit Card and Debit Card Transaction Amount for Market BETWEEN 2010-2018
total <- DF %>% group_by(year) %>% 
  summarise(total_cc = sum(Trans_Amount_CC), total_dc = sum(Trans_Amount_DC)) %>% 
  mutate(total_amount = (total_cc + total_dc))
total
## # A tibble: 10 x 4
##    year  total_cc total_dc total_amount
##    <chr>    <dbl>    <dbl>        <dbl>
##  1 2010   213706.    8740.      222446.
##  2 2011   265337.   13335.      278672.
##  3 2012   331818.   17961.      349779.
##  4 2013   388514.   24788.      413302.
##  5 2014   433150.   33553.      466703.
##  6 2015   490839.   43060.      533900.
##  7 2016   534068.   54800.      588869.
##  8 2017   605382.   78897.      684279.
##  9 2018   725661.  114678.      840338.
## 10 2019   402133.   72979.      475113.
ggplot(total ) +
   geom_jitter(aes(x=year,y=total_amount,color=total_amount,size=total_amount)) +
    labs(title = "Cards Usage", subtitle = "Total Amount", caption="(based on data from BKM)", y="Tot. Credit+Debit Card Use (in Mio)", x="Year") +scale_y_continuous()