library(readxl)
library(dplyr)
library(tidyr)
library(knitr)
2 Data Preparation
2.1 Main Data for our Project
Balance sheet, Income statement and other information for Real Sector in sectoral breakdown for the period between 2009 and 2021
# Read sector info data
<- read_excel("docs/Project_Data_TURKSTAT/Sector_info.xlsx")
sector_info
saveRDS(sector_info, "docs/Project_Data/sector_grouping.rds")
#Sectoral grouping is given in the table below.
kable(sector_info)
sector_code | sector_name | group |
---|---|---|
A | Agriculture | Agriculture |
B | Mining | Industry |
C | Manufacturing | Industry |
D | Electricity_Gas | Industry |
E | Water_Supply | Industry |
F | Construction | Construction |
G | Wholesale_Retail_Trade | Services |
H | Transportation | Services |
I | Accomodation_Food | Services |
J | Information | Services |
L | Real_Estate | Services |
M | Professional_Activities | Services |
N | Administrative_Activities | Services |
P | Education | Services |
Q | Health | Services |
R | Entertainment | Services |
S | Other_Services | Services |
# Read Excel Data for company info regarding number of employees for different company sizes.
<-
company_info read_excel("docs/Project_Data_TURKSTAT/Project_data_company_info.xlsx")
# Drop columns whose values are all NA
<- company_info %>%
company_info select_if(function(col) !all(is.na(col)))
# Drop rows whose values except first column are all NA
<- company_info %>%
company_info filter(!if_all(2:ncol(company_info), is.na))
# Fill first column with previous values
<- company_info %>%
company_info fill(1)
# Fill second column with previous values
<- company_info %>%
company_info fill(2)
# Fill NA in first row with previous values
<- company_info %>%
first_row filter_all(any_vars(is.na(.))) %>%
slice(1) %>%
as.data.frame() %>%
pivot_longer(-1) %>%
fill(value, .direction = "down") %>%
select(value) %>%
as.matrix()
1,2:ncol(company_info)] <- t(first_row)
company_info[
# Fill NA in second row with previous values
<- company_info %>%
second_row filter_all(any_vars(is.na(.))) %>%
slice(1) %>%
as.data.frame() %>%
pivot_longer(-1) %>%
fill(value, .direction = "down") %>%
select(value) %>%
as.matrix()
2,2:ncol(company_info)] <- t(second_row)
company_info[
# Drop cell values before blanks
<- company_info %>% as.matrix() %>% gsub(".*\r\n", "", .) %>% as_tibble()
company_info
#Drop columns with share (%)
<- !(company_info[3,] == "(%)") %>% as.vector()
cond <- company_info[,cond]
company_info
#Drop columns with entity types
<- !(company_info[2,] == "Entity Types") %>% as.vector()
cond_1 <- company_info[, cond_1]
company_info # Drop row with duplicate number values
<- company_info[-3,]
company_info
2,] <- as.list(company_info[2,] %>% gsub(pattern = " ", replacement = "_", company_info))
company_info[
<- paste(company_info[1,], company_info[2,], sep = "/")
header_new 1:2] <- c("sector", "size")
header_new[
names(company_info) <- header_new
<- company_info %>%
company_info filter(!(sector %in% c("", "Türkiye")))
<- company_info %>%
company_info pivot_longer(-c(sector, size)) %>%
separate(name, into = c("year", "number"), sep = "/")
$number <- company_info$number %>%
company_infogsub(pattern = ".companıes", replacement = "_companies")
%>%
company_info pivot_wider(names_from = number, values_from = value)
# A tibble: 1,105 × 5
sector size year Number_of_companies Number_…¹
<chr> <chr> <chr> <chr> <chr>
1 A-Agrıculture, forestry and fishing Total 2009 5558 32584
2 A-Agrıculture, forestry and fishing Total 2010 6731 36849
3 A-Agrıculture, forestry and fishing Total 2011 7198 41546
4 A-Agrıculture, forestry and fishing Total 2012 7312 44814
5 A-Agrıculture, forestry and fishing Total 2013 7682 47135
6 A-Agrıculture, forestry and fishing Total 2014 7944 50240
7 A-Agrıculture, forestry and fishing Total 2015 8478 54175
8 A-Agrıculture, forestry and fishing Total 2016 9226 57025
9 A-Agrıculture, forestry and fishing Total 2017 9690 58203
10 A-Agrıculture, forestry and fishing Total 2018 10194 63918
# … with 1,095 more rows, and abbreviated variable name ¹Number_of_employees
$sector_code <- company_info$sector %>% gsub("-.*", "", .)
company_info<- company_info %>%
company_info select(sector_code, everything()) %>%
select(-sector)
$value <- as.double(company_info$value)
company_info
<-
company_info left_join(company_info, sector_info, by = "sector_code") %>%
select(sector_code, sector_name, group, everything())
saveRDS(company_info, "docs/Project_Data/sector_information.rds")
%>% head() company_info
# A tibble: 6 × 7
sector_code sector_name group size year number value
<chr> <chr> <chr> <chr> <chr> <chr> <dbl>
1 A Agriculture Agriculture Total 2009 Number_of_companies 5558
2 A Agriculture Agriculture Total 2009 Number_of_employees 32584
3 A Agriculture Agriculture Total 2010 Number_of_companies 6731
4 A Agriculture Agriculture Total 2010 Number_of_employees 36849
5 A Agriculture Agriculture Total 2011 Number_of_companies 7198
6 A Agriculture Agriculture Total 2011 Number_of_employees 41546
# Read Balance Sheet Data
<- read_excel("docs/Project_Data_TURKSTAT/Project_data_b_s.xlsx",
company_bs range = "A4:O4032",
col_names = c("sector_code", "accounts", 2009:2021))
<- company_bs %>%
company_bs fill(sector_code)
<-
company_bs !is.na(company_bs[,3]),]
company_bs[
<- company_bs[-1,]
company_bs
$sector_code <-
company_bs$sector_code %>%
company_bsgsub(".*\r\n", "", .) %>%
gsub("-.*", "", .)
$accounts <- company_bs$accounts %>% sub("^[^\r\n]+", "",.)
company_bs
$accounts <- company_bs$accounts %>% gsub("\r\n", "",.)
company_bs
<- company_bs %>%
company_bs pivot_longer(-c(sector_code,accounts), names_to = "year", values_to = "value")
<- company_bs %>% select(sector_code, everything())
company_bs <-
company_bs left_join(company_bs, sector_info, by = "sector_code") %>%
select(sector_code, sector_name, group, everything())
$year <- as.integer(company_bs$year)
company_bs
saveRDS(company_bs, "docs/Project_Data/balance_sheet.rds")
%>% head() company_bs
# A tibble: 6 × 6
sector_code sector_name group accounts year value
<chr> <chr> <chr> <chr> <int> <dbl>
1 A Agriculture Agriculture I-Current assets 2009 3744238
2 A Agriculture Agriculture I-Current assets 2010 4946459
3 A Agriculture Agriculture I-Current assets 2011 6652950
4 A Agriculture Agriculture I-Current assets 2012 8155273
5 A Agriculture Agriculture I-Current assets 2013 10465125
6 A Agriculture Agriculture I-Current assets 2014 12143122
# Read Income Statement Data
<- read_excel("docs/Project_Data_TURKSTAT/Project_data_i_s.xlsx",
company_is range = "A4:O921",
col_names = c("sector_code", "accounts", 2009:2021))
# Fill sectors in first column with previous observations
<- company_is %>%
company_is fill(sector_code)
# Remove blank rows
<- company_is[!is.na(company_is[,3]),]
company_is
# Remove redundant first row
<- company_is[-1,]
company_is
# Remove linespaces from sectors and accounts columns so that english phrases remain
$sector_code <- company_is$sector_code %>% gsub("- .*", "", .)
company_is
# pivot data into longer (panel) format
<- company_is %>%
company_is pivot_longer(-c(sector_code, accounts), names_to = "year", values_to = "value")
<- company_is %>% select(sector_code, everything())
company_is <-
company_is left_join(company_is, sector_info, by = "sector_code") %>%
select(sector_code, sector_name, group, everything())
$accounts <- company_is$accounts %>% gsub(".*\r\n", "", .)
company_is
$year <- company_is$year %>% as.integer()
company_is
saveRDS(company_is, "docs/Project_Data/income_statement.rds")
%>% head() company_is
# A tibble: 6 × 6
sector_code sector_name group accounts year value
<chr> <chr> <chr> <chr> <int> <dbl>
1 A Agriculture Agriculture A-Gross sales 2009 5268108
2 A Agriculture Agriculture A-Gross sales 2010 6507636
3 A Agriculture Agriculture A-Gross sales 2011 8440778
4 A Agriculture Agriculture A-Gross sales 2012 10095319
5 A Agriculture Agriculture A-Gross sales 2013 11998750
6 A Agriculture Agriculture A-Gross sales 2014 14979503
2.2 Other Relational Datasets:
We explore other datasets relevant to our main data that will be used in our exploratory data analysis in the following sections:
2.2.1 Employment data: Labor Cost Index and Gender Distribution
# Read Employment Data
<-
employment_male read_excel("docs/Project_Data_TURKSTAT/Project_data_employment.xlsx",
range = "Z6:AW14")
<-
employment_female read_excel("docs/Project_Data_TURKSTAT/Project_data_employment.xlsx",
range = "AY6:BW14")
# Create new column as gender
$gender <- c("male")
employment_male$gender <- c("female")
employment_female
# drop first row and columns whose values are all NA
<- employment_male[-1,] %>%
employment_male fill(1) %>%
select_if(function(col) !all(is.na(col)))
<- employment_female[-1,] %>%
employment_female fill(1) %>%
select_if(function(col) !all(is.na(col)))
# Create a quarterly time series starting 2021Q1 and ending 2022Q3
2] <- seq.Date(from = as.Date("2021-03-30"), to = as.Date("2022-09-30"), by = "quarter")
employment_male[,2] <- seq.Date(from = as.Date("2021-03-30"), to = as.Date("2022-09-30"), by = "quarter")
employment_female[,
# Rearrange tibble columns such that gender comes first
<- employment_male[,-1] %>%
employment_male select(gender, everything())
<- employment_female[,-1] %>%
employment_female select(gender, everything())
# Change column name of date column
colnames(employment_male)[2] <- "date"
colnames(employment_female)[2] <- "date"
<- rbind(employment_female, employment_male)
employment colnames(employment) <- colnames(employment) %>% gsub(".*[(]([^.]+)[)].*", "\\1", .)
<- employment %>% select(-3)
employment
$D <- employment$`D+E`/ 2
employment$E <- employment$`D+E`/ 2
employment
$S <- employment$`S+T+U`/ 3
employment$T <- employment$`S+T+U`/ 3
employment$U <- employment$`S+T+U`/ 3
employment
<- employment %>%
employment select(-c("D+E", "S+T+U")) %>%
pivot_longer(-c("gender", "date"), names_to = "sector_code", values_to = "number") %>%
left_join(., sector_info, by = "sector_code")
<- employment[!is.na(employment$sector_name),]
employment
saveRDS(employment, "docs/Project_Data/employment.rds")
employment
# A tibble: 238 × 6
gender date sector_code number sector_name group
<chr> <date> <chr> <dbl> <chr> <chr>
1 female 2021-03-30 A 1766 Agriculture Agriculture
2 female 2021-03-30 B 6 Mining Industry
3 female 2021-03-30 C 1399 Manufacturing Industry
4 female 2021-03-30 F 74 Construction Construction
5 female 2021-03-30 G 1004 Wholesale_Retail_Trade Services
6 female 2021-03-30 H 105 Transportation Services
7 female 2021-03-30 I 309 Accomodation_Food Services
8 female 2021-03-30 J 69 Information Services
9 female 2021-03-30 L 68 Real_Estate Services
10 female 2021-03-30 M 395 Professional_Activities Services
# … with 228 more rows
# Read labor cost Index data
<-
labor_cost_index read_excel("docs/Project_Data_TURKSTAT/Project_data_labor_cost_index.xlsx", range = "A4:P54")
colnames(labor_cost_index)[3] <- "date"
$date <- seq.Date(from = as.Date("2010-03-30"), to = as.Date("2022-06-30"), by = "quarter")
labor_cost_index<- labor_cost_index[,c(-1,-2)]
labor_cost_index colnames(labor_cost_index) <- colnames(labor_cost_index) %>% gsub("\\..*", "", .)
<-
labor_cost_index %>%
labor_cost_index pivot_longer(-c("date"), names_to = "sector_code", values_to = "value") %>%
left_join(.,sector_info, by="sector_code") %>%
select(sector_code, sector_name, group, everything())
<- labor_cost_index[!is.na(labor_cost_index$sector_name),]
labor_cost_index
saveRDS(labor_cost_index, "docs/Project_Data/labor_cost.rds")
%>% head() labor_cost_index
# A tibble: 6 × 5
sector_code sector_name group date value
<chr> <chr> <chr> <date> <dbl>
1 B Mining Industry 2010-03-30 59.4
2 C Manufacturing Industry 2010-03-30 57
3 D Electricity_Gas Industry 2010-03-30 70.4
4 E Water_Supply Industry 2010-03-30 54.2
5 F Construction Construction 2010-03-30 59.9
6 G Wholesale_Retail_Trade Services 2010-03-30 59
2.2.2 Economic Growth Data: Gross Domestic Product
# Read GDP Index Data
<- read_excel("docs/Project_Data_TURKSTAT/Project_data_gdp.xlsx",
gdp range = "A7:C318")
# Rename columns
colnames(gdp) <- c("sector", "year", "gdp_index")
# Filter NA rows
<- gdp %>%
gdp filter(!is.na(sector))
saveRDS(gdp, "docs/Project_Data/gdp.rds")
%>% head() gdp
# A tibble: 6 × 3
sector year gdp_index
<chr> <dbl> <dbl>
1 Total 1998 70.7
2 Total 1999 68.3
3 Total 2000 73.1
4 Total 2001 68.9
5 Total 2002 73.3
6 Total 2003 77.6
2.2.3 International Trade Data: Terms of Trade, Export and Import
# Read Terms of Trade Data
<-
tot read_excel("docs/Project_Data_TURKSTAT/Project_data_ToT.xlsx", range = "C4:BI157")
<- tot %>%
tot select(contains("ABD Doları"))
$date <- seq.Date(from = as.Date("2010-01-01"), to = as.Date("2022-09-01"), by = "month")
tot
<- tot[,26:30]
tot
<- tot %>% select(date, everything())
tot
<- tot[!is.na(tot[,3]),]
tot
<- tot %>% pivot_longer(-c("date"), names_to = "sector_code", values_to = "value")
tot
$sector_code <- tot$sector_code %>% gsub("[.].*", "",.)
tot
<- left_join(tot, sector_info, by = "sector_code") %>% select(sector_code, sector_name, group, everything())
tot
saveRDS(tot, "docs/Project_Data/terms_of_trade.rds")
%>% head() tot
# A tibble: 6 × 5
sector_code sector_name group date value
<chr> <chr> <chr> <date> <dbl>
1 A Agriculture Agriculture 2013-01-01 75.6
2 B Mining Industry 2013-01-01 66.8
3 C Manufacturing Industry 2013-01-01 99.5
4 E Water_Supply Industry 2013-01-01 93.7
5 A Agriculture Agriculture 2013-02-01 76.2
6 B Mining Industry 2013-02-01 65.5
# Read Exports Data
<- read_excel("docs/Project_Data_TURKSTAT/Project_data_exports.xlsx",
export range = "A5:D854")
<- export %>% filter(!if_all(, is.na))
export
1,1] <- as.character(2022)
export[
$Year <- export$Year %>% as.double()
export
<- export %>% fill(Year)
export
<- export[!is.na(export[,2]),]
export
colnames(export) <- c("year", "sector_code", "sector_name", "export")
$export <- gsub(pattern = "-", replacement = 0, export$export) %>% as.double()
export
<- export[export$sector_code %in% sector_info$sector_code,] %>% select(-sector_name)
export
<-
export left_join(export, sector_info, by = "sector_code") %>%
select(sector_code, sector_name, group, everything())
export
# A tibble: 90 × 5
sector_code sector_name group year export
<chr> <chr> <chr> <dbl> <dbl>
1 A Agriculture Agriculture 2022 5434049.
2 B Mining Industry 2022 3508870.
3 C Manufacturing Industry 2022 178162765.
4 D Electricity_Gas Industry 2022 211875.
5 E Water_Supply Industry 2022 666611.
6 H Transportation Services 2022 15.0
7 J Information Services 2022 157891.
8 M Professional_Activities Services 2022 110.
9 R Entertainment Services 2022 10055.
10 A Agriculture Agriculture 2021 7156041.
# … with 80 more rows
# Read Imports Data
<- read_excel("docs/Project_Data_TURKSTAT/Project_data_imports.xlsx",
import range = "A5:D854")
<- import %>% filter(!if_all(, is.na))
import
1,1] <- as.character(2022)
import[
$Year <- import$Year %>% as.double()
import
<- import %>% fill(Year)
import
<- import[!is.na(import[,2]),]
import
colnames(import) <- c("year", "sector_code", "sector_name", "import")
$import <- gsub(pattern = "-", replacement = 0, import$import) %>% as.double()
import
<- import[import$sector_code %in% sector_info$sector_code,] %>% select(-sector_name)
import
<-
import left_join(import, sector_info, by = "sector_code") %>%
select(sector_code, sector_name, group, everything())
import
# A tibble: 89 × 5
sector_code sector_name group year import
<chr> <chr> <chr> <dbl> <dbl>
1 A Agriculture Agriculture 2022 11311875.
2 B Mining Industry 2022 58364861.
3 C Manufacturing Industry 2022 191327777.
4 D Electricity_Gas Industry 2022 217690.
5 E Water_Supply Industry 2022 9883865.
6 H Transportation Services 2022 15.1
7 J Information Services 2022 128615.
8 M Professional_Activities Services 2022 1261.
9 R Entertainment Services 2022 13120.
10 A Agriculture Agriculture 2021 12082080.
# … with 79 more rows
#Merge exports and imports data
<- inner_join(export, import, by = c("sector_code", "sector_name", "year", "group"))
trade
saveRDS(trade, "docs/Project_Data/trade.rds")
%>% head() trade
# A tibble: 6 × 6
sector_code sector_name group year export import
<chr> <chr> <chr> <dbl> <dbl> <dbl>
1 A Agriculture Agriculture 2022 5434049. 11311875.
2 B Mining Industry 2022 3508870. 58364861.
3 C Manufacturing Industry 2022 178162765. 191327777.
4 D Electricity_Gas Industry 2022 211875. 217690.
5 E Water_Supply Industry 2022 666611. 9883865.
6 H Transportation Services 2022 15.0 15.1
2.2.4 Price Data: Producer Price Index
# Read PPI Data
<- read_excel("docs/Project_Data_TURKSTAT/Project_data_ppi_new.xlsx",
ppi_1 range = "A4:AH158")
<- ppi_1 %>%
ppi_1 fill(1) %>%
select_if(function(col) !all(is.na(col)))
colnames(ppi_1)[1:2] <- c("year", "month")
$date <- seq.Date(from = as.Date("2010-01-01"), to = as.Date("2022-10-01"), by = "month")
ppi_1<- ppi_1[,c(-1,-2)]
ppi_1 <- ppi_1 %>% select(date, everything())
ppi_1
<- ppi_1 %>% select(date, (ncol(.)-3):ncol(.))
ppi_1
colnames(ppi_1) <- colnames(ppi_1) %>% gsub("\\..*", "", .)
ppi_1
# A tibble: 154 × 5
date B C D E
<date> <dbl> <dbl> <dbl> <dbl>
1 2010-01-01 220. 161. 169. 149.
2 2010-02-01 221. 162. 187. 149.
3 2010-03-01 220. 165. 189. 151.
4 2010-04-01 221. 166. 189. 151.
5 2010-05-01 227. 166. 191. 154.
6 2010-06-01 222. 165. 191. 157.
7 2010-07-01 228. 165. 192. 157.
8 2010-08-01 229. 165. 202. 155.
9 2010-09-01 231. 166. 203. 155.
10 2010-10-01 228. 167. 202. 156.
# … with 144 more rows
# Read PPI Services Data
<- read_excel("docs/Project_Data_TURKSTAT/Project_data_ppi_services_new.xlsx",
ppi_2 range = "A4:I73")
<- ppi_2 %>%
ppi_2 fill(1) %>%
select_if(function(col) !all(is.na(col)))
colnames(ppi_2)[1:2] <- c("year", "month")
$date <- seq.Date(from = as.Date("2017-01-01"), to = as.Date("2022-09-01"), by = "month")
ppi_2<- ppi_2[,c(-1,-2)]
ppi_2 <- ppi_2 %>% select(date, everything())
ppi_2
<- ppi_2 %>% select(date, everything())
ppi_2 colnames(ppi_2) <- colnames(ppi_2) %>% gsub("\\..*", "", .)
ppi_2
# A tibble: 69 × 7
date H I J L M N
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2017-01-01 96.3 94.6 102. 95.6 96.6 95.4
2 2017-02-01 95.8 94.9 104. 95.8 97.8 95.7
3 2017-03-01 96.3 95.9 101. 95.8 98.3 95.6
4 2017-04-01 97.6 97.0 102. 96.7 99.9 97.6
5 2017-05-01 97.5 98.5 99.5 96.5 101. 98.7
6 2017-06-01 98.4 101. 99.4 98.1 99.7 101.
7 2017-07-01 101. 104. 97.0 101. 98.6 103.
8 2017-08-01 103. 105. 97.2 101. 99.3 105.
9 2017-09-01 102. 104. 98.8 100. 101. 104.
10 2017-10-01 102. 102. 99.7 105. 102. 101.
# … with 59 more rows
# Merge ppi and ppi services data
<- left_join(ppi_1, ppi_2, by = "date")
ppi <- ppi %>%
ppi pivot_longer(-c("date"), names_to = "sector_code", values_to = "index")
# Obtain conventional sector names and groupings
<-
ppi left_join(ppi, sector_info, by = "sector_code") %>%
select(sector_code, sector_name, group, everything())
saveRDS(ppi, "docs/Project_Data/input_cost.rds")
%>% head() ppi
# A tibble: 6 × 5
sector_code sector_name group date index
<chr> <chr> <chr> <date> <dbl>
1 B Mining Industry 2010-01-01 220.
2 C Manufacturing Industry 2010-01-01 161.
3 D Electricity_Gas Industry 2010-01-01 169.
4 E Water_Supply Industry 2010-01-01 149.
5 H Transportation Services 2010-01-01 NA
6 I Accomodation_Food Services 2010-01-01 NA