2  Data Preparation

2.1 Main Data for our Project

Balance sheet, Income statement and other information for Real Sector in sectoral breakdown for the period between 2009 and 2021

library(readxl)
library(dplyr)
library(tidyr)
library(knitr)
# Read sector info data
sector_info <- read_excel("docs/Project_Data_TURKSTAT/Sector_info.xlsx")

saveRDS(sector_info, "docs/Project_Data/sector_grouping.rds")

#Sectoral grouping is given in the table below.
kable(sector_info)
sector_code sector_name group
A Agriculture Agriculture
B Mining Industry
C Manufacturing Industry
D Electricity_Gas Industry
E Water_Supply Industry
F Construction Construction
G Wholesale_Retail_Trade Services
H Transportation Services
I Accomodation_Food Services
J Information Services
L Real_Estate Services
M Professional_Activities Services
N Administrative_Activities Services
P Education Services
Q Health Services
R Entertainment Services
S Other_Services Services
# Read Excel Data for company info regarding number of employees for different company sizes.
company_info <- 
  read_excel("docs/Project_Data_TURKSTAT/Project_data_company_info.xlsx")

# Drop columns whose values are all NA
company_info <- company_info %>% 
  select_if(function(col) !all(is.na(col)))

# Drop rows whose values except first column are all NA
company_info <- company_info %>% 
  filter(!if_all(2:ncol(company_info), is.na))

# Fill first column with previous values
company_info <- company_info %>% 
  fill(1) 

# Fill second column with previous values
company_info <- company_info %>% 
  fill(2)

# Fill NA in first row with previous values
first_row <- company_info %>% 
  filter_all(any_vars(is.na(.))) %>% 
  slice(1) %>% 
  as.data.frame() %>% 
  pivot_longer(-1) %>% 
  fill(value, .direction = "down") %>% 
  select(value) %>% 
  as.matrix()

company_info[1,2:ncol(company_info)] <- t(first_row)

# Fill NA in second row with previous values
second_row <- company_info %>% 
  filter_all(any_vars(is.na(.))) %>% 
  slice(1) %>% 
  as.data.frame() %>% 
  pivot_longer(-1) %>% 
  fill(value, .direction = "down") %>% 
  select(value) %>% 
  as.matrix()

company_info[2,2:ncol(company_info)] <- t(second_row)

# Drop cell values before blanks
company_info <- company_info %>% as.matrix() %>% gsub(".*\r\n", "", .) %>% as_tibble()

#Drop columns with share (%) 
cond <- !(company_info[3,] == "(%)") %>% as.vector()
company_info <- company_info[,cond]

#Drop columns with entity types
cond_1 <- !(company_info[2,] == "Entity Types") %>% as.vector()
company_info <- company_info[, cond_1]
# Drop row with duplicate number values
company_info <- company_info[-3,]


company_info[2,] <- as.list(company_info[2,] %>% gsub(pattern = " ", replacement = "_", company_info))


header_new <- paste(company_info[1,], company_info[2,], sep = "/")
header_new[1:2] <-  c("sector", "size")

names(company_info) <- header_new
company_info <-  company_info %>% 
  filter(!(sector %in% c("", "Türkiye")))

company_info <- company_info %>% 
  pivot_longer(-c(sector, size)) %>% 
  separate(name, into = c("year", "number"), sep = "/")

company_info$number <- company_info$number %>% 
  gsub(pattern = ".companıes", replacement = "_companies")

company_info %>% 
  pivot_wider(names_from = number, values_from = value)
# A tibble: 1,105 × 5
   sector                              size  year  Number_of_companies Number_…¹
   <chr>                               <chr> <chr> <chr>               <chr>    
 1 A-Agrıculture, forestry and fishing Total 2009  5558                32584    
 2 A-Agrıculture, forestry and fishing Total 2010  6731                36849    
 3 A-Agrıculture, forestry and fishing Total 2011  7198                41546    
 4 A-Agrıculture, forestry and fishing Total 2012  7312                44814    
 5 A-Agrıculture, forestry and fishing Total 2013  7682                47135    
 6 A-Agrıculture, forestry and fishing Total 2014  7944                50240    
 7 A-Agrıculture, forestry and fishing Total 2015  8478                54175    
 8 A-Agrıculture, forestry and fishing Total 2016  9226                57025    
 9 A-Agrıculture, forestry and fishing Total 2017  9690                58203    
10 A-Agrıculture, forestry and fishing Total 2018  10194               63918    
# … with 1,095 more rows, and abbreviated variable name ¹​Number_of_employees
company_info$sector_code <- company_info$sector %>% gsub("-.*", "", .)
company_info <-  company_info %>% 
  select(sector_code, everything()) %>% 
  select(-sector)

company_info$value <- as.double(company_info$value)

company_info <- 
  left_join(company_info, sector_info, by = "sector_code") %>% 
  select(sector_code, sector_name, group, everything())

saveRDS(company_info, "docs/Project_Data/sector_information.rds")

company_info %>% head()
# A tibble: 6 × 7
  sector_code sector_name group       size  year  number              value
  <chr>       <chr>       <chr>       <chr> <chr> <chr>               <dbl>
1 A           Agriculture Agriculture Total 2009  Number_of_companies  5558
2 A           Agriculture Agriculture Total 2009  Number_of_employees 32584
3 A           Agriculture Agriculture Total 2010  Number_of_companies  6731
4 A           Agriculture Agriculture Total 2010  Number_of_employees 36849
5 A           Agriculture Agriculture Total 2011  Number_of_companies  7198
6 A           Agriculture Agriculture Total 2011  Number_of_employees 41546
# Read Balance Sheet Data 
company_bs <- read_excel("docs/Project_Data_TURKSTAT/Project_data_b_s.xlsx", 
                         range = "A4:O4032",  
                         col_names = c("sector_code", "accounts", 2009:2021))


company_bs <- company_bs %>% 
  fill(sector_code)

company_bs <- 
  company_bs[!is.na(company_bs[,3]),]

company_bs <- company_bs[-1,]


company_bs$sector_code <- 
  company_bs$sector_code %>% 
  gsub(".*\r\n", "", .) %>% 
  gsub("-.*", "", .)


company_bs$accounts <- company_bs$accounts %>% sub("^[^\r\n]+", "",.)

company_bs$accounts <- company_bs$accounts %>% gsub("\r\n", "",.)

company_bs <- company_bs %>% 
  pivot_longer(-c(sector_code,accounts), names_to = "year", values_to = "value")


company_bs <- company_bs %>% select(sector_code, everything())
company_bs <- 
  left_join(company_bs, sector_info, by = "sector_code") %>% 
  select(sector_code, sector_name, group, everything())

company_bs$year <- as.integer(company_bs$year)

saveRDS(company_bs, "docs/Project_Data/balance_sheet.rds")

company_bs %>% head()
# A tibble: 6 × 6
  sector_code sector_name group       accounts          year    value
  <chr>       <chr>       <chr>       <chr>            <int>    <dbl>
1 A           Agriculture Agriculture I-Current assets  2009  3744238
2 A           Agriculture Agriculture I-Current assets  2010  4946459
3 A           Agriculture Agriculture I-Current assets  2011  6652950
4 A           Agriculture Agriculture I-Current assets  2012  8155273
5 A           Agriculture Agriculture I-Current assets  2013 10465125
6 A           Agriculture Agriculture I-Current assets  2014 12143122
# Read Income Statement Data

company_is <- read_excel("docs/Project_Data_TURKSTAT/Project_data_i_s.xlsx", 
                         range = "A4:O921",  
                         col_names = c("sector_code", "accounts", 2009:2021))

# Fill sectors in first column with previous observations
company_is <- company_is %>% 
  fill(sector_code)

# Remove blank rows
company_is <- company_is[!is.na(company_is[,3]),]

# Remove redundant first row
company_is <- company_is[-1,]

# Remove linespaces from sectors and accounts columns so that english phrases remain
company_is$sector_code <- company_is$sector_code %>% gsub("- .*", "", .)


# pivot data into longer (panel) format
company_is <- company_is %>% 
  pivot_longer(-c(sector_code, accounts), names_to = "year", values_to = "value")
company_is <- company_is %>% select(sector_code,  everything())
company_is <- 
  left_join(company_is, sector_info, by = "sector_code") %>% 
  select(sector_code, sector_name, group, everything())

company_is$accounts <- company_is$accounts %>% gsub(".*\r\n", "", .)

company_is$year <- company_is$year %>% as.integer()

saveRDS(company_is, "docs/Project_Data/income_statement.rds")

company_is %>% head()
# A tibble: 6 × 6
  sector_code sector_name group       accounts       year    value
  <chr>       <chr>       <chr>       <chr>         <int>    <dbl>
1 A           Agriculture Agriculture A-Gross sales  2009  5268108
2 A           Agriculture Agriculture A-Gross sales  2010  6507636
3 A           Agriculture Agriculture A-Gross sales  2011  8440778
4 A           Agriculture Agriculture A-Gross sales  2012 10095319
5 A           Agriculture Agriculture A-Gross sales  2013 11998750
6 A           Agriculture Agriculture A-Gross sales  2014 14979503

2.2 Other Relational Datasets:

We explore other datasets relevant to our main data that will be used in our exploratory data analysis in the following sections:

2.2.1 Employment data: Labor Cost Index and Gender Distribution

# Read Employment Data 
employment_male <- 
  read_excel("docs/Project_Data_TURKSTAT/Project_data_employment.xlsx", 
             range = "Z6:AW14")
employment_female <-
  read_excel("docs/Project_Data_TURKSTAT/Project_data_employment.xlsx", 
             range = "AY6:BW14")

# Create new column as gender
employment_male$gender <- c("male")
employment_female$gender <- c("female")

# drop first row and columns whose values are all NA
employment_male <-  employment_male[-1,] %>% 
  fill(1) %>% 
  select_if(function(col) !all(is.na(col)))

employment_female <-  employment_female[-1,] %>% 
  fill(1) %>% 
  select_if(function(col) !all(is.na(col)))

# Create a quarterly time series starting 2021Q1 and ending 2022Q3

employment_male[,2] <- seq.Date(from = as.Date("2021-03-30"), to = as.Date("2022-09-30"), by = "quarter")
employment_female[,2] <- seq.Date(from = as.Date("2021-03-30"), to = as.Date("2022-09-30"), by = "quarter")

# Rearrange tibble columns such that gender comes first
employment_male <- employment_male[,-1] %>% 
  select(gender, everything())

employment_female <- employment_female[,-1] %>% 
  select(gender, everything())

# Change column name of date column
colnames(employment_male)[2] <- "date"
colnames(employment_female)[2] <- "date"


employment <- rbind(employment_female, employment_male)
colnames(employment) <- colnames(employment) %>% gsub(".*[(]([^.]+)[)].*", "\\1", .)
employment <- employment %>% select(-3)

employment$D <- employment$`D+E`/ 2
employment$E <- employment$`D+E`/ 2

employment$S <- employment$`S+T+U`/ 3
employment$T <- employment$`S+T+U`/ 3
employment$U <- employment$`S+T+U`/ 3

employment <- employment %>% 
  select(-c("D+E", "S+T+U")) %>% 
  pivot_longer(-c("gender", "date"), names_to = "sector_code", values_to = "number") %>% 
  left_join(., sector_info, by = "sector_code")

employment <- employment[!is.na(employment$sector_name),]

saveRDS(employment, "docs/Project_Data/employment.rds")

employment
# A tibble: 238 × 6
   gender date       sector_code number sector_name             group       
   <chr>  <date>     <chr>        <dbl> <chr>                   <chr>       
 1 female 2021-03-30 A             1766 Agriculture             Agriculture 
 2 female 2021-03-30 B                6 Mining                  Industry    
 3 female 2021-03-30 C             1399 Manufacturing           Industry    
 4 female 2021-03-30 F               74 Construction            Construction
 5 female 2021-03-30 G             1004 Wholesale_Retail_Trade  Services    
 6 female 2021-03-30 H              105 Transportation          Services    
 7 female 2021-03-30 I              309 Accomodation_Food       Services    
 8 female 2021-03-30 J               69 Information             Services    
 9 female 2021-03-30 L               68 Real_Estate             Services    
10 female 2021-03-30 M              395 Professional_Activities Services    
# … with 228 more rows
# Read labor cost Index data
labor_cost_index <- 
  read_excel("docs/Project_Data_TURKSTAT/Project_data_labor_cost_index.xlsx", range = "A4:P54")

colnames(labor_cost_index)[3] <- "date"
labor_cost_index$date <-  seq.Date(from = as.Date("2010-03-30"), to = as.Date("2022-06-30"), by = "quarter")
labor_cost_index <- labor_cost_index[,c(-1,-2)]
colnames(labor_cost_index) <- colnames(labor_cost_index) %>% gsub("\\..*", "", .)

labor_cost_index <- 
  labor_cost_index %>% 
  pivot_longer(-c("date"), names_to = "sector_code", values_to = "value") %>% 
  left_join(.,sector_info, by="sector_code") %>% 
  select(sector_code, sector_name, group, everything())

labor_cost_index <- labor_cost_index[!is.na(labor_cost_index$sector_name),]

saveRDS(labor_cost_index, "docs/Project_Data/labor_cost.rds")

labor_cost_index %>% head()
# A tibble: 6 × 5
  sector_code sector_name            group        date       value
  <chr>       <chr>                  <chr>        <date>     <dbl>
1 B           Mining                 Industry     2010-03-30  59.4
2 C           Manufacturing          Industry     2010-03-30  57  
3 D           Electricity_Gas        Industry     2010-03-30  70.4
4 E           Water_Supply           Industry     2010-03-30  54.2
5 F           Construction           Construction 2010-03-30  59.9
6 G           Wholesale_Retail_Trade Services     2010-03-30  59  

2.2.2 Economic Growth Data: Gross Domestic Product

# Read GDP Index Data
gdp <- read_excel("docs/Project_Data_TURKSTAT/Project_data_gdp.xlsx", 
                  range = "A7:C318")

# Rename columns
colnames(gdp) <- c("sector", "year", "gdp_index")

# Filter NA rows
gdp <- gdp %>% 
  filter(!is.na(sector))
saveRDS(gdp, "docs/Project_Data/gdp.rds")

gdp %>% head()
# A tibble: 6 × 3
  sector  year gdp_index
  <chr>  <dbl>     <dbl>
1 Total   1998      70.7
2 Total   1999      68.3
3 Total   2000      73.1
4 Total   2001      68.9
5 Total   2002      73.3
6 Total   2003      77.6

2.2.3 International Trade Data: Terms of Trade, Export and Import

# Read Terms of Trade Data
tot <- 
  read_excel("docs/Project_Data_TURKSTAT/Project_data_ToT.xlsx", range = "C4:BI157")

tot <- tot %>% 
  select(contains("ABD Doları"))

tot$date <-  seq.Date(from = as.Date("2010-01-01"), to = as.Date("2022-09-01"), by = "month")

tot <- tot[,26:30]

tot <- tot %>% select(date, everything())

tot <- tot[!is.na(tot[,3]),]

tot <- tot %>% pivot_longer(-c("date"), names_to = "sector_code", values_to = "value")

tot$sector_code <- tot$sector_code %>% gsub("[.].*", "",.)

tot <- left_join(tot, sector_info, by = "sector_code") %>% select(sector_code, sector_name, group, everything())

saveRDS(tot, "docs/Project_Data/terms_of_trade.rds")

tot %>% head()
# A tibble: 6 × 5
  sector_code sector_name   group       date       value
  <chr>       <chr>         <chr>       <date>     <dbl>
1 A           Agriculture   Agriculture 2013-01-01  75.6
2 B           Mining        Industry    2013-01-01  66.8
3 C           Manufacturing Industry    2013-01-01  99.5
4 E           Water_Supply  Industry    2013-01-01  93.7
5 A           Agriculture   Agriculture 2013-02-01  76.2
6 B           Mining        Industry    2013-02-01  65.5
# Read Exports  Data
export <- read_excel("docs/Project_Data_TURKSTAT/Project_data_exports.xlsx", 
                     range = "A5:D854")

export <- export %>% filter(!if_all(, is.na))

export[1,1] <- as.character(2022) 

export$Year <- export$Year %>% as.double()

export <- export %>% fill(Year)

export <- export[!is.na(export[,2]),]

colnames(export) <- c("year", "sector_code", "sector_name", "export")

export$export <- gsub(pattern = "-", replacement = 0, export$export) %>% as.double()

export <-  export[export$sector_code %in% sector_info$sector_code,] %>% select(-sector_name)

export <- 
  left_join(export, sector_info, by = "sector_code") %>%
  select(sector_code, sector_name, group, everything())

export
# A tibble: 90 × 5
   sector_code sector_name             group        year      export
   <chr>       <chr>                   <chr>       <dbl>       <dbl>
 1 A           Agriculture             Agriculture  2022   5434049. 
 2 B           Mining                  Industry     2022   3508870. 
 3 C           Manufacturing           Industry     2022 178162765. 
 4 D           Electricity_Gas         Industry     2022    211875. 
 5 E           Water_Supply            Industry     2022    666611. 
 6 H           Transportation          Services     2022        15.0
 7 J           Information             Services     2022    157891. 
 8 M           Professional_Activities Services     2022       110. 
 9 R           Entertainment           Services     2022     10055. 
10 A           Agriculture             Agriculture  2021   7156041. 
# … with 80 more rows
# Read Imports Data
import <- read_excel("docs/Project_Data_TURKSTAT/Project_data_imports.xlsx", 
                     range = "A5:D854")

import <- import %>% filter(!if_all(, is.na))

import[1,1] <- as.character(2022) 

import$Year <- import$Year %>% as.double()

import <- import %>% fill(Year)

import <- import[!is.na(import[,2]),]

colnames(import) <- c("year", "sector_code", "sector_name", "import")

import$import <- gsub(pattern = "-", replacement = 0, import$import) %>% as.double()

import <-  import[import$sector_code %in% sector_info$sector_code,] %>% select(-sector_name)

import <- 
  left_join(import, sector_info, by = "sector_code") %>%
  select(sector_code, sector_name, group, everything())
import
# A tibble: 89 × 5
   sector_code sector_name             group        year      import
   <chr>       <chr>                   <chr>       <dbl>       <dbl>
 1 A           Agriculture             Agriculture  2022  11311875. 
 2 B           Mining                  Industry     2022  58364861. 
 3 C           Manufacturing           Industry     2022 191327777. 
 4 D           Electricity_Gas         Industry     2022    217690. 
 5 E           Water_Supply            Industry     2022   9883865. 
 6 H           Transportation          Services     2022        15.1
 7 J           Information             Services     2022    128615. 
 8 M           Professional_Activities Services     2022      1261. 
 9 R           Entertainment           Services     2022     13120. 
10 A           Agriculture             Agriculture  2021  12082080. 
# … with 79 more rows
#Merge exports and imports data
trade <- inner_join(export, import, by = c("sector_code", "sector_name", "year", "group"))

saveRDS(trade, "docs/Project_Data/trade.rds")
trade %>% head()
# A tibble: 6 × 6
  sector_code sector_name     group        year      export      import
  <chr>       <chr>           <chr>       <dbl>       <dbl>       <dbl>
1 A           Agriculture     Agriculture  2022   5434049.   11311875. 
2 B           Mining          Industry     2022   3508870.   58364861. 
3 C           Manufacturing   Industry     2022 178162765.  191327777. 
4 D           Electricity_Gas Industry     2022    211875.     217690. 
5 E           Water_Supply    Industry     2022    666611.    9883865. 
6 H           Transportation  Services     2022        15.0        15.1

2.2.4 Price Data: Producer Price Index

# Read PPI Data
ppi_1 <- read_excel("docs/Project_Data_TURKSTAT/Project_data_ppi_new.xlsx", 
                     range = "A4:AH158")

ppi_1 <- ppi_1 %>% 
  fill(1) %>% 
  select_if(function(col) !all(is.na(col)))

colnames(ppi_1)[1:2] <- c("year", "month")
ppi_1$date <- seq.Date(from = as.Date("2010-01-01"), to = as.Date("2022-10-01"), by = "month")
ppi_1 <- ppi_1[,c(-1,-2)]
ppi_1 <- ppi_1 %>% select(date, everything())

ppi_1 <- ppi_1 %>% select(date, (ncol(.)-3):ncol(.))

colnames(ppi_1) <-  colnames(ppi_1) %>% gsub("\\..*", "", .)
ppi_1
# A tibble: 154 × 5
   date           B     C     D     E
   <date>     <dbl> <dbl> <dbl> <dbl>
 1 2010-01-01  220.  161.  169.  149.
 2 2010-02-01  221.  162.  187.  149.
 3 2010-03-01  220.  165.  189.  151.
 4 2010-04-01  221.  166.  189.  151.
 5 2010-05-01  227.  166.  191.  154.
 6 2010-06-01  222.  165.  191.  157.
 7 2010-07-01  228.  165.  192.  157.
 8 2010-08-01  229.  165.  202.  155.
 9 2010-09-01  231.  166.  203.  155.
10 2010-10-01  228.  167.  202.  156.
# … with 144 more rows
# Read PPI Services Data
ppi_2 <- read_excel("docs/Project_Data_TURKSTAT/Project_data_ppi_services_new.xlsx", 
                    range = "A4:I73")
ppi_2 <- ppi_2 %>% 
  fill(1) %>% 
  select_if(function(col) !all(is.na(col)))

colnames(ppi_2)[1:2] <- c("year", "month")

ppi_2$date <- seq.Date(from = as.Date("2017-01-01"), to = as.Date("2022-09-01"), by = "month")
ppi_2 <- ppi_2[,c(-1,-2)]
ppi_2 <- ppi_2 %>% select(date, everything())

ppi_2 <- ppi_2 %>% select(date, everything())
colnames(ppi_2) <-  colnames(ppi_2) %>% gsub("\\..*", "", .)
ppi_2
# A tibble: 69 × 7
   date           H     I     J     L     M     N
   <date>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 2017-01-01  96.3  94.6 102.   95.6  96.6  95.4
 2 2017-02-01  95.8  94.9 104.   95.8  97.8  95.7
 3 2017-03-01  96.3  95.9 101.   95.8  98.3  95.6
 4 2017-04-01  97.6  97.0 102.   96.7  99.9  97.6
 5 2017-05-01  97.5  98.5  99.5  96.5 101.   98.7
 6 2017-06-01  98.4 101.   99.4  98.1  99.7 101. 
 7 2017-07-01 101.  104.   97.0 101.   98.6 103. 
 8 2017-08-01 103.  105.   97.2 101.   99.3 105. 
 9 2017-09-01 102.  104.   98.8 100.  101.  104. 
10 2017-10-01 102.  102.   99.7 105.  102.  101. 
# … with 59 more rows
# Merge ppi and ppi services data
ppi <- left_join(ppi_1, ppi_2, by = "date")
ppi <- ppi %>% 
  pivot_longer(-c("date"), names_to = "sector_code", values_to = "index")

# Obtain conventional sector names and groupings
ppi <- 
  left_join(ppi, sector_info, by = "sector_code") %>% 
  select(sector_code, sector_name, group, everything())

saveRDS(ppi, "docs/Project_Data/input_cost.rds")

ppi %>% head()
# A tibble: 6 × 5
  sector_code sector_name       group    date       index
  <chr>       <chr>             <chr>    <date>     <dbl>
1 B           Mining            Industry 2010-01-01  220.
2 C           Manufacturing     Industry 2010-01-01  161.
3 D           Electricity_Gas   Industry 2010-01-01  169.
4 E           Water_Supply      Industry 2010-01-01  149.
5 H           Transportation    Services 2010-01-01   NA 
6 I           Accomodation_Food Services 2010-01-01   NA