library(readxl)
library(dplyr)
library(tidyr)
library(knitr)2 Data Preparation
2.1 Main Data for our Project
Balance sheet, Income statement and other information for Real Sector in sectoral breakdown for the period between 2009 and 2021
# Read sector info data
sector_info <- read_excel("docs/Project_Data_TURKSTAT/Sector_info.xlsx")
saveRDS(sector_info, "docs/Project_Data/sector_grouping.rds")
#Sectoral grouping is given in the table below.
kable(sector_info)| sector_code | sector_name | group |
|---|---|---|
| A | Agriculture | Agriculture |
| B | Mining | Industry |
| C | Manufacturing | Industry |
| D | Electricity_Gas | Industry |
| E | Water_Supply | Industry |
| F | Construction | Construction |
| G | Wholesale_Retail_Trade | Services |
| H | Transportation | Services |
| I | Accomodation_Food | Services |
| J | Information | Services |
| L | Real_Estate | Services |
| M | Professional_Activities | Services |
| N | Administrative_Activities | Services |
| P | Education | Services |
| Q | Health | Services |
| R | Entertainment | Services |
| S | Other_Services | Services |
# Read Excel Data for company info regarding number of employees for different company sizes.
company_info <-
read_excel("docs/Project_Data_TURKSTAT/Project_data_company_info.xlsx")
# Drop columns whose values are all NA
company_info <- company_info %>%
select_if(function(col) !all(is.na(col)))
# Drop rows whose values except first column are all NA
company_info <- company_info %>%
filter(!if_all(2:ncol(company_info), is.na))
# Fill first column with previous values
company_info <- company_info %>%
fill(1)
# Fill second column with previous values
company_info <- company_info %>%
fill(2)
# Fill NA in first row with previous values
first_row <- company_info %>%
filter_all(any_vars(is.na(.))) %>%
slice(1) %>%
as.data.frame() %>%
pivot_longer(-1) %>%
fill(value, .direction = "down") %>%
select(value) %>%
as.matrix()
company_info[1,2:ncol(company_info)] <- t(first_row)
# Fill NA in second row with previous values
second_row <- company_info %>%
filter_all(any_vars(is.na(.))) %>%
slice(1) %>%
as.data.frame() %>%
pivot_longer(-1) %>%
fill(value, .direction = "down") %>%
select(value) %>%
as.matrix()
company_info[2,2:ncol(company_info)] <- t(second_row)
# Drop cell values before blanks
company_info <- company_info %>% as.matrix() %>% gsub(".*\r\n", "", .) %>% as_tibble()
#Drop columns with share (%)
cond <- !(company_info[3,] == "(%)") %>% as.vector()
company_info <- company_info[,cond]
#Drop columns with entity types
cond_1 <- !(company_info[2,] == "Entity Types") %>% as.vector()
company_info <- company_info[, cond_1]
# Drop row with duplicate number values
company_info <- company_info[-3,]
company_info[2,] <- as.list(company_info[2,] %>% gsub(pattern = " ", replacement = "_", company_info))
header_new <- paste(company_info[1,], company_info[2,], sep = "/")
header_new[1:2] <- c("sector", "size")
names(company_info) <- header_new
company_info <- company_info %>%
filter(!(sector %in% c("", "Türkiye")))
company_info <- company_info %>%
pivot_longer(-c(sector, size)) %>%
separate(name, into = c("year", "number"), sep = "/")
company_info$number <- company_info$number %>%
gsub(pattern = ".companıes", replacement = "_companies")
company_info %>%
pivot_wider(names_from = number, values_from = value)# A tibble: 1,105 × 5
sector size year Number_of_companies Number_…¹
<chr> <chr> <chr> <chr> <chr>
1 A-Agrıculture, forestry and fishing Total 2009 5558 32584
2 A-Agrıculture, forestry and fishing Total 2010 6731 36849
3 A-Agrıculture, forestry and fishing Total 2011 7198 41546
4 A-Agrıculture, forestry and fishing Total 2012 7312 44814
5 A-Agrıculture, forestry and fishing Total 2013 7682 47135
6 A-Agrıculture, forestry and fishing Total 2014 7944 50240
7 A-Agrıculture, forestry and fishing Total 2015 8478 54175
8 A-Agrıculture, forestry and fishing Total 2016 9226 57025
9 A-Agrıculture, forestry and fishing Total 2017 9690 58203
10 A-Agrıculture, forestry and fishing Total 2018 10194 63918
# … with 1,095 more rows, and abbreviated variable name ¹Number_of_employees
company_info$sector_code <- company_info$sector %>% gsub("-.*", "", .)
company_info <- company_info %>%
select(sector_code, everything()) %>%
select(-sector)
company_info$value <- as.double(company_info$value)
company_info <-
left_join(company_info, sector_info, by = "sector_code") %>%
select(sector_code, sector_name, group, everything())
saveRDS(company_info, "docs/Project_Data/sector_information.rds")
company_info %>% head()# A tibble: 6 × 7
sector_code sector_name group size year number value
<chr> <chr> <chr> <chr> <chr> <chr> <dbl>
1 A Agriculture Agriculture Total 2009 Number_of_companies 5558
2 A Agriculture Agriculture Total 2009 Number_of_employees 32584
3 A Agriculture Agriculture Total 2010 Number_of_companies 6731
4 A Agriculture Agriculture Total 2010 Number_of_employees 36849
5 A Agriculture Agriculture Total 2011 Number_of_companies 7198
6 A Agriculture Agriculture Total 2011 Number_of_employees 41546
# Read Balance Sheet Data
company_bs <- read_excel("docs/Project_Data_TURKSTAT/Project_data_b_s.xlsx",
range = "A4:O4032",
col_names = c("sector_code", "accounts", 2009:2021))
company_bs <- company_bs %>%
fill(sector_code)
company_bs <-
company_bs[!is.na(company_bs[,3]),]
company_bs <- company_bs[-1,]
company_bs$sector_code <-
company_bs$sector_code %>%
gsub(".*\r\n", "", .) %>%
gsub("-.*", "", .)
company_bs$accounts <- company_bs$accounts %>% sub("^[^\r\n]+", "",.)
company_bs$accounts <- company_bs$accounts %>% gsub("\r\n", "",.)
company_bs <- company_bs %>%
pivot_longer(-c(sector_code,accounts), names_to = "year", values_to = "value")
company_bs <- company_bs %>% select(sector_code, everything())
company_bs <-
left_join(company_bs, sector_info, by = "sector_code") %>%
select(sector_code, sector_name, group, everything())
company_bs$year <- as.integer(company_bs$year)
saveRDS(company_bs, "docs/Project_Data/balance_sheet.rds")
company_bs %>% head()# A tibble: 6 × 6
sector_code sector_name group accounts year value
<chr> <chr> <chr> <chr> <int> <dbl>
1 A Agriculture Agriculture I-Current assets 2009 3744238
2 A Agriculture Agriculture I-Current assets 2010 4946459
3 A Agriculture Agriculture I-Current assets 2011 6652950
4 A Agriculture Agriculture I-Current assets 2012 8155273
5 A Agriculture Agriculture I-Current assets 2013 10465125
6 A Agriculture Agriculture I-Current assets 2014 12143122
# Read Income Statement Data
company_is <- read_excel("docs/Project_Data_TURKSTAT/Project_data_i_s.xlsx",
range = "A4:O921",
col_names = c("sector_code", "accounts", 2009:2021))
# Fill sectors in first column with previous observations
company_is <- company_is %>%
fill(sector_code)
# Remove blank rows
company_is <- company_is[!is.na(company_is[,3]),]
# Remove redundant first row
company_is <- company_is[-1,]
# Remove linespaces from sectors and accounts columns so that english phrases remain
company_is$sector_code <- company_is$sector_code %>% gsub("- .*", "", .)
# pivot data into longer (panel) format
company_is <- company_is %>%
pivot_longer(-c(sector_code, accounts), names_to = "year", values_to = "value")
company_is <- company_is %>% select(sector_code, everything())
company_is <-
left_join(company_is, sector_info, by = "sector_code") %>%
select(sector_code, sector_name, group, everything())
company_is$accounts <- company_is$accounts %>% gsub(".*\r\n", "", .)
company_is$year <- company_is$year %>% as.integer()
saveRDS(company_is, "docs/Project_Data/income_statement.rds")
company_is %>% head()# A tibble: 6 × 6
sector_code sector_name group accounts year value
<chr> <chr> <chr> <chr> <int> <dbl>
1 A Agriculture Agriculture A-Gross sales 2009 5268108
2 A Agriculture Agriculture A-Gross sales 2010 6507636
3 A Agriculture Agriculture A-Gross sales 2011 8440778
4 A Agriculture Agriculture A-Gross sales 2012 10095319
5 A Agriculture Agriculture A-Gross sales 2013 11998750
6 A Agriculture Agriculture A-Gross sales 2014 14979503
2.2 Other Relational Datasets:
We explore other datasets relevant to our main data that will be used in our exploratory data analysis in the following sections:
2.2.1 Employment data: Labor Cost Index and Gender Distribution
# Read Employment Data
employment_male <-
read_excel("docs/Project_Data_TURKSTAT/Project_data_employment.xlsx",
range = "Z6:AW14")
employment_female <-
read_excel("docs/Project_Data_TURKSTAT/Project_data_employment.xlsx",
range = "AY6:BW14")
# Create new column as gender
employment_male$gender <- c("male")
employment_female$gender <- c("female")
# drop first row and columns whose values are all NA
employment_male <- employment_male[-1,] %>%
fill(1) %>%
select_if(function(col) !all(is.na(col)))
employment_female <- employment_female[-1,] %>%
fill(1) %>%
select_if(function(col) !all(is.na(col)))
# Create a quarterly time series starting 2021Q1 and ending 2022Q3
employment_male[,2] <- seq.Date(from = as.Date("2021-03-30"), to = as.Date("2022-09-30"), by = "quarter")
employment_female[,2] <- seq.Date(from = as.Date("2021-03-30"), to = as.Date("2022-09-30"), by = "quarter")
# Rearrange tibble columns such that gender comes first
employment_male <- employment_male[,-1] %>%
select(gender, everything())
employment_female <- employment_female[,-1] %>%
select(gender, everything())
# Change column name of date column
colnames(employment_male)[2] <- "date"
colnames(employment_female)[2] <- "date"
employment <- rbind(employment_female, employment_male)
colnames(employment) <- colnames(employment) %>% gsub(".*[(]([^.]+)[)].*", "\\1", .)
employment <- employment %>% select(-3)
employment$D <- employment$`D+E`/ 2
employment$E <- employment$`D+E`/ 2
employment$S <- employment$`S+T+U`/ 3
employment$T <- employment$`S+T+U`/ 3
employment$U <- employment$`S+T+U`/ 3
employment <- employment %>%
select(-c("D+E", "S+T+U")) %>%
pivot_longer(-c("gender", "date"), names_to = "sector_code", values_to = "number") %>%
left_join(., sector_info, by = "sector_code")
employment <- employment[!is.na(employment$sector_name),]
saveRDS(employment, "docs/Project_Data/employment.rds")
employment# A tibble: 238 × 6
gender date sector_code number sector_name group
<chr> <date> <chr> <dbl> <chr> <chr>
1 female 2021-03-30 A 1766 Agriculture Agriculture
2 female 2021-03-30 B 6 Mining Industry
3 female 2021-03-30 C 1399 Manufacturing Industry
4 female 2021-03-30 F 74 Construction Construction
5 female 2021-03-30 G 1004 Wholesale_Retail_Trade Services
6 female 2021-03-30 H 105 Transportation Services
7 female 2021-03-30 I 309 Accomodation_Food Services
8 female 2021-03-30 J 69 Information Services
9 female 2021-03-30 L 68 Real_Estate Services
10 female 2021-03-30 M 395 Professional_Activities Services
# … with 228 more rows
# Read labor cost Index data
labor_cost_index <-
read_excel("docs/Project_Data_TURKSTAT/Project_data_labor_cost_index.xlsx", range = "A4:P54")
colnames(labor_cost_index)[3] <- "date"
labor_cost_index$date <- seq.Date(from = as.Date("2010-03-30"), to = as.Date("2022-06-30"), by = "quarter")
labor_cost_index <- labor_cost_index[,c(-1,-2)]
colnames(labor_cost_index) <- colnames(labor_cost_index) %>% gsub("\\..*", "", .)
labor_cost_index <-
labor_cost_index %>%
pivot_longer(-c("date"), names_to = "sector_code", values_to = "value") %>%
left_join(.,sector_info, by="sector_code") %>%
select(sector_code, sector_name, group, everything())
labor_cost_index <- labor_cost_index[!is.na(labor_cost_index$sector_name),]
saveRDS(labor_cost_index, "docs/Project_Data/labor_cost.rds")
labor_cost_index %>% head()# A tibble: 6 × 5
sector_code sector_name group date value
<chr> <chr> <chr> <date> <dbl>
1 B Mining Industry 2010-03-30 59.4
2 C Manufacturing Industry 2010-03-30 57
3 D Electricity_Gas Industry 2010-03-30 70.4
4 E Water_Supply Industry 2010-03-30 54.2
5 F Construction Construction 2010-03-30 59.9
6 G Wholesale_Retail_Trade Services 2010-03-30 59
2.2.2 Economic Growth Data: Gross Domestic Product
# Read GDP Index Data
gdp <- read_excel("docs/Project_Data_TURKSTAT/Project_data_gdp.xlsx",
range = "A7:C318")
# Rename columns
colnames(gdp) <- c("sector", "year", "gdp_index")
# Filter NA rows
gdp <- gdp %>%
filter(!is.na(sector))
saveRDS(gdp, "docs/Project_Data/gdp.rds")
gdp %>% head()# A tibble: 6 × 3
sector year gdp_index
<chr> <dbl> <dbl>
1 Total 1998 70.7
2 Total 1999 68.3
3 Total 2000 73.1
4 Total 2001 68.9
5 Total 2002 73.3
6 Total 2003 77.6
2.2.3 International Trade Data: Terms of Trade, Export and Import
# Read Terms of Trade Data
tot <-
read_excel("docs/Project_Data_TURKSTAT/Project_data_ToT.xlsx", range = "C4:BI157")
tot <- tot %>%
select(contains("ABD Doları"))
tot$date <- seq.Date(from = as.Date("2010-01-01"), to = as.Date("2022-09-01"), by = "month")
tot <- tot[,26:30]
tot <- tot %>% select(date, everything())
tot <- tot[!is.na(tot[,3]),]
tot <- tot %>% pivot_longer(-c("date"), names_to = "sector_code", values_to = "value")
tot$sector_code <- tot$sector_code %>% gsub("[.].*", "",.)
tot <- left_join(tot, sector_info, by = "sector_code") %>% select(sector_code, sector_name, group, everything())
saveRDS(tot, "docs/Project_Data/terms_of_trade.rds")
tot %>% head()# A tibble: 6 × 5
sector_code sector_name group date value
<chr> <chr> <chr> <date> <dbl>
1 A Agriculture Agriculture 2013-01-01 75.6
2 B Mining Industry 2013-01-01 66.8
3 C Manufacturing Industry 2013-01-01 99.5
4 E Water_Supply Industry 2013-01-01 93.7
5 A Agriculture Agriculture 2013-02-01 76.2
6 B Mining Industry 2013-02-01 65.5
# Read Exports Data
export <- read_excel("docs/Project_Data_TURKSTAT/Project_data_exports.xlsx",
range = "A5:D854")
export <- export %>% filter(!if_all(, is.na))
export[1,1] <- as.character(2022)
export$Year <- export$Year %>% as.double()
export <- export %>% fill(Year)
export <- export[!is.na(export[,2]),]
colnames(export) <- c("year", "sector_code", "sector_name", "export")
export$export <- gsub(pattern = "-", replacement = 0, export$export) %>% as.double()
export <- export[export$sector_code %in% sector_info$sector_code,] %>% select(-sector_name)
export <-
left_join(export, sector_info, by = "sector_code") %>%
select(sector_code, sector_name, group, everything())
export# A tibble: 90 × 5
sector_code sector_name group year export
<chr> <chr> <chr> <dbl> <dbl>
1 A Agriculture Agriculture 2022 5434049.
2 B Mining Industry 2022 3508870.
3 C Manufacturing Industry 2022 178162765.
4 D Electricity_Gas Industry 2022 211875.
5 E Water_Supply Industry 2022 666611.
6 H Transportation Services 2022 15.0
7 J Information Services 2022 157891.
8 M Professional_Activities Services 2022 110.
9 R Entertainment Services 2022 10055.
10 A Agriculture Agriculture 2021 7156041.
# … with 80 more rows
# Read Imports Data
import <- read_excel("docs/Project_Data_TURKSTAT/Project_data_imports.xlsx",
range = "A5:D854")
import <- import %>% filter(!if_all(, is.na))
import[1,1] <- as.character(2022)
import$Year <- import$Year %>% as.double()
import <- import %>% fill(Year)
import <- import[!is.na(import[,2]),]
colnames(import) <- c("year", "sector_code", "sector_name", "import")
import$import <- gsub(pattern = "-", replacement = 0, import$import) %>% as.double()
import <- import[import$sector_code %in% sector_info$sector_code,] %>% select(-sector_name)
import <-
left_join(import, sector_info, by = "sector_code") %>%
select(sector_code, sector_name, group, everything())
import# A tibble: 89 × 5
sector_code sector_name group year import
<chr> <chr> <chr> <dbl> <dbl>
1 A Agriculture Agriculture 2022 11311875.
2 B Mining Industry 2022 58364861.
3 C Manufacturing Industry 2022 191327777.
4 D Electricity_Gas Industry 2022 217690.
5 E Water_Supply Industry 2022 9883865.
6 H Transportation Services 2022 15.1
7 J Information Services 2022 128615.
8 M Professional_Activities Services 2022 1261.
9 R Entertainment Services 2022 13120.
10 A Agriculture Agriculture 2021 12082080.
# … with 79 more rows
#Merge exports and imports data
trade <- inner_join(export, import, by = c("sector_code", "sector_name", "year", "group"))
saveRDS(trade, "docs/Project_Data/trade.rds")
trade %>% head()# A tibble: 6 × 6
sector_code sector_name group year export import
<chr> <chr> <chr> <dbl> <dbl> <dbl>
1 A Agriculture Agriculture 2022 5434049. 11311875.
2 B Mining Industry 2022 3508870. 58364861.
3 C Manufacturing Industry 2022 178162765. 191327777.
4 D Electricity_Gas Industry 2022 211875. 217690.
5 E Water_Supply Industry 2022 666611. 9883865.
6 H Transportation Services 2022 15.0 15.1
2.2.4 Price Data: Producer Price Index
# Read PPI Data
ppi_1 <- read_excel("docs/Project_Data_TURKSTAT/Project_data_ppi_new.xlsx",
range = "A4:AH158")
ppi_1 <- ppi_1 %>%
fill(1) %>%
select_if(function(col) !all(is.na(col)))
colnames(ppi_1)[1:2] <- c("year", "month")
ppi_1$date <- seq.Date(from = as.Date("2010-01-01"), to = as.Date("2022-10-01"), by = "month")
ppi_1 <- ppi_1[,c(-1,-2)]
ppi_1 <- ppi_1 %>% select(date, everything())
ppi_1 <- ppi_1 %>% select(date, (ncol(.)-3):ncol(.))
colnames(ppi_1) <- colnames(ppi_1) %>% gsub("\\..*", "", .)
ppi_1# A tibble: 154 × 5
date B C D E
<date> <dbl> <dbl> <dbl> <dbl>
1 2010-01-01 220. 161. 169. 149.
2 2010-02-01 221. 162. 187. 149.
3 2010-03-01 220. 165. 189. 151.
4 2010-04-01 221. 166. 189. 151.
5 2010-05-01 227. 166. 191. 154.
6 2010-06-01 222. 165. 191. 157.
7 2010-07-01 228. 165. 192. 157.
8 2010-08-01 229. 165. 202. 155.
9 2010-09-01 231. 166. 203. 155.
10 2010-10-01 228. 167. 202. 156.
# … with 144 more rows
# Read PPI Services Data
ppi_2 <- read_excel("docs/Project_Data_TURKSTAT/Project_data_ppi_services_new.xlsx",
range = "A4:I73")
ppi_2 <- ppi_2 %>%
fill(1) %>%
select_if(function(col) !all(is.na(col)))
colnames(ppi_2)[1:2] <- c("year", "month")
ppi_2$date <- seq.Date(from = as.Date("2017-01-01"), to = as.Date("2022-09-01"), by = "month")
ppi_2 <- ppi_2[,c(-1,-2)]
ppi_2 <- ppi_2 %>% select(date, everything())
ppi_2 <- ppi_2 %>% select(date, everything())
colnames(ppi_2) <- colnames(ppi_2) %>% gsub("\\..*", "", .)
ppi_2# A tibble: 69 × 7
date H I J L M N
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2017-01-01 96.3 94.6 102. 95.6 96.6 95.4
2 2017-02-01 95.8 94.9 104. 95.8 97.8 95.7
3 2017-03-01 96.3 95.9 101. 95.8 98.3 95.6
4 2017-04-01 97.6 97.0 102. 96.7 99.9 97.6
5 2017-05-01 97.5 98.5 99.5 96.5 101. 98.7
6 2017-06-01 98.4 101. 99.4 98.1 99.7 101.
7 2017-07-01 101. 104. 97.0 101. 98.6 103.
8 2017-08-01 103. 105. 97.2 101. 99.3 105.
9 2017-09-01 102. 104. 98.8 100. 101. 104.
10 2017-10-01 102. 102. 99.7 105. 102. 101.
# … with 59 more rows
# Merge ppi and ppi services data
ppi <- left_join(ppi_1, ppi_2, by = "date")
ppi <- ppi %>%
pivot_longer(-c("date"), names_to = "sector_code", values_to = "index")
# Obtain conventional sector names and groupings
ppi <-
left_join(ppi, sector_info, by = "sector_code") %>%
select(sector_code, sector_name, group, everything())
saveRDS(ppi, "docs/Project_Data/input_cost.rds")
ppi %>% head()# A tibble: 6 × 5
sector_code sector_name group date index
<chr> <chr> <chr> <date> <dbl>
1 B Mining Industry 2010-01-01 220.
2 C Manufacturing Industry 2010-01-01 161.
3 D Electricity_Gas Industry 2010-01-01 169.
4 E Water_Supply Industry 2010-01-01 149.
5 H Transportation Services 2010-01-01 NA
6 I Accomodation_Food Services 2010-01-01 NA