1. Introduction

1.1 Data

The dataset covers up all the recorded service information of x company’s dealers in Turkey. The data is belong the dates between 2019 and 2020. This data is obtained from x company where one of our team member is working for.In the dataset we have 561,142 rows and 17 columns which gives different kind of parameters. According to the parameters a reader can get information about details of service processes such as model number, km, warranty start & end date, prices, vehicle id etc.

The variables which data has can be seen as below:

Material_id: Material id
Vehicle_id: Vehicle id
Dealer_id: Dealer id
Job_order_number: Job order uniquq id number
material_type: Material type that used in service process
process_type: Types of process
beginning_date: Date of beginning the service process
ending_date: Date of ending the service process
quantity: Quantity of part or material used
price: Price
Model: Model of vehicle
production_date: Production year of model
job_closed_date: The date when job order status was turned to closed
vehicle_km: Kilometer of vehicle
dealer_city: City name of where the dealer is in
warranty_beginning: Date of beginning the warranty
warranty_ending: Date of ending the warranty

1.2 Preprocessing of The Data

library(readxl)
library(tidyverse)
library(lubridate)
library(openxlsx)
library(rio)
library(reshape)
library(kableExtra)
library(xaringan)

raw_data <- rio::import("https://github.com/pjournal/mef04g-madagaskar/blob/gh-pages/Data/x_vehicle_company_service_dataset_v2.xlsx?raw=True")

raw_dt=na.omit(raw_data)

Transforming the job_closed_date column from character to datetime by using a function of lubridate which is as.POSIXct

raw_dt1 = raw_dt
raw_dt1$'Is emri kapanis tarihi' = str_replace_all(raw_dt1$'Is emri kapanis tarihi', "/","-")
raw_dt1$'Is emri kapanis tarihi' <- as.POSIXct(raw_dt1$'Is emri kapanis tarihi',format = "%d-%m-%Y")

Renaming of the columns by using rename() function in order to get clear information about data.

service_dt <- dplyr::rename(raw_dt1, "Material_id" = 1,
                 "Vehicle_id" = 2,
                 "Dealer_id" = 3,
                 "Job_order_number" = 4,
                 "material_type" = 5,
                 "process_type" = 6,
                 "beginning_date" = 7,
                 "ending_date" = 8,
                 "quantity" = 9,
                 "price" = 10,
                 "Model" = 11,
                 "production_date" = 12,
                 "job_closed_date" = 13,
                 "vehicle_km" = 14,
                 "dealer_city" = 15,
                 "warranty_beginning" = 16,
                 "warranty_ending" = 17)

glimpse(service_dt)

## Rows: 552,125
## Columns: 19
## $ Material_id                <chr> "malzeme_236", "malzeme_2156", "malzeme_...
## $ Vehicle_id                 <chr> "sasi_no_9883", "sasi_no_9883", "sasi_no...
## $ Dealer_id                  <chr> "müsteri_15", "müsteri_15", "müsteri_15"...
## $ Job_order_number           <chr> "000000000127819", "000000000127819", "0...
## $ material_type              <chr> "Diger", "Parca", "Diger", "Parca", "Dig...
## $ process_type               <chr> "Ariza", "Ariza", "Ariza", "Ariza", "Ari...
## $ beginning_date             <dttm> 2019-01-22, 2019-01-22, 2019-01-22, 201...
## $ ending_date                <dttm> 2019-01-20, 2019-01-20, 2019-01-20, 201...
## $ quantity                   <dbl> 1.0, 2.0, 20.0, 1.0, 1.0, 10.0, 1.0, 1.0...
## $ price                      <dbl> 209.96415, 97.63875, 0.01485, 905.86485,...
## $ Model                      <chr> "model_23", "model_23", "model_23", "mod...
## $ production_date            <dbl> 2016, 2016, 2016, 2017, 2017, 2017, 2016...
## $ job_closed_date            <dttm> 2019-01-25, 2019-01-25, 2019-01-25, 201...
## $ vehicle_km                 <dbl> 732544, 732544, 732544, 499700, 499700, ...
## $ dealer_city                <chr> "Ankara", "Ankara", "Ankara", "Ankara", ...
## $ warranty_beginning         <dbl> 39.92077, 39.92077, 39.92077, 39.92077, ...
## $ warranty_ending            <dbl> 32.85411, 32.85411, 32.85411, 32.85411, ...
## $ `Garanti Baslangic tarihi` <dttm> 2016-10-04, 2016-10-04, 2016-10-04, 201...
## $ `Garanti Bitis tarihi`     <dttm> 2018-10-04, 2018-10-04, 2018-10-04, 201...

As it can be seen from the codes which was written at below, a new dataset(“Eliminated”) was created after the determination of the upper and lower ranges.

Q <- quantile(service_dt$vehicle_km, probs=c(.25, .75), na.rm = FALSE)
iqr <- IQR(service_dt$vehicle_km)
up <-  Q[2]+1.5*iqr # Upper Range  
low<- Q[1]-1.5*iqr # Lower Range

eliminated<- subset(service_dt, service_dt$vehicle_km> (Q[1] - 1.5*iqr) & service_dt$vehicle_km < (Q[2]+1.5*iqr))

2. Final Report

There were some important points which should be well understood carefully from this report.

2.1 Vehicle Operation Trend

The operation trend was changed in 2020 due to the pandemic issue.It should not be overlooked that this situation may change the demands in the next few years.

dt_2019<- service_dt%>%
  group_by(month=lubridate::month(beginning_date),year=lubridate::year(beginning_date)) %>%
  filter(year == 2019) %>% summarise(total_quantity = sum(quantity))
graph_2019 <- dt_2019 %>% select(month,total_quantity)
dt_2020<- service_dt%>%
  group_by(month=lubridate::month(beginning_date),year=lubridate::year(beginning_date))%>%
  filter(year == 2020) %>% summarise(total_quantity = sum(quantity))
graph_2020 <- dt_2020 %>% select(month,total_quantity)
dt_graph = merge(graph_2019, graph_2020, by="month", )
dt_graph_2 <- reshape::melt(dt_graph, id.var='month')
p<-ggplot(dt_graph_2, aes(x=month, y=value, col=variable)) +geom_line(size=0.9) + 
  ggtitle("Vehicle Operation Trend", "2019-2020 Operations in Services") + 
  xlab("Month") + ylab("Total Quantity") + labs(color="Year") + theme_classic()
p + scale_color_manual(name="Year",
                      labels = c("2019", "2020"),
                      values = c("total_quantity.x"="#D16103",
                                 "total_quantity.y"="#4E84C4"))

2.2 Proportion of Process Types

Almost 83% of the transactions made in the services consist of “Ariza”.

plot_1<- service_dt%>%
  mutate(count_total = n())%>%
  group_by(process_type)%>%
  mutate(count_vehicle_id = n())%>%
  select(process_type,count_vehicle_id, count_total)%>%
  mutate(rate_process_type = round(100*count_vehicle_id/count_total))%>%
  arrange(desc(rate_process_type))%>%
  distinct(process_type, rate_process_type)
print(plot_1)

## # A tibble: 11 x 2
## # Groups:   process_type [11]
##    process_type rate_process_type
##    <chr>                    <dbl>
##  1 Ariza                       84
##  2 P.Bakim                      6
##  3 Kontrol                      4
##  4 Hasar                        2
##  5 Kampanya                     2
##  6 Garanti                      2
##  7 Yol Yardım                   0
##  8 Dahili                       0
##  9 PDI                          0
## 10 Aksesuar                     0
## 11 P.Bak+Ariza                  0

ggplot(plot_1, aes(x="", y=rate_process_type, fill=process_type)) + geom_bar(stat="identity") + 
  coord_polar("y") + ggtitle("Proportion of Process Types") +
  theme_minimal()+ theme(axis.title.y = element_blank(), plot.title = element_text(hjust=0.5))

2.3 The Expensive Types of Model

Model 23 is the most expensive vehicle model, as it is the most frequently visited to the dealers.

plot_7 <- eliminated %>%
  group_by(Model, production_date) %>%
  summarise(total_price = sum(price)) %>%
  mutate(age_of_car = as.numeric(format(Sys.Date(), "%Y"))-production_date)%>%
  arrange(desc(total_price))
plot_7

## # A tibble: 121 x 4
## # Groups:   Model [27]
##    Model    production_date total_price age_of_car
##    <chr>              <dbl>       <dbl>      <dbl>
##  1 model_23            2016   29279644.          4
##  2 model_1             2016   23435455.          4
##  3 model_23            2017   20997581.          3
##  4 model_21            2017   20108047.          3
##  5 model_21            2018   19286777.          2
##  6 model_9             2017   13556276.          3
##  7 model_21            2016   10950340.          4
##  8 model_9             2016   10451858.          4
##  9 model_2             2015    9139873.          5
## 10 model_27            2018    8599881.          2
## # ... with 111 more rows

ggplot(plot_7, aes(x = Model, y = total_price, fill= Model))+
  geom_point(stat="identity", position = "stack") +theme(axis.text.x = element_text(angle=90, size=9, vjust=0.5,hjust=1))+
  labs(x="Model", y="Total Price (TL)", fill ="Types of Model", title = "The Total Price According to Types of Model")

plot_2<- service_dt%>%
  filter(process_type == "Ariza")%>%
  group_by(Model)%>%
  summarise(count_vehicle_id = n())%>%
  arrange(desc(Model))

ggplot(plot_2, aes(x = Model, y = count_vehicle_id, fill= Model))+ ggtitle("Model Distribution in Process Type of Ariza") +
  geom_bar(stat="identity") +theme(axis.text.x = element_text(angle=90, size=9, vjust=0.5,hjust=1))

2.4 The Usage Amount of The Material Types

It was seen that the “diğer”, “parça” and “yag” materials were ranked in order of high to low, respectively, according to their usage amounts in whole process types.

plot_5<- eliminated%>%
  group_by(process_type, material_type)%>%
  summarise(count = sum(quantity))%>%
  arrange(desc(count))
kable(plot_5,col.names = c("Process_Type", "Material_Type.", "Total_Quantity"))%>%
   kable_styling("striped", full_width = F) %>%
  scroll_box(width = "100%", height = "400px")

Process_Type	Material_Type.	Total_Quantity
Ariza	Diger	901921.05
Ariza	Parca	354913.74
Ariza	Yag	129357.86
P.Bakim	Diger	62320.25
Kontrol	Diger	31813.03
P.Bakim	Yag	31086.40
Kampanya	Diger	23604.38
Kontrol	Parca	21885.75
Hasar	Diger	21093.68
P.Bakim	Parca	17463.55
Kampanya	Yag	12190.70
Garanti	Diger	11762.80
Hasar	Parca	9591.05
Garanti	Parca	9465.15
Kampanya	Parca	7367.61
Kontrol	Yag	6934.30
Yol Yardım	Diger	5783.45
Dahili	Diger	4585.10
Yol Yardım	Parca	3265.90
Garanti	Yag	1595.60
Dahili	Parca	617.00
Yol Yardım	Yag	467.60
Aksesuar	Diger	162.00
Dahili	Yag	110.50
PDI	Diger	68.00
Ariza	Aksesuar	61.00
PDI	Parca	38.00
Hasar	Yag	30.00
PDI	Yag	24.50
Aksesuar	Parca	13.00
Kampanya	Aksesuar	13.00
P.Bak+Ariza	Yag	9.50
P.Bak+Ariza	Diger	4.00
P.Bak+Ariza	Parca	3.00
Kontrol	Aksesuar	1.00
P.Bakim	Aksesuar	1.00

  ggplot(plot_5, aes(y=count, x = process_type, fill = process_type)) +
  geom_col() +
  facet_wrap(~material_type, scales ="free") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  theme(axis.text.x = element_text(angle = 90), legend.position = "none")+
  labs(title = "Total Quantity According to Material Type ",
       x = "Process Type",
       y = "Quantity")

2.5 The Most Popular Dealers

It was observed that the transactions were concentrated in the Marmara Region(Istanbul), where industry and transportation are intense, and in the Akdeniz and Ege regions(Antalya, İzmir), where tourism activities are intense.

plot_6<- eliminated%>%
  count(dealer_city)%>%
  mutate(percentage = 100*round(n/sum(n),3))%>%
  arrange(desc(percentage))

kable(plot_6,col.names = c("Dealer_City", "Number of Process.", "Percentage"))%>%
kable_styling("striped", full_width = F) %>%
  scroll_box(width = "100%", height = "400px")

Dealer_City	Number of Process.	Percentage
İstanbul	110758	20.8
Antalya	87578	16.4
İzmir	68052	12.8
Ankara	47843	9.0
Kocaeli	42691	8.0
Bursa	24345	4.6
Denizli	21804	4.1
Gaziantep	17282	3.2
Erzurum	15446	2.9
Muğla	14238	2.7
Nevşehir	10552	2.0
Balıkesir	9945	1.9
Mersin	9692	1.8
Konya	8719	1.6
Sakarya	6632	1.2
Bolu	5316	1.0
Samsun	4766	0.9
Düzce	3571	0.7
Eskişehir	3752	0.7
Hatay	3256	0.6
Adıyaman	2211	0.4
Şanlıurfa	1958	0.4
Aksaray	1761	0.3
Aydın	1586	0.3
Uşak	1729	0.3
Çankırı	1212	0.2
Edirne	1317	0.2
Elazığ	1274	0.2
Karaman	656	0.1
Kayseri	514	0.1
Kütahya	602	0.1
Malatya	420	0.1
Sivas	433	0.1
Tekirdağ	371	0.1
Afyonkarahisar	60	0.0
Çorum	260	0.0
Diyarbakır	33	0.0
Kahramanmaraş	6	0.0
Niğde	164	0.0
Van	249	0.0

plot_6%>%
  filter(percentage >= 1.0)%>%

ggplot(aes(x=percentage, y=reorder(dealer_city, percentage), fill=percentage)) +
  geom_col() +
  scale_fill_gradient("percentage", low="thistle1", high="thistle4") +
  geom_text(aes(label = paste(format(percentage,digits=1), "%")), size=4, position = position_stack(vjust = 0.5)) +
  theme_minimal() +
  theme(legend.position = "none", plot.title = element_text(vjust = 0.5)) +
  labs(x = "Percentages",
       y = "Dealers",
       title = "The Most Popular Dealers")

Final Report

MadagaskaR

2020-12-27