1. Introduction

1.1 Data

The dataset covers up all the recorded service information of x company’s dealers in Turkey. The data is belong the dates between 2019 and 2020. This data is obtained from x company where one of our team member is working for.In the dataset we have 561,142 rows and 17 columns which gives different kind of parameters. According to the parameters a reader can get information about details of service processes such as model number, km, warranty start & end date, prices, vehicle id etc.

The variables which data has can be seen as below:

  • Material_id: Material id
  • Vehicle_id: Vehicle id
  • Dealer_id: Dealer id
  • Job_order_number: Job order uniquq id number
  • material_type: Material type that used in service process
  • process_type: Types of process
  • beginning_date: Date of beginning the service process
  • ending_date: Date of ending the service process
  • quantity: Quantity of part or material used
  • price: Price
  • Model: Model of vehicle
  • production_date: Production year of model
  • job_closed_date: The date when job order status was turned to closed
  • vehicle_km: Kilometer of vehicle
  • dealer_city: City name of where the dealer is in
  • warranty_beginning: Date of beginning the warranty
  • warranty_ending: Date of ending the warranty

1.2 Preprocessing of The Data

library(readxl)
library(tidyverse)
library(lubridate)
library(openxlsx)
library(rio)
library(reshape)
library(kableExtra)
library(xaringan)

raw_data <- rio::import("https://github.com/pjournal/mef04g-madagaskar/blob/gh-pages/Data/x_vehicle_company_service_dataset_v2.xlsx?raw=True")

raw_dt=na.omit(raw_data)

Transforming the job_closed_date column from character to datetime by using a function of lubridate which is as.POSIXct

raw_dt1 = raw_dt
raw_dt1$'Is emri kapanis tarihi' = str_replace_all(raw_dt1$'Is emri kapanis tarihi', "/","-")
raw_dt1$'Is emri kapanis tarihi' <- as.POSIXct(raw_dt1$'Is emri kapanis tarihi',format = "%d-%m-%Y")

Renaming of the columns by using rename() function in order to get clear information about data.

service_dt <- dplyr::rename(raw_dt1, "Material_id" = 1,
                 "Vehicle_id" = 2,
                 "Dealer_id" = 3,
                 "Job_order_number" = 4,
                 "material_type" = 5,
                 "process_type" = 6,
                 "beginning_date" = 7,
                 "ending_date" = 8,
                 "quantity" = 9,
                 "price" = 10,
                 "Model" = 11,
                 "production_date" = 12,
                 "job_closed_date" = 13,
                 "vehicle_km" = 14,
                 "dealer_city" = 15,
                 "warranty_beginning" = 16,
                 "warranty_ending" = 17)

glimpse(service_dt)
## Rows: 552,125
## Columns: 19
## $ Material_id                <chr> "malzeme_236", "malzeme_2156", "malzeme_...
## $ Vehicle_id                 <chr> "sasi_no_9883", "sasi_no_9883", "sasi_no...
## $ Dealer_id                  <chr> "müsteri_15", "müsteri_15", "müsteri_15"...
## $ Job_order_number           <chr> "000000000127819", "000000000127819", "0...
## $ material_type              <chr> "Diger", "Parca", "Diger", "Parca", "Dig...
## $ process_type               <chr> "Ariza", "Ariza", "Ariza", "Ariza", "Ari...
## $ beginning_date             <dttm> 2019-01-22, 2019-01-22, 2019-01-22, 201...
## $ ending_date                <dttm> 2019-01-20, 2019-01-20, 2019-01-20, 201...
## $ quantity                   <dbl> 1.0, 2.0, 20.0, 1.0, 1.0, 10.0, 1.0, 1.0...
## $ price                      <dbl> 209.96415, 97.63875, 0.01485, 905.86485,...
## $ Model                      <chr> "model_23", "model_23", "model_23", "mod...
## $ production_date            <dbl> 2016, 2016, 2016, 2017, 2017, 2017, 2016...
## $ job_closed_date            <dttm> 2019-01-25, 2019-01-25, 2019-01-25, 201...
## $ vehicle_km                 <dbl> 732544, 732544, 732544, 499700, 499700, ...
## $ dealer_city                <chr> "Ankara", "Ankara", "Ankara", "Ankara", ...
## $ warranty_beginning         <dbl> 39.92077, 39.92077, 39.92077, 39.92077, ...
## $ warranty_ending            <dbl> 32.85411, 32.85411, 32.85411, 32.85411, ...
## $ `Garanti Baslangic tarihi` <dttm> 2016-10-04, 2016-10-04, 2016-10-04, 201...
## $ `Garanti Bitis tarihi`     <dttm> 2018-10-04, 2018-10-04, 2018-10-04, 201...

As it can be seen from the codes which was written at below, a new dataset(“Eliminated”) was created after the determination of the upper and lower ranges.

Q <- quantile(service_dt$vehicle_km, probs=c(.25, .75), na.rm = FALSE)
iqr <- IQR(service_dt$vehicle_km)
up <-  Q[2]+1.5*iqr # Upper Range  
low<- Q[1]-1.5*iqr # Lower Range

eliminated<- subset(service_dt, service_dt$vehicle_km> (Q[1] - 1.5*iqr) & service_dt$vehicle_km < (Q[2]+1.5*iqr))

2. Final Report

There were some important points which should be well understood carefully from this report.

2.1 Vehicle Operation Trend

  • The operation trend was changed in 2020 due to the pandemic issue.It should not be overlooked that this situation may change the demands in the next few years.
dt_2019<- service_dt%>%
  group_by(month=lubridate::month(beginning_date),year=lubridate::year(beginning_date)) %>%
  filter(year == 2019) %>% summarise(total_quantity = sum(quantity))
graph_2019 <- dt_2019 %>% select(month,total_quantity)
dt_2020<- service_dt%>%
  group_by(month=lubridate::month(beginning_date),year=lubridate::year(beginning_date))%>%
  filter(year == 2020) %>% summarise(total_quantity = sum(quantity))
graph_2020 <- dt_2020 %>% select(month,total_quantity)
dt_graph = merge(graph_2019, graph_2020, by="month", )
dt_graph_2 <- reshape::melt(dt_graph, id.var='month')
p<-ggplot(dt_graph_2, aes(x=month, y=value, col=variable)) +geom_line(size=0.9) + 
  ggtitle("Vehicle Operation Trend", "2019-2020 Operations in Services") + 
  xlab("Month") + ylab("Total Quantity") + labs(color="Year") + theme_classic()
p + scale_color_manual(name="Year",
                      labels = c("2019", "2020"),
                      values = c("total_quantity.x"="#D16103",
                                 "total_quantity.y"="#4E84C4"))

2.2 Proportion of Process Types

  • Almost 83% of the transactions made in the services consist of “Ariza”.
plot_1<- service_dt%>%
  mutate(count_total = n())%>%
  group_by(process_type)%>%
  mutate(count_vehicle_id = n())%>%
  select(process_type,count_vehicle_id, count_total)%>%
  mutate(rate_process_type = round(100*count_vehicle_id/count_total))%>%
  arrange(desc(rate_process_type))%>%
  distinct(process_type, rate_process_type)
print(plot_1)
## # A tibble: 11 x 2
## # Groups:   process_type [11]
##    process_type rate_process_type
##    <chr>                    <dbl>
##  1 Ariza                       84
##  2 P.Bakim                      6
##  3 Kontrol                      4
##  4 Hasar                        2
##  5 Kampanya                     2
##  6 Garanti                      2
##  7 Yol Yardım                   0
##  8 Dahili                       0
##  9 PDI                          0
## 10 Aksesuar                     0
## 11 P.Bak+Ariza                  0
ggplot(plot_1, aes(x="", y=rate_process_type, fill=process_type)) + geom_bar(stat="identity") + 
  coord_polar("y") + ggtitle("Proportion of Process Types") +
  theme_minimal()+ theme(axis.title.y = element_blank(), plot.title = element_text(hjust=0.5))

2.3 The Expensive Types of Model

  • Model 23 is the most expensive vehicle model, as it is the most frequently visited to the dealers.
plot_7 <- eliminated %>%
  group_by(Model, production_date) %>%
  summarise(total_price = sum(price)) %>%
  mutate(age_of_car = as.numeric(format(Sys.Date(), "%Y"))-production_date)%>%
  arrange(desc(total_price))
plot_7
## # A tibble: 121 x 4
## # Groups:   Model [27]
##    Model    production_date total_price age_of_car
##    <chr>              <dbl>       <dbl>      <dbl>
##  1 model_23            2016   29279644.          4
##  2 model_1             2016   23435455.          4
##  3 model_23            2017   20997581.          3
##  4 model_21            2017   20108047.          3
##  5 model_21            2018   19286777.          2
##  6 model_9             2017   13556276.          3
##  7 model_21            2016   10950340.          4
##  8 model_9             2016   10451858.          4
##  9 model_2             2015    9139873.          5
## 10 model_27            2018    8599881.          2
## # ... with 111 more rows
ggplot(plot_7, aes(x = Model, y = total_price, fill= Model))+
  geom_point(stat="identity", position = "stack") +theme(axis.text.x = element_text(angle=90, size=9, vjust=0.5,hjust=1))+
  labs(x="Model", y="Total Price (TL)", fill ="Types of Model", title = "The Total Price According to Types of Model")

plot_2<- service_dt%>%
  filter(process_type == "Ariza")%>%
  group_by(Model)%>%
  summarise(count_vehicle_id = n())%>%
  arrange(desc(Model))

ggplot(plot_2, aes(x = Model, y = count_vehicle_id, fill= Model))+ ggtitle("Model Distribution in Process Type of Ariza") +
  geom_bar(stat="identity") +theme(axis.text.x = element_text(angle=90, size=9, vjust=0.5,hjust=1))

2.4 The Usage Amount of The Material Types

  • It was seen that the “diğer”, “parça” and “yag” materials were ranked in order of high to low, respectively, according to their usage amounts in whole process types.
plot_5<- eliminated%>%
  group_by(process_type, material_type)%>%
  summarise(count = sum(quantity))%>%
  arrange(desc(count))
kable(plot_5,col.names = c("Process_Type", "Material_Type.", "Total_Quantity"))%>%
   kable_styling("striped", full_width = F) %>%
  scroll_box(width = "100%", height = "400px")
Process_Type Material_Type. Total_Quantity
Ariza Diger 901921.05
Ariza Parca 354913.74
Ariza Yag 129357.86
P.Bakim Diger 62320.25
Kontrol Diger 31813.03
P.Bakim Yag 31086.40
Kampanya Diger 23604.38
Kontrol Parca 21885.75
Hasar Diger 21093.68
P.Bakim Parca 17463.55
Kampanya Yag 12190.70
Garanti Diger 11762.80
Hasar Parca 9591.05
Garanti Parca 9465.15
Kampanya Parca 7367.61
Kontrol Yag 6934.30
Yol Yardım Diger 5783.45
Dahili Diger 4585.10
Yol Yardım Parca 3265.90
Garanti Yag 1595.60
Dahili Parca 617.00
Yol Yardım Yag 467.60
Aksesuar Diger 162.00
Dahili Yag 110.50
PDI Diger 68.00
Ariza Aksesuar 61.00
PDI Parca 38.00
Hasar Yag 30.00
PDI Yag 24.50
Aksesuar Parca 13.00
Kampanya Aksesuar 13.00
P.Bak+Ariza Yag 9.50
P.Bak+Ariza Diger 4.00
P.Bak+Ariza Parca 3.00
Kontrol Aksesuar 1.00
P.Bakim Aksesuar 1.00
  ggplot(plot_5, aes(y=count, x = process_type, fill = process_type)) +
  geom_col() +
  facet_wrap(~material_type, scales ="free") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  theme(axis.text.x = element_text(angle = 90), legend.position = "none")+
  labs(title = "Total Quantity According to Material Type ",
       x = "Process Type",
       y = "Quantity")