The dataset covers up all the recorded service information of x company’s dealers in Turkey. The data is belong the dates between 2019 and 2020. This data is obtained from x company where one of our team member is working for.In the dataset we have 561,142 rows and 17 columns which gives different kind of parameters. According to the parameters a reader can get information about details of service processes such as model number, km, warranty start & end date, prices, vehicle id etc.
The variables which data has can be seen as below:
library(readxl)
library(tidyverse)
library(lubridate)
library(openxlsx)
library(rio)
library(reshape)
library(kableExtra)
library(xaringan)
raw_data <- rio::import("https://github.com/pjournal/mef04g-madagaskar/blob/gh-pages/Data/x_vehicle_company_service_dataset_v2.xlsx?raw=True")
raw_dt=na.omit(raw_data)
Transforming the job_closed_date column from character to datetime by using a function of lubridate which is as.POSIXct
raw_dt1 = raw_dt
raw_dt1$'Is emri kapanis tarihi' = str_replace_all(raw_dt1$'Is emri kapanis tarihi', "/","-")
raw_dt1$'Is emri kapanis tarihi' <- as.POSIXct(raw_dt1$'Is emri kapanis tarihi',format = "%d-%m-%Y")
Renaming of the columns by using rename() function in order to get clear information about data.
service_dt <- dplyr::rename(raw_dt1, "Material_id" = 1,
"Vehicle_id" = 2,
"Dealer_id" = 3,
"Job_order_number" = 4,
"material_type" = 5,
"process_type" = 6,
"beginning_date" = 7,
"ending_date" = 8,
"quantity" = 9,
"price" = 10,
"Model" = 11,
"production_date" = 12,
"job_closed_date" = 13,
"vehicle_km" = 14,
"dealer_city" = 15,
"warranty_beginning" = 16,
"warranty_ending" = 17)
glimpse(service_dt)
## Rows: 552,125
## Columns: 19
## $ Material_id <chr> "malzeme_236", "malzeme_2156", "malzeme_...
## $ Vehicle_id <chr> "sasi_no_9883", "sasi_no_9883", "sasi_no...
## $ Dealer_id <chr> "müsteri_15", "müsteri_15", "müsteri_15"...
## $ Job_order_number <chr> "000000000127819", "000000000127819", "0...
## $ material_type <chr> "Diger", "Parca", "Diger", "Parca", "Dig...
## $ process_type <chr> "Ariza", "Ariza", "Ariza", "Ariza", "Ari...
## $ beginning_date <dttm> 2019-01-22, 2019-01-22, 2019-01-22, 201...
## $ ending_date <dttm> 2019-01-20, 2019-01-20, 2019-01-20, 201...
## $ quantity <dbl> 1.0, 2.0, 20.0, 1.0, 1.0, 10.0, 1.0, 1.0...
## $ price <dbl> 209.96415, 97.63875, 0.01485, 905.86485,...
## $ Model <chr> "model_23", "model_23", "model_23", "mod...
## $ production_date <dbl> 2016, 2016, 2016, 2017, 2017, 2017, 2016...
## $ job_closed_date <dttm> 2019-01-25, 2019-01-25, 2019-01-25, 201...
## $ vehicle_km <dbl> 732544, 732544, 732544, 499700, 499700, ...
## $ dealer_city <chr> "Ankara", "Ankara", "Ankara", "Ankara", ...
## $ warranty_beginning <dbl> 39.92077, 39.92077, 39.92077, 39.92077, ...
## $ warranty_ending <dbl> 32.85411, 32.85411, 32.85411, 32.85411, ...
## $ `Garanti Baslangic tarihi` <dttm> 2016-10-04, 2016-10-04, 2016-10-04, 201...
## $ `Garanti Bitis tarihi` <dttm> 2018-10-04, 2018-10-04, 2018-10-04, 201...
As it can be seen from the codes which was written at below, a new dataset(“Eliminated”) was created after the determination of the upper and lower ranges.
There were some important points which should be well understood carefully from this report.
dt_2019<- service_dt%>%
group_by(month=lubridate::month(beginning_date),year=lubridate::year(beginning_date)) %>%
filter(year == 2019) %>% summarise(total_quantity = sum(quantity))
graph_2019 <- dt_2019 %>% select(month,total_quantity)
dt_2020<- service_dt%>%
group_by(month=lubridate::month(beginning_date),year=lubridate::year(beginning_date))%>%
filter(year == 2020) %>% summarise(total_quantity = sum(quantity))
graph_2020 <- dt_2020 %>% select(month,total_quantity)
dt_graph = merge(graph_2019, graph_2020, by="month", )
dt_graph_2 <- reshape::melt(dt_graph, id.var='month')
p<-ggplot(dt_graph_2, aes(x=month, y=value, col=variable)) +geom_line(size=0.9) +
ggtitle("Vehicle Operation Trend", "2019-2020 Operations in Services") +
xlab("Month") + ylab("Total Quantity") + labs(color="Year") + theme_classic()
p + scale_color_manual(name="Year",
labels = c("2019", "2020"),
values = c("total_quantity.x"="#D16103",
"total_quantity.y"="#4E84C4"))
plot_1<- service_dt%>%
mutate(count_total = n())%>%
group_by(process_type)%>%
mutate(count_vehicle_id = n())%>%
select(process_type,count_vehicle_id, count_total)%>%
mutate(rate_process_type = round(100*count_vehicle_id/count_total))%>%
arrange(desc(rate_process_type))%>%
distinct(process_type, rate_process_type)
print(plot_1)
## # A tibble: 11 x 2
## # Groups: process_type [11]
## process_type rate_process_type
## <chr> <dbl>
## 1 Ariza 84
## 2 P.Bakim 6
## 3 Kontrol 4
## 4 Hasar 2
## 5 Kampanya 2
## 6 Garanti 2
## 7 Yol Yardım 0
## 8 Dahili 0
## 9 PDI 0
## 10 Aksesuar 0
## 11 P.Bak+Ariza 0
plot_7 <- eliminated %>%
group_by(Model, production_date) %>%
summarise(total_price = sum(price)) %>%
mutate(age_of_car = as.numeric(format(Sys.Date(), "%Y"))-production_date)%>%
arrange(desc(total_price))
plot_7
## # A tibble: 121 x 4
## # Groups: Model [27]
## Model production_date total_price age_of_car
## <chr> <dbl> <dbl> <dbl>
## 1 model_23 2016 29279644. 4
## 2 model_1 2016 23435455. 4
## 3 model_23 2017 20997581. 3
## 4 model_21 2017 20108047. 3
## 5 model_21 2018 19286777. 2
## 6 model_9 2017 13556276. 3
## 7 model_21 2016 10950340. 4
## 8 model_9 2016 10451858. 4
## 9 model_2 2015 9139873. 5
## 10 model_27 2018 8599881. 2
## # ... with 111 more rows
ggplot(plot_7, aes(x = Model, y = total_price, fill= Model))+
geom_point(stat="identity", position = "stack") +theme(axis.text.x = element_text(angle=90, size=9, vjust=0.5,hjust=1))+
labs(x="Model", y="Total Price (TL)", fill ="Types of Model", title = "The Total Price According to Types of Model")
plot_2<- service_dt%>%
filter(process_type == "Ariza")%>%
group_by(Model)%>%
summarise(count_vehicle_id = n())%>%
arrange(desc(Model))
ggplot(plot_2, aes(x = Model, y = count_vehicle_id, fill= Model))+ ggtitle("Model Distribution in Process Type of Ariza") +
geom_bar(stat="identity") +theme(axis.text.x = element_text(angle=90, size=9, vjust=0.5,hjust=1))
plot_5<- eliminated%>%
group_by(process_type, material_type)%>%
summarise(count = sum(quantity))%>%
arrange(desc(count))
kable(plot_5,col.names = c("Process_Type", "Material_Type.", "Total_Quantity"))%>%
kable_styling("striped", full_width = F) %>%
scroll_box(width = "100%", height = "400px")
Process_Type | Material_Type. | Total_Quantity |
---|---|---|
Ariza | Diger | 901921.05 |
Ariza | Parca | 354913.74 |
Ariza | Yag | 129357.86 |
P.Bakim | Diger | 62320.25 |
Kontrol | Diger | 31813.03 |
P.Bakim | Yag | 31086.40 |
Kampanya | Diger | 23604.38 |
Kontrol | Parca | 21885.75 |
Hasar | Diger | 21093.68 |
P.Bakim | Parca | 17463.55 |
Kampanya | Yag | 12190.70 |
Garanti | Diger | 11762.80 |
Hasar | Parca | 9591.05 |
Garanti | Parca | 9465.15 |
Kampanya | Parca | 7367.61 |
Kontrol | Yag | 6934.30 |
Yol Yardım | Diger | 5783.45 |
Dahili | Diger | 4585.10 |
Yol Yardım | Parca | 3265.90 |
Garanti | Yag | 1595.60 |
Dahili | Parca | 617.00 |
Yol Yardım | Yag | 467.60 |
Aksesuar | Diger | 162.00 |
Dahili | Yag | 110.50 |
PDI | Diger | 68.00 |
Ariza | Aksesuar | 61.00 |
PDI | Parca | 38.00 |
Hasar | Yag | 30.00 |
PDI | Yag | 24.50 |
Aksesuar | Parca | 13.00 |
Kampanya | Aksesuar | 13.00 |
P.Bak+Ariza | Yag | 9.50 |
P.Bak+Ariza | Diger | 4.00 |
P.Bak+Ariza | Parca | 3.00 |
Kontrol | Aksesuar | 1.00 |
P.Bakim | Aksesuar | 1.00 |
ggplot(plot_5, aes(y=count, x = process_type, fill = process_type)) +
geom_col() +
facet_wrap(~material_type, scales ="free") +
theme_minimal() +
theme(legend.position = "none") +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
theme(axis.text.x = element_text(angle = 90), legend.position = "none")+
labs(title = "Total Quantity According to Material Type ",
x = "Process Type",
y = "Quantity")
plot_6<- eliminated%>%
count(dealer_city)%>%
mutate(percentage = 100*round(n/sum(n),3))%>%
arrange(desc(percentage))
kable(plot_6,col.names = c("Dealer_City", "Number of Process.", "Percentage"))%>%
kable_styling("striped", full_width = F) %>%
scroll_box(width = "100%", height = "400px")
Dealer_City | Number of Process. | Percentage |
---|---|---|
İstanbul | 110758 | 20.8 |
Antalya | 87578 | 16.4 |
İzmir | 68052 | 12.8 |
Ankara | 47843 | 9.0 |
Kocaeli | 42691 | 8.0 |
Bursa | 24345 | 4.6 |
Denizli | 21804 | 4.1 |
Gaziantep | 17282 | 3.2 |
Erzurum | 15446 | 2.9 |
Muğla | 14238 | 2.7 |
Nevşehir | 10552 | 2.0 |
Balıkesir | 9945 | 1.9 |
Mersin | 9692 | 1.8 |
Konya | 8719 | 1.6 |
Sakarya | 6632 | 1.2 |
Bolu | 5316 | 1.0 |
Samsun | 4766 | 0.9 |
Düzce | 3571 | 0.7 |
Eskişehir | 3752 | 0.7 |
Hatay | 3256 | 0.6 |
Adıyaman | 2211 | 0.4 |
Şanlıurfa | 1958 | 0.4 |
Aksaray | 1761 | 0.3 |
Aydın | 1586 | 0.3 |
Uşak | 1729 | 0.3 |
Çankırı | 1212 | 0.2 |
Edirne | 1317 | 0.2 |
Elazığ | 1274 | 0.2 |
Karaman | 656 | 0.1 |
Kayseri | 514 | 0.1 |
Kütahya | 602 | 0.1 |
Malatya | 420 | 0.1 |
Sivas | 433 | 0.1 |
Tekirdağ | 371 | 0.1 |
Afyonkarahisar | 60 | 0.0 |
Çorum | 260 | 0.0 |
Diyarbakır | 33 | 0.0 |
Kahramanmaraş | 6 | 0.0 |
Niğde | 164 | 0.0 |
Van | 249 | 0.0 |
plot_6%>%
filter(percentage >= 1.0)%>%
ggplot(aes(x=percentage, y=reorder(dealer_city, percentage), fill=percentage)) +
geom_col() +
scale_fill_gradient("percentage", low="thistle1", high="thistle4") +
geom_text(aes(label = paste(format(percentage,digits=1), "%")), size=4, position = position_stack(vjust = 0.5)) +
theme_minimal() +
theme(legend.position = "none", plot.title = element_text(vjust = 0.5)) +
labs(x = "Percentages",
y = "Dealers",
title = "The Most Popular Dealers")