library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ readr 1.3.1
## ✔ tibble 2.1.3 ✔ purrr 0.3.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ ggplot2 3.2.1 ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(readxl)
##Step I: Download Raw Data from Repository
#Create a temporary file
tmp<-tempfile(fileext="odd_car_sales_2019_09.xlsx")
#Download
download.file("https://github.com/pjournal/mef03-arslnbatu/raw/master/odd_car_sales_2019_09.xlsx?raw=true",destfile=tmp,mode="wb")
#Read excel
raw_data<-readxl::read_excel(tmp,skip=7,col_names=FALSE)
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * … and 5 more problems
#Remove
file.remove(tmp)
## [1] TRUE
##Step II: Remove the last two rows:
# Remove the last two rows
raw_data <- raw_data %>% slice(-c(43,44))
#First 5 raw data
head(raw_data)
## # A tibble: 6 x 10
## ...1 ...2 ...3 ...4 ...5 ...6 ...7 ...8 ...9 ...10
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO NA 9 9 NA NA 0 0 9 9
## 2 ASTON MARTIN NA 3 3 NA NA 0 0 3 3
## 3 AUDI NA 1191 1191 NA NA 0 0 1191 1191
## 4 BENTLEY NA 0 0 NA NA 0 0 0 0
## 5 BMW NA 737 737 NA NA 0 0 737 737
## 6 CITROEN NA 694 694 NA 363 363 0 1057 1057
##Step III: Civilized the Data
# Replace NA values with 0 and label the time period with year and month.
car_data_sep_19 <- raw_data %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year=2019,month=9)
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
# Change the column names
colnames(car_data_sep_19) <- c("brand","local_auto","imp_auto","total_auto","local_comm","imp_comm","total_comm","local_total","imp_total","all_total","year","months")
print(car_data_sep_19,width=Inf)
## # A tibble: 42 x 12
## brand local_auto imp_auto total_auto local_comm imp_comm
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO 0 9 9 0 0
## 2 ASTON MARTIN 0 3 3 0 0
## 3 AUDI 0 1191 1191 0 0
## 4 BENTLEY 0 0 0 0 0
## 5 BMW 0 737 737 0 0
## 6 CITROEN 0 694 694 0 363
## 7 DACIA 0 998 998 0 201
## 8 DS 0 15 15 0 0
## 9 FERRARI 0 2 2 0 0
## 10 FIAT 4904 77 4981 956 186
## total_comm local_total imp_total all_total year months
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0 9 9 2019 9
## 2 0 0 3 3 2019 9
## 3 0 0 1191 1191 2019 9
## 4 0 0 0 0 2019 9
## 5 0 0 737 737 2019 9
## 6 363 0 1057 1057 2019 9
## 7 201 0 1199 1199 2019 9
## 8 0 0 15 15 2019 9
## 9 0 0 2 2 2019 9
## 10 1142 5860 263 6123 2019 9
## # … with 32 more rows
##Step IV: Save the Data
saveRDS(car_data_sep_19,file="C:\\Users\\HP\\Downloads\\odd_car_sales_2019_09.rds")
##Step V: Make Some Analysis
#calculate local and import point
data <- car_data_sep_19 %>%
filter(local_total > 0 & imp_total > 0 & brand != "TOPLAM:") %>%
select(brand,local_total,all_total,imp_total) %>%
mutate(local_point = round(local_total/all_total,2), imp_point = round(imp_total/all_total,2)) %>%
arrange(desc(local_point))
data
## # A tibble: 7 x 6
## brand local_total all_total imp_total local_point imp_point
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ISUZU 62 63 1 0.98 0.02
## 2 FIAT 5860 6123 263 0.96 0.04
## 3 TOYOTA 2387 2573 186 0.93 0.07
## 4 HONDA 1500 1720 220 0.87 0.13
## 5 RENAULT 6247 7296 1049 0.86 0.14
## 6 FORD 2447 3652 1205 0.67 0.33
## 7 HYUNDAI 564 1604 1040 0.35 0.65
# List of all total sales for all vehicles
car_data_sep_19 %>%
filter(total_auto > 0 & total_comm > 0 & brand != "TOPLAM:") %>%
select(brand,all_total) %>%
arrange(desc(all_total))
## # A tibble: 15 x 2
## brand all_total
## <chr> <dbl>
## 1 RENAULT 7296
## 2 FIAT 6123
## 3 VOLKSWAGEN 4428
## 4 FORD 3652
## 5 TOYOTA 2573
## 6 PEUGEOT 1938
## 7 HYUNDAI 1604
## 8 OPEL 1548
## 9 MERCEDES-BENZ 1401
## 10 DACIA 1199
## 11 CITROEN 1057
## 12 NISSAN 1019
## 13 KIA 647
## 14 MITSUBISHI 141
## 15 SSANGYONG 20
#Plot between local point and all total sales for brands
ggplot(data, aes(local_point, all_total, colour = brand)) +
geom_point(size=6)