# Create a temporary file
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(ggrepel)
#Create a temporary file
tmp=tempfile(fileext=".xlsx")
#Download file from repository to the temp file
download.file("https://github.com/pjournal/mef03-tugceu/blob/master/odd_retail_sales_2018_09.xlsx?raw=true", destfile=tmp,mode='wb')
#Read that excel file using readxl package's read_excel function. You might need to adjust the parameters (skip, col_names) according to your raw file's format.
raw_data<-readxl::read_excel(tmp,skip=7,col_names=FALSE)
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * … and 5 more problems
# Remove the temp file
file.remove(tmp)
## [1] TRUE
# Remove the last two rows because they are irrelevant (total and empty rows)
raw_data <- raw_data %>% slice(-c(43,44))
# Let's see our raw data
head(raw_data)
## # A tibble: 6 x 10
## ...1 ...2 ...3 ...4 ...5 ...6 ...7 ...8 ...9 ...10
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO NA 13 13 NA NA 0 0 13 13
## 2 ASTON MARTIN NA 2 2 NA NA 0 0 2 2
## 3 AUDI NA 350 350 NA NA 0 0 350 350
## 4 BENTLEY NA 0 0 NA NA 0 0 0 0
## 5 BMW NA 158 158 NA NA 0 0 158 158
## 6 CITROEN NA 134 134 NA 197 197 0 331 331
# Use the same column names in your data.
colnames(raw_data) <- c("brand_name","auto_dom","auto_imp","auto_total","comm_dom","comm_imp","comm_total","total_dom","total_imp","total_total")
# Now we replace NA values with 0 and label the time period with year and month, so when we merge the data we won't be confused.
car_data_sep_18 <- raw_data %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year=2018,month=9)
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
print(car_data_sep_18,width=Inf)
## # A tibble: 42 x 12
## brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO 0 13 13 0 0 0
## 2 ASTON MARTIN 0 2 2 0 0 0
## 3 AUDI 0 350 350 0 0 0
## 4 BENTLEY 0 0 0 0 0 0
## 5 BMW 0 158 158 0 0 0
## 6 CITROEN 0 134 134 0 197 197
## 7 DACIA 0 1141 1141 0 319 319
## 8 DS 0 9 9 0 0 0
## 9 FERRARI 0 3 3 0 0 0
## 10 FIAT 632 57 689 789 199 988
## total_dom total_imp total_total year month
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 13 13 2018 9
## 2 0 2 2 2018 9
## 3 0 350 350 2018 9
## 4 0 0 0 2018 9
## 5 0 158 158 2018 9
## 6 0 331 331 2018 9
## 7 0 1460 1460 2018 9
## 8 0 9 9 2018 9
## 9 0 3 3 2018 9
## 10 1421 256 1677 2018 9
## # … with 32 more rows
saveRDS(car_data_sep_18,file="~/desktop/R-Homework/odd_car_sales_data_sep_18.rds")
# You can read that file by readRDS and assigning to an object
# e.g
# rds_data <- readRDS("~/YOUR_OWN_PATH/odd_car_sales_data_sep_18.rds")
Analysis 1 I wonder if there is any brand that did not sell any car in Sept 18
car_data_sep_18%>%
filter(total_total==0)%>%
select(brand_name,total_total)
## # A tibble: 4 x 2
## brand_name total_total
## <chr> <dbl>
## 1 BENTLEY 0
## 2 INFINITI 0
## 3 LAMBORGHINI 0
## 4 SMART 0
Analysis 2 Let’s show brands’ sell type in another column
car_data_sep_18%>%
filter(total_dom>0 | total_imp>0)%>%
select(brand_name,total_dom,total_imp)%>%
transmute(brand_name,
sell_type=ifelse(total_dom > total_imp,"dom","imp"))%>%
arrange((brand_name))
## # A tibble: 38 x 2
## brand_name sell_type
## <chr> <chr>
## 1 ALFA ROMEO imp
## 2 ASTON MARTIN imp
## 3 AUDI imp
## 4 BMW imp
## 5 CITROEN imp
## 6 DACIA imp
## 7 DS imp
## 8 FERRARI imp
## 9 FIAT dom
## 10 FORD dom
## # … with 28 more rows
Analysis 3 Let’s see brand’s market share in Sept 18.
car_data_sep_18%>%
select(brand_name,total_total)%>%
mutate(market_share=total_total/sum(total_total)*100)%>%
arrange(desc(market_share))
## # A tibble: 42 x 3
## brand_name total_total market_share
## <chr> <dbl> <dbl>
## 1 RENAULT 3186 13.8
## 2 FORD 2356 10.2
## 3 VOLKSWAGEN 2239 9.72
## 4 HONDA 1973 8.57
## 5 FIAT 1677 7.28
## 6 HYUNDAI 1535 6.67
## 7 DACIA 1460 6.34
## 8 NISSAN 1217 5.28
## 9 MERCEDES-BENZ 1163 5.05
## 10 TOYOTA 1127 4.89
## # … with 32 more rows
Analysis 4: Let’s visualize brands’ auto vs commercial sales on a scatter plot
car_data <- car_data_sep_18%>%
filter(total_total>0)%>%
select(brand_name,auto_total,comm_total)
car_data_plot <- ggplot(car_data)+
geom_point(aes(auto_total, comm_total), color='#00AFBB',size=1)+
geom_label_repel(aes(auto_total, comm_total,label=car_data$brand_name),size=2)+
theme_classic(base_size = 10)+
coord_fixed(ratio = 1.2)
car_data_plot