# Create a temporary file
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)

library(ggrepel)
#Create a temporary file
tmp=tempfile(fileext=".xlsx")
#Download file from repository to the temp file
download.file("https://github.com/pjournal/mef03-tugceu/blob/master/odd_retail_sales_2018_09.xlsx?raw=true", destfile=tmp,mode='wb')
#Read that excel file using readxl package's read_excel function. You might need to adjust the parameters (skip, col_names) according to your raw file's format.
raw_data<-readxl::read_excel(tmp,skip=7,col_names=FALSE)
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * … and 5 more problems
# Remove the temp file
file.remove(tmp)
## [1] TRUE
# Remove the last two rows because they are irrelevant (total and empty rows)
raw_data <- raw_data %>% slice(-c(43,44))

# Let's see our raw data
head(raw_data)
## # A tibble: 6 x 10
##   ...1          ...2  ...3  ...4  ...5  ...6  ...7  ...8  ...9 ...10
##   <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO      NA    13    13    NA    NA     0     0    13    13
## 2 ASTON MARTIN    NA     2     2    NA    NA     0     0     2     2
## 3 AUDI            NA   350   350    NA    NA     0     0   350   350
## 4 BENTLEY         NA     0     0    NA    NA     0     0     0     0
## 5 BMW             NA   158   158    NA    NA     0     0   158   158
## 6 CITROEN         NA   134   134    NA   197   197     0   331   331
# Use the same column names in your data.
colnames(raw_data) <- c("brand_name","auto_dom","auto_imp","auto_total","comm_dom","comm_imp","comm_total","total_dom","total_imp","total_total")
# Now we replace NA values with 0 and label the time period with year and month, so when we merge the data we won't be confused.
car_data_sep_18 <- raw_data %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year=2018,month=9)
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
print(car_data_sep_18,width=Inf)
## # A tibble: 42 x 12
##    brand_name   auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##    <chr>           <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
##  1 ALFA ROMEO          0       13         13        0        0          0
##  2 ASTON MARTIN        0        2          2        0        0          0
##  3 AUDI                0      350        350        0        0          0
##  4 BENTLEY             0        0          0        0        0          0
##  5 BMW                 0      158        158        0        0          0
##  6 CITROEN             0      134        134        0      197        197
##  7 DACIA               0     1141       1141        0      319        319
##  8 DS                  0        9          9        0        0          0
##  9 FERRARI             0        3          3        0        0          0
## 10 FIAT              632       57        689      789      199        988
##    total_dom total_imp total_total  year month
##        <dbl>     <dbl>       <dbl> <dbl> <dbl>
##  1         0        13          13  2018     9
##  2         0         2           2  2018     9
##  3         0       350         350  2018     9
##  4         0         0           0  2018     9
##  5         0       158         158  2018     9
##  6         0       331         331  2018     9
##  7         0      1460        1460  2018     9
##  8         0         9           9  2018     9
##  9         0         3           3  2018     9
## 10      1421       256        1677  2018     9
## # … with 32 more rows
saveRDS(car_data_sep_18,file="~/desktop/R-Homework/odd_car_sales_data_sep_18.rds")
# You can read that file by readRDS and assigning to an object 
# e.g 
# rds_data <- readRDS("~/YOUR_OWN_PATH/odd_car_sales_data_sep_18.rds")

Analysis 1 I wonder if there is any brand that did not sell any car in Sept 18

car_data_sep_18%>% 
  filter(total_total==0)%>% 
  select(brand_name,total_total)
## # A tibble: 4 x 2
##   brand_name  total_total
##   <chr>             <dbl>
## 1 BENTLEY               0
## 2 INFINITI              0
## 3 LAMBORGHINI           0
## 4 SMART                 0

Analysis 2 Let’s show brands’ sell type in another column

car_data_sep_18%>% 
  filter(total_dom>0 | total_imp>0)%>% 
  select(brand_name,total_dom,total_imp)%>% 
  transmute(brand_name,
            sell_type=ifelse(total_dom > total_imp,"dom","imp"))%>%
  arrange((brand_name))
## # A tibble: 38 x 2
##    brand_name   sell_type
##    <chr>        <chr>    
##  1 ALFA ROMEO   imp      
##  2 ASTON MARTIN imp      
##  3 AUDI         imp      
##  4 BMW          imp      
##  5 CITROEN      imp      
##  6 DACIA        imp      
##  7 DS           imp      
##  8 FERRARI      imp      
##  9 FIAT         dom      
## 10 FORD         dom      
## # … with 28 more rows

Analysis 3 Let’s see brand’s market share in Sept 18.

car_data_sep_18%>% 
  select(brand_name,total_total)%>%
  mutate(market_share=total_total/sum(total_total)*100)%>%
  arrange(desc(market_share))
## # A tibble: 42 x 3
##    brand_name    total_total market_share
##    <chr>               <dbl>        <dbl>
##  1 RENAULT              3186        13.8 
##  2 FORD                 2356        10.2 
##  3 VOLKSWAGEN           2239         9.72
##  4 HONDA                1973         8.57
##  5 FIAT                 1677         7.28
##  6 HYUNDAI              1535         6.67
##  7 DACIA                1460         6.34
##  8 NISSAN               1217         5.28
##  9 MERCEDES-BENZ        1163         5.05
## 10 TOYOTA               1127         4.89
## # … with 32 more rows

Analysis 4: Let’s visualize brands’ auto vs commercial sales on a scatter plot

car_data <- car_data_sep_18%>%
  filter(total_total>0)%>%
  select(brand_name,auto_total,comm_total)
car_data_plot <- ggplot(car_data)+
  geom_point(aes(auto_total, comm_total), color='#00AFBB',size=1)+
  geom_label_repel(aes(auto_total, comm_total,label=car_data$brand_name),size=2)+
  theme_classic(base_size = 10)+
  coord_fixed(ratio = 1.2)
car_data_plot