First we find the data on Otomotiv Distibütörleri Derneği website. Then we will find yearly sales data from 2016 to 2019. We combined this data in Konsolide_ODD_datasi.xlsx
. We will try to analyse this data and make some conclusions.
1. Year (year)
2. Brand (brand)
3. Domestic Car Sales (car_dom)
4. Imported Car Sales (car_imp)
5. Total Car Sales (car_total)
6. Domestic Light Commercial Sales (lcv_dom)
7. Imported Light Commercial Sales (lcv_imp)
8. Total Light Commercial Sales (lcv_total)
9. Domestic Total Sales (all_dom)
10. Imported Total Sales (all_imp)
11. All Sales (all_total)
Our raw excel file is in our repository. We can automatically download that file and put it in a temporary file. Then we can read that excel document into R and remove the temp file.
## Our temp file
tmp<-tempfile(fileext=".xlsx")
## Download file from repository
download.file("https://github.com/pjournal/mef03g-mujde-r/blob/master/Konsolide_ODD_datasi.xlsx?raw=true",destfile=tmp, mode='wb')
## Read that excel file using readxl library. col_names not included and first two rows skipped
raw_data<-readxl::read_excel(tmp,col_names = FALSE, skip=2)
## Remove the temp file
file.remove(tmp)
## [1] TRUE
# Let's see our data
head(raw_data)
## # A tibble: 6 x 11
## ...1 ...2 ...3 ...4 ...5 ...6 ...7 ...8 ...9 ...10 ...11
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2016 ALFA ROMEO 0 634 634 0 0 0 0 634 634
## 2 2016 ASTON MARTIN 0 29 29 0 0 0 0 29 29
## 3 2016 AUDI 0 22005 22005 0 0 0 0 22005 22005
## 4 2016 BENTLEY 0 9 9 0 0 0 0 9 9
## 5 2016 BMW 0 27166 27166 0 0 0 0 27166 27166
## 6 2016 CHERY 0 144 144 0 0 0 0 144 144
## rename colnames
colnames(raw_data) <- c("year","brand","car_dom","car_imp","car_total","lcv_dom","lcv_imp","lcv_total","all_dom","all_imp","all_total")
## replace NA values with 0
car_data <- raw_data %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.)))
## let's see our data again
print(head(car_data),width=Inf)
## # A tibble: 6 x 11
## year brand car_dom car_imp car_total lcv_dom lcv_imp lcv_total
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2016 ALFA ROMEO 0 634 634 0 0 0
## 2 2016 ASTON MARTIN 0 29 29 0 0 0
## 3 2016 AUDI 0 22005 22005 0 0 0
## 4 2016 BENTLEY 0 9 9 0 0 0
## 5 2016 BMW 0 27166 27166 0 0 0
## 6 2016 CHERY 0 144 144 0 0 0
## all_dom all_imp all_total
## <dbl> <dbl> <dbl>
## 1 0 634 634
## 2 0 29 29
## 3 0 22005 22005
## 4 0 9 9
## 5 0 27166 27166
## 6 0 144 144
## remove data which total_sales are 0
best_seller <- car_data %>% filter(car_total > 0 & lcv_total > 0)
## group by brand and sum total_sales
best_seller <- best_seller %>% select(brand,all_total) %>% group_by(brand) %>% summarise(all_total = sum(all_total))
## arrange by all_total desc
best_seller <- best_seller %>% arrange(desc(all_total))
## best_seller 6 brands
best_seller <- best_seller %>% filter(row_number() < 7)
print(best_seller, width=Inf)
## # A tibble: 6 x 2
## brand all_total
## <chr> <dbl>
## 1 RENAULT 383624
## 2 VOLKSWAGEN 354062
## 3 FIAT 344592
## 4 FORD 317304
## 5 HYUNDAI 154569
## 6 TOYOTA 151026
fancy_scientific <- function(l) {
# turn in to character string in scientific notation
l <- format(l, scientific = TRUE)
# quote the part before the exponent to keep all the digits
l <- gsub("^(.*)e", "'\\1'e", l)
# turn the 'e+' into plotmath format
l <- gsub("e", "%*%10^", l)
# return this as an expression
parse(text=l)
}
ggplot(best_seller, aes(x=brand, y=all_total, fill=brand)) + geom_bar(alpha=0.6,stat="identity") + geom_point() + scale_y_continuous(labels=fancy_scientific) + labs(x="Brand Name",y="Sales Count",title="6 Best Seller Brands",fill="Brand")
## remove data which total_sales are 0
total_sales_over_years <- car_data %>% filter(car_total > 0 & lcv_total > 0)
## group by year and sum total_sales
total_sales_over_years <- total_sales_over_years %>% select(year,all_total) %>% group_by(year) %>% summarise(all_total = sum(all_total))
## arrange by year desc
total_sales_over_years <- total_sales_over_years %>% arrange(desc(year))
## remove last year because the year is not finished
total_sales_over_years <- total_sales_over_years %>% slice(-1)
print(total_sales_over_years, width=Inf)
## # A tibble: 3 x 2
## year all_total
## <dbl> <dbl>
## 1 2018 490991
## 2 2017 776801
## 3 2016 781695
## let's convert sales by year data to pie category
percentage_data <- mutate(total_sales_over_years, sales_percentage = all_total / sum(all_total) * 100)
## we can see that there is a big decline in sales of 2018
percentage_data
## # A tibble: 3 x 3
## year all_total sales_percentage
## <dbl> <dbl> <dbl>
## 1 2018 490991 24.0
## 2 2017 776801 37.9
## 3 2016 781695 38.1
ggplot(percentage_data, aes(x="", y=sales_percentage, fill=factor(year))) + geom_bar(stat="identity", width=1) + coord_polar("y", start=0) +geom_text(aes(label = paste0(round(sales_percentage), "%")), position = position_stack(vjust = 0.5)) + scale_fill_manual(values=c("#55DDE0", "#33658A", "#2F4858")) + labs(x = NULL, y = NULL, fill = NULL, title = "Sales Percentage Over Year") + theme_classic() + theme(axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(hjust = 0.5, color = "#666666"))
## remove data which total_sales are 0
total_sales_brands_by_type <- car_data %>% filter(car_total > 0 & lcv_total > 0)
## group by year and sum total_sales
total_sales_brands_by_type <- total_sales_brands_by_type %>% select(brand, year,lcv_total, car_total) %>% group_by(brand) %>% summarise(lcv_total = sum(lcv_total), car_total = sum(car_total)) %>% select (brand, lcv_total, car_total)
## reshape our dataframe which lcv_total and car_total values are different rows by brand
total_sales_brands_by_type <- reshape(total_sales_brands_by_type, varying = c("lcv_total", "car_total"), timevar="sales_type", v.names="sales_count", times = c("lcv_total", "car_total"),direction="long")
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## create flipped stack bar chart with lcv_total - car_total data by brands
ggplot(total_sales_brands_by_type, aes(x=reorder(brand,sales_count), y=sales_count/1000)) + geom_bar(aes(fill = sales_type), stat = "identity",position ="stack") + coord_flip() + geom_point() + labs(x="Brand Name",y="Sales Count K (divided by 1000",title="Car / Commercial Sales By Brands",fill="Brand")