1 Loading Packages

library(tidyverse)
library(dplyr)
library(ggplot2)
library(knitr) 
library(RColorBrewer)

2 Data Processing

set.seed(503)
diamonds_test <- diamonds %>% mutate(diamond_id = row_number()) %>% 
  group_by(cut, color, clarity) %>% sample_frac(0.2) %>% ungroup()
diamonds_train <- anti_join(diamonds %>% mutate(diamond_id = row_number()), 
                            diamonds_test, by = "diamond_id")

3 Explanatory Data Analysis

3.1 Carat

 ggplot(data = diamonds, mapping = aes(x = carat)) + geom_histogram(bins = 500)

3.2 Color

 ggplot(diamonds, aes(x = color, y = ..)) + geom_bar(aes(y = ..count.. / sum(..count..))) + ylab("Proportion")

3.3 Clarity

diamonds %>%
  mutate(clarity = factor(clarity)) %>%
  group_by(clarity) %>%
  summarise(clarity_count = n())%>%
  arrange(desc(clarity_count)) %>%
  head(20) %>%
  
  ggplot(., aes(x=clarity_count, y = reorder(clarity, clarity_count), fill= clarity)) +
  geom_col() +
  geom_text(aes(label = clarity_count), size=3, color = "black", position = position_stack(vjust = 0.95)) +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(x = "Count",
         y = "Clarity")

3.4 Cut

diamonds %>%
  #mutate(cut = factor(cut)) %>%
  group_by(cut) %>%
  summarise(count = n()) %>%
  mutate(percentage = 100*count/sum(count)) %>%
ggplot(., aes(x=cut, y = count, fill = count)) +
  geom_col() +
  scale_fill_gradient("count", low="lightpink1", high="lightpink4") +
  geom_line(aes(y = count), size = 1.2, color="black", group = 1) +
  theme_minimal() +
  theme(legend.position = "none") +
  labs( x = "Quality of Cut",
       y = "Number of Cut")

3.5 Price and Carat

ggplot(data=diamonds, aes(x=carat, y=price)) +
  # get rid of top percentile
  scale_x_continuous(lim=c(0,quantile(diamonds$carat,0.99))) +
  scale_y_continuous(lim=c(0,quantile(diamonds$price,0.99))) +
  geom_point(fill=I('#dd3333'), color= I("black"), aes(alpha=1/10),shape=21)+ xlab("Carat") + ylab("Price")

3.6 Price and Cut

ggplot(diamonds, aes(factor(cut), price, fill=cut)) + geom_boxplot() + ggtitle("Diamond Price according Cut") + xlab("Type of Cut") + ylab("Price")

3.7 Price and Color

ggplot(diamonds, aes(factor(color), price, fill=color)) + geom_boxplot() + ggtitle("Diamond Price according Color") + xlab("Type of Color") + ylab("Price")

3.8 Cut and Color

ggplot(diamonds, aes(x = color, fill = cut)) + geom_bar(position = "fill")

3.9 Cut and Clarity

ggplot(diamonds, aes(cut, clarity)) + 
  geom_jitter(alpha = 0.05)

3.10 Price with Cut and Carat

diamonds %>%
  ggplot(mapping = aes(x = carat, y = price))+
  geom_point(mapping = aes(color = cut))

4 PCA

diamonds_pca <- princomp(as.matrix(diamonds_train[,sapply(diamonds_train, class) == "numeric"]),cor=T)
summary(diamonds_pca,loadings=TRUE)
## Importance of components:
##                          Comp.1    Comp.2    Comp.3    Comp.4      Comp.5
## Standard deviation     1.979800 1.1336225 0.8255050 0.2273068 0.218071172
## Proportion of Variance 0.653268 0.2141833 0.1135764 0.0086114 0.007925839
## Cumulative Proportion  0.653268 0.8674513 0.9810277 0.9896391 0.997564967
##                             Comp.6
## Standard deviation     0.120872645
## Proportion of Variance 0.002435033
## Cumulative Proportion  1.000000000
## 
## Loadings:
##       Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## carat  0.496                0.638  0.442  0.385
## depth         0.734 -0.671                     
## table  0.122 -0.669 -0.733                     
## x      0.501                0.114        -0.855
## y      0.494               -0.760  0.361  0.201
## z      0.493  0.103               -0.817  0.279
plot(diamonds_pca,type="l")

5 Prediction

formul<-price~carat+cut+color+clarity+depth+table+x+y+z

Diamond.lm.model<-lm(formul,diamonds_train)
diamonds_test$lmprediction<-predict(Diamond.lm.model,newdata = diamonds_test)
summary(Diamond.lm.model)
## 
## Call:
## lm(formula = formul, data = diamonds_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21446.0   -593.0   -182.4    378.8  10701.8 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  6032.795    440.677   13.690  < 2e-16 ***
## carat       11306.047     55.032  205.445  < 2e-16 ***
## cut.L         587.057     25.144   23.347  < 2e-16 ***
## cut.Q        -300.964     20.114  -14.963  < 2e-16 ***
## cut.C         148.352     17.317    8.567  < 2e-16 ***
## cut^4         -26.568     13.826   -1.922   0.0547 .  
## color.L     -1949.838     19.404 -100.484  < 2e-16 ***
## color.Q      -672.931     17.640  -38.148  < 2e-16 ***
## color.C      -161.554     16.458   -9.816  < 2e-16 ***
## color^4        25.973     15.117    1.718   0.0858 .  
## color^5       -98.819     14.278   -6.921 4.55e-12 ***
## color^6       -59.129     12.977   -4.556 5.22e-06 ***
## clarity.L    4129.825     33.880  121.897  < 2e-16 ***
## clarity.Q   -1954.978     31.602  -61.863  < 2e-16 ***
## clarity.C     997.867     27.030   36.918  < 2e-16 ***
## clarity^4    -382.021     21.572  -17.709  < 2e-16 ***
## clarity^5     242.941     17.611   13.795  < 2e-16 ***
## clarity^6      12.562     15.328    0.820   0.4125    
## clarity^7      87.466     13.525    6.467 1.01e-10 ***
## depth         -65.123      4.992  -13.045  < 2e-16 ***
## table         -29.084      3.250   -8.950  < 2e-16 ***
## x           -1020.664     34.775  -29.350  < 2e-16 ***
## y              -1.438     19.396   -0.074   0.9409    
## z             -38.006     33.858   -1.123   0.2616    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1130 on 43119 degrees of freedom
## Multiple R-squared:  0.9202, Adjusted R-squared:  0.9202 
## F-statistic: 2.162e+04 on 23 and 43119 DF,  p-value: < 2.2e-16
linear.rsquared<-1 - (sum((diamonds_test$lmprediction-diamonds_test$price)^2)/sum((diamonds_test$price-mean(diamonds_test$price))^2))
linear.rsquared
## [1] 0.9180486
ggplot(diamonds_test,aes(lmprediction,price,alpha=0.6)) +
  geom_point(color="orange")+
  geom_abline(color="black",size=1.2)+
  theme_minimal() +
  theme(legend.position = "none", plot.title = element_text(vjust = 0.5))+
  ggtitle("Predictions vs Actual Prices")