1. Introduction

1.1 Diamonds Data Set

The diamond data set is a data set that shows the price of diamonds by classifying them according to certain numerical and categorical variables. You can find variables as below:

  • carat: weight of the diamond
  • cut: quality of the cut
  • color: diamond color
  • clarity: measurement of how clear the diamond is
  • depth: total depth percentage
  • table: width of top of diamond relative to widest point
  • price: price in US dollars
  • x: length in mm
  • y: width in mm
  • z: depth in mm

1.2 Install Packages

library(tidyverse)
library(ggplot2)
library(corrplot)
library(RColorBrewer)
library(caret) 
library(rpart)
library(rpart.plot)
library(GGally)
library(gridExtra)
library(magrittr) 

1.3 Diamonds Data Set

diamonds
## # A tibble: 53,940 x 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  7 0.24  Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  8 0.26  Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  9 0.22  Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
## 10 0.23  Very Good H     VS1      59.4    61   338  4     4.05  2.39
## # ... with 53,930 more rows
str(diamonds)
## tibble [53,940 x 10] (S3: tbl_df/tbl/data.frame)
##  $ carat  : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

2. Correlation Analysis

2.1 Price & Carat

price_carat <- ggplot(aes(x=carat, y=price), data=diamonds) +
  geom_point(fill=I("#f77a20"), color=I("black"), shape=21) +
  stat_smooth(method="lm") +
  scale_x_continuous(lim = c(0, quantile(diamonds$carat, 0.99)) ) +
  scale_y_continuous(lim = c(0, quantile(diamonds$price, 0.99)) ) +
  ggtitle("price vs. carat") +
  theme(plot.title = element_text(hjust = 0.5))

2.2 Price & Depth

price_depth <- ggplot(aes(x=depth, y=price), data=diamonds) +
  geom_point(fill=I("#f77a20"), color=I("black"), shape=21) +
  stat_smooth(method="lm") +
  scale_x_continuous(lim = c(0, quantile(diamonds$depth, 0.99)) ) +
  scale_y_continuous(lim = c(0, quantile(diamonds$price, 0.99)) ) +
  ggtitle("price vs. depth") +
  theme(plot.title = element_text(hjust = 0.5))

2.3 Price & Table

price_table <- ggplot(aes(x=table, y=price), data=diamonds) +
  geom_point(fill=I("#f77a20"), color=I("black"), shape=21) +
  stat_smooth(method="lm") +
  scale_x_continuous(lim = c(0, quantile(diamonds$table, 0.99)) ) +
  scale_y_continuous(lim = c(0, quantile(diamonds$price, 0.99)) ) +
  ggtitle("price vs. table") +
  theme(plot.title = element_text(hjust = 0.5))

2.4 Price & X

price_x <- ggplot(aes(x=x, y=price), data=diamonds) +
  geom_point(fill=I("#f77a20"), color=I("black"), shape=21) +
  stat_smooth(method="lm") +
  scale_x_continuous(lim = c(0, quantile(diamonds$x, 0.99)) ) +
  scale_y_continuous(lim = c(0, quantile(diamonds$price, 0.99)) ) +
  ggtitle("price vs. x") +
  theme(plot.title = element_text(hjust = 0.5))

2.5 Price & Y

price_y <- ggplot(aes(x=y, y=price), data=diamonds) +
  geom_point(fill=I("#f77a20"), color=I("black"), shape=21) +
  stat_smooth(method="lm") +
  scale_x_continuous(lim = c(0, quantile(diamonds$y, 0.99)) ) +
  scale_y_continuous(lim = c(0, quantile(diamonds$price, 0.99)) ) +
  ggtitle("price vs. y") +
  theme(plot.title = element_text(hjust = 0.5))

2.6 Price & Z

price_z <- ggplot(aes(x=z, y=price), data=diamonds) +
  geom_point(fill=I("#f77a20"), color=I("black"), shape=21) +
  stat_smooth(method="lm") +
  scale_x_continuous(lim = c(0, quantile(diamonds$z, 0.99)) ) +
  scale_y_continuous(lim = c(0, quantile(diamonds$price, 0.99)) ) +
  ggtitle("price vs. z") +
  theme(plot.title = element_text(hjust = 0.5))

2.7 Pairwise Scatter Plots

grid.arrange(arrangeGrob(price_carat, 
                         price_depth, price_table, 
                         ncol=2, nrow=2, 
                         layout_matrix=rbind(c(1,1), c(2,3))),
             arrangeGrob(price_x, price_y, price_z, 
                         ncol=1, nrow=3), ncol=2)

2.8 Correlation Plots and Coefficients

ggpairs(diamonds[, c("price", "carat", "depth", "table", "x", "y", "z")], 
        upper =list(continuous="cor"), title = "Correlations") + theme(plot.title = element_text(hjust = 0.5))

It is seen from the graphs that there is a very high correlation between price and carat & y variables. It is clear that the explanatory power of carat and y variables on the price will be high in the price prediction models to be established.

3. Estimation Models

3.1 Test and Train Sets

Training and test sets were created, with 20% of the main data set being test and 80% being training set.

set.seed(503)
diamonds_test <- diamonds %>% mutate(diamond_id = row_number()) %>% 
  group_by(cut, color, clarity) %>% sample_frac(0.2) %>% ungroup()

diamonds_train <- anti_join(diamonds %>% mutate(diamond_id = row_number()), 
                            diamonds_test, by = "diamond_id")

3.2 Model Building and Estimation

3.2.1 Classification and Regression Trees (CART)

3.2.1.1 Anova Model

model<- rpart(price~carat+y+x+z+cut+color+clarity, data=diamonds_train,  method="anova")
rpart.plot(model, type=3, digits=3, fallen.leaves = TRUE)

pred<- predict(model, diamonds_test)
head(pred)
##         1         2         3         4         5         6 
## 10940.832  1050.302  5401.156  5401.156  3060.143  3060.143
Mae<- function(diamonds_test, pred) {mean(abs(diamonds_test - pred))}
Mae(diamonds_test$price, pred) 
## [1] 890.0513

3.2.1.2 Class Model

model2<- rpart(price~carat+y+x+z+cut+color+clarity, data=diamonds_train,  method="class")
pred2<- predict(model2, diamonds_test, type="class")
head(pred2)
## [1] 605 605 605 605 605 605
## 10678 Levels: 326 327 334 335 336 337 338 339 340 342 344 345 348 351 353 ... 18818
pred_roc<- predict(model2, diamonds_test)

Mae<- function(diamonds_test, pred_roc) {mean(abs(diamonds_test - pred_roc))}
Mae(diamonds_test$price, pred_roc) 
## [1] 3908.804

3.2.2 Linear Regression

linreg= lm(price~carat+y+x+z+cut+color+clarity, data= diamonds_train)
summary(linreg)
## 
## Call:
## lm(formula = price ~ carat + y + x + z + cut + color + clarity, 
##     data = diamonds_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21188.9   -597.0   -180.3    382.8  10750.6 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)    91.532     89.060    1.028   0.3041    
## carat       11168.672     53.913  207.163  < 2e-16 ***
## y              18.262     19.372    0.943   0.3458    
## x            -892.562     32.549  -27.422  < 2e-16 ***
## z            -187.957     30.647   -6.133 8.70e-10 ***
## cut.L         737.527     22.514   32.759  < 2e-16 ***
## cut.Q        -336.202     19.723  -17.046  < 2e-16 ***
## cut.C         177.820     17.055   10.426  < 2e-16 ***
## cut^4         -14.575     13.668   -1.066   0.2863    
## color.L     -1956.031     19.439 -100.623  < 2e-16 ***
## color.Q      -672.484     17.676  -38.045  < 2e-16 ***
## color.C      -159.135     16.493   -9.649  < 2e-16 ***
## color^4        25.484     15.147    1.682   0.0925 .  
## color^5      -100.358     14.309   -7.014 2.35e-12 ***
## color^6       -58.822     13.006   -4.523 6.12e-06 ***
## clarity.L    4161.591     33.866  122.884  < 2e-16 ***
## clarity.Q   -1954.488     31.670  -61.713  < 2e-16 ***
## clarity.C    1004.038     27.085   37.070  < 2e-16 ***
## clarity^4    -384.675     21.617  -17.795  < 2e-16 ***
## clarity^5     249.465     17.643   14.140  < 2e-16 ***
## clarity^6       8.907     15.358    0.580   0.5619    
## clarity^7      88.666     13.554    6.542 6.15e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1132 on 43121 degrees of freedom
## Multiple R-squared:  0.9199, Adjusted R-squared:  0.9198 
## F-statistic: 2.357e+04 on 21 and 43121 DF,  p-value: < 2.2e-16
linreg
## 
## Call:
## lm(formula = price ~ carat + y + x + z + cut + color + clarity, 
##     data = diamonds_train)
## 
## Coefficients:
## (Intercept)        carat            y            x            z        cut.L  
##      91.532    11168.672       18.262     -892.562     -187.957      737.527  
##       cut.Q        cut.C        cut^4      color.L      color.Q      color.C  
##    -336.202      177.820      -14.575    -1956.031     -672.484     -159.135  
##     color^4      color^5      color^6    clarity.L    clarity.Q    clarity.C  
##      25.484     -100.358      -58.822     4161.591    -1954.488     1004.038  
##   clarity^4    clarity^5    clarity^6    clarity^7  
##    -384.675      249.465        8.907       88.666
pred_linreg<- predict(linreg, diamonds_test)
head(pred_linreg)
##         1         2         3         4         5         6 
## 7887.8206  340.8368 4182.1728 4541.8600 1409.4117 3133.8959
Mae_linreg<- function(diamonds_test, pred_linreg) {mean(abs(diamonds_test - pred_linreg))}
Mae_linreg(diamonds_test$price, pred_linreg)  
## [1] 745.1833

4. Comparison and Conclusion

When CART and linear regression models are compared, it is seen that the model with the smallest mae value is the model established by linear regression. The fact that the R-squared and adjusted R-squared values(0.92) are quite high in the linear regression model shows that the variables in the model explain the changes in price at a high rate. Additionally, the fact that the p-value (2.2e-16) is significantly lower than 0.05 supports the significance of the model. Based on all these results, the most suitable model among these three models to estimate diamond prices is the linear regression model.