Introduction

This data set contains prices and other attributes of almost 55000 diamonds. In this report, we will try to understand relationship of price and other attributes and estimate prices of diamonds using linear regression.

Variables

price: price in US dollars ($326–$18,823) carat: weight of the diamond (0.2–5.01) cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal) color: diamond color, from D (best) to J (worst) clarity: a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best)) x: length in mm (0–10.74) y: width in mm (0–58.9) z: depth in mm (0–31.8) depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43–79) table: width of top of diamond relative to widest point (43–95)

Data

diamonds %>% head(5)
## # A tibble: 5 x 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23  Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43
## 2 0.21  Premium E     SI1      59.8    61   326  3.89  3.84  2.31
## 3 0.23  Good    E     VS1      56.9    65   327  4.05  4.07  2.31
## 4 0.290 Premium I     VS2      62.4    58   334  4.2   4.23  2.63
## 5 0.31  Good    J     SI2      63.3    58   335  4.34  4.35  2.75
glimpse(diamonds)
## Rows: 53,940
## Columns: 10
## $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0…
## $ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ve…
## $ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1,…
## $ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 6…
## $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 5…
## $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 3…
## $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4…
## $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4…
## $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2…
summary(diamonds)
##      carat               cut        color        clarity          depth      
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065   Min.   :43.00  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258   1st Qu.:61.00  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194   Median :61.80  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171   Mean   :61.75  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066   3rd Qu.:62.50  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655   Max.   :79.00  
##                                     J: 2808   (Other): 2531                  
##      table           price             x                y         
##  Min.   :43.00   Min.   :  326   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710   1st Qu.: 4.720  
##  Median :57.00   Median : 2401   Median : 5.700   Median : 5.710  
##  Mean   :57.46   Mean   : 3933   Mean   : 5.731   Mean   : 5.735  
##  3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540   3rd Qu.: 6.540  
##  Max.   :95.00   Max.   :18823   Max.   :10.740   Max.   :58.900  
##                                                                   
##        z         
##  Min.   : 0.000  
##  1st Qu.: 2.910  
##  Median : 3.530  
##  Mean   : 3.539  
##  3rd Qu.: 4.040  
##  Max.   :31.800  
## 

Splitting Data

set.seed(503)
diamonds_test <- diamonds %>% mutate(diamond_id = row_number()) %>% 
    group_by(cut, color, clarity) %>% sample_frac(0.2) %>% ungroup()

diamonds_train <- anti_join(diamonds %>% mutate(diamond_id = row_number()), 
    diamonds_test, by = "diamond_id")

diamonds_train
## # A tibble: 43,143 x 11
##    carat cut       color clarity depth table price     x     y     z diamond_id
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>      <int>
##  1 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31          2
##  2 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31          3
##  3 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63          4
##  4 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75          5
##  5 0.24  Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47          7
##  6 0.26  Very Good H     SI1      61.9    55   337  4.07  4.11  2.53          8
##  7 0.22  Fair      E     VS2      65.1    61   337  3.87  3.78  2.49          9
##  8 0.23  Very Good H     VS1      59.4    61   338  4     4.05  2.39         10
##  9 0.3   Good      J     SI1      64      55   339  4.25  4.28  2.73         11
## 10 0.23  Ideal     J     VS1      62.8    56   340  3.93  3.9   2.46         12
## # … with 43,133 more rows

Creating Models

## 
## Calls:
## model: lm(formula = price ~ carat, data = diamonds_train)
## model2: lm(formula = price ~ carat + cut + color + clarity, data = diamonds_train)
## model3: lm(formula = price ~ carat + cut + color, data = diamonds_train)
## 
## =========================================================
##                   model         model2        model3     
## ---------------------------------------------------------
##   (Intercept)  -2258.787***  -3723.666***  -3155.736***  
##                  (14.609)      (15.628)      (17.622)    
##   carat         7761.940***   8896.038***   8188.654***  
##                  (15.727)      (13.437)      (15.541)    
##   cut: .L                      707.557***   1259.702***  
##                                (22.744)      (27.712)    
##   cut: .Q                     -327.631***   -536.930***  
##                                (20.037)      (24.569)    
##   cut: .C                      185.351***    379.188***  
##                                (17.398)      (21.464)    
##   cut: ^4                       -4.259        72.087***  
##                                (13.928)      (17.232)    
##   color: .L                  -1905.565***  -1568.186***  
##                                (19.821)      (24.346)    
##   color: .Q                   -627.589***   -730.584***  
##                                (18.028)      (22.244)    
##   color: .C                   -166.849***    -99.423***  
##                                (16.849)      (20.872)    
##   color: ^4                     11.152        70.941***  
##                                (15.472)      (19.165)    
##   color: ^5                    -89.472***   -140.170***  
##                                (14.617)      (18.118)    
##   color: ^6                    -59.685***   -171.967***  
##                                (13.287)      (16.434)    
##   clarity: .L                 4250.715***                
##                                (34.533)                  
##   clarity: .Q                -1858.920***                
##                                (32.277)                  
##   clarity: .C                  936.585***                
##                                (27.624)                  
##   clarity: ^4                 -379.408***                
##                                (22.085)                  
##   clarity: ^5                  223.564***                
##                                (18.013)                  
##   clarity: ^6                    8.574                   
##                                (15.691)                  
##   clarity: ^7                  106.284***                
##                                (13.841)                  
## ---------------------------------------------------------
##   R-squared        0.850         0.916         0.871     
##   N            43143         43143         43143         
## =========================================================
##   Significance: *** = p < 0.001; ** = p < 0.01;   
##                 * = p < 0.05
prediction <- predict(model2, diamonds_test)
rs2 <- cor(diamonds_test$price,prediction)^2
rs2
## [1] 0.914327