The diamond data set is a data set that shows the price of diamonds by classifying them according to certain numerical and categorical variables. You can find variables as below:
library(tidyverse)
library(ggplot2)
library(corrplot)
library(RColorBrewer)
library(caret)
library(rpart)
library(rpart.plot)
library(GGally)
library(gridExtra)
library(magrittr)
diamonds
## # A tibble: 53,940 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## # ... with 53,930 more rows
str(diamonds)
## tibble [53,940 x 10] (S3: tbl_df/tbl/data.frame)
## $ carat : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
ggplot(aes(x=carat, y=price), data=diamonds) +
price_carat <- geom_point(fill=I("#f77a20"), color=I("black"), shape=21) +
stat_smooth(method="lm") +
scale_x_continuous(lim = c(0, quantile(diamonds$carat, 0.99)) ) +
scale_y_continuous(lim = c(0, quantile(diamonds$price, 0.99)) ) +
ggtitle("price vs. carat") +
theme(plot.title = element_text(hjust = 0.5))
ggplot(aes(x=depth, y=price), data=diamonds) +
price_depth <- geom_point(fill=I("#f77a20"), color=I("black"), shape=21) +
stat_smooth(method="lm") +
scale_x_continuous(lim = c(0, quantile(diamonds$depth, 0.99)) ) +
scale_y_continuous(lim = c(0, quantile(diamonds$price, 0.99)) ) +
ggtitle("price vs. depth") +
theme(plot.title = element_text(hjust = 0.5))
ggplot(aes(x=table, y=price), data=diamonds) +
price_table <- geom_point(fill=I("#f77a20"), color=I("black"), shape=21) +
stat_smooth(method="lm") +
scale_x_continuous(lim = c(0, quantile(diamonds$table, 0.99)) ) +
scale_y_continuous(lim = c(0, quantile(diamonds$price, 0.99)) ) +
ggtitle("price vs. table") +
theme(plot.title = element_text(hjust = 0.5))
ggplot(aes(x=x, y=price), data=diamonds) +
price_x <- geom_point(fill=I("#f77a20"), color=I("black"), shape=21) +
stat_smooth(method="lm") +
scale_x_continuous(lim = c(0, quantile(diamonds$x, 0.99)) ) +
scale_y_continuous(lim = c(0, quantile(diamonds$price, 0.99)) ) +
ggtitle("price vs. x") +
theme(plot.title = element_text(hjust = 0.5))
ggplot(aes(x=y, y=price), data=diamonds) +
price_y <- geom_point(fill=I("#f77a20"), color=I("black"), shape=21) +
stat_smooth(method="lm") +
scale_x_continuous(lim = c(0, quantile(diamonds$y, 0.99)) ) +
scale_y_continuous(lim = c(0, quantile(diamonds$price, 0.99)) ) +
ggtitle("price vs. y") +
theme(plot.title = element_text(hjust = 0.5))
ggplot(aes(x=z, y=price), data=diamonds) +
price_z <- geom_point(fill=I("#f77a20"), color=I("black"), shape=21) +
stat_smooth(method="lm") +
scale_x_continuous(lim = c(0, quantile(diamonds$z, 0.99)) ) +
scale_y_continuous(lim = c(0, quantile(diamonds$price, 0.99)) ) +
ggtitle("price vs. z") +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(arrangeGrob(price_carat,
price_depth, price_table, ncol=2, nrow=2,
layout_matrix=rbind(c(1,1), c(2,3))),
arrangeGrob(price_x, price_y, price_z,
ncol=1, nrow=3), ncol=2)
ggpairs(diamonds[, c("price", "carat", "depth", "table", "x", "y", "z")],
upper =list(continuous="cor"), title = "Correlations") + theme(plot.title = element_text(hjust = 0.5))
It is seen from the graphs that there is a very high correlation between price and carat & y variables. It is clear that the explanatory power of carat and y variables on the price will be high in the price prediction models to be established.
Training and test sets were created, with 20% of the main data set being test and 80% being training set.
set.seed(503)
diamonds %>% mutate(diamond_id = row_number()) %>%
diamonds_test <- group_by(cut, color, clarity) %>% sample_frac(0.2) %>% ungroup()
anti_join(diamonds %>% mutate(diamond_id = row_number()),
diamonds_train <-by = "diamond_id") diamonds_test,
rpart(price~carat+y+x+z+cut+color+clarity, data=diamonds_train, method="anova")
model<-rpart.plot(model, type=3, digits=3, fallen.leaves = TRUE)
predict(model, diamonds_test)
pred<-head(pred)
## 1 2 3 4 5 6
## 10940.832 1050.302 5401.156 5401.156 3060.143 3060.143
function(diamonds_test, pred) {mean(abs(diamonds_test - pred))}
Mae<-Mae(diamonds_test$price, pred)
## [1] 890.0513
rpart(price~carat+y+x+z+cut+color+clarity, data=diamonds_train, method="class") model2<-
predict(model2, diamonds_test, type="class")
pred2<-head(pred2)
## [1] 605 605 605 605 605 605
## 10678 Levels: 326 327 334 335 336 337 338 339 340 342 344 345 348 351 353 ... 18818
predict(model2, diamonds_test)
pred_roc<-
function(diamonds_test, pred_roc) {mean(abs(diamonds_test - pred_roc))}
Mae<-Mae(diamonds_test$price, pred_roc)
## [1] 3908.804
lm(price~carat+y+x+z+cut+color+clarity, data= diamonds_train)
linreg=summary(linreg)
##
## Call:
## lm(formula = price ~ carat + y + x + z + cut + color + clarity,
## data = diamonds_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21188.9 -597.0 -180.3 382.8 10750.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 91.532 89.060 1.028 0.3041
## carat 11168.672 53.913 207.163 < 2e-16 ***
## y 18.262 19.372 0.943 0.3458
## x -892.562 32.549 -27.422 < 2e-16 ***
## z -187.957 30.647 -6.133 8.70e-10 ***
## cut.L 737.527 22.514 32.759 < 2e-16 ***
## cut.Q -336.202 19.723 -17.046 < 2e-16 ***
## cut.C 177.820 17.055 10.426 < 2e-16 ***
## cut^4 -14.575 13.668 -1.066 0.2863
## color.L -1956.031 19.439 -100.623 < 2e-16 ***
## color.Q -672.484 17.676 -38.045 < 2e-16 ***
## color.C -159.135 16.493 -9.649 < 2e-16 ***
## color^4 25.484 15.147 1.682 0.0925 .
## color^5 -100.358 14.309 -7.014 2.35e-12 ***
## color^6 -58.822 13.006 -4.523 6.12e-06 ***
## clarity.L 4161.591 33.866 122.884 < 2e-16 ***
## clarity.Q -1954.488 31.670 -61.713 < 2e-16 ***
## clarity.C 1004.038 27.085 37.070 < 2e-16 ***
## clarity^4 -384.675 21.617 -17.795 < 2e-16 ***
## clarity^5 249.465 17.643 14.140 < 2e-16 ***
## clarity^6 8.907 15.358 0.580 0.5619
## clarity^7 88.666 13.554 6.542 6.15e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1132 on 43121 degrees of freedom
## Multiple R-squared: 0.9199, Adjusted R-squared: 0.9198
## F-statistic: 2.357e+04 on 21 and 43121 DF, p-value: < 2.2e-16
linreg
##
## Call:
## lm(formula = price ~ carat + y + x + z + cut + color + clarity,
## data = diamonds_train)
##
## Coefficients:
## (Intercept) carat y x z cut.L
## 91.532 11168.672 18.262 -892.562 -187.957 737.527
## cut.Q cut.C cut^4 color.L color.Q color.C
## -336.202 177.820 -14.575 -1956.031 -672.484 -159.135
## color^4 color^5 color^6 clarity.L clarity.Q clarity.C
## 25.484 -100.358 -58.822 4161.591 -1954.488 1004.038
## clarity^4 clarity^5 clarity^6 clarity^7
## -384.675 249.465 8.907 88.666
predict(linreg, diamonds_test)
pred_linreg<-head(pred_linreg)
## 1 2 3 4 5 6
## 7887.8206 340.8368 4182.1728 4541.8600 1409.4117 3133.8959
function(diamonds_test, pred_linreg) {mean(abs(diamonds_test - pred_linreg))}
Mae_linreg<-Mae_linreg(diamonds_test$price, pred_linreg)
## [1] 745.1833
When CART and linear regression models are compared, it is seen that the model with the smallest mae value is the model established by linear regression. The fact that the R-squared and adjusted R-squared values(0.92) are quite high in the linear regression model shows that the variables in the model explain the changes in price at a high rate. Additionally, the fact that the p-value (2.2e-16) is significantly lower than 0.05 supports the significance of the model. Based on all these results, the most suitable model among these three models to estimate diamond prices is the linear regression model.