Loading Packages
library(tidyverse)
library(dplyr)
library(ggplot2)
library(knitr)
library(RColorBrewer)
Data Processing
set.seed(503)
diamonds_test <- diamonds %>% mutate(diamond_id = row_number()) %>%
group_by(cut, color, clarity) %>% sample_frac(0.2) %>% ungroup()
diamonds_train <- anti_join(diamonds %>% mutate(diamond_id = row_number()),
diamonds_test, by = "diamond_id")
Explanatory Data Analysis
Carat
ggplot(data = diamonds, mapping = aes(x = carat)) + geom_histogram(bins = 500)

Color
ggplot(diamonds, aes(x = color, y = ..)) + geom_bar(aes(y = ..count.. / sum(..count..))) + ylab("Proportion")

Clarity
diamonds %>%
mutate(clarity = factor(clarity)) %>%
group_by(clarity) %>%
summarise(clarity_count = n())%>%
arrange(desc(clarity_count)) %>%
head(20) %>%
ggplot(., aes(x=clarity_count, y = reorder(clarity, clarity_count), fill= clarity)) +
geom_col() +
geom_text(aes(label = clarity_count), size=3, color = "black", position = position_stack(vjust = 0.95)) +
theme_minimal() +
theme(legend.position = "none") +
labs(x = "Count",
y = "Clarity")

Cut
diamonds %>%
#mutate(cut = factor(cut)) %>%
group_by(cut) %>%
summarise(count = n()) %>%
mutate(percentage = 100*count/sum(count)) %>%
ggplot(., aes(x=cut, y = count, fill = count)) +
geom_col() +
scale_fill_gradient("count", low="lightpink1", high="lightpink4") +
geom_line(aes(y = count), size = 1.2, color="black", group = 1) +
theme_minimal() +
theme(legend.position = "none") +
labs( x = "Quality of Cut",
y = "Number of Cut")

Price and Carat
ggplot(data=diamonds, aes(x=carat, y=price)) +
# get rid of top percentile
scale_x_continuous(lim=c(0,quantile(diamonds$carat,0.99))) +
scale_y_continuous(lim=c(0,quantile(diamonds$price,0.99))) +
geom_point(fill=I('#dd3333'), color= I("black"), aes(alpha=1/10),shape=21)+ xlab("Carat") + ylab("Price")

Price and Cut
ggplot(diamonds, aes(factor(cut), price, fill=cut)) + geom_boxplot() + ggtitle("Diamond Price according Cut") + xlab("Type of Cut") + ylab("Price")

Price and Color
ggplot(diamonds, aes(factor(color), price, fill=color)) + geom_boxplot() + ggtitle("Diamond Price according Color") + xlab("Type of Color") + ylab("Price")

Cut and Color
ggplot(diamonds, aes(x = color, fill = cut)) + geom_bar(position = "fill")

Cut and Clarity
ggplot(diamonds, aes(cut, clarity)) +
geom_jitter(alpha = 0.05)

Price with Cut and Carat
diamonds %>%
ggplot(mapping = aes(x = carat, y = price))+
geom_point(mapping = aes(color = cut))

PCA
diamonds_pca <- princomp(as.matrix(diamonds_train[,sapply(diamonds_train, class) == "numeric"]),cor=T)
summary(diamonds_pca,loadings=TRUE)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.979800 1.1336225 0.8255050 0.2273068 0.218071172
## Proportion of Variance 0.653268 0.2141833 0.1135764 0.0086114 0.007925839
## Cumulative Proportion 0.653268 0.8674513 0.9810277 0.9896391 0.997564967
## Comp.6
## Standard deviation 0.120872645
## Proportion of Variance 0.002435033
## Cumulative Proportion 1.000000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## carat 0.496 0.638 0.442 0.385
## depth 0.734 -0.671
## table 0.122 -0.669 -0.733
## x 0.501 0.114 -0.855
## y 0.494 -0.760 0.361 0.201
## z 0.493 0.103 -0.817 0.279
plot(diamonds_pca,type="l")

Prediction
formul<-price~carat+cut+color+clarity+depth+table+x+y+z
Diamond.lm.model<-lm(formul,diamonds_train)
diamonds_test$lmprediction<-predict(Diamond.lm.model,newdata = diamonds_test)
summary(Diamond.lm.model)
##
## Call:
## lm(formula = formul, data = diamonds_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21446.0 -593.0 -182.4 378.8 10701.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6032.795 440.677 13.690 < 2e-16 ***
## carat 11306.047 55.032 205.445 < 2e-16 ***
## cut.L 587.057 25.144 23.347 < 2e-16 ***
## cut.Q -300.964 20.114 -14.963 < 2e-16 ***
## cut.C 148.352 17.317 8.567 < 2e-16 ***
## cut^4 -26.568 13.826 -1.922 0.0547 .
## color.L -1949.838 19.404 -100.484 < 2e-16 ***
## color.Q -672.931 17.640 -38.148 < 2e-16 ***
## color.C -161.554 16.458 -9.816 < 2e-16 ***
## color^4 25.973 15.117 1.718 0.0858 .
## color^5 -98.819 14.278 -6.921 4.55e-12 ***
## color^6 -59.129 12.977 -4.556 5.22e-06 ***
## clarity.L 4129.825 33.880 121.897 < 2e-16 ***
## clarity.Q -1954.978 31.602 -61.863 < 2e-16 ***
## clarity.C 997.867 27.030 36.918 < 2e-16 ***
## clarity^4 -382.021 21.572 -17.709 < 2e-16 ***
## clarity^5 242.941 17.611 13.795 < 2e-16 ***
## clarity^6 12.562 15.328 0.820 0.4125
## clarity^7 87.466 13.525 6.467 1.01e-10 ***
## depth -65.123 4.992 -13.045 < 2e-16 ***
## table -29.084 3.250 -8.950 < 2e-16 ***
## x -1020.664 34.775 -29.350 < 2e-16 ***
## y -1.438 19.396 -0.074 0.9409
## z -38.006 33.858 -1.123 0.2616
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1130 on 43119 degrees of freedom
## Multiple R-squared: 0.9202, Adjusted R-squared: 0.9202
## F-statistic: 2.162e+04 on 23 and 43119 DF, p-value: < 2.2e-16
linear.rsquared<-1 - (sum((diamonds_test$lmprediction-diamonds_test$price)^2)/sum((diamonds_test$price-mean(diamonds_test$price))^2))
linear.rsquared
## [1] 0.9180486
ggplot(diamonds_test,aes(lmprediction,price,alpha=0.6)) +
geom_point(color="orange")+
geom_abline(color="black",size=1.2)+
theme_minimal() +
theme(legend.position = "none", plot.title = element_text(vjust = 0.5))+
ggtitle("Predictions vs Actual Prices")
