In this study dplyr and tidyverse packages are used for data manipulation, rpart,rpartplot and rattle packages are used to construct and visualize CART models.
library(tidyverse)
library(dplyr)
library(rpart)
library(rpart.plot)
library(rattle)
spamdata <- read.csv(file = 'spambase.csv')
set.seed(58)
spamdata = spamdata%>%
filter(complete.cases(.)) %>%
mutate(train_test = ifelse(runif(nrow(.)) < 0.25,"test","train"))%>%
mutate(spclass = ifelse(spamdata$class == 1 ,'Spam','Not Spam'))%>%
tbl_df() %>% select(-class)
spamdata$spclass <- factor(spamdata$spclass)
A regression tree is created with rpart and visualized with rpart.plot. Regression tress are type of decision trees.
Looking the tree below, we can say that;
These are just some of the interpretations, it is possible to make some others.
spam_train <- spamdata %>% filter(train_test == "train") %>% select(-train_test)
fit <- rpart(spclass ~ ., method="class", data=spam_train)
fancyRpartPlot(fit)
spam_in_sample <- predict(fit)
print(head(spam_in_sample))
## Not Spam Spam
## 1 0.11377246 0.8862275
## 2 0.06565657 0.9343434
## 3 0.09917355 0.9008264
## 4 0.09917355 0.9008264
## 5 0.89661930 0.1033807
## 6 0.89661930 0.1033807
in_sample_prediction =
cbind(
spam_in_sample %>% tbl_df %>%
transmute(spam_predict = ifelse(Spam >= 0.5,1,0)),
spam_train %>%
transmute(spam_actual = ifelse(spclass == 'Spam',1,0))
) %>% tbl_df %>%
mutate(correct_class = (spam_predict == spam_actual)) %>%
group_by(correct_class) %>%
summarise(count=n(),percentage=n()/nrow(.))
print(in_sample_prediction)
## # A tibble: 2 x 3
## correct_class count percentage
## <lgl> <int> <dbl>
## 1 FALSE 335 0.0977
## 2 TRUE 3094 0.902
spam_test <- spamdata %>% filter(train_test=="test") %>% select(-train_test)
spam_predict <- predict(fit,newdata=spam_test)
print(head(spam_predict))
## Not Spam Spam
## 1 0.06565657 0.9343434
## 2 0.89661930 0.1033807
## 3 0.09917355 0.9008264
## 4 0.06565657 0.9343434
## 5 0.06565657 0.9343434
## 6 0.11377246 0.8862275
out_of_sample_prediction =
cbind(
spam_predict %>% tbl_df %>%
transmute(spam_predict = ifelse(Spam >= 0.5,1,0)),
spam_test %>% tbl_df %>%
transmute(spam_actual = ifelse(spclass == "Spam",1,0))
) %>%
mutate(correct_class = (spam_predict == spam_actual)) %>%
group_by(correct_class) %>%
summarise(count=n(),percentage=n()/nrow(.))
print(out_of_sample_prediction)
## # A tibble: 2 x 3
## correct_class count percentage
## <lgl> <int> <dbl>
## 1 FALSE 113 0.0964
## 2 TRUE 1059 0.904
Logistic regression predicts whether something is (binary) True or False, or Spam and Not Spam in our case.
Some notes about Logit and Probit;
For further details about Logit and Probit
spam_logit_model <- glm(spclass ~ ., data=spam_train,family=binomial(link = "logit"))
spam_probit_model <- glm(spclass ~ ., data=spam_train,family=binomial(link = "probit"))
spam_logit_in_sample <- predict(spam_logit_model,type="response")
spam_logit_in_sample_prediction <-
data.frame(in_sample=(spam_logit_in_sample >= 0.5)*1,
actual=(spam_train$spclass == "Spam")*1) %>%
mutate(correct_class= (in_sample == actual)) %>%
group_by(correct_class) %>%
summarise(count=n(),percentage=n()/nrow(.))
print(spam_logit_in_sample_prediction)
## # A tibble: 2 x 3
## correct_class count percentage
## <lgl> <int> <dbl>
## 1 FALSE 249 0.0726
## 2 TRUE 3180 0.927
spam_logit_out_of_sample <- predict(spam_logit_model,newdata=spam_test,type="response")
spam_logit_out_of_sample_prediction <-
data.frame(out_of_sample=(spam_logit_out_of_sample >= 0.5)*1,
actual=(spam_test$spclass == "Spam")*1) %>%
mutate(correct_class= (out_of_sample == actual)) %>%
group_by(correct_class) %>%
summarise(count=n(),percentage=n()/nrow(.))
print(spam_logit_out_of_sample_prediction)
## # A tibble: 2 x 3
## correct_class count percentage
## <lgl> <int> <dbl>
## 1 FALSE 81 0.0691
## 2 TRUE 1091 0.931
spam_probit_in_sample <- predict(spam_probit_model,type="response")
spam_probit_in_sample_prediction <-
data.frame(in_sample=(spam_probit_in_sample >= 0.5)*1,
actual=(spam_train$spclass == "Spam")*1) %>%
mutate(correct_class= (in_sample == actual)) %>%
group_by(correct_class) %>%
summarise(count=n(),percentage=n()/nrow(.))
print(spam_probit_in_sample_prediction)
## # A tibble: 2 x 3
## correct_class count percentage
## <lgl> <int> <dbl>
## 1 FALSE 400 0.117
## 2 TRUE 3029 0.883
spam_probit_out_of_sample <- predict(spam_probit_model,newdata=spam_test,type="response")
spam_probit_out_of_sample_prediction <-
data.frame(out_of_sample=(spam_probit_out_of_sample >= 0.5)*1,
actual=(spam_test$spclass == "Spam")*1) %>%
mutate(correct_class= (out_of_sample == actual)) %>%
group_by(correct_class) %>%
summarise(count=n(),percentage=n()/nrow(.))
print(spam_probit_out_of_sample_prediction)
## # A tibble: 2 x 3
## correct_class count percentage
## <lgl> <int> <dbl>
## 1 FALSE 128 0.109
## 2 TRUE 1044 0.891
With looking the accuracy ratios below, we can conclude that;
complete_benchmark <- data.frame(
model = c("CART","Logistic Reg. - Logit Link","Logistic Reg. - Probit Link"),
in_sample_accuracy = c(
in_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist(),
spam_logit_in_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist(),
spam_probit_in_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist()
),
out_of_sample_accuracy = c(
out_of_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist(),
spam_logit_out_of_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist(),
spam_probit_out_of_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist()
)
)
print(complete_benchmark)
## model in_sample_accuracy out_of_sample_accuracy
## 1 CART 0.9023 0.9036
## 2 Logistic Reg. - Logit Link 0.9274 0.9309
## 3 Logistic Reg. - Probit Link 0.8833 0.8908
You may click here to reach other items of my progress journal.