Importing Necessary Packages and Dataset

In this study dplyr and tidyverse packages are used for data manipulation, rpart,rpartplot and rattle packages are used to construct and visualize CART models.

library(tidyverse) 
library(dplyr)
library(rpart)
library(rpart.plot)
library(rattle)

spamdata <- read.csv(file = 'spambase.csv')

Pre-processing and Making Some Adjustments on Dataset

set.seed(58)

  spamdata = spamdata%>%
  filter(complete.cases(.)) %>%
  mutate(train_test = ifelse(runif(nrow(.)) < 0.25,"test","train"))%>%
  mutate(spclass = ifelse(spamdata$class == 1 ,'Spam','Not Spam'))%>%
  tbl_df() %>% select(-class)

spamdata$spclass <- factor(spamdata$spclass)

Classification and Regression Trees (CART)

A regression tree is created with rpart and visualized with rpart.plot. Regression tress are type of decision trees.

Looking the tree below, we can say that;

These are just some of the interpretations, it is possible to make some others.

spam_train <- spamdata %>% filter(train_test == "train") %>% select(-train_test)
fit <- rpart(spclass ~ ., method="class", data=spam_train)
fancyRpartPlot(fit)

In Sample Prediction

  • In sample prediction means, creating/fitting a model with our training set and testing it again with it.
  • We see some prediction results of our model, and the accuracy of it the tables below, respectively.
spam_in_sample <- predict(fit)
print(head(spam_in_sample))
##     Not Spam      Spam
## 1 0.11377246 0.8862275
## 2 0.06565657 0.9343434
## 3 0.09917355 0.9008264
## 4 0.09917355 0.9008264
## 5 0.89661930 0.1033807
## 6 0.89661930 0.1033807
in_sample_prediction =
  cbind(
    spam_in_sample %>% tbl_df %>%
      transmute(spam_predict = ifelse(Spam >= 0.5,1,0)),
    spam_train %>%
      transmute(spam_actual = ifelse(spclass == 'Spam',1,0))
  ) %>% tbl_df %>%
  mutate(correct_class = (spam_predict == spam_actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))

print(in_sample_prediction)
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           335     0.0977
## 2 TRUE           3094     0.902

Out of Sample Prediction

  • Out of the sample prediction means, creating/fitting a model with our training set and testing it with our test set.
  • We see some prediction results of our model, and the accuracy of it the tables below, respectively.
spam_test <- spamdata %>% filter(train_test=="test") %>% select(-train_test)
spam_predict <- predict(fit,newdata=spam_test)
print(head(spam_predict))
##     Not Spam      Spam
## 1 0.06565657 0.9343434
## 2 0.89661930 0.1033807
## 3 0.09917355 0.9008264
## 4 0.06565657 0.9343434
## 5 0.06565657 0.9343434
## 6 0.11377246 0.8862275
out_of_sample_prediction =
  cbind(
    spam_predict %>% tbl_df %>%
      transmute(spam_predict = ifelse(Spam >= 0.5,1,0)),
    spam_test %>% tbl_df %>%
      transmute(spam_actual = ifelse(spclass == "Spam",1,0))
  ) %>%
  mutate(correct_class = (spam_predict == spam_actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))

print(out_of_sample_prediction)
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           113     0.0964
## 2 TRUE           1059     0.904
  • In short, our model accuracy is 90.2% and 90.4% in ‘In sample prediction’ and ‘Out of the sample prediciton’, respectively.

Logistic Regression

For further details about Logit and Probit

spam_logit_model <- glm(spclass ~ ., data=spam_train,family=binomial(link = "logit"))
spam_probit_model <- glm(spclass ~ ., data=spam_train,family=binomial(link = "probit"))

Logit - In Sample

spam_logit_in_sample <- predict(spam_logit_model,type="response")

spam_logit_in_sample_prediction <-
  data.frame(in_sample=(spam_logit_in_sample >= 0.5)*1,
             actual=(spam_train$spclass == "Spam")*1) %>%
  mutate(correct_class= (in_sample == actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))

print(spam_logit_in_sample_prediction)
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           249     0.0726
## 2 TRUE           3180     0.927

Logit - Out of Sample

spam_logit_out_of_sample <- predict(spam_logit_model,newdata=spam_test,type="response")

spam_logit_out_of_sample_prediction <-
  data.frame(out_of_sample=(spam_logit_out_of_sample >= 0.5)*1,
             actual=(spam_test$spclass == "Spam")*1) %>%
  mutate(correct_class= (out_of_sample == actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))

print(spam_logit_out_of_sample_prediction)
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE            81     0.0691
## 2 TRUE           1091     0.931

Probit - In Sample

spam_probit_in_sample <- predict(spam_probit_model,type="response")

spam_probit_in_sample_prediction <-
  data.frame(in_sample=(spam_probit_in_sample >= 0.5)*1,
             actual=(spam_train$spclass == "Spam")*1) %>%
  mutate(correct_class= (in_sample == actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))
print(spam_probit_in_sample_prediction)
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           400      0.117
## 2 TRUE           3029      0.883

Probit - Out of Sample

spam_probit_out_of_sample <- predict(spam_probit_model,newdata=spam_test,type="response")

spam_probit_out_of_sample_prediction <-
  data.frame(out_of_sample=(spam_probit_out_of_sample >= 0.5)*1,
             actual=(spam_test$spclass == "Spam")*1) %>%
  mutate(correct_class= (out_of_sample == actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))

print(spam_probit_out_of_sample_prediction)
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           128      0.109
## 2 TRUE           1044      0.891

Benchmark

complete_benchmark <- data.frame(
  model = c("CART","Logistic Reg. - Logit Link","Logistic Reg. - Probit Link"),
  in_sample_accuracy = c(
    in_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist(),
    spam_logit_in_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist(),
    spam_probit_in_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist()
  ),
  out_of_sample_accuracy = c(
    out_of_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist(),
    spam_logit_out_of_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist(),
    spam_probit_out_of_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist()
  )
)
print(complete_benchmark)
##                         model in_sample_accuracy out_of_sample_accuracy
## 1                        CART             0.9023                 0.9036
## 2  Logistic Reg. - Logit Link             0.9274                 0.9309
## 3 Logistic Reg. - Probit Link             0.8833                 0.8908

References

You may click here to reach other items of my progress journal.