Importing Necessary Packages and Dataset

In this study dplyr and tidyverse packages are used for data manipulation, rpart,rpartplot and rattle packages are used to construct and visualize CART models.


spamdata <- read.csv(file = 'spambase.csv')

Pre-processing and Making Some Adjustments on Dataset


  spamdata = spamdata%>%
  filter(complete.cases(.)) %>%
  mutate(train_test = ifelse(runif(nrow(.)) < 0.25,"test","train"))%>%
  mutate(spclass = ifelse(spamdata$class == 1 ,'Spam','Not Spam'))%>%
  tbl_df() %>% select(-class)

spamdata$spclass <- factor(spamdata$spclass)

Classification and Regression Trees (CART)

A regression tree is created with rpart and visualized with rpart.plot. Regression tress are type of decision trees.

Looking the tree below, we can say that;

spam_train <- spamdata %>% filter(train_test == "train") %>% select(-train_test)
fit <- rpart(spclass ~ ., method="class", data=spam_train)

In Sample Prediction

  • In sample prediction means, creating/fitting a model with our training set and testing it again with it.
  • We see some prediction results of our model, and the accuracy of it the tables below, respectively.
spam_in_sample <- predict(fit)
##     Not Spam      Spam
## 1 0.11377246 0.8862275
## 2 0.06565657 0.9343434
## 3 0.09917355 0.9008264
## 4 0.09917355 0.9008264
## 5 0.89661930 0.1033807
## 6 0.89661930 0.1033807
in_sample_prediction =
    spam_in_sample %>% tbl_df %>%
      transmute(spam_predict = ifelse(Spam >= 0.5,1,0)),
    spam_train %>%
      transmute(spam_actual = ifelse(spclass == 'Spam',1,0))
  ) %>% tbl_df %>%
  mutate(correct_class = (spam_predict == spam_actual)) %>%
  group_by(correct_class) %>%

## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           335     0.0977
## 2 TRUE           3094     0.902

Out of Sample Prediction

  • Out of the sample prediction means, creating/fitting a model with our training set and testing it with our test set.
  • We see some prediction results of our model, and the accuracy of it the tables below, respectively.
spam_test <- spamdata %>% filter(train_test=="test") %>% select(-train_test)
spam_predict <- predict(fit,newdata=spam_test)
##     Not Spam      Spam
## 1 0.06565657 0.9343434
## 2 0.89661930 0.1033807
## 3 0.09917355 0.9008264
## 4 0.06565657 0.9343434
## 5 0.06565657 0.9343434
## 6 0.11377246 0.8862275
out_of_sample_prediction =
    spam_predict %>% tbl_df %>%
      transmute(spam_predict = ifelse(Spam >= 0.5,1,0)),
    spam_test %>% tbl_df %>%
      transmute(spam_actual = ifelse(spclass == "Spam",1,0))
  ) %>%
  mutate(correct_class = (spam_predict == spam_actual)) %>%
  group_by(correct_class) %>%

## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           113     0.0964
## 2 TRUE           1059     0.904
  • In short, our model accuracy is 90.2% and 90.4% in ‘In sample prediction’ and ‘Out of the sample prediciton’, respectively.

Logistic Regression

spam_logit_model <- glm(spclass ~ ., data=spam_train,family=binomial(link = "logit"))
spam_probit_model <- glm(spclass ~ ., data=spam_train,family=binomial(link = "probit"))

Logit - In Sample

spam_logit_in_sample <- predict(spam_logit_model,type="response")

spam_logit_in_sample_prediction <-
  data.frame(in_sample=(spam_logit_in_sample >= 0.5)*1,
             actual=(spam_train$spclass == "Spam")*1) %>%
  mutate(correct_class= (in_sample == actual)) %>%
  group_by(correct_class) %>%

## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           249     0.0726
## 2 TRUE           3180     0.927

Logit - Out of Sample

spam_logit_out_of_sample <- predict(spam_logit_model,newdata=spam_test,type="response")

spam_logit_out_of_sample_prediction <-
  data.frame(out_of_sample=(spam_logit_out_of_sample >= 0.5)*1,
             actual=(spam_test$spclass == "Spam")*1) %>%
  mutate(correct_class= (out_of_sample == actual)) %>%
  group_by(correct_class) %>%

## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE            81     0.0691
## 2 TRUE           1091     0.931

Probit - In Sample

spam_probit_in_sample <- predict(spam_probit_model,type="response")

spam_probit_in_sample_prediction <-
  data.frame(in_sample=(spam_probit_in_sample >= 0.5)*1,
             actual=(spam_train$spclass == "Spam")*1) %>%
  mutate(correct_class= (in_sample == actual)) %>%
  group_by(correct_class) %>%
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           400      0.117
## 2 TRUE           3029      0.883

Probit - Out of Sample

spam_probit_out_of_sample <- predict(spam_probit_model,newdata=spam_test,type="response")

spam_probit_out_of_sample_prediction <-
  data.frame(out_of_sample=(spam_probit_out_of_sample >= 0.5)*1,
             actual=(spam_test$spclass == "Spam")*1) %>%
  mutate(correct_class= (out_of_sample == actual)) %>%
  group_by(correct_class) %>%

## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           128      0.109
## 2 TRUE           1044      0.891


complete_benchmark <- data.frame(
  model = c("CART","Logistic Reg. - Logit Link","Logistic Reg. - Probit Link"),
  in_sample_accuracy = c(
    in_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist(),
    spam_logit_in_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist(),
    spam_probit_in_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist()
  out_of_sample_accuracy = c(
    out_of_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist(),
    spam_logit_out_of_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist(),
    spam_probit_out_of_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist()
##                         model in_sample_accuracy out_of_sample_accuracy
## 1                        CART             0.9023                 0.9036
## 2  Logistic Reg. - Logit Link             0.9274                 0.9309
## 3 Logistic Reg. - Probit Link             0.8833                 0.8908


