Objective

The objective of this project is to build a CART model to detect spam emails using UCI Spambase data.

Preprocessing

The dataset is imported as spambase_csv and it is examined with the glimpse.The class column is examined in detail. Then, the dataset is split into train and test set. %25 is test and 75% for the train.

spambase_csv <- read_csv("C:/Users/Aybike/Desktop/spambase_csv.csv")

glimpse(spambase_csv)

## Rows: 4,601
## Columns: 58
## $ word_freq_make             <dbl> 0.00, 0.21, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_address          <dbl> 0.64, 0.28, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_all              <dbl> 0.64, 0.50, 0.71, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_3d               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_our              <dbl> 0.32, 0.14, 1.23, 0.63, 0.63, 1.85, 1.92...
## $ word_freq_over             <dbl> 0.00, 0.28, 0.19, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_remove           <dbl> 0.00, 0.21, 0.19, 0.31, 0.31, 0.00, 0.00...
## $ word_freq_internet         <dbl> 0.00, 0.07, 0.12, 0.63, 0.63, 1.85, 0.00...
## $ word_freq_order            <dbl> 0.00, 0.00, 0.64, 0.31, 0.31, 0.00, 0.00...
## $ word_freq_mail             <dbl> 0.00, 0.94, 0.25, 0.63, 0.63, 0.00, 0.64...
## $ word_freq_receive          <dbl> 0.00, 0.21, 0.38, 0.31, 0.31, 0.00, 0.96...
## $ word_freq_will             <dbl> 0.64, 0.79, 0.45, 0.31, 0.31, 0.00, 1.28...
## $ word_freq_people           <dbl> 0.00, 0.65, 0.12, 0.31, 0.31, 0.00, 0.00...
## $ word_freq_report           <dbl> 0.00, 0.21, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_addresses        <dbl> 0.00, 0.14, 1.75, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_free             <dbl> 0.32, 0.14, 0.06, 0.31, 0.31, 0.00, 0.96...
## $ word_freq_business         <dbl> 0.00, 0.07, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_email            <dbl> 1.29, 0.28, 1.03, 0.00, 0.00, 0.00, 0.32...
## $ word_freq_you              <dbl> 1.93, 3.47, 1.36, 3.18, 3.18, 0.00, 3.85...
## $ word_freq_credit           <dbl> 0.00, 0.00, 0.32, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_your             <dbl> 0.96, 1.59, 0.51, 0.31, 0.31, 0.00, 0.64...
## $ word_freq_font             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_000              <dbl> 0.00, 0.43, 1.16, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_money            <dbl> 0.00, 0.43, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_hp               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_hpl              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_george           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_650              <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_lab              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_labs             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_telnet           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_857              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_data             <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_415              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_85               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_technology       <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_1999             <dbl> 0.00, 0.07, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_parts            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_pm               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_direct           <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_cs               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_meeting          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_original         <dbl> 0.00, 0.00, 0.12, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_project          <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_re               <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_edu              <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_table            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_conference       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ `char_freq_%3B`            <dbl> 0.000, 0.000, 0.010, 0.000, 0.000, 0.000...
## $ `char_freq_%28`            <dbl> 0.000, 0.132, 0.143, 0.137, 0.135, 0.223...
## $ `char_freq_%5B`            <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...
## $ `char_freq_%21`            <dbl> 0.778, 0.372, 0.276, 0.137, 0.135, 0.000...
## $ `char_freq_%24`            <dbl> 0.000, 0.180, 0.184, 0.000, 0.000, 0.000...
## $ `char_freq_%23`            <dbl> 0.000, 0.048, 0.010, 0.000, 0.000, 0.000...
## $ capital_run_length_average <dbl> 3.756, 5.114, 9.821, 3.537, 3.537, 3.000...
## $ capital_run_length_longest <dbl> 61, 101, 485, 40, 40, 15, 4, 11, 445, 43...
## $ capital_run_length_total   <dbl> 278, 1028, 2259, 191, 191, 54, 112, 49, ...
## $ class                      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...

summary(spambase_csv$class)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.394   1.000   1.000

set.seed(58) #set seed to obtain the same sample in each run of the model.
spambase_csv = spambase_csv %>% filter(complete.cases(.)) %>%
  mutate(train_test = ifelse(runif(nrow(.)) < 0.25,"test","train"))

spam_train <- spambase_csv %>% filter(train_test == "train") %>% select(-train_test)

spam_test <- spambase_csv %>% filter(train_test=="test") %>% select(-train_test)

CART Modelling

The R package rpart is used for building a model to predict that, an e-mail is a spam or ham. The output of the model (y) is either 1 or 0, therefore the method=“class” argument is used in the rpart model.

fit <- rpart(class ~ .,
             method="class", data=spam_train)

The Classification Tree of the train set is plotted with rpart.plot.

fancyRpartPlot(fit,main = "Classification Tree of Training Set\n")

First, the data is split in two: fewer char_freq_%24 (less than 0.056 char_freq_%24, to the left of the tree) and more char_freq_%24 (more than 0.056 char_freq_%24, to the right). The left group makes up 76% of the original sample, versus 24% for the other one. As an example, the explanation of Node 7 is if char_freq_%24 is greater than or equal to 0.056 and word_freq_hp is less than 0.41, then the probability of being spam (1) is 0.93.

CART Analysis

After fitting the model, it is run with the train and the test set.

In Sample Analysis

Firstly, the model is run with the train set. The prediction values are obtained. The probability outputs are changed using if-else statements. Then the data is grouped according to the accuracy as TRUE-FALSE. Finally, it is summarized as the count and percentage of accuracy.

spam_predict_in_sample <- as_tibble(predict(fit))
names(spam_predict_in_sample)[1] <- "Ham"
names(spam_predict_in_sample)[2] <- "Spam"

print(head(spam_predict_in_sample))

## # A tibble: 6 x 2
##      Ham  Spam
##    <dbl> <dbl>
## 1 0.114  0.886
## 2 0.0657 0.934
## 3 0.0992 0.901
## 4 0.0992 0.901
## 5 0.897  0.103
## 6 0.897  0.103

in_sample_prediction <-
  cbind(
    spam_predict_in_sample %>%
      transmute(spam_predict = ifelse(Spam >= 0.5,1,0)),
    spam_train %>%
      transmute(spam_actual = ifelse(class == 1,1,0))
  ) %>%
  mutate(correct_class = (spam_predict == spam_actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=round(n()/nrow(.),4))

print(in_sample_prediction)

## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           335     0.0977
## 2 TRUE           3094     0.902

The in sample accuracy of the model is obtained as 90,2%.

Out of Sample Analysis

In this part, the model is run with the test set. The same procedures are applied as In Sample Analysis.

spam_predict_in_test_set <- as_tibble(predict(fit,newdata=spam_test))
names(spam_predict_in_test_set)[1] <- "Ham"
names(spam_predict_in_test_set)[2] <- "Spam"

print(spam_predict_in_test_set)

## # A tibble: 1,172 x 2
##       Ham  Spam
##     <dbl> <dbl>
##  1 0.0657 0.934
##  2 0.897  0.103
##  3 0.0992 0.901
##  4 0.0657 0.934
##  5 0.0657 0.934
##  6 0.114  0.886
##  7 0.0657 0.934
##  8 0.0992 0.901
##  9 0.0625 0.938
## 10 0.114  0.886
## # ... with 1,162 more rows

out_of_sample_prediction <-
  cbind(
    spam_predict_in_test_set%>%
      transmute(spam_predict = ifelse(Spam >= 0.5,1,0)),
    spam_test %>%
      transmute(spam_actual = ifelse(class == 1,1,0))
  ) %>%
  mutate(correct_class = (spam_predict == spam_actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=round(n()/nrow(.),4))

print(out_of_sample_prediction)

## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           113     0.0964
## 2 TRUE           1059     0.904

The out of sample accuracy of the model is obtained as 90,4%.

Logistic Regression Modelling

Logistic regression is a algorithm, which is used to predict a binary outcome based on a set of independent variables. In this model, the binary outcome is class and independent variables are the remaining columns.

Logit and Probit are the two links type of the logistic regression model. They were built using glm.

Logit could be used when the data have a standard logistic distribution of errors.
Probit could be used when the data have a normal distribution of errors.

For additional information about the Logit and Probit

spam_logit_model <- glm(class ~ ., data=spam_train,family=binomial(link = "logit"))
spam_probit_model <- glm(class ~ ., data=spam_train,family=binomial(link = "probit"))

In Sample and Out of Sample analysis are done for both Logistic Regression models respectively.

Logit-Logistic Regression Analysis

spam_logit_in_sample <- predict(spam_logit_model,type="response")

spam_logit_in_sample_prediction <-
  data.frame(in_sample=(spam_logit_in_sample >= 0.5)*1,
             actual=(spam_train$class == 1)*1) %>%
  mutate(correct_class= (in_sample == actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=round(n()/nrow(.),4))


print(spam_logit_in_sample_prediction)

## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           249     0.0726
## 2 TRUE           3180     0.927

spam_logit_out_of_sample <- predict(spam_logit_model,newdata=spam_test,type="response")

spam_logit_out_of_sample_prediction <-
  data.frame(out_of_sample=(spam_logit_out_of_sample >= 0.5)*1,
             actual=(spam_test$class == 1)*1) %>%
  mutate(correct_class= (out_of_sample == actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=round(n()/nrow(.),4))

print(spam_logit_out_of_sample_prediction)

## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE            81     0.0691
## 2 TRUE           1091     0.931

Probit-Logistic Regression Analysis

spam_probit_in_sample <- predict(spam_probit_model,type="response")

spam_probit_in_sample_prediction <-
  data.frame(in_sample=(spam_probit_in_sample >= 0.5)*1,
             actual=(spam_train$class == 1)*1) %>%
  mutate(correct_class= (in_sample == actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=round(n()/nrow(.),4))


print(spam_probit_in_sample_prediction)

## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           400      0.117
## 2 TRUE           3029      0.883

spam_probit_out_of_sample <- predict(spam_probit_model,newdata=spam_test,type="response")

spam_probit_out_of_sample_prediction <-
  data.frame(out_of_sample=(spam_probit_out_of_sample >= 0.5)*1,
             actual=(spam_test$class == 1)*1) %>%
  mutate(correct_class= (out_of_sample == actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))


print(spam_probit_out_of_sample_prediction)

## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           128      0.109
## 2 TRUE           1044      0.891

Comparison of Models

The outputs of the three models are shown together.

complete_benchmark <- data.frame(
  model = c("CART","Logistic Reg. - Logit","Logistic Reg. - Probit"),
  in_sample_accuracy = c(
    in_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist(),
    spam_logit_in_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist(),
    spam_probit_in_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist()
  ),
  out_of_sample_accuracy = c(
    out_of_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist(),
    spam_logit_out_of_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist(),
    spam_probit_out_of_sample_prediction %>% filter(correct_class) %>% transmute(round(percentage,4)) %>% unlist()
  )
  
)

print(complete_benchmark)

##                    model in_sample_accuracy out_of_sample_accuracy
## 1                   CART             0.9023                 0.9036
## 2  Logistic Reg. - Logit             0.9274                 0.9309
## 3 Logistic Reg. - Probit             0.8833                 0.8908

As a result, Logistic Reg. - Logit model has the highest model accuracy in both in_sample and out_of_sample respectively 92,74%, 93,09%.

References

You may click here to reach other items of my progress journal.

SPAM Detection with ML Algorithms

Aybike Dilek

19/12/2020