1 Import Dataset

setwd("C:/Users/ahmet/Desktop/MEF_BDA/BDA-503/Week_6/HW_Spam_Detection")
spamdata = read.csv(file = 'spam_Data.csv')
head(spamdata)

##   word_freq_make word_freq_address word_freq_all word_freq_3d word_freq_our
## 1           0.00              0.64          0.64            0          0.32
## 2           0.21              0.28          0.50            0          0.14
## 3           0.06              0.00          0.71            0          1.23
## 4           0.00              0.00          0.00            0          0.63
## 5           0.00              0.00          0.00            0          0.63
## 6           0.00              0.00          0.00            0          1.85
##   word_freq_over word_freq_remove word_freq_internet word_freq_order
## 1           0.00             0.00               0.00            0.00
## 2           0.28             0.21               0.07            0.00
## 3           0.19             0.19               0.12            0.64
## 4           0.00             0.31               0.63            0.31
## 5           0.00             0.31               0.63            0.31
## 6           0.00             0.00               1.85            0.00
##   word_freq_mail word_freq_receive word_freq_will word_freq_people
## 1           0.00              0.00           0.64             0.00
## 2           0.94              0.21           0.79             0.65
## 3           0.25              0.38           0.45             0.12
## 4           0.63              0.31           0.31             0.31
## 5           0.63              0.31           0.31             0.31
## 6           0.00              0.00           0.00             0.00
##   word_freq_report word_freq_addresses word_freq_free word_freq_business
## 1             0.00                0.00           0.32               0.00
## 2             0.21                0.14           0.14               0.07
## 3             0.00                1.75           0.06               0.06
## 4             0.00                0.00           0.31               0.00
## 5             0.00                0.00           0.31               0.00
## 6             0.00                0.00           0.00               0.00
##   word_freq_email word_freq_you word_freq_credit word_freq_your word_freq_font
## 1            1.29          1.93             0.00           0.96              0
## 2            0.28          3.47             0.00           1.59              0
## 3            1.03          1.36             0.32           0.51              0
## 4            0.00          3.18             0.00           0.31              0
## 5            0.00          3.18             0.00           0.31              0
## 6            0.00          0.00             0.00           0.00              0
##   word_freq_000 word_freq_money word_freq_hp word_freq_hpl word_freq_george
## 1          0.00            0.00            0             0                0
## 2          0.43            0.43            0             0                0
## 3          1.16            0.06            0             0                0
## 4          0.00            0.00            0             0                0
## 5          0.00            0.00            0             0                0
## 6          0.00            0.00            0             0                0
##   word_freq_650 word_freq_lab word_freq_labs word_freq_telnet word_freq_857
## 1             0             0              0                0             0
## 2             0             0              0                0             0
## 3             0             0              0                0             0
## 4             0             0              0                0             0
## 5             0             0              0                0             0
## 6             0             0              0                0             0
##   word_freq_data word_freq_415 word_freq_85 word_freq_technology word_freq_1999
## 1              0             0            0                    0           0.00
## 2              0             0            0                    0           0.07
## 3              0             0            0                    0           0.00
## 4              0             0            0                    0           0.00
## 5              0             0            0                    0           0.00
## 6              0             0            0                    0           0.00
##   word_freq_parts word_freq_pm word_freq_direct word_freq_cs word_freq_meeting
## 1               0            0             0.00            0                 0
## 2               0            0             0.00            0                 0
## 3               0            0             0.06            0                 0
## 4               0            0             0.00            0                 0
## 5               0            0             0.00            0                 0
## 6               0            0             0.00            0                 0
##   word_freq_original word_freq_project word_freq_re word_freq_edu
## 1               0.00                 0         0.00          0.00
## 2               0.00                 0         0.00          0.00
## 3               0.12                 0         0.06          0.06
## 4               0.00                 0         0.00          0.00
## 5               0.00                 0         0.00          0.00
## 6               0.00                 0         0.00          0.00
##   word_freq_table word_freq_conference char_freq_.3B char_freq_.28
## 1               0                    0          0.00         0.000
## 2               0                    0          0.00         0.132
## 3               0                    0          0.01         0.143
## 4               0                    0          0.00         0.137
## 5               0                    0          0.00         0.135
## 6               0                    0          0.00         0.223
##   char_freq_.5B char_freq_.21 char_freq_.24 char_freq_.23
## 1             0         0.778         0.000         0.000
## 2             0         0.372         0.180         0.048
## 3             0         0.276         0.184         0.010
## 4             0         0.137         0.000         0.000
## 5             0         0.135         0.000         0.000
## 6             0         0.000         0.000         0.000
##   capital_run_length_average capital_run_length_longest
## 1                      3.756                         61
## 2                      5.114                        101
## 3                      9.821                        485
## 4                      3.537                         40
## 5                      3.537                         40
## 6                      3.000                         15
##   capital_run_length_total class
## 1                      278     1
## 2                     1028     1
## 3                     2259     1
## 4                      191     1
## 5                      191     1
## 6                       54     1

glimpse(spamdata)

## Rows: 4,601
## Columns: 58
## $ word_freq_make             <dbl> 0.00, 0.21, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_address          <dbl> 0.64, 0.28, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_all              <dbl> 0.64, 0.50, 0.71, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_3d               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_our              <dbl> 0.32, 0.14, 1.23, 0.63, 0.63, 1.85, 1.92...
## $ word_freq_over             <dbl> 0.00, 0.28, 0.19, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_remove           <dbl> 0.00, 0.21, 0.19, 0.31, 0.31, 0.00, 0.00...
## $ word_freq_internet         <dbl> 0.00, 0.07, 0.12, 0.63, 0.63, 1.85, 0.00...
## $ word_freq_order            <dbl> 0.00, 0.00, 0.64, 0.31, 0.31, 0.00, 0.00...
## $ word_freq_mail             <dbl> 0.00, 0.94, 0.25, 0.63, 0.63, 0.00, 0.64...
## $ word_freq_receive          <dbl> 0.00, 0.21, 0.38, 0.31, 0.31, 0.00, 0.96...
## $ word_freq_will             <dbl> 0.64, 0.79, 0.45, 0.31, 0.31, 0.00, 1.28...
## $ word_freq_people           <dbl> 0.00, 0.65, 0.12, 0.31, 0.31, 0.00, 0.00...
## $ word_freq_report           <dbl> 0.00, 0.21, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_addresses        <dbl> 0.00, 0.14, 1.75, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_free             <dbl> 0.32, 0.14, 0.06, 0.31, 0.31, 0.00, 0.96...
## $ word_freq_business         <dbl> 0.00, 0.07, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_email            <dbl> 1.29, 0.28, 1.03, 0.00, 0.00, 0.00, 0.32...
## $ word_freq_you              <dbl> 1.93, 3.47, 1.36, 3.18, 3.18, 0.00, 3.85...
## $ word_freq_credit           <dbl> 0.00, 0.00, 0.32, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_your             <dbl> 0.96, 1.59, 0.51, 0.31, 0.31, 0.00, 0.64...
## $ word_freq_font             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_000              <dbl> 0.00, 0.43, 1.16, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_money            <dbl> 0.00, 0.43, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_hp               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_hpl              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_george           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_650              <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_lab              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_labs             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_telnet           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_857              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_data             <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_415              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_85               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_technology       <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_1999             <dbl> 0.00, 0.07, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_parts            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_pm               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_direct           <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_cs               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_meeting          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_original         <dbl> 0.00, 0.00, 0.12, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_project          <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_re               <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_edu              <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_table            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_conference       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ char_freq_.3B              <dbl> 0.000, 0.000, 0.010, 0.000, 0.000, 0.000...
## $ char_freq_.28              <dbl> 0.000, 0.132, 0.143, 0.137, 0.135, 0.223...
## $ char_freq_.5B              <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...
## $ char_freq_.21              <dbl> 0.778, 0.372, 0.276, 0.137, 0.135, 0.000...
## $ char_freq_.24              <dbl> 0.000, 0.180, 0.184, 0.000, 0.000, 0.000...
## $ char_freq_.23              <dbl> 0.000, 0.048, 0.010, 0.000, 0.000, 0.000...
## $ capital_run_length_average <dbl> 3.756, 5.114, 9.821, 3.537, 3.537, 3.000...
## $ capital_run_length_longest <int> 61, 101, 485, 40, 40, 15, 4, 11, 445, 43...
## $ capital_run_length_total   <int> 278, 1028, 2259, 191, 191, 54, 112, 49, ...
## $ class                      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...

dim(spamdata)

## [1] 4601   58

2 Split to Training and Test Set

Split spam dataset into train and test dataset. Split ratio = 0.80

smp_size <- floor(0.80 * nrow(spamdata))
set.seed(50)
train_ind <- sample(seq_len(nrow(spamdata)), size = smp_size)
train <- spamdata[train_ind, ]
test <- spamdata[-train_ind, ]

3 Build the Model

Build the model with the training data and show graph.

spam_tree <- rpart(class~.,data = train, method = 'class')
fancyRpartPlot(spam_tree)

4 Explication

printcp(spam_tree)

## 
## Classification tree:
## rpart(formula = class ~ ., data = train, method = "class")
## 
## Variables actually used in tree construction:
## [1] capital_run_length_total char_freq_.21            char_freq_.24           
## [4] word_freq_free           word_freq_hp             word_freq_remove        
## 
## Root node error: 1472/3680 = 0.4
## 
## n= 3680 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.474864      0   1.00000 1.00000 0.020189
## 2 0.155571      1   0.52514 0.54484 0.017014
## 3 0.043478      2   0.36957 0.42935 0.015543
## 4 0.042799      3   0.32609 0.33696 0.014073
## 5 0.027174      4   0.28329 0.31250 0.013629
## 6 0.010190      5   0.25611 0.28057 0.013008
## 7 0.010000      6   0.24592 0.27310 0.012855

plotcp(spam_tree)

5 Test Prediction

We can predict our test dataset.

5.1 Confusion Matrix

predict_unseen <-predict(spam_tree, test, type = 'class')
cm <- table(test$class, predict_unseen)
cm

##    predict_unseen
##       0   1
##   0 554  26
##   1  62 279

There are 833 true prediction and 88 false prediction. For 88 false prediction; * There are 26 false positive result * There are 62 false negative result

5.2 Performance Measurement

accuracy_Test <- sum(diag(cm)) / sum(cm)
print(paste('Accuracy for test', accuracy_Test))

## [1] "Accuracy for test 0.904451682953312"

6 Tune the hyper-parameters

accuracy_tune <- function(fit) {
    predict_unseen <- predict(spam_tree, test, type = 'class')
    table_mat <- table(test$class, predict_unseen)
    accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
    accuracy_Test
}

control <- rpart.control(minsplit = 4,
    minbucket = round(5 / 3),
    maxdepth = 3,
    cp = 0)
tune_fit <- rpart(class~., data = train, method = 'class', control = control)

predict_tune<-predict(tune_fit, test, type = 'class')
cm_fit <- table(test$class, predict_tune)
cm_fit

##    predict_tune
##       0   1
##   0 540  40
##   1  58 283

accuracy_test_tune <- sum(diag(cm_fit)) / sum(cm_fit)
print(paste('Accuracy for test', accuracy_test_tune))

## [1] "Accuracy for test 0.893593919652552"

SPAM Detection HW

Ahmet Emin Saricaoglu

20 12 2020