setwd("C:/Users/ahmet/Desktop/MEF_BDA/BDA-503/Week_6/HW_Spam_Detection")
spamdata = read.csv(file = 'spam_Data.csv')
head(spamdata)
## word_freq_make word_freq_address word_freq_all word_freq_3d word_freq_our
## 1 0.00 0.64 0.64 0 0.32
## 2 0.21 0.28 0.50 0 0.14
## 3 0.06 0.00 0.71 0 1.23
## 4 0.00 0.00 0.00 0 0.63
## 5 0.00 0.00 0.00 0 0.63
## 6 0.00 0.00 0.00 0 1.85
## word_freq_over word_freq_remove word_freq_internet word_freq_order
## 1 0.00 0.00 0.00 0.00
## 2 0.28 0.21 0.07 0.00
## 3 0.19 0.19 0.12 0.64
## 4 0.00 0.31 0.63 0.31
## 5 0.00 0.31 0.63 0.31
## 6 0.00 0.00 1.85 0.00
## word_freq_mail word_freq_receive word_freq_will word_freq_people
## 1 0.00 0.00 0.64 0.00
## 2 0.94 0.21 0.79 0.65
## 3 0.25 0.38 0.45 0.12
## 4 0.63 0.31 0.31 0.31
## 5 0.63 0.31 0.31 0.31
## 6 0.00 0.00 0.00 0.00
## word_freq_report word_freq_addresses word_freq_free word_freq_business
## 1 0.00 0.00 0.32 0.00
## 2 0.21 0.14 0.14 0.07
## 3 0.00 1.75 0.06 0.06
## 4 0.00 0.00 0.31 0.00
## 5 0.00 0.00 0.31 0.00
## 6 0.00 0.00 0.00 0.00
## word_freq_email word_freq_you word_freq_credit word_freq_your word_freq_font
## 1 1.29 1.93 0.00 0.96 0
## 2 0.28 3.47 0.00 1.59 0
## 3 1.03 1.36 0.32 0.51 0
## 4 0.00 3.18 0.00 0.31 0
## 5 0.00 3.18 0.00 0.31 0
## 6 0.00 0.00 0.00 0.00 0
## word_freq_000 word_freq_money word_freq_hp word_freq_hpl word_freq_george
## 1 0.00 0.00 0 0 0
## 2 0.43 0.43 0 0 0
## 3 1.16 0.06 0 0 0
## 4 0.00 0.00 0 0 0
## 5 0.00 0.00 0 0 0
## 6 0.00 0.00 0 0 0
## word_freq_650 word_freq_lab word_freq_labs word_freq_telnet word_freq_857
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
## word_freq_data word_freq_415 word_freq_85 word_freq_technology word_freq_1999
## 1 0 0 0 0 0.00
## 2 0 0 0 0 0.07
## 3 0 0 0 0 0.00
## 4 0 0 0 0 0.00
## 5 0 0 0 0 0.00
## 6 0 0 0 0 0.00
## word_freq_parts word_freq_pm word_freq_direct word_freq_cs word_freq_meeting
## 1 0 0 0.00 0 0
## 2 0 0 0.00 0 0
## 3 0 0 0.06 0 0
## 4 0 0 0.00 0 0
## 5 0 0 0.00 0 0
## 6 0 0 0.00 0 0
## word_freq_original word_freq_project word_freq_re word_freq_edu
## 1 0.00 0 0.00 0.00
## 2 0.00 0 0.00 0.00
## 3 0.12 0 0.06 0.06
## 4 0.00 0 0.00 0.00
## 5 0.00 0 0.00 0.00
## 6 0.00 0 0.00 0.00
## word_freq_table word_freq_conference char_freq_.3B char_freq_.28
## 1 0 0 0.00 0.000
## 2 0 0 0.00 0.132
## 3 0 0 0.01 0.143
## 4 0 0 0.00 0.137
## 5 0 0 0.00 0.135
## 6 0 0 0.00 0.223
## char_freq_.5B char_freq_.21 char_freq_.24 char_freq_.23
## 1 0 0.778 0.000 0.000
## 2 0 0.372 0.180 0.048
## 3 0 0.276 0.184 0.010
## 4 0 0.137 0.000 0.000
## 5 0 0.135 0.000 0.000
## 6 0 0.000 0.000 0.000
## capital_run_length_average capital_run_length_longest
## 1 3.756 61
## 2 5.114 101
## 3 9.821 485
## 4 3.537 40
## 5 3.537 40
## 6 3.000 15
## capital_run_length_total class
## 1 278 1
## 2 1028 1
## 3 2259 1
## 4 191 1
## 5 191 1
## 6 54 1
glimpse(spamdata)
## Rows: 4,601
## Columns: 58
## $ word_freq_make <dbl> 0.00, 0.21, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_address <dbl> 0.64, 0.28, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_all <dbl> 0.64, 0.50, 0.71, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_3d <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_our <dbl> 0.32, 0.14, 1.23, 0.63, 0.63, 1.85, 1.92...
## $ word_freq_over <dbl> 0.00, 0.28, 0.19, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_remove <dbl> 0.00, 0.21, 0.19, 0.31, 0.31, 0.00, 0.00...
## $ word_freq_internet <dbl> 0.00, 0.07, 0.12, 0.63, 0.63, 1.85, 0.00...
## $ word_freq_order <dbl> 0.00, 0.00, 0.64, 0.31, 0.31, 0.00, 0.00...
## $ word_freq_mail <dbl> 0.00, 0.94, 0.25, 0.63, 0.63, 0.00, 0.64...
## $ word_freq_receive <dbl> 0.00, 0.21, 0.38, 0.31, 0.31, 0.00, 0.96...
## $ word_freq_will <dbl> 0.64, 0.79, 0.45, 0.31, 0.31, 0.00, 1.28...
## $ word_freq_people <dbl> 0.00, 0.65, 0.12, 0.31, 0.31, 0.00, 0.00...
## $ word_freq_report <dbl> 0.00, 0.21, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_addresses <dbl> 0.00, 0.14, 1.75, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_free <dbl> 0.32, 0.14, 0.06, 0.31, 0.31, 0.00, 0.96...
## $ word_freq_business <dbl> 0.00, 0.07, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_email <dbl> 1.29, 0.28, 1.03, 0.00, 0.00, 0.00, 0.32...
## $ word_freq_you <dbl> 1.93, 3.47, 1.36, 3.18, 3.18, 0.00, 3.85...
## $ word_freq_credit <dbl> 0.00, 0.00, 0.32, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_your <dbl> 0.96, 1.59, 0.51, 0.31, 0.31, 0.00, 0.64...
## $ word_freq_font <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_000 <dbl> 0.00, 0.43, 1.16, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_money <dbl> 0.00, 0.43, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_hp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_hpl <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_george <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_650 <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_lab <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_labs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_telnet <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_857 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_data <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_415 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_85 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_technology <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_1999 <dbl> 0.00, 0.07, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_parts <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_pm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_direct <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_cs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_meeting <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_original <dbl> 0.00, 0.00, 0.12, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_project <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_re <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_edu <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00...
## $ word_freq_table <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ word_freq_conference <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ char_freq_.3B <dbl> 0.000, 0.000, 0.010, 0.000, 0.000, 0.000...
## $ char_freq_.28 <dbl> 0.000, 0.132, 0.143, 0.137, 0.135, 0.223...
## $ char_freq_.5B <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...
## $ char_freq_.21 <dbl> 0.778, 0.372, 0.276, 0.137, 0.135, 0.000...
## $ char_freq_.24 <dbl> 0.000, 0.180, 0.184, 0.000, 0.000, 0.000...
## $ char_freq_.23 <dbl> 0.000, 0.048, 0.010, 0.000, 0.000, 0.000...
## $ capital_run_length_average <dbl> 3.756, 5.114, 9.821, 3.537, 3.537, 3.000...
## $ capital_run_length_longest <int> 61, 101, 485, 40, 40, 15, 4, 11, 445, 43...
## $ capital_run_length_total <int> 278, 1028, 2259, 191, 191, 54, 112, 49, ...
## $ class <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
dim(spamdata)
## [1] 4601 58
Split spam dataset into train and test dataset. Split ratio = 0.80
smp_size <- floor(0.80 * nrow(spamdata))
set.seed(50)
train_ind <- sample(seq_len(nrow(spamdata)), size = smp_size)
train <- spamdata[train_ind, ]
test <- spamdata[-train_ind, ]
Build the model with the training data and show graph.
spam_tree <- rpart(class~.,data = train, method = 'class')
fancyRpartPlot(spam_tree)
printcp(spam_tree)
##
## Classification tree:
## rpart(formula = class ~ ., data = train, method = "class")
##
## Variables actually used in tree construction:
## [1] capital_run_length_total char_freq_.21 char_freq_.24
## [4] word_freq_free word_freq_hp word_freq_remove
##
## Root node error: 1472/3680 = 0.4
##
## n= 3680
##
## CP nsplit rel error xerror xstd
## 1 0.474864 0 1.00000 1.00000 0.020189
## 2 0.155571 1 0.52514 0.54484 0.017014
## 3 0.043478 2 0.36957 0.42935 0.015543
## 4 0.042799 3 0.32609 0.33696 0.014073
## 5 0.027174 4 0.28329 0.31250 0.013629
## 6 0.010190 5 0.25611 0.28057 0.013008
## 7 0.010000 6 0.24592 0.27310 0.012855
plotcp(spam_tree)
We can predict our test dataset.
predict_unseen <-predict(spam_tree, test, type = 'class')
cm <- table(test$class, predict_unseen)
cm
## predict_unseen
## 0 1
## 0 554 26
## 1 62 279
There are 833 true prediction and 88 false prediction. For 88 false prediction; * There are 26 false positive result * There are 62 false negative result
accuracy_Test <- sum(diag(cm)) / sum(cm)
print(paste('Accuracy for test', accuracy_Test))
## [1] "Accuracy for test 0.904451682953312"
accuracy_tune <- function(fit) {
predict_unseen <- predict(spam_tree, test, type = 'class')
table_mat <- table(test$class, predict_unseen)
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
accuracy_Test
}
control <- rpart.control(minsplit = 4,
minbucket = round(5 / 3),
maxdepth = 3,
cp = 0)
tune_fit <- rpart(class~., data = train, method = 'class', control = control)
predict_tune<-predict(tune_fit, test, type = 'class')
cm_fit <- table(test$class, predict_tune)
cm_fit
## predict_tune
## 0 1
## 0 540 40
## 1 58 283
accuracy_test_tune <- sum(diag(cm_fit)) / sum(cm_fit)
print(paste('Accuracy for test', accuracy_test_tune))
## [1] "Accuracy for test 0.893593919652552"