raw_df = read_csv("data/spambase.data", col_names = FALSE)
clean_names = c("FREQ_MAKE","FREQ_ADDRESS","FREQ_ALL","FREQ_3D","FREQ_OUR","FREQ_OVER","FREQ_REMOVE","FREQ_INTERNET","FREQ_ORDER","FREQ_MAIL","FREQ_RECEIVE","FREQ_WILL","FREQ_PEOPLE","FREQ_REPORT","FREQ_ADDRESSES","FREQ_FREE","FREQ_BUSINESS","FREQ_EMAIL","FREQ_YOU","FREQ_CREDIT","FREQ_YOUR","FREQ_FONT","FREQ_000","FREQ_MONEY","FREQ_HP","FREQ_HPL","FREQ_GEORGE","FREQ_650","FREQ_LAB","FREQ_LABS","FREQ_TELNET","FREQ_857","FREQ_DATA","FREQ_415","FREQ_85","FREQ_TECHNOLOGY","FREQ_1999","FREQ_PARTS","FREQ_PM","FREQ_DIRECT","FREQ_CS","FREQ_MEETING","FREQ_ORIGINAL","FREQ_PROJECT","FREQ_RE","FREQ_EDU","FREQ_TABLE","FREQ_CONFERENCE","FREQ_;","FREQ_(","FREQ_[","FREQ_!","FREQ_$","FREQ_#","LENGTH_AVERAGE","LENGTH_LONGEST","LENGTH_TOTAL","IS_SPAM")
clean_df = setNames(raw_df, clean_names)
knitr::kable(head(clean_df[c(1,2,3,4,5,54,55,56,57,58)], 5), format="markdown", align='cc')
FREQ_MAKE | FREQ_ADDRESS | FREQ_ALL | FREQ_3D | FREQ_OUR | FREQ_# | LENGTH_AVERAGE | LENGTH_LONGEST | LENGTH_TOTAL | IS_SPAM |
---|---|---|---|---|---|---|---|---|---|
0.00 | 0.64 | 0.64 | 0 | 0.32 | 0.000 | 3.756 | 61 | 278 | 1 |
0.21 | 0.28 | 0.50 | 0 | 0.14 | 0.048 | 5.114 | 101 | 1028 | 1 |
0.06 | 0.00 | 0.71 | 0 | 1.23 | 0.010 | 9.821 | 485 | 2259 | 1 |
0.00 | 0.00 | 0.00 | 0 | 0.63 | 0.000 | 3.537 | 40 | 191 | 1 |
0.00 | 0.00 | 0.00 | 0 | 0.63 | 0.000 | 3.537 | 40 | 191 | 1 |
We are going to split the dataframe by 85% to train dataset and use remaining 15% to control and test our prediction rate.
sample_size = floor(0.85 * nrow(clean_df))
set.seed(9999)
train_index <- sample(seq_len(nrow(clean_df)), size = sample_size)
train_df = clean_df[train_index, ]
test_df = clean_df[-train_index, ]
knitr::kable(head(test_df[c(1,2,3,4,5,54,55,56,57,58)], 5), format="markdown", align='cc')
FREQ_MAKE | FREQ_ADDRESS | FREQ_ALL | FREQ_3D | FREQ_OUR | FREQ_# | LENGTH_AVERAGE | LENGTH_LONGEST | LENGTH_TOTAL | IS_SPAM |
---|---|---|---|---|---|---|---|---|---|
0.00 | 0.00 | 0.00 | 0 | 0.90 | 0.000 | 2.083 | 7 | 25 | 1 |
0.00 | 0.00 | 1.42 | 0 | 0.71 | 0.000 | 1.971 | 24 | 205 | 1 |
0.00 | 0.00 | 0.00 | 0 | 0.52 | 0.147 | 2.145 | 38 | 339 | 1 |
0.15 | 0.45 | 1.05 | 0 | 0.45 | 0.000 | 5.301 | 130 | 774 | 1 |
0.18 | 0.00 | 0.18 | 0 | 1.57 | 0.000 | 1.733 | 12 | 442 | 1 |
classification_tree = rpart(IS_SPAM~., data = train_df, method='class')
rpart.plot(classification_tree, type=3, box.palette = "Blues")
## CP Table of Classification Tree
We will check CP Table of classification tree before we prune the tree. It is not necessary to prune it here since our dataset is small but It is good practice to prune it for performance optimization.
cls_tree_cpt = classification_tree$cptable
knitr::kable(cls_tree_cpt, format="markdown", align='cc')
CP | nsplit | rel error | xerror | xstd |
---|---|---|---|---|
0.4754742 | 0 | 1.0000000 | 1.0000000 | 0.0199566 |
0.1451929 | 1 | 0.5245258 | 0.5493787 | 0.0167963 |
0.0464356 | 2 | 0.3793329 | 0.4401570 | 0.0154377 |
0.0405494 | 3 | 0.3328973 | 0.3603663 | 0.0142293 |
0.0327011 | 4 | 0.2923479 | 0.3348594 | 0.0137959 |
0.0117724 | 5 | 0.2596468 | 0.2890778 | 0.0129496 |
0.0100000 | 6 | 0.2478744 | 0.2773054 | 0.0127160 |
predict_df = predict(classification_tree, test_df, type = 'class')
prediction_table = table(test_df$IS_SPAM, predict_df)
prediction_rate = (prediction_table[1,1]+prediction_table[2,2])/sum(prediction_table)
print(paste(c('Our CART model has ',round(prediction_rate*100,2),'% correct prediction rate.'), collapse = ""))
## [1] "Our CART model has 90.01% correct prediction rate."