Preparing data for analysis

raw_df = read_csv("data/spambase.data", col_names = FALSE)
clean_names = c("FREQ_MAKE","FREQ_ADDRESS","FREQ_ALL","FREQ_3D","FREQ_OUR","FREQ_OVER","FREQ_REMOVE","FREQ_INTERNET","FREQ_ORDER","FREQ_MAIL","FREQ_RECEIVE","FREQ_WILL","FREQ_PEOPLE","FREQ_REPORT","FREQ_ADDRESSES","FREQ_FREE","FREQ_BUSINESS","FREQ_EMAIL","FREQ_YOU","FREQ_CREDIT","FREQ_YOUR","FREQ_FONT","FREQ_000","FREQ_MONEY","FREQ_HP","FREQ_HPL","FREQ_GEORGE","FREQ_650","FREQ_LAB","FREQ_LABS","FREQ_TELNET","FREQ_857","FREQ_DATA","FREQ_415","FREQ_85","FREQ_TECHNOLOGY","FREQ_1999","FREQ_PARTS","FREQ_PM","FREQ_DIRECT","FREQ_CS","FREQ_MEETING","FREQ_ORIGINAL","FREQ_PROJECT","FREQ_RE","FREQ_EDU","FREQ_TABLE","FREQ_CONFERENCE","FREQ_;","FREQ_(","FREQ_[","FREQ_!","FREQ_$","FREQ_#","LENGTH_AVERAGE","LENGTH_LONGEST","LENGTH_TOTAL","IS_SPAM")
clean_df = setNames(raw_df, clean_names)
knitr::kable(head(clean_df[c(1,2,3,4,5,54,55,56,57,58)], 5), format="markdown", align='cc')
FREQ_MAKE FREQ_ADDRESS FREQ_ALL FREQ_3D FREQ_OUR FREQ_# LENGTH_AVERAGE LENGTH_LONGEST LENGTH_TOTAL IS_SPAM
0.00 0.64 0.64 0 0.32 0.000 3.756 61 278 1
0.21 0.28 0.50 0 0.14 0.048 5.114 101 1028 1
0.06 0.00 0.71 0 1.23 0.010 9.821 485 2259 1
0.00 0.00 0.00 0 0.63 0.000 3.537 40 191 1
0.00 0.00 0.00 0 0.63 0.000 3.537 40 191 1

Splitting dataframe for train and testing

We are going to split the dataframe by 85% to train dataset and use remaining 15% to control and test our prediction rate.

sample_size = floor(0.85 * nrow(clean_df))
set.seed(9999)

train_index <- sample(seq_len(nrow(clean_df)), size = sample_size)

train_df = clean_df[train_index, ]
test_df = clean_df[-train_index, ]
knitr::kable(head(test_df[c(1,2,3,4,5,54,55,56,57,58)], 5), format="markdown", align='cc')
FREQ_MAKE FREQ_ADDRESS FREQ_ALL FREQ_3D FREQ_OUR FREQ_# LENGTH_AVERAGE LENGTH_LONGEST LENGTH_TOTAL IS_SPAM
0.00 0.00 0.00 0 0.90 0.000 2.083 7 25 1
0.00 0.00 1.42 0 0.71 0.000 1.971 24 205 1
0.00 0.00 0.00 0 0.52 0.147 2.145 38 339 1
0.15 0.45 1.05 0 0.45 0.000 5.301 130 774 1
0.18 0.00 0.18 0 1.57 0.000 1.733 12 442 1

Building Classification Tree

classification_tree = rpart(IS_SPAM~., data = train_df, method='class')
rpart.plot(classification_tree, type=3, box.palette = "Blues")

## CP Table of Classification Tree

We will check CP Table of classification tree before we prune the tree. It is not necessary to prune it here since our dataset is small but It is good practice to prune it for performance optimization.

cls_tree_cpt = classification_tree$cptable
knitr::kable(cls_tree_cpt, format="markdown", align='cc')
CP nsplit rel error xerror xstd
0.4754742 0 1.0000000 1.0000000 0.0199566
0.1451929 1 0.5245258 0.5493787 0.0167963
0.0464356 2 0.3793329 0.4401570 0.0154377
0.0405494 3 0.3328973 0.3603663 0.0142293
0.0327011 4 0.2923479 0.3348594 0.0137959
0.0117724 5 0.2596468 0.2890778 0.0129496
0.0100000 6 0.2478744 0.2773054 0.0127160

Measuring our prediction rate

predict_df = predict(classification_tree, test_df, type = 'class')
prediction_table = table(test_df$IS_SPAM, predict_df)

prediction_rate = (prediction_table[1,1]+prediction_table[2,2])/sum(prediction_table)
print(paste(c('Our CART model has ',round(prediction_rate*100,2),'% correct prediction rate.'), collapse = ""))
## [1] "Our CART model has 90.01% correct prediction rate."

References

How to split data into training/testing sets using sample function

Regression Tree