esoph
In order to examine this dataset, the following libraries are required and loaded. This dataset is already available in base-R under the name of esoph
; therefore, there is no need for reading and loading the dataset.
## Rows: 88
## Columns: 5
## $ agegp <ord> 25-34, 25-34, 25-34, 25-34, 25-34, 25-34, 25-34, 25-34, 2...
## $ alcgp <ord> 0-39g/day, 0-39g/day, 0-39g/day, 0-39g/day, 40-79, 40-79,...
## $ tobgp <ord> 0-9g/day, 10-19, 20-29, 30+, 0-9g/day, 10-19, 20-29, 30+,...
## $ ncases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, ...
## $ ncontrols <dbl> 40, 10, 6, 5, 27, 7, 4, 7, 2, 1, 2, 1, 1, 1, 2, 60, 14, 7...
## agegp alcgp tobgp ncases ncontrols
## 25-34:15 0-39g/day:23 0-9g/day:24 Min. : 0.000 Min. : 1.00
## 35-44:15 40-79 :23 10-19 :24 1st Qu.: 0.000 1st Qu.: 3.00
## 45-54:16 80-119 :21 20-29 :20 Median : 1.000 Median : 6.00
## 55-64:16 120+ :21 30+ :20 Mean : 2.273 Mean :11.08
## 65-74:15 3rd Qu.: 4.000 3rd Qu.:14.00
## 75+ :11 Max. :17.000 Max. :60.00
esoph
which comes long in “R” package.If the data set is small, sometimes a boxplot may not be very accurate, as the quartiles are not well estimated from the data and may give a falsely inflated or deflated figure. In those cases, plotting the raw data may be more desirable. This can be done using a strip chart.
We cannot get all inferences about the dataset just by looking at the stripcharts. More detailed analysis continues below.
esoph %>%
group_by(agegp) %>%
summarise(total_cases = sum(ncases),
total_controls = sum(ncontrols),
percentage = 100 * total_cases / (total_cases+total_controls)) %>%
ggplot(., aes(x = agegp, y = percentage, fill = agegp)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Proportion of Cancer Cases over Age Groups", subtitle = "Data Source: `esoph`", x = 'Age Groups', y = "% of Cancer Cases") +
theme_minimal() +
theme(legend.position = "none") +
geom_text(aes(label = paste(format(percentage,digits=1), "%")), size=4.5, position = position_stack(vjust = 0.5))
esoph %>%
group_by(alcgp) %>%
summarise(total_cases = sum(ncases),
total_controls = sum(ncontrols),
percentage = 100 * total_cases / (total_cases+total_controls)) %>%
ggplot(., aes(x = alcgp, y = percentage, fill = alcgp)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Proportion of Cancer Cases over Alcohol Consumption", subtitle = "Data Source: `esoph`", x = 'Alcohol Consumption', y = "% of Cancer Cases") +
theme_minimal() +
theme(legend.position = "none") +
geom_text(aes(label = paste(format(percentage,digits=1), "%")), size=4.5, position = position_stack(vjust = 0.5))
esoph %>%
group_by(tobgp) %>%
summarise(total_cases = sum(ncases),
total_controls = sum(ncontrols),
percentage = 100 * total_cases / (total_cases+total_controls)) %>%
ggplot(., aes(x = tobgp, y = percentage, fill = tobgp)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Proportion of Cancer Cases over Tobacco Consumption", subtitle = "Data Source: `esoph`", x = 'Tobacco Consumption', y = "% of Cancer Cases") +
theme_minimal() +
theme(legend.position = "none") +
geom_text(aes(label = paste(format(percentage,digits=1), "%")), size=4.5, position = position_stack(vjust = 0.5))
esoph %>%
select(-tobgp, -ncontrols) %>%
group_by(agegp, alcgp) %>%
summarize(total_cases = sum(ncases)) %>%
group_by(agegp) %>%
mutate(percentage = 100 * total_cases / sum(total_cases)) %>%
filter(percentage != "NaN" & percentage != 0) %>%
ggplot(., aes(x = agegp, y = percentage, fill = alcgp)) +
geom_col(stat = "identity", position = "fill") +
theme_minimal() +
geom_text(aes(label = paste(format(percentage,digits=1), "%")), size=4, position = "fill", hjust = 0.5, vjust = 1.1) +
scale_y_continuous(labels = scales::percent_format()) +
labs(title = "Stacked Bar Chart of Case Distribution of Alcohol Consumption by Age Groups", subtitle = "Data Source: `esoph`", x = "Age Groups", y = "% of Cancer Cases", fill = "Alcohol Consumption")
esoph %>%
select(-alcgp, -ncontrols) %>%
group_by(agegp, tobgp) %>%
summarize(total_cases = sum(ncases)) %>%
group_by(agegp) %>%
mutate(percentage = 100 * total_cases / sum(total_cases)) %>%
filter(percentage != "NaN" & percentage != 0) %>%
ggplot(., aes(x = agegp, y = percentage, fill = tobgp)) +
geom_col(stat = "identity", position = "fill") +
theme_minimal() +
geom_text(aes(label = paste(format(percentage,digits=1), "%")), size=4, position = "fill", hjust = 0.5, vjust = 1.1) +
scale_y_continuous(labels = scales::percent_format()) +
labs(title = "Stacked Bar Chart of Case Distribution of Tobacco Consumption by Age Groups", subtitle = "Data Source: `esoph`", x = "Age Groups", y = "% of Cancer Cases", fill = "Tobacco Consumption")
esoph %>%
select(-agegp, -ncontrols) %>%
group_by(tobgp, alcgp) %>%
summarize(total_cases = sum(ncases)) %>%
group_by(tobgp) %>%
mutate(percentage = 100 * total_cases / sum(total_cases)) %>%
filter(percentage != "NaN" & percentage != 0) %>%
ggplot(., aes(x = tobgp, y = percentage, fill = alcgp)) +
geom_col(stat = "identity", position = "fill") +
theme_minimal() +
geom_text(aes(label = paste(format(percentage,digits=1), "%")), size=4, position = "fill", hjust = 0.5, vjust = 1.1) +
scale_y_continuous(labels = scales::percent_format()) +
labs(title = "Stacked Bar Chart of Case Distribution of Alcohol Consumption by Tobacco Groups", subtitle = "Data Source: `esoph`", x = "Tobacco Consumption", y = "% of Cancer Cases", fill = "Alcohol Consumption")
esoph %>%
select(-agegp, -ncontrols) %>%
group_by(alcgp, tobgp) %>%
summarize(total_cases = sum(ncases)) %>%
group_by(alcgp) %>%
mutate(percentage = 100 * total_cases / sum(total_cases)) %>%
filter(percentage != "NaN" & percentage != 0) %>%
ggplot(., aes(x = alcgp, y = percentage, fill = tobgp)) +
geom_col(stat = "identity", position = "fill") +
theme_minimal() +
geom_text(aes(label = paste(format(percentage,digits=1), "%")), size=4, position = "fill", hjust = 0.5, vjust = 1.1) +
scale_y_continuous(labels = scales::percent_format()) +
labs(title = "Stacked Bar Chart of Case Distribution of Tobacco Consumption by Alcohol Groups", subtitle = "Data Source: `esoph`", x = "Alcohol Consumption", y = "% of Cancer Cases", fill = "Tobacco Consumption")
esoph %>%
group_by(agegp) %>%
mutate(total_cases = sum(ncases),
total_controls = sum(ncontrols),
percentage = 100 * total_cases / (total_cases+total_controls)) %>%
ggplot(., aes(x = alcgp, y = tobgp, fill = percentage)) +
geom_tile() +
facet_wrap(~agegp) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_gradient2(low="white", high="red3", guide="colorbar") +
labs(title = "Heatmap of Cancer Cases", x = "Alcohol Consumption", subtitle = "Data Source: `esoph`", y = "Tobacco Consumption", fill = "Cancer Cases (%)")
ggplot(esoph, aes(x = as.factor(agegp), y = ncases, color = alcgp)) +
geom_jitter(size=1.5, position = position_jitter(width = 0.4)) +
theme_minimal() +
scale_color_manual(values=c("green2", "blue2", "yellow2", "red2")) +
labs(title = "Jitter Plot of 'ncases' by Alcohol and Age Groups", x = "Age Groups", y = "Number of Cases", color = "Alcohol Consumption")
require(graphics) # for mosaicplot
## Re-arrange data for a mosaic plot
ttt <- table(esoph$agegp, esoph$alcgp, esoph$tobgp)
o <- with(esoph, order(tobgp, alcgp, agegp))
ttt[ttt == 1] <- esoph$ncases[o]
tt1 <- table(esoph$agegp, esoph$alcgp, esoph$tobgp)
tt1[tt1 == 1] <- esoph$ncontrols[o]
tt <- array(c(ttt, tt1), c(dim(ttt),2),
c(dimnames(ttt), list(c("Cancer", "Control"))))
mosaicplot(tt, main = "Mosaicplot of Cancer Case Distribution", color = TRUE)
esoph$percentage <- esoph$ncases / (esoph$ncontrols+esoph$ncases) #The new column is created to show the cancer percentages
model <- lm(percentage ~ agegp + tobgp + alcgp, data = esoph) #Linear model is created in order to apply anova test
anova(model)
## Analysis of Variance Table
##
## Response: percentage
## Df Sum Sq Mean Sq F value Pr(>F)
## agegp 5 1.23742 0.247484 19.4982 1.918e-12 ***
## tobgp 3 0.03361 0.011205 0.8828 0.454
## alcgp 3 0.86514 0.288381 22.7203 1.333e-10 ***
## Residuals 76 0.96464 0.012693
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
According to the results of the ANOVA test, it was observed that while age and alcohol groups had a serious effect, tobacco groups did not make a significant difference with the p-value 45%.
To build a better model, we can decide whether we should remove tobacco groups from our model by looking at the AIC values.
## [1] 69.62955
## [1] 63.43588
AIC (Akaike’s Information Criterion) allows to compare models with different distributions and with different number of parameters. The best fitting model is the model with the smallest AIC-value. When we remove the tobacco group from our model, we see a serious decrease in AIC value. Therefore, we will proceed without adding tobacco groups to our model.
model <- glm(percentage ~ agegp + alcgp, data = esoph, family = binomial(link = "logit")) #Logistic regression
summary(model)
##
## Call:
## glm(formula = percentage ~ agegp + alcgp, family = binomial(link = "logit"),
## data = esoph)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8486 -0.2650 -0.1225 0.1672 1.1824
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.69863 0.37820 -4.491 7.08e-06 ***
## agegp.L 2.45976 1.06194 2.316 0.0205 *
## agegp.Q -0.94555 0.95610 -0.989 0.3227
## agegp.C -0.01623 0.90430 -0.018 0.9857
## agegp^4 0.44297 0.81404 0.544 0.5863
## agegp^5 -0.17601 0.65825 -0.267 0.7892
## alcgp.L 1.41043 0.64271 2.195 0.0282 *
## alcgp.Q -0.09357 0.60333 -0.155 0.8768
## alcgp.C 0.16453 0.56916 0.289 0.7725
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23.2729 on 87 degrees of freedom
## Residual deviance: 8.5657 on 79 degrees of freedom
## AIC: 63.436
##
## Number of Fisher Scoring iterations: 6
predict_cancer_percentages <- data.frame()
for (i in 1:6) {
for (j in 1:4) {
predict_cancer_percentages[i,j] <- plogis(predict(model, data.frame(agegp = unique(esoph$agegp)[i], alcgp = unique(esoph$alcgp)[j]))) #Prediction
}
}
pivot_longer(predict_cancer_percentages, cols=everything(), names_to = "Alcohol_Consumption", values_to = "Cancer_Percentage") %>%
add_column(.before="Alcohol_Consumption", Age_Group = c(rep("25-34",4),rep("35-44",4),rep("45-54",4),rep("55-64",4),rep("65-74",4),rep("75+",4))) %>%
ggplot(.,aes(x=Age_Group, y=Cancer_Percentage, fill = Alcohol_Consumption)) +
geom_bar(stat = "identity", position = "dodge") +
scale_y_continuous(labels = scales::percent_format()) +
scale_fill_discrete(name = "Alcohol Consumption (gm/day)", labels = c("0-39", "40-79", "80-119", "120+")) +
labs(title = "Predicted Cancer Risk Percentages among Alcohol and Age Groups", subtitle = "Prediction Based on Logistic Regression", x = "Age Groups", y = "Predicted Percentage")
predict_cancer_percentages <- data.frame(row.names = c("25-34 years","35-44 years","45-54 years","55-64 years","65-74 years","75+ years"))
for (i in 1:6) {
for (j in 1:4) {
predict_cancer_percentages[i,j] <- paste(round(100*(plogis(predict(model, data.frame(agegp = unique(esoph$agegp)[i], alcgp = unique(esoph$alcgp)[j])))),0),"%",sep="")
}
}
colnames(predict_cancer_percentages) <- c("0-39 gm/day","40-79 gm/day","80-119 gm/day","120+ gm/day")
kable(predict_cancer_percentages, caption = "Predicted Cancer Percentages corresp. to Age and Alcohol Groups")
0-39 gm/day | 40-79 gm/day | 80-119 gm/day | 120+ gm/day | |
---|---|---|---|---|
25-34 years | 1% | 2% | 3% | 7% |
35-44 years | 2% | 5% | 7% | 14% |
45-54 years | 9% | 19% | 26% | 41% |
55-64 years | 12% | 25% | 34% | 50% |
65-74 years | 13% | 26% | 34% | 51% |
75+ years | 15% | 30% | 40% | 56% |
pred_length <- nrow(esoph)
fit_glm_error <- c()
fit_glm_sq_error <- c()
for(i in 1:pred_length){
fit_glm <- glm(percentage ~ agegp + alcgp, family = binomial(link = "logit"), data = esoph[-i,]) #Leave-one-out Cross Validation
fit_glm_pred <- (predict(fit_glm, esoph[i,]))^2
fit_glm_error[i] <- esoph$percentage[i] - fit_glm_pred
fit_glm_sq_error[i] = (esoph$percentage[i] - fit_glm_pred)^2
}
hist(fit_glm_error, breaks = 50, xlim = range(-50,50), title = "Histogram of Errors", xlab = "Fitted GLM Errors")
When the histogram of the errors is drawn, we see that it is normally distributed. A few errors appear to be more than 30. Apart from that, since we obviously want to minimize the error, the fact that the errors are mostly close to zero shows that we have established a good model. In order to evaluate the model better, RMSE is calculated as follows:
## [1] 42.17921
Young People Survey