set.seed(20250811)
# install.packages(c("pROC","randomForest","MASS")) # only if missing
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
csv_path <- "diabetes_binary_5050split_health_indicators_BRFSS2015.csv"
df <- read.csv(csv_path)
binary_cols <- c(
"Diabetes_binary","HighBP","HighChol","CholCheck","Smoker","Stroke",
"HeartDiseaseorAttack","PhysActivity","Fruits","Veggies",
"HvyAlcoholConsump","AnyHealthcare","NoDocbcCost","DiffWalk","Sex"
)
df[binary_cols] <- lapply(df[binary_cols], factor)
df$GenHlth <- factor(df$GenHlth, ordered = TRUE, levels = sort(unique(df$GenHlth)))
df$Education <- factor(df$Education, ordered = TRUE, levels = sort(unique(df$Education)))
df$Income <- factor(df$Income, ordered = TRUE, levels = sort(unique(df$Income)))
df$Age <- factor(df$Age, ordered = TRUE, levels = sort(unique(df$Age)))
df$Diabetes_binary <- factor(df$Diabetes_binary, levels = c(0,1), labels = c("no","yes"))
cat("\nOutcome distribution:\n"); print(prop.table(table(df$Diabetes_binary)))
##
## Outcome distribution:
##
## no yes
## 0.5 0.5
stopifnot(all(c("BMI","MentHlth","PhysHlth") %in% names(df)))
strat_split <- function(data, prop = 0.7, y = "Diabetes_binary") {
idx_yes <- which(data[[y]] == "yes")
idx_no <- which(data[[y]] == "no")
n_yes <- floor(length(idx_yes) * prop)
n_no <- floor(length(idx_no) * prop)
tr_idx <- c(sample(idx_yes, n_yes), sample(idx_no, n_no))
tr_idx <- sample(tr_idx)
list(train = data[tr_idx, , drop = FALSE],
test = data[-tr_idx, , drop = FALSE])
}
set.seed(20250811)
spl <- strat_split(df, prop = 0.70, y = "Diabetes_binary")
train <- droplevels(spl$train)
test <- droplevels(spl$test)
if (length(unique(test$Diabetes_binary)) < 2) { # ensure both classes present
set.seed(20250811 + 1)
spl <- strat_split(df, prop = 0.70, y = "Diabetes_binary")
train <- droplevels(spl$train)
test <- droplevels(spl$test)
}
cat("\nTest outcome distribution:\n"); print(table(test$Diabetes_binary))
##
## Test outcome distribution:
##
## no yes
## 10604 10604
# Persist to disk (optional but helpful if R restarts)
saveRDS(train, "train.rds"); saveRDS(test, "test.rds")
train <- readRDS("train.rds"); test <- readRDS("test.rds")
glm_formula <- Diabetes_binary ~ .
# Logistic
log_fit <- glm(glm_formula, data = train, family = binomial(link = "logit"))
# Random Forest (smaller trees for speed; increase later)
p_all <- ncol(train) - 1
mtry_val <- max(2, floor(sqrt(p_all)))
rf_fit <- randomForest(glm_formula, data = train, ntree = 300, mtry = mtry_val,
importance = TRUE, na.action = na.omit)
# LDA (requires numeric matrix)
mm_formula <- ~ . - Diabetes_binary
x_train_mm <- model.matrix(mm_formula, data = train)[, -1, drop = FALSE]
x_test_mm <- model.matrix(mm_formula, data = test)[, -1, drop = FALSE]
nzv <- which(apply(x_train_mm, 2, function(z) length(unique(z)) > 1))
x_train_mm <- x_train_mm[, nzv, drop = FALSE]
x_test_mm <- x_test_mm[, intersect(colnames(x_train_mm), colnames(x_test_mm)), drop = FALSE]
x_test_mm <- x_test_mm[, colnames(x_train_mm), drop = FALSE]
cs <- colMeans(x_train_mm); ss <- apply(x_train_mm, 2, sd); ss[ss==0] <- 1
x_train_lda <- scale(x_train_mm, center = cs, scale = ss)
x_test_lda <- scale(x_test_mm, center = cs, scale = ss)
lda_fit <- lda(x = x_train_lda, grouping = train$Diabetes_binary)
saveRDS(list(log_fit=log_fit, rf_fit=rf_fit, lda_fit=lda_fit,
cs=cs, ss=ss, nzv=nzv, cols=colnames(x_train_mm)),
"models_quick.rds")
models <- readRDS("models_quick.rds")
log_fit <- models$log_fit; rf_fit <- models$rf_fit; lda_fit <- models$lda_fit
cs <- models$cs; ss <- models$ss; cols <- models$cols
# Rebuild LDA test matrix aligned (safe if you ran earlier)
x_test_mm <- model.matrix(~ . - Diabetes_binary, data = test)[, -1, drop = FALSE]
x_test_mm <- x_test_mm[, intersect(cols, colnames(x_test_mm)), drop = FALSE]
x_test_mm <- x_test_mm[, cols, drop = FALSE]
x_test_lda <- scale(x_test_mm, center = cs, scale = ss)
# Helper metrics
bin_metrics <- function(truth, prob_yes, pred_class) {
truth <- factor(truth, levels = c("no","yes"))
pred_class <- factor(pred_class, levels = c("no","yes"))
cm <- table(Predicted = pred_class, Actual = truth)
TP <- cm["yes","yes"]; TN <- cm["no","no"]; FP <- cm["yes","no"]; FN <- cm["no","yes"]
accuracy <- (TP + TN) / sum(cm)
precision <- if ((TP + FP) > 0) TP / (TP + FP) else NA_real_
recall <- if ((TP + FN) > 0) TP / (TP + FN) else NA_real_
f1 <- if (!is.na(precision) && !is.na(recall) && (precision + recall) > 0) {
2 * precision * recall / (precision + recall)
} else NA_real_
roc_obj <- tryCatch(pROC::roc(truth, as.numeric(prob_yes),
levels = c("no","yes"), direction = "<", quiet = TRUE),
error = function(e) NULL)
auc_val <- if (is.null(roc_obj)) NA_real_ else as.numeric(pROC::auc(roc_obj))
list(cm=cm, accuracy=accuracy, precision=precision, recall=recall, f1=f1, auc=auc_val)
}
y_test <- test$Diabetes_binary
# Logistic
log_prob <- as.numeric(predict(log_fit, newdata = test, type = "response"))
log_pred <- ifelse(log_prob > 0.5, "yes", "no")
log_mets <- bin_metrics(y_test, log_prob, log_pred)
# RF
rf_prob <- as.numeric(predict(rf_fit, newdata = test, type = "prob")[,"yes"])
rf_pred <- predict(rf_fit, newdata = test, type = "response")
rf_mets <- bin_metrics(y_test, rf_prob, rf_pred)
# LDA
lda_pred_obj <- predict(lda_fit, newdata = x_test_lda)
lda_prob <- as.numeric(lda_pred_obj$posterior[,"yes"])
lda_pred <- lda_pred_obj$class
lda_mets <- bin_metrics(y_test, lda_prob, lda_pred)
cat("\n=== Test Metrics: Logistic ===\n"); print(log_mets[c("cm","accuracy","precision","recall","f1")]); cat("AUC:", log_mets$auc, "\n")
##
## === Test Metrics: Logistic ===
## $cm
## Actual
## Predicted no yes
## no 7755 2391
## yes 2849 8213
##
## $accuracy
## [1] 0.7529234
##
## $precision
## [1] 0.7424516
##
## $recall
## [1] 0.774519
##
## $f1
## [1] 0.7581464
## AUC: 0.8290489
cat("\n=== Test Metrics: LDA ===\n"); print(lda_mets[c("cm","accuracy","precision","recall","f1")]); cat("AUC:", lda_mets$auc, "\n")
##
## === Test Metrics: LDA ===
## $cm
## Actual
## Predicted no yes
## no 7678 2333
## yes 2926 8271
##
## $accuracy
## [1] 0.7520275
##
## $precision
## [1] 0.73868
##
## $recall
## [1] 0.7799887
##
## $f1
## [1] 0.7587725
## AUC: 0.8278599
cat("\n=== Test Metrics: RF ===\n"); print(rf_mets[c("cm","accuracy","precision","recall","f1")]); cat("AUC:", rf_mets$auc, "\n")
##
## === Test Metrics: RF ===
## $cm
## Actual
## Predicted no yes
## no 7566 2273
## yes 3038 8331
##
## $accuracy
## [1] 0.7495756
##
## $precision
## [1] 0.7327821
##
## $recall
## [1] 0.7856469
##
## $f1
## [1] 0.7582943
## AUC: 0.822329
summ_test <- data.frame(
Model = c("Logistic","LDA","Random Forest"),
Accuracy = c(log_mets$accuracy, lda_mets$accuracy, rf_mets$accuracy),
F1 = c(log_mets$f1, lda_mets$f1, rf_mets$f1),
ROC_AUC = c(log_mets$auc, lda_mets$auc, rf_mets$auc)
)
summ_test[ , -1] <- round(summ_test[ , -1], 4) # round everything except Model
cat("\n===== Test-set summary =====\n"); print(summ_test)
##
## ===== Test-set summary =====
## Model Accuracy F1 ROC_AUC
## 1 Logistic 0.7529 0.7581 0.8290
## 2 LDA 0.7520 0.7588 0.8279
## 3 Random Forest 0.7496 0.7583 0.8223
# Logistic: Odds ratios + Wald CI
co <- coef(summary(log_fit))
OR <- exp(coef(log_fit))
SE <- co[, "Std. Error"]
lower <- exp(coef(log_fit) - 1.96 * SE)
upper <- exp(coef(log_fit) + 1.96 * SE)
log_or <- data.frame(term = names(OR), OR = OR, CI_low = lower, CI_high = upper)
log_or <- log_or[order(-abs(log_or$OR - 1)), ]
cat("\nTop 15 Odds Ratios (Wald CI):\n")
##
## Top 15 Odds Ratios (Wald CI):
print(head(log_or, 15), row.names = FALSE)
## term OR CI_low CI_high
## Age.L 10.20280129 8.53607442 12.19496797
## GenHlth.L 5.20658633 4.73972578 5.71943241
## CholCheck1 3.80309807 3.14046835 4.60554074
## HighBP1 2.05919384 1.96585885 2.15696020
## (Intercept) 0.01106813 0.00842357 0.01454295
## HighChol1 1.74273338 1.66683773 1.82208476
## HvyAlcoholConsump1 0.46858261 0.41807616 0.52519058
## Age.Q 0.56393383 0.47768725 0.66575226
## Sex1 1.32199542 1.26358444 1.38310653
## HeartDiseaseorAttack1 1.31900363 1.23403090 1.40982741
## Income.L 0.68437104 0.62901684 0.74459648
## GenHlth.Q 0.70377985 0.65583667 0.75522779
## Stroke1 1.24584781 1.13178084 1.37141107
## DiffWalk1 1.19060461 1.12044237 1.26516041
## Age.C 0.81257302 0.69604889 0.94860422
# RF: Importance (top 15)
rf_imp <- importance(rf_fit, type = 2)[,1]
rf_imp <- sort(rf_imp, decreasing = TRUE)
cat("\nTop 15 Feature Importances (RF, MeanDecreaseGini):\n")
##
## Top 15 Feature Importances (RF, MeanDecreaseGini):
print(head(rf_imp, 15))
## BMI GenHlth Age
## 3150.1662 2445.2616 2317.7279
## HighBP Income PhysHlth
## 1801.7608 1514.0479 1337.3348
## MentHlth Education HighChol
## 1031.6065 1021.2078 983.4499
## DiffWalk Fruits Smoker
## 614.9496 480.2136 478.8976
## Sex PhysActivity HeartDiseaseorAttack
## 475.0977 425.1304 405.3385
# LDA: Loadings
lda_load <- data.frame(feature = rownames(lda_fit$scaling),
LD1 = lda_fit$scaling[,1], row.names = NULL)
lda_load <- lda_load[order(-abs(lda_load$LD1)), ]
cat("\nTop 15 LDA loadings (|LD1|):\n")
##
## Top 15 LDA loadings (|LD1|):
print(head(lda_load, 15), row.names = FALSE)
## feature LD1
## GenHlth.L 0.43874257
## BMI 0.34491855
## HighBP1 0.32597186
## Age.L 0.29501629
## HighChol1 0.22421508
## HvyAlcoholConsump1 -0.10757572
## CholCheck1 0.10662857
## Sex1 0.09790900
## Income.L -0.09317045
## GenHlth.Q -0.08703876
## HeartDiseaseorAttack1 0.08132589
## GenHlth.C -0.07593671
## DiffWalk1 0.07316943
## Age.C -0.06365464
## Age.Q -0.04381448
# Helper: make K-fold indices (stratified)
kfold_indices <- function(y, k = 10) {
y <- factor(y, levels = c("no","yes"))
idx_no <- which(y == "no"); idx_yes <- which(y == "yes")
folds_no <- split(sample(idx_no), rep(1:k, length.out = length(idx_no)))
folds_yes <- split(sample(idx_yes), rep(1:k, length.out = length(idx_yes)))
Map(function(a,b) c(a,b), folds_no, folds_yes)
}
bin_metrics <- function(truth, prob_yes, pred_class) {
truth <- factor(truth, levels = c("no","yes"))
pred_class <- factor(pred_class, levels = c("no","yes"))
cm <- table(Predicted = pred_class, Actual = truth)
TP <- cm["yes","yes"]; TN <- cm["no","no"]; FP <- cm["yes","no"]; FN <- cm["no","yes"]
accuracy <- (TP + TN) / sum(cm)
precision <- if ((TP + FP) > 0) TP / (TP + FP) else NA_real_
recall <- if ((TP + FN) > 0) TP / (TP + FN) else NA_real_
f1 <- if (!is.na(precision) && !is.na(recall) && (precision + recall) > 0) {
2 * precision * recall / (precision + recall)
} else NA_real_
roc_obj <- tryCatch(pROC::roc(truth, as.numeric(prob_yes),
levels = c("no","yes"), direction = "<", quiet = TRUE),
error = function(e) NULL)
auc_val <- if (is.null(roc_obj)) NA_real_ else as.numeric(pROC::auc(roc_obj))
c(accuracy=accuracy, precision=precision, recall=recall, f1=f1, roc_auc=auc_val)
}
set.seed(20250811)
k <- 10
folds <- kfold_indices(train$Diabetes_binary, k)
# ---- Choose one model at a time by toggling which block you run ----
## (A) Logistic CV
glm_formula <- Diabetes_binary ~ .
cv_log <- replicate(k, NA_real_)
mets <- matrix(NA_real_, nrow = k, ncol = 5,
dimnames = list(NULL, c("accuracy","precision","recall","f1","roc_auc")))
for (i in seq_len(k)) {
te_idx <- folds[[i]]; tr_idx <- setdiff(seq_len(nrow(train)), te_idx)
tr <- train[tr_idx,]; te <- train[te_idx,]
fit <- glm(glm_formula, data = tr, family = binomial)
pr <- as.numeric(predict(fit, newdata = te, type = "response"))
pc <- ifelse(pr > 0.5, "yes", "no")
mets[i,] <- bin_metrics(te$Diabetes_binary, pr, pc)
}
cat("\nLogistic 10-fold CV (mean):\n"); print(colMeans(mets, na.rm = TRUE))
##
## Logistic 10-fold CV (mean):
## accuracy precision recall f1 roc_auc
## 0.7480199 0.7354077 0.7748768 0.7545789 0.8253636
## (B) Random Forest CV (smaller ntree for speed)
# mets <- matrix(NA_real_, nrow = k, ncol = 5,
# dimnames = list(NULL, c("accuracy","precision","recall","f1","roc_auc")))
# p_all <- ncol(train) - 1; mtry_val <- max(2, floor(sqrt(p_all)))
# for (i in seq_len(k)) {
# te_idx <- folds[[i]]; tr_idx <- setdiff(seq_len(nrow(train)), te_idx)
# tr <- train[tr_idx,]; te <- train[te_idx,]
# fit <- randomForest(Diabetes_binary ~ ., data = tr, ntree = 300, mtry = mtry_val,
# importance = FALSE, na.action = na.omit)
# pr <- as.numeric(predict(fit, newdata = te, type = "prob")[,"yes"])
# pc <- predict(fit, newdata = te, type = "response")
# mets[i,] <- bin_metrics(te$Diabetes_binary, pr, pc)
# }
# cat("\nRF 10-fold CV (mean):\n"); print(colMeans(mets, na.rm = TRUE))
## (C) LDA CV
# mets <- matrix(NA_real_, nrow = k, ncol = 5,
# dimnames = list(NULL, c("accuracy","precision","recall","f1","roc_auc")))
# for (i in seq_len(k)) {
# te_idx <- folds[[i]]; tr_idx <- setdiff(seq_len(nrow(train)), te_idx)
# tr <- train[tr_idx,]; te <- train[te_idx,]
# x_tr <- model.matrix(~ . - Diabetes_binary, data = tr)[, -1, drop = FALSE]
# x_te <- model.matrix(~ . - Diabetes_binary, data = te)[, -1, drop = FALSE]
# nzv <- which(apply(x_tr, 2, function(z) length(unique(z)) > 1))
# x_tr <- x_tr[, nzv, drop = FALSE]
# x_te <- x_te[, intersect(colnames(x_tr), colnames(x_te)), drop = FALSE]
# x_te <- x_te[, colnames(x_tr), drop = FALSE]
# cs <- colMeans(x_tr); ss <- apply(x_tr, 2, sd); ss[ss==0] <- 1
# x_trs <- scale(x_tr, center = cs, scale = ss)
# x_tes <- scale(x_te, center = cs, scale = ss)
# fit <- lda(x = x_trs, grouping = tr$Diabetes_binary)
# pred <- predict(fit, newdata = x_tes)
# pr <- as.numeric(pred$posterior[,"yes"])
# pc <- pred$class
# mets[i,] <- bin_metrics(te$Diabetes_binary, pr, pc)
# }
# cat("\nLDA 10-fold CV (mean):\n"); print(colMeans(mets, na.rm = TRUE))
## Warning: package 'ggplot2' was built under R version 4.4.3
| Model | Accuracy | F1 | ROC_AUC |
|---|---|---|---|
| Logistic | 0.7529 | 0.7581 | 0.8290 |
| LDA | 0.7520 | 0.7588 | 0.8279 |
| Random Forest | 0.7496 | 0.7583 | 0.8223 |
Test set Accuracy by Model
ROC curves for Logistic Regression, Random Forest, and LDA
Random Forest: Top 15 features by Gini importance
Logistic Regression: Top odds ratios with 95% CI