set.seed(20250811)

# install.packages(c("pROC","randomForest","MASS"))  # only if missing
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
csv_path <- "diabetes_binary_5050split_health_indicators_BRFSS2015.csv"
df <- read.csv(csv_path)

binary_cols <- c(
  "Diabetes_binary","HighBP","HighChol","CholCheck","Smoker","Stroke",
  "HeartDiseaseorAttack","PhysActivity","Fruits","Veggies",
  "HvyAlcoholConsump","AnyHealthcare","NoDocbcCost","DiffWalk","Sex"
)
df[binary_cols] <- lapply(df[binary_cols], factor)

df$GenHlth   <- factor(df$GenHlth,   ordered = TRUE, levels = sort(unique(df$GenHlth)))
df$Education <- factor(df$Education, ordered = TRUE, levels = sort(unique(df$Education)))
df$Income    <- factor(df$Income,    ordered = TRUE, levels = sort(unique(df$Income)))
df$Age       <- factor(df$Age,       ordered = TRUE, levels = sort(unique(df$Age)))

df$Diabetes_binary <- factor(df$Diabetes_binary, levels = c(0,1), labels = c("no","yes"))

cat("\nOutcome distribution:\n"); print(prop.table(table(df$Diabetes_binary)))
## 
## Outcome distribution:
## 
##  no yes 
## 0.5 0.5
stopifnot(all(c("BMI","MentHlth","PhysHlth") %in% names(df)))
strat_split <- function(data, prop = 0.7, y = "Diabetes_binary") {
  idx_yes <- which(data[[y]] == "yes")
  idx_no  <- which(data[[y]] == "no")
  n_yes   <- floor(length(idx_yes) * prop)
  n_no    <- floor(length(idx_no)  * prop)
  tr_idx  <- c(sample(idx_yes, n_yes), sample(idx_no, n_no))
  tr_idx  <- sample(tr_idx)
  list(train = data[tr_idx, , drop = FALSE],
       test  = data[-tr_idx, , drop = FALSE])
}
set.seed(20250811)
spl <- strat_split(df, prop = 0.70, y = "Diabetes_binary")
train <- droplevels(spl$train)
test  <- droplevels(spl$test)

if (length(unique(test$Diabetes_binary)) < 2) {  # ensure both classes present
  set.seed(20250811 + 1)
  spl <- strat_split(df, prop = 0.70, y = "Diabetes_binary")
  train <- droplevels(spl$train)
  test  <- droplevels(spl$test)
}

cat("\nTest outcome distribution:\n"); print(table(test$Diabetes_binary))
## 
## Test outcome distribution:
## 
##    no   yes 
## 10604 10604
# Persist to disk (optional but helpful if R restarts)
saveRDS(train, "train.rds"); saveRDS(test, "test.rds")
train <- readRDS("train.rds"); test <- readRDS("test.rds")
glm_formula <- Diabetes_binary ~ .

# Logistic
log_fit <- glm(glm_formula, data = train, family = binomial(link = "logit"))

# Random Forest (smaller trees for speed; increase later)
p_all <- ncol(train) - 1
mtry_val <- max(2, floor(sqrt(p_all)))
rf_fit <- randomForest(glm_formula, data = train, ntree = 300, mtry = mtry_val,
                       importance = TRUE, na.action = na.omit)

# LDA (requires numeric matrix)
mm_formula <- ~ . - Diabetes_binary
x_train_mm <- model.matrix(mm_formula, data = train)[, -1, drop = FALSE]
x_test_mm  <- model.matrix(mm_formula, data = test)[,  -1, drop = FALSE]
nzv <- which(apply(x_train_mm, 2, function(z) length(unique(z)) > 1))
x_train_mm <- x_train_mm[, nzv, drop = FALSE]
x_test_mm  <- x_test_mm[,  intersect(colnames(x_train_mm), colnames(x_test_mm)), drop = FALSE]
x_test_mm  <- x_test_mm[, colnames(x_train_mm), drop = FALSE]
cs <- colMeans(x_train_mm); ss <- apply(x_train_mm, 2, sd); ss[ss==0] <- 1
x_train_lda <- scale(x_train_mm, center = cs, scale = ss)
x_test_lda  <- scale(x_test_mm,  center = cs, scale = ss)
lda_fit <- lda(x = x_train_lda, grouping = train$Diabetes_binary)

saveRDS(list(log_fit=log_fit, rf_fit=rf_fit, lda_fit=lda_fit,
             cs=cs, ss=ss, nzv=nzv, cols=colnames(x_train_mm)),
        "models_quick.rds")
models <- readRDS("models_quick.rds")
log_fit <- models$log_fit; rf_fit <- models$rf_fit; lda_fit <- models$lda_fit
cs <- models$cs; ss <- models$ss; cols <- models$cols

# Rebuild LDA test matrix aligned (safe if you ran earlier)
x_test_mm  <- model.matrix(~ . - Diabetes_binary, data = test)[, -1, drop = FALSE]
x_test_mm  <- x_test_mm[, intersect(cols, colnames(x_test_mm)), drop = FALSE]
x_test_mm  <- x_test_mm[, cols, drop = FALSE]
x_test_lda <- scale(x_test_mm, center = cs, scale = ss)

# Helper metrics
bin_metrics <- function(truth, prob_yes, pred_class) {
  truth <- factor(truth, levels = c("no","yes"))
  pred_class <- factor(pred_class, levels = c("no","yes"))
  cm <- table(Predicted = pred_class, Actual = truth)
  TP <- cm["yes","yes"]; TN <- cm["no","no"]; FP <- cm["yes","no"]; FN <- cm["no","yes"]
  accuracy  <- (TP + TN) / sum(cm)
  precision <- if ((TP + FP) > 0) TP / (TP + FP) else NA_real_
  recall    <- if ((TP + FN) > 0) TP / (TP + FN) else NA_real_
  f1        <- if (!is.na(precision) && !is.na(recall) && (precision + recall) > 0) {
    2 * precision * recall / (precision + recall)
  } else NA_real_
  roc_obj <- tryCatch(pROC::roc(truth, as.numeric(prob_yes),
                                levels = c("no","yes"), direction = "<", quiet = TRUE),
                      error = function(e) NULL)
  auc_val <- if (is.null(roc_obj)) NA_real_ else as.numeric(pROC::auc(roc_obj))
  list(cm=cm, accuracy=accuracy, precision=precision, recall=recall, f1=f1, auc=auc_val)
}

y_test <- test$Diabetes_binary

# Logistic
log_prob <- as.numeric(predict(log_fit, newdata = test, type = "response"))
log_pred <- ifelse(log_prob > 0.5, "yes", "no")
log_mets <- bin_metrics(y_test, log_prob, log_pred)

# RF
rf_prob <- as.numeric(predict(rf_fit, newdata = test, type = "prob")[,"yes"])
rf_pred <- predict(rf_fit, newdata = test, type = "response")
rf_mets <- bin_metrics(y_test, rf_prob, rf_pred)

# LDA
lda_pred_obj <- predict(lda_fit, newdata = x_test_lda)
lda_prob <- as.numeric(lda_pred_obj$posterior[,"yes"])
lda_pred <- lda_pred_obj$class
lda_mets <- bin_metrics(y_test, lda_prob, lda_pred)

cat("\n=== Test Metrics: Logistic ===\n"); print(log_mets[c("cm","accuracy","precision","recall","f1")]); cat("AUC:", log_mets$auc, "\n")
## 
## === Test Metrics: Logistic ===
## $cm
##          Actual
## Predicted   no  yes
##       no  7755 2391
##       yes 2849 8213
## 
## $accuracy
## [1] 0.7529234
## 
## $precision
## [1] 0.7424516
## 
## $recall
## [1] 0.774519
## 
## $f1
## [1] 0.7581464
## AUC: 0.8290489
cat("\n=== Test Metrics: LDA ===\n");      print(lda_mets[c("cm","accuracy","precision","recall","f1")]); cat("AUC:", lda_mets$auc, "\n")
## 
## === Test Metrics: LDA ===
## $cm
##          Actual
## Predicted   no  yes
##       no  7678 2333
##       yes 2926 8271
## 
## $accuracy
## [1] 0.7520275
## 
## $precision
## [1] 0.73868
## 
## $recall
## [1] 0.7799887
## 
## $f1
## [1] 0.7587725
## AUC: 0.8278599
cat("\n=== Test Metrics: RF ===\n");       print(rf_mets[c("cm","accuracy","precision","recall","f1")]); cat("AUC:", rf_mets$auc, "\n")
## 
## === Test Metrics: RF ===
## $cm
##          Actual
## Predicted   no  yes
##       no  7566 2273
##       yes 3038 8331
## 
## $accuracy
## [1] 0.7495756
## 
## $precision
## [1] 0.7327821
## 
## $recall
## [1] 0.7856469
## 
## $f1
## [1] 0.7582943
## AUC: 0.822329
summ_test <- data.frame(
  Model    = c("Logistic","LDA","Random Forest"),
  Accuracy = c(log_mets$accuracy, lda_mets$accuracy, rf_mets$accuracy),
  F1       = c(log_mets$f1,       lda_mets$f1,       rf_mets$f1),
  ROC_AUC  = c(log_mets$auc,      lda_mets$auc,      rf_mets$auc)
)

summ_test[ , -1] <- round(summ_test[ , -1], 4)  # round everything except Model
cat("\n===== Test-set summary =====\n"); print(summ_test)
## 
## ===== Test-set summary =====
##           Model Accuracy     F1 ROC_AUC
## 1      Logistic   0.7529 0.7581  0.8290
## 2           LDA   0.7520 0.7588  0.8279
## 3 Random Forest   0.7496 0.7583  0.8223
# Logistic: Odds ratios + Wald CI
co <- coef(summary(log_fit))
OR  <- exp(coef(log_fit))
SE  <- co[, "Std. Error"]
lower <- exp(coef(log_fit) - 1.96 * SE)
upper <- exp(coef(log_fit) + 1.96 * SE)
log_or <- data.frame(term = names(OR), OR = OR, CI_low = lower, CI_high = upper)
log_or <- log_or[order(-abs(log_or$OR - 1)), ]
cat("\nTop 15 Odds Ratios (Wald CI):\n")
## 
## Top 15 Odds Ratios (Wald CI):
print(head(log_or, 15), row.names = FALSE)
##                   term          OR     CI_low     CI_high
##                  Age.L 10.20280129 8.53607442 12.19496797
##              GenHlth.L  5.20658633 4.73972578  5.71943241
##             CholCheck1  3.80309807 3.14046835  4.60554074
##                HighBP1  2.05919384 1.96585885  2.15696020
##            (Intercept)  0.01106813 0.00842357  0.01454295
##              HighChol1  1.74273338 1.66683773  1.82208476
##     HvyAlcoholConsump1  0.46858261 0.41807616  0.52519058
##                  Age.Q  0.56393383 0.47768725  0.66575226
##                   Sex1  1.32199542 1.26358444  1.38310653
##  HeartDiseaseorAttack1  1.31900363 1.23403090  1.40982741
##               Income.L  0.68437104 0.62901684  0.74459648
##              GenHlth.Q  0.70377985 0.65583667  0.75522779
##                Stroke1  1.24584781 1.13178084  1.37141107
##              DiffWalk1  1.19060461 1.12044237  1.26516041
##                  Age.C  0.81257302 0.69604889  0.94860422
# RF: Importance (top 15)
rf_imp <- importance(rf_fit, type = 2)[,1]
rf_imp <- sort(rf_imp, decreasing = TRUE)
cat("\nTop 15 Feature Importances (RF, MeanDecreaseGini):\n")
## 
## Top 15 Feature Importances (RF, MeanDecreaseGini):
print(head(rf_imp, 15))
##                  BMI              GenHlth                  Age 
##            3150.1662            2445.2616            2317.7279 
##               HighBP               Income             PhysHlth 
##            1801.7608            1514.0479            1337.3348 
##             MentHlth            Education             HighChol 
##            1031.6065            1021.2078             983.4499 
##             DiffWalk               Fruits               Smoker 
##             614.9496             480.2136             478.8976 
##                  Sex         PhysActivity HeartDiseaseorAttack 
##             475.0977             425.1304             405.3385
# LDA: Loadings
lda_load <- data.frame(feature = rownames(lda_fit$scaling),
                       LD1 = lda_fit$scaling[,1], row.names = NULL)
lda_load <- lda_load[order(-abs(lda_load$LD1)), ]
cat("\nTop 15 LDA loadings (|LD1|):\n")
## 
## Top 15 LDA loadings (|LD1|):
print(head(lda_load, 15), row.names = FALSE)
##                feature         LD1
##              GenHlth.L  0.43874257
##                    BMI  0.34491855
##                HighBP1  0.32597186
##                  Age.L  0.29501629
##              HighChol1  0.22421508
##     HvyAlcoholConsump1 -0.10757572
##             CholCheck1  0.10662857
##                   Sex1  0.09790900
##               Income.L -0.09317045
##              GenHlth.Q -0.08703876
##  HeartDiseaseorAttack1  0.08132589
##              GenHlth.C -0.07593671
##              DiffWalk1  0.07316943
##                  Age.C -0.06365464
##                  Age.Q -0.04381448
# Helper: make K-fold indices (stratified)
kfold_indices <- function(y, k = 10) {
  y <- factor(y, levels = c("no","yes"))
  idx_no  <- which(y == "no");  idx_yes <- which(y == "yes")
  folds_no  <- split(sample(idx_no),  rep(1:k, length.out = length(idx_no)))
  folds_yes <- split(sample(idx_yes), rep(1:k, length.out = length(idx_yes)))
  Map(function(a,b) c(a,b), folds_no, folds_yes)
}
bin_metrics <- function(truth, prob_yes, pred_class) {
  truth <- factor(truth, levels = c("no","yes"))
  pred_class <- factor(pred_class, levels = c("no","yes"))
  cm <- table(Predicted = pred_class, Actual = truth)
  TP <- cm["yes","yes"]; TN <- cm["no","no"]; FP <- cm["yes","no"]; FN <- cm["no","yes"]
  accuracy  <- (TP + TN) / sum(cm)
  precision <- if ((TP + FP) > 0) TP / (TP + FP) else NA_real_
  recall    <- if ((TP + FN) > 0) TP / (TP + FN) else NA_real_
  f1        <- if (!is.na(precision) && !is.na(recall) && (precision + recall) > 0) {
    2 * precision * recall / (precision + recall)
  } else NA_real_
  roc_obj <- tryCatch(pROC::roc(truth, as.numeric(prob_yes),
                                levels = c("no","yes"), direction = "<", quiet = TRUE),
                      error = function(e) NULL)
  auc_val <- if (is.null(roc_obj)) NA_real_ else as.numeric(pROC::auc(roc_obj))
  c(accuracy=accuracy, precision=precision, recall=recall, f1=f1, roc_auc=auc_val)
}
set.seed(20250811)
k <- 10
folds <- kfold_indices(train$Diabetes_binary, k)

# ---- Choose one model at a time by toggling which block you run ----

## (A) Logistic CV
glm_formula <- Diabetes_binary ~ .
cv_log <- replicate(k, NA_real_)
mets <- matrix(NA_real_, nrow = k, ncol = 5,
               dimnames = list(NULL, c("accuracy","precision","recall","f1","roc_auc")))
for (i in seq_len(k)) {
  te_idx <- folds[[i]]; tr_idx <- setdiff(seq_len(nrow(train)), te_idx)
  tr <- train[tr_idx,]; te <- train[te_idx,]
  fit <- glm(glm_formula, data = tr, family = binomial)
  pr  <- as.numeric(predict(fit, newdata = te, type = "response"))
  pc  <- ifelse(pr > 0.5, "yes", "no")
  mets[i,] <- bin_metrics(te$Diabetes_binary, pr, pc)
}
cat("\nLogistic 10-fold CV (mean):\n"); print(colMeans(mets, na.rm = TRUE))
## 
## Logistic 10-fold CV (mean):
##  accuracy precision    recall        f1   roc_auc 
## 0.7480199 0.7354077 0.7748768 0.7545789 0.8253636
## (B) Random Forest CV (smaller ntree for speed)
# mets <- matrix(NA_real_, nrow = k, ncol = 5,
#                dimnames = list(NULL, c("accuracy","precision","recall","f1","roc_auc")))
# p_all <- ncol(train) - 1; mtry_val <- max(2, floor(sqrt(p_all)))
# for (i in seq_len(k)) {
#   te_idx <- folds[[i]]; tr_idx <- setdiff(seq_len(nrow(train)), te_idx)
#   tr <- train[tr_idx,]; te <- train[te_idx,]
#   fit <- randomForest(Diabetes_binary ~ ., data = tr, ntree = 300, mtry = mtry_val,
#                       importance = FALSE, na.action = na.omit)
#   pr  <- as.numeric(predict(fit, newdata = te, type = "prob")[,"yes"])
#   pc  <- predict(fit, newdata = te, type = "response")
#   mets[i,] <- bin_metrics(te$Diabetes_binary, pr, pc)
# }
# cat("\nRF 10-fold CV (mean):\n"); print(colMeans(mets, na.rm = TRUE))

## (C) LDA CV
# mets <- matrix(NA_real_, nrow = k, ncol = 5,
#                dimnames = list(NULL, c("accuracy","precision","recall","f1","roc_auc")))
# for (i in seq_len(k)) {
#   te_idx <- folds[[i]]; tr_idx <- setdiff(seq_len(nrow(train)), te_idx)
#   tr <- train[tr_idx,]; te <- train[te_idx,]
#   x_tr <- model.matrix(~ . - Diabetes_binary, data = tr)[, -1, drop = FALSE]
#   x_te <- model.matrix(~ . - Diabetes_binary, data = te)[, -1, drop = FALSE]
#   nzv  <- which(apply(x_tr, 2, function(z) length(unique(z)) > 1))
#   x_tr <- x_tr[, nzv, drop = FALSE]
#   x_te <- x_te[, intersect(colnames(x_tr), colnames(x_te)), drop = FALSE]
#   x_te <- x_te[, colnames(x_tr), drop = FALSE]
#   cs <- colMeans(x_tr); ss <- apply(x_tr, 2, sd); ss[ss==0] <- 1
#   x_trs <- scale(x_tr, center = cs, scale = ss)
#   x_tes <- scale(x_te, center = cs, scale = ss)
#   fit <- lda(x = x_trs, grouping = tr$Diabetes_binary)
#   pred <- predict(fit, newdata = x_tes)
#   pr   <- as.numeric(pred$posterior[,"yes"])
#   pc   <- pred$class
#   mets[i,] <- bin_metrics(te$Diabetes_binary, pr, pc)
# }
# cat("\nLDA 10-fold CV (mean):\n"); print(colMeans(mets, na.rm = TRUE))

Results — Key Outputs (friend-style additions)

## Warning: package 'ggplot2' was built under R version 4.4.3
Test-set performance (Accuracy, F1, ROC-AUC)
Model Accuracy F1 ROC_AUC
Logistic 0.7529 0.7581 0.8290
LDA 0.7520 0.7588 0.8279
Random Forest 0.7496 0.7583 0.8223
Test set Accuracy by Model

Test set Accuracy by Model

ROC curves for Logistic Regression, Random Forest, and LDA

ROC curves for Logistic Regression, Random Forest, and LDA

Random Forest: Top 15 features by Gini importance

Random Forest: Top 15 features by Gini importance

Logistic Regression: Top odds ratios with 95% CI

Logistic Regression: Top odds ratios with 95% CI