set.seed(20250811)

# install.packages(c("pROC","randomForest","MASS"))  # only if missing
library(pROC)

## Warning: package 'pROC' was built under R version 4.4.3

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.4.3

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

library(MASS)

## Warning: package 'MASS' was built under R version 4.4.3

csv_path <- "diabetes_binary_5050split_health_indicators_BRFSS2015.csv"
df <- read.csv(csv_path)

binary_cols <- c(
  "Diabetes_binary","HighBP","HighChol","CholCheck","Smoker","Stroke",
  "HeartDiseaseorAttack","PhysActivity","Fruits","Veggies",
  "HvyAlcoholConsump","AnyHealthcare","NoDocbcCost","DiffWalk","Sex"
)
df[binary_cols] <- lapply(df[binary_cols], factor)

df$GenHlth   <- factor(df$GenHlth,   ordered = TRUE, levels = sort(unique(df$GenHlth)))
df$Education <- factor(df$Education, ordered = TRUE, levels = sort(unique(df$Education)))
df$Income    <- factor(df$Income,    ordered = TRUE, levels = sort(unique(df$Income)))
df$Age       <- factor(df$Age,       ordered = TRUE, levels = sort(unique(df$Age)))

df$Diabetes_binary <- factor(df$Diabetes_binary, levels = c(0,1), labels = c("no","yes"))

cat("\nOutcome distribution:\n"); print(prop.table(table(df$Diabetes_binary)))

## 
## Outcome distribution:

## 
##  no yes 
## 0.5 0.5

stopifnot(all(c("BMI","MentHlth","PhysHlth") %in% names(df)))

strat_split <- function(data, prop = 0.7, y = "Diabetes_binary") {
  idx_yes <- which(data[[y]] == "yes")
  idx_no  <- which(data[[y]] == "no")
  n_yes   <- floor(length(idx_yes) * prop)
  n_no    <- floor(length(idx_no)  * prop)
  tr_idx  <- c(sample(idx_yes, n_yes), sample(idx_no, n_no))
  tr_idx  <- sample(tr_idx)
  list(train = data[tr_idx, , drop = FALSE],
       test  = data[-tr_idx, , drop = FALSE])
}
set.seed(20250811)
spl <- strat_split(df, prop = 0.70, y = "Diabetes_binary")
train <- droplevels(spl$train)
test  <- droplevels(spl$test)

if (length(unique(test$Diabetes_binary)) < 2) {  # ensure both classes present
  set.seed(20250811 + 1)
  spl <- strat_split(df, prop = 0.70, y = "Diabetes_binary")
  train <- droplevels(spl$train)
  test  <- droplevels(spl$test)
}

cat("\nTest outcome distribution:\n"); print(table(test$Diabetes_binary))

## 
## Test outcome distribution:

## 
##    no   yes 
## 10604 10604

# Persist to disk (optional but helpful if R restarts)
saveRDS(train, "train.rds"); saveRDS(test, "test.rds")

train <- readRDS("train.rds"); test <- readRDS("test.rds")
glm_formula <- Diabetes_binary ~ .

# Logistic
log_fit <- glm(glm_formula, data = train, family = binomial(link = "logit"))

# Random Forest (smaller trees for speed; increase later)
p_all <- ncol(train) - 1
mtry_val <- max(2, floor(sqrt(p_all)))
rf_fit <- randomForest(glm_formula, data = train, ntree = 300, mtry = mtry_val,
                       importance = TRUE, na.action = na.omit)

# LDA (requires numeric matrix)
mm_formula <- ~ . - Diabetes_binary
x_train_mm <- model.matrix(mm_formula, data = train)[, -1, drop = FALSE]
x_test_mm  <- model.matrix(mm_formula, data = test)[,  -1, drop = FALSE]
nzv <- which(apply(x_train_mm, 2, function(z) length(unique(z)) > 1))
x_train_mm <- x_train_mm[, nzv, drop = FALSE]
x_test_mm  <- x_test_mm[,  intersect(colnames(x_train_mm), colnames(x_test_mm)), drop = FALSE]
x_test_mm  <- x_test_mm[, colnames(x_train_mm), drop = FALSE]
cs <- colMeans(x_train_mm); ss <- apply(x_train_mm, 2, sd); ss[ss==0] <- 1
x_train_lda <- scale(x_train_mm, center = cs, scale = ss)
x_test_lda  <- scale(x_test_mm,  center = cs, scale = ss)
lda_fit <- lda(x = x_train_lda, grouping = train$Diabetes_binary)

saveRDS(list(log_fit=log_fit, rf_fit=rf_fit, lda_fit=lda_fit,
             cs=cs, ss=ss, nzv=nzv, cols=colnames(x_train_mm)),
        "models_quick.rds")

models <- readRDS("models_quick.rds")
log_fit <- models$log_fit; rf_fit <- models$rf_fit; lda_fit <- models$lda_fit
cs <- models$cs; ss <- models$ss; cols <- models$cols

# Rebuild LDA test matrix aligned (safe if you ran earlier)
x_test_mm  <- model.matrix(~ . - Diabetes_binary, data = test)[, -1, drop = FALSE]
x_test_mm  <- x_test_mm[, intersect(cols, colnames(x_test_mm)), drop = FALSE]
x_test_mm  <- x_test_mm[, cols, drop = FALSE]
x_test_lda <- scale(x_test_mm, center = cs, scale = ss)

# Helper metrics
bin_metrics <- function(truth, prob_yes, pred_class) {
  truth <- factor(truth, levels = c("no","yes"))
  pred_class <- factor(pred_class, levels = c("no","yes"))
  cm <- table(Predicted = pred_class, Actual = truth)
  TP <- cm["yes","yes"]; TN <- cm["no","no"]; FP <- cm["yes","no"]; FN <- cm["no","yes"]
  accuracy  <- (TP + TN) / sum(cm)
  precision <- if ((TP + FP) > 0) TP / (TP + FP) else NA_real_
  recall    <- if ((TP + FN) > 0) TP / (TP + FN) else NA_real_
  f1        <- if (!is.na(precision) && !is.na(recall) && (precision + recall) > 0) {
    2 * precision * recall / (precision + recall)
  } else NA_real_
  roc_obj <- tryCatch(pROC::roc(truth, as.numeric(prob_yes),
                                levels = c("no","yes"), direction = "<", quiet = TRUE),
                      error = function(e) NULL)
  auc_val <- if (is.null(roc_obj)) NA_real_ else as.numeric(pROC::auc(roc_obj))
  list(cm=cm, accuracy=accuracy, precision=precision, recall=recall, f1=f1, auc=auc_val)
}

y_test <- test$Diabetes_binary

# Logistic
log_prob <- as.numeric(predict(log_fit, newdata = test, type = "response"))
log_pred <- ifelse(log_prob > 0.5, "yes", "no")
log_mets <- bin_metrics(y_test, log_prob, log_pred)

# RF
rf_prob <- as.numeric(predict(rf_fit, newdata = test, type = "prob")[,"yes"])
rf_pred <- predict(rf_fit, newdata = test, type = "response")
rf_mets <- bin_metrics(y_test, rf_prob, rf_pred)

# LDA
lda_pred_obj <- predict(lda_fit, newdata = x_test_lda)
lda_prob <- as.numeric(lda_pred_obj$posterior[,"yes"])
lda_pred <- lda_pred_obj$class
lda_mets <- bin_metrics(y_test, lda_prob, lda_pred)

cat("\n=== Test Metrics: Logistic ===\n"); print(log_mets[c("cm","accuracy","precision","recall","f1")]); cat("AUC:", log_mets$auc, "\n")

## 
## === Test Metrics: Logistic ===

## $cm
##          Actual
## Predicted   no  yes
##       no  7755 2391
##       yes 2849 8213
## 
## $accuracy
## [1] 0.7529234
## 
## $precision
## [1] 0.7424516
## 
## $recall
## [1] 0.774519
## 
## $f1
## [1] 0.7581464

## AUC: 0.8290489

cat("\n=== Test Metrics: LDA ===\n");      print(lda_mets[c("cm","accuracy","precision","recall","f1")]); cat("AUC:", lda_mets$auc, "\n")

## 
## === Test Metrics: LDA ===

## $cm
##          Actual
## Predicted   no  yes
##       no  7678 2333
##       yes 2926 8271
## 
## $accuracy
## [1] 0.7520275
## 
## $precision
## [1] 0.73868
## 
## $recall
## [1] 0.7799887
## 
## $f1
## [1] 0.7587725

## AUC: 0.8278599

cat("\n=== Test Metrics: RF ===\n");       print(rf_mets[c("cm","accuracy","precision","recall","f1")]); cat("AUC:", rf_mets$auc, "\n")

## 
## === Test Metrics: RF ===

## $cm
##          Actual
## Predicted   no  yes
##       no  7566 2273
##       yes 3038 8331
## 
## $accuracy
## [1] 0.7495756
## 
## $precision
## [1] 0.7327821
## 
## $recall
## [1] 0.7856469
## 
## $f1
## [1] 0.7582943

## AUC: 0.822329

summ_test <- data.frame(
  Model    = c("Logistic","LDA","Random Forest"),
  Accuracy = c(log_mets$accuracy, lda_mets$accuracy, rf_mets$accuracy),
  F1       = c(log_mets$f1,       lda_mets$f1,       rf_mets$f1),
  ROC_AUC  = c(log_mets$auc,      lda_mets$auc,      rf_mets$auc)
)

summ_test[ , -1] <- round(summ_test[ , -1], 4)  # round everything except Model
cat("\n===== Test-set summary =====\n"); print(summ_test)

## 
## ===== Test-set summary =====

##           Model Accuracy     F1 ROC_AUC
## 1      Logistic   0.7529 0.7581  0.8290
## 2           LDA   0.7520 0.7588  0.8279
## 3 Random Forest   0.7496 0.7583  0.8223

# Logistic: Odds ratios + Wald CI
co <- coef(summary(log_fit))
OR  <- exp(coef(log_fit))
SE  <- co[, "Std. Error"]
lower <- exp(coef(log_fit) - 1.96 * SE)
upper <- exp(coef(log_fit) + 1.96 * SE)
log_or <- data.frame(term = names(OR), OR = OR, CI_low = lower, CI_high = upper)
log_or <- log_or[order(-abs(log_or$OR - 1)), ]
cat("\nTop 15 Odds Ratios (Wald CI):\n")

## 
## Top 15 Odds Ratios (Wald CI):

print(head(log_or, 15), row.names = FALSE)

##                   term          OR     CI_low     CI_high
##                  Age.L 10.20280129 8.53607442 12.19496797
##              GenHlth.L  5.20658633 4.73972578  5.71943241
##             CholCheck1  3.80309807 3.14046835  4.60554074
##                HighBP1  2.05919384 1.96585885  2.15696020
##            (Intercept)  0.01106813 0.00842357  0.01454295
##              HighChol1  1.74273338 1.66683773  1.82208476
##     HvyAlcoholConsump1  0.46858261 0.41807616  0.52519058
##                  Age.Q  0.56393383 0.47768725  0.66575226
##                   Sex1  1.32199542 1.26358444  1.38310653
##  HeartDiseaseorAttack1  1.31900363 1.23403090  1.40982741
##               Income.L  0.68437104 0.62901684  0.74459648
##              GenHlth.Q  0.70377985 0.65583667  0.75522779
##                Stroke1  1.24584781 1.13178084  1.37141107
##              DiffWalk1  1.19060461 1.12044237  1.26516041
##                  Age.C  0.81257302 0.69604889  0.94860422

# RF: Importance (top 15)
rf_imp <- importance(rf_fit, type = 2)[,1]
rf_imp <- sort(rf_imp, decreasing = TRUE)
cat("\nTop 15 Feature Importances (RF, MeanDecreaseGini):\n")

## 
## Top 15 Feature Importances (RF, MeanDecreaseGini):

print(head(rf_imp, 15))

##                  BMI              GenHlth                  Age 
##            3150.1662            2445.2616            2317.7279 
##               HighBP               Income             PhysHlth 
##            1801.7608            1514.0479            1337.3348 
##             MentHlth            Education             HighChol 
##            1031.6065            1021.2078             983.4499 
##             DiffWalk               Fruits               Smoker 
##             614.9496             480.2136             478.8976 
##                  Sex         PhysActivity HeartDiseaseorAttack 
##             475.0977             425.1304             405.3385

# LDA: Loadings
lda_load <- data.frame(feature = rownames(lda_fit$scaling),
                       LD1 = lda_fit$scaling[,1], row.names = NULL)
lda_load <- lda_load[order(-abs(lda_load$LD1)), ]
cat("\nTop 15 LDA loadings (|LD1|):\n")

## 
## Top 15 LDA loadings (|LD1|):

print(head(lda_load, 15), row.names = FALSE)

##                feature         LD1
##              GenHlth.L  0.43874257
##                    BMI  0.34491855
##                HighBP1  0.32597186
##                  Age.L  0.29501629
##              HighChol1  0.22421508
##     HvyAlcoholConsump1 -0.10757572
##             CholCheck1  0.10662857
##                   Sex1  0.09790900
##               Income.L -0.09317045
##              GenHlth.Q -0.08703876
##  HeartDiseaseorAttack1  0.08132589
##              GenHlth.C -0.07593671
##              DiffWalk1  0.07316943
##                  Age.C -0.06365464
##                  Age.Q -0.04381448

# Helper: make K-fold indices (stratified)
kfold_indices <- function(y, k = 10) {
  y <- factor(y, levels = c("no","yes"))
  idx_no  <- which(y == "no");  idx_yes <- which(y == "yes")
  folds_no  <- split(sample(idx_no),  rep(1:k, length.out = length(idx_no)))
  folds_yes <- split(sample(idx_yes), rep(1:k, length.out = length(idx_yes)))
  Map(function(a,b) c(a,b), folds_no, folds_yes)
}
bin_metrics <- function(truth, prob_yes, pred_class) {
  truth <- factor(truth, levels = c("no","yes"))
  pred_class <- factor(pred_class, levels = c("no","yes"))
  cm <- table(Predicted = pred_class, Actual = truth)
  TP <- cm["yes","yes"]; TN <- cm["no","no"]; FP <- cm["yes","no"]; FN <- cm["no","yes"]
  accuracy  <- (TP + TN) / sum(cm)
  precision <- if ((TP + FP) > 0) TP / (TP + FP) else NA_real_
  recall    <- if ((TP + FN) > 0) TP / (TP + FN) else NA_real_
  f1        <- if (!is.na(precision) && !is.na(recall) && (precision + recall) > 0) {
    2 * precision * recall / (precision + recall)
  } else NA_real_
  roc_obj <- tryCatch(pROC::roc(truth, as.numeric(prob_yes),
                                levels = c("no","yes"), direction = "<", quiet = TRUE),
                      error = function(e) NULL)
  auc_val <- if (is.null(roc_obj)) NA_real_ else as.numeric(pROC::auc(roc_obj))
  c(accuracy=accuracy, precision=precision, recall=recall, f1=f1, roc_auc=auc_val)
}
set.seed(20250811)
k <- 10
folds <- kfold_indices(train$Diabetes_binary, k)

# ---- Choose one model at a time by toggling which block you run ----

## (A) Logistic CV
glm_formula <- Diabetes_binary ~ .
cv_log <- replicate(k, NA_real_)
mets <- matrix(NA_real_, nrow = k, ncol = 5,
               dimnames = list(NULL, c("accuracy","precision","recall","f1","roc_auc")))
for (i in seq_len(k)) {
  te_idx <- folds[[i]]; tr_idx <- setdiff(seq_len(nrow(train)), te_idx)
  tr <- train[tr_idx,]; te <- train[te_idx,]
  fit <- glm(glm_formula, data = tr, family = binomial)
  pr  <- as.numeric(predict(fit, newdata = te, type = "response"))
  pc  <- ifelse(pr > 0.5, "yes", "no")
  mets[i,] <- bin_metrics(te$Diabetes_binary, pr, pc)
}
cat("\nLogistic 10-fold CV (mean):\n"); print(colMeans(mets, na.rm = TRUE))

## 
## Logistic 10-fold CV (mean):

##  accuracy precision    recall        f1   roc_auc 
## 0.7480199 0.7354077 0.7748768 0.7545789 0.8253636

## (B) Random Forest CV (smaller ntree for speed)
# mets <- matrix(NA_real_, nrow = k, ncol = 5,
#                dimnames = list(NULL, c("accuracy","precision","recall","f1","roc_auc")))
# p_all <- ncol(train) - 1; mtry_val <- max(2, floor(sqrt(p_all)))
# for (i in seq_len(k)) {
#   te_idx <- folds[[i]]; tr_idx <- setdiff(seq_len(nrow(train)), te_idx)
#   tr <- train[tr_idx,]; te <- train[te_idx,]
#   fit <- randomForest(Diabetes_binary ~ ., data = tr, ntree = 300, mtry = mtry_val,
#                       importance = FALSE, na.action = na.omit)
#   pr  <- as.numeric(predict(fit, newdata = te, type = "prob")[,"yes"])
#   pc  <- predict(fit, newdata = te, type = "response")
#   mets[i,] <- bin_metrics(te$Diabetes_binary, pr, pc)
# }
# cat("\nRF 10-fold CV (mean):\n"); print(colMeans(mets, na.rm = TRUE))

## (C) LDA CV
# mets <- matrix(NA_real_, nrow = k, ncol = 5,
#                dimnames = list(NULL, c("accuracy","precision","recall","f1","roc_auc")))
# for (i in seq_len(k)) {
#   te_idx <- folds[[i]]; tr_idx <- setdiff(seq_len(nrow(train)), te_idx)
#   tr <- train[tr_idx,]; te <- train[te_idx,]
#   x_tr <- model.matrix(~ . - Diabetes_binary, data = tr)[, -1, drop = FALSE]
#   x_te <- model.matrix(~ . - Diabetes_binary, data = te)[, -1, drop = FALSE]
#   nzv  <- which(apply(x_tr, 2, function(z) length(unique(z)) > 1))
#   x_tr <- x_tr[, nzv, drop = FALSE]
#   x_te <- x_te[, intersect(colnames(x_tr), colnames(x_te)), drop = FALSE]
#   x_te <- x_te[, colnames(x_tr), drop = FALSE]
#   cs <- colMeans(x_tr); ss <- apply(x_tr, 2, sd); ss[ss==0] <- 1
#   x_trs <- scale(x_tr, center = cs, scale = ss)
#   x_tes <- scale(x_te, center = cs, scale = ss)
#   fit <- lda(x = x_trs, grouping = tr$Diabetes_binary)
#   pred <- predict(fit, newdata = x_tes)
#   pr   <- as.numeric(pred$posterior[,"yes"])
#   pc   <- pred$class
#   mets[i,] <- bin_metrics(te$Diabetes_binary, pr, pc)
# }
# cat("\nLDA 10-fold CV (mean):\n"); print(colMeans(mets, na.rm = TRUE))

Results — Key Outputs (friend-style additions)

## Warning: package 'ggplot2' was built under R version 4.4.3

Test-set performance (Accuracy, F1, ROC-AUC)
Model	Accuracy	F1	ROC_AUC
Logistic	0.7529	0.7581	0.8290
LDA	0.7520	0.7588	0.8279
Random Forest	0.7496	0.7583	0.8223

Test set Accuracy by Model

ROC curves for Logistic Regression, Random Forest, and LDA

Random Forest: Top 15 features by Gini importance

Logistic Regression: Top odds ratios with 95% CI

Case_update

Shraddha

2025-08-11

Results — Key Outputs (friend-style additions)