## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) library(BORG) ## ----basic-usage-------------------------------------------------------------- # Create sample data set.seed(42) data <- data.frame( x1 = rnorm(100), x2 = rnorm(100), y = rnorm(100) ) # Define a split train_idx <- 1:70 test_idx <- 71:100 # Inspect the split result <- borg_inspect(data, train_idx = train_idx, test_idx = test_idx) result ## ----overlap-detection-------------------------------------------------------- # Accidental overlap in indices bad_result <- borg_inspect(data, train_idx = 1:60, test_idx = 51:100) bad_result ## ----diagnosis-mode----------------------------------------------------------- # Spatial data with coordinates set.seed(42) spatial_data <- data.frame( lon = runif(200, -10, 10), lat = runif(200, -10, 10), elevation = rnorm(200, 500, 100), response = rnorm(200) ) # Let BORG diagnose and create CV folds result <- borg(spatial_data, coords = c("lon", "lat"), target = "response") result ## ----validation-mode---------------------------------------------------------- # Validate a manual split risk <- borg(spatial_data, train_idx = 1:150, test_idx = 151:200) risk ## ----plot-results, fig.width=7, fig.height=5---------------------------------- # Plot the risk assessment plot(risk) ## ----summary-results---------------------------------------------------------- # Generate methods text for publications summary(result) ## ----spatial-example---------------------------------------------------------- result_spatial <- borg(spatial_data, coords = c("lon", "lat"), target = "response") result_spatial$diagnosis@recommended_cv ## ----temporal-example--------------------------------------------------------- temporal_data <- data.frame( date = seq(as.Date("2020-01-01"), by = "day", length.out = 200), value = cumsum(rnorm(200)) ) result_temporal <- borg(temporal_data, time = "date", target = "value") result_temporal$diagnosis@recommended_cv ## ----grouped-example---------------------------------------------------------- grouped_data <- data.frame( site = rep(1:20, each = 10), measurement = rnorm(200) ) result_grouped <- borg(grouped_data, groups = "site", target = "measurement") result_grouped$diagnosis@recommended_cv ## ----target-leakage----------------------------------------------------------- # Simulate target leakage leaky_data <- data.frame( x = rnorm(100), leaked_feature = rnorm(100), # Will be made leaky outcome = rnorm(100) ) # Make leaked_feature highly correlated with outcome leaky_data$leaked_feature <- leaky_data$outcome + rnorm(100, sd = 0.05) result <- borg_inspect(leaky_data, train_idx = 1:70, test_idx = 71:100, target = "outcome") result ## ----group-leakage------------------------------------------------------------ # Simulate clinical data with patient IDs clinical_data <- data.frame( patient_id = rep(1:10, each = 10), visit = rep(1:10, times = 10), measurement = rnorm(100) ) # Random split ignoring patients (BAD) set.seed(123) all_idx <- sample(100) train_idx <- all_idx[1:70] test_idx <- all_idx[71:100] # Check for group leakage result <- borg_inspect(clinical_data, train_idx = train_idx, test_idx = test_idx, groups = "patient_id") result ## ----cv-folds----------------------------------------------------------------- result <- borg(spatial_data, coords = c("lon", "lat"), target = "response", v = 5) # Number of folds length(result$folds) # First fold's train/test sizes cat("Fold 1 - Train:", length(result$folds[[1]]$train), "Test:", length(result$folds[[1]]$test), "\n") ## ----certificate-------------------------------------------------------------- # Create a certificate cert <- borg_certificate(result$diagnosis, data = spatial_data) cert ## ----export, eval=FALSE------------------------------------------------------- # # Export to file # borg_export(result$diagnosis, spatial_data, "validation.yaml") # borg_export(result$diagnosis, spatial_data, "validation.json") ## ----methods-text------------------------------------------------------------- # Default APA style result <- borg(spatial_data, coords = c("lon", "lat"), target = "response") methods_text <- summary(result) ## ----methods-nature, eval=FALSE----------------------------------------------- # # Nature style # summary(result, style = "nature") # # # Ecology style # summary(result, style = "ecology") ## ----methods-with-comparison, eval=FALSE-------------------------------------- # comparison <- borg_compare_cv(spatial_data, response ~ lon + lat, # coords = c("lon", "lat")) # summary(result, comparison = comparison) ## ----compare-cv--------------------------------------------------------------- comparison <- borg_compare_cv( spatial_data, formula = response ~ lon + lat, coords = c("lon", "lat"), v = 5, repeats = 5 # Use more repeats in practice ) print(comparison) ## ----compare-cv-plot, fig.width=7, fig.height=5------------------------------- plot(comparison) ## ----power-analysis----------------------------------------------------------- # Clustered data: 20 sites, 10 observations each clustered_data <- data.frame( site = rep(1:20, each = 10), value = rep(rnorm(20, sd = 2), each = 10) + rnorm(200, sd = 0.5) ) pw <- borg_power(clustered_data, groups = "site", target = "value") print(pw) summary(pw)