## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) library(BORG) # Check package availability has_caret <- requireNamespace("caret", quietly = TRUE) has_recipes <- requireNamespace("recipes", quietly = TRUE) has_rsample <- requireNamespace("rsample", quietly = TRUE) has_mlr3 <- requireNamespace("mlr3", quietly = TRUE) ## ----base-r------------------------------------------------------------------- # Create data data <- iris set.seed(42) n <- nrow(data) train_idx <- sample(n, 0.7 * n) test_idx <- setdiff(1:n, train_idx) # Validate the split borg(data, train_idx = train_idx, test_idx = test_idx) ## ----preprocessing-pattern---------------------------------------------------- # CORRECT: Fit preprocessing on training data only train_data <- data[train_idx, ] train_means <- colMeans(train_data[, 1:4]) train_sds <- apply(train_data[, 1:4], 2, sd) # Apply train statistics to both sets scaled_train <- scale(data[train_idx, 1:4], center = train_means, scale = train_sds) scaled_test <- scale(data[test_idx, 1:4], center = train_means, scale = train_sds) ## ----caret-preprocess, eval = has_caret--------------------------------------- library(caret) data(mtcars) train_idx <- 1:25 test_idx <- 26:32 # BAD: preProcess on full data (LEAKS!) pp_bad <- preProcess(mtcars[, -1], method = c("center", "scale")) borg_inspect(pp_bad, train_idx, test_idx, data = mtcars) # GOOD: preProcess on training data only pp_good <- preProcess(mtcars[train_idx, -1], method = c("center", "scale")) borg_inspect(pp_good, train_idx, test_idx, data = mtcars) ## ----caret-traincontrol, eval = FALSE----------------------------------------- # # Standard caret workflow with spatial data # spatial_data <- data.frame( # lon = runif(200, 0, 100), # lat = runif(200, 0, 100), # response = rnorm(200) # ) # # # This will warn/error if random CV is inappropriate # ctrl <- borg_trainControl( # data = spatial_data, # coords = c("lon", "lat"), # method = "cv", # number = 5 # ) # # If spatial autocorrelation detected, blocks random CV # # Use auto_block = TRUE to automatically switch to spatial blocking ## ----tidymodels-recipes, eval = has_recipes && has_rsample-------------------- library(recipes) library(rsample) data(mtcars) set.seed(123) split <- initial_split(mtcars, prop = 0.8) train_idx <- split$in_id test_idx <- setdiff(seq_len(nrow(mtcars)), train_idx) # BAD: Recipe prepped on full data rec_bad <- recipe(mpg ~ ., data = mtcars) |> step_normalize(all_numeric_predictors()) |> prep() # Uses full mtcars! borg_inspect(rec_bad, train_idx, test_idx, data = mtcars) # GOOD: Recipe prepped on training only rec_good <- recipe(mpg ~ ., data = training(split)) |> step_normalize(all_numeric_predictors()) |> prep() borg_inspect(rec_good, train_idx, test_idx, data = mtcars) ## ----borg-vfold, eval = FALSE------------------------------------------------- # # Standard rsample # folds <- vfold_cv(data, v = 5) # Random folds # # # BORG-guarded version # folds <- borg_vfold_cv( # data = spatial_data, # coords = c("lon", "lat"), # v = 5, # auto_block = TRUE # Switches to spatial blocking if needed # ) ## ----borg-group-vfold, eval = FALSE------------------------------------------- # # For grouped data # folds <- borg_group_vfold_cv( # data = clinical_data, # group = patient_id, # v = 5 # ) ## ----borg-initial-split, eval = FALSE----------------------------------------- # # For temporal data - enforces chronological ordering # split <- borg_initial_split( # data = ts_data, # time = "date", # prop = 0.8 # ) ## ----rsample-validation, eval = has_rsample----------------------------------- # Validate existing rsample objects ts_data <- data.frame( date = seq(as.Date("2020-01-01"), by = "day", length.out = 200), value = cumsum(rnorm(200)) ) rolling <- rolling_origin( data = ts_data, initial = 100, assess = 20, cumulative = FALSE ) # Check for temporal leakage borg_inspect(rolling, train_idx = NULL, test_idx = NULL) ## ----mlr3-example, eval = has_mlr3-------------------------------------------- # library(mlr3) # # # Create task # task <- TaskClassif$new("iris", iris, target = "Species") # # # Create resampling # resampling <- rsmp("cv", folds = 5) # resampling$instantiate(task) # # # Validate first fold # train_idx <- resampling$train_set(1) # test_idx <- resampling$test_set(1) # borg_inspect(task, train_idx, test_idx) ## ----temporal-basic----------------------------------------------------------- set.seed(123) n <- 365 ts_data <- data.frame( date = seq(as.Date("2020-01-01"), by = "day", length.out = n), value = cumsum(rnorm(n)), feature = rnorm(n) ) # Chronological split train_idx <- 1:252 test_idx <- 253:365 # Validate temporal ordering result <- borg(ts_data, train_idx = train_idx, test_idx = test_idx, time = "date") result ## ----rolling-origin, eval = has_rsample--------------------------------------- rolling <- rolling_origin( data = ts_data, initial = 200, assess = 30, cumulative = FALSE ) # Validate the resampling scheme borg_inspect(rolling, train_idx = NULL, test_idx = NULL) ## ----spatial-basic------------------------------------------------------------ set.seed(456) n <- 200 spatial_data <- data.frame( lon = runif(n, -10, 10), lat = runif(n, -10, 10), response = rnorm(n), predictor = rnorm(n) ) # Geographic split (west vs east) train_idx <- which(spatial_data$lon < 0) test_idx <- which(spatial_data$lon >= 0) # Validate with spatial awareness result <- borg(spatial_data, train_idx = train_idx, test_idx = test_idx, coords = c("lon", "lat")) result ## ----spatial-auto------------------------------------------------------------- # Let BORG generate spatially-blocked folds result <- borg(spatial_data, coords = c("lon", "lat"), target = "response", v = 5) result$diagnosis@recommended_cv # Access the folds length(result$folds) ## ----grouped-workflow--------------------------------------------------------- # Clinical trial data with repeated measures clinical_data <- data.frame( patient_id = rep(1:50, each = 4), visit = rep(1:4, times = 50), outcome = rnorm(200) ) # Let BORG create leave-group-out folds result <- borg(clinical_data, groups = "patient_id", target = "outcome", v = 5) result$diagnosis@recommended_cv # Verify no patient appears in both train and test fold1 <- result$folds[[1]] train_patients <- unique(clinical_data$patient_id[fold1$train]) test_patients <- unique(clinical_data$patient_id[fold1$test]) length(intersect(train_patients, test_patients)) # Should be 0 ## ----pipeline-validation------------------------------------------------------ # Build a workflow data <- iris set.seed(789) n <- nrow(data) train_idx <- sample(n, 0.7 * n) test_idx <- setdiff(1:n, train_idx) # Validate everything result <- borg_validate(list( data = data, train_idx = train_idx, test_idx = test_idx )) result ## ----pipeline-bad------------------------------------------------------------- # Workflow with overlap (common mistake) bad_workflow <- list( data = iris, train_idx = 1:100, test_idx = 51:150 # Overlaps! ) result <- borg_validate(bad_workflow) result ## ----assimilate--------------------------------------------------------------- # Workflow with fixable issues workflow <- list( data = iris, train_idx = 1:100, test_idx = 51:150 # Overlap ) # Attempt automatic repair fixed <- borg_assimilate(workflow) if (length(fixed$unfixable) > 0) { cat("Partial assimilation:", length(fixed$unfixable), "risk(s) require manual fix:", paste(fixed$unfixable, collapse = ", "), "\n") } else { cat("Assimilation complete:", length(fixed$fixed), "risk(s) corrected\n") }