--- title: "Train-Test Splitting for Symbolic Regression" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Train-Test Splitting for Symbolic Regression} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ``` r # Install the Python backend (only needs to be done once). leaf::install_leaf() ``` ``` r # Load package library(leaf) if (!backend_available()) { message("Install backend with leaf::install_leaf()") } ``` ``` r set.seed(42) N <- 50L # Generate features x1 <- runif(N, min = 1, max = 40) x2 <- runif(N, min = 0, max = 2) # Generate target: y = 0.23·log(x1) + 2·x1·x2 + noise y <- 0.23 * log(x1) + 2 * x1 * x2 + rnorm(N, mean = 0, sd = 1e-2) data <- data.frame(x1 = x1, x2 = x2, y = y) # Train/test split train_ratio <- 0.8 train_index <- sample(seq_len(nrow(data)), size = floor(train_ratio * nrow(data))) train_data <- data[train_index, ] test_data <- data[-train_index, ] print(head(train_data)) #> x1 x2 y #> 24 37.920061 0.9587971 73.541917 #> 34 27.721619 1.2912638 72.354745 #> 25 4.215065 0.3948207 3.653854 #> 28 36.323787 0.7509799 55.387777 #> 43 2.459810 0.4171399 2.265693 #> 14 10.961724 1.1329768 25.403471 print(head(test_data)) #> x1 x2 y #> 6 21.244742 1.4975908 64.337530 #> 7 29.726944 1.3545537 81.320448 #> 8 6.251997 0.3425287 4.705442 #> 15 19.029420 1.6993794 65.346714 #> 18 5.582007 1.6563170 18.897027 #> 19 19.524886 1.3864096 54.831677 ``` ``` r # Initialise the symbolic regressor regressor = leaf::SymbolicRegressor$new( engine='rsrm', loss='MSE', num_iterations=2L, threshold=1e-10, operators = c("+"), base=list(verbose = FALSE), mcts=list(times = 20L), gp=list(times = 6L) ) ``` ``` r # Stage 1: Discover equation skeletons search_results = regressor$search_equations( data = train_data, formula = 'y ~ f( log(x1), x1*x2 )', normalization = 'divide_by_gmd' ) #> 1. Processing data for equation search based on formula... #> 2. Running engine 'rsrm' over 1 folds using up to 1 processes... #> -- FINAL RESULTS -- #> Episode: 2/2 #> time: 7.88s #> loss: 8.57761298360967e-05 #> form: F #> HOF: #> equation complexity loss #> 0 41.2303*X2 2 0.18 #> 1 0.1944*X1 + 40.8084*X2 4 0.00 #> --- #> task:dataset_d5a244a5-e53e-4009-bca7-6c3555d75360 expr:0.19442414717846374*X1 + 40.80839593452161*X2 Loss_MSE:0.00 Test 0/1. #> final result: #> success rate : 0% #> average discovery time is 7.885 seconds #> Number of equations looked at (per test) [Total, Timed out, Successful]: [[6, 0, 6]] #> 3. Found 2 raw skeletons. Deduplicating... print("=== Search results ===") #> [1] "=== Search results ===" print(search_results) #> Equation Complexity #> 0 β1⋅x1⋅x2 3 #> 1 β1⋅log(x1) + β2⋅x1⋅x2 6 ``` ``` r # Stage 2: Fit parameters and compute loss fit_results = regressor$fit( data = train_data ) #> Fitting parameters for 2 equations... #> Parameter fitting complete. print("\n=== Fit results ===") #> [1] "\n=== Fit results ===" print(fit_results) #> Equation Complexity Loss #> 0 β1⋅x1⋅x2 3 1.754847e-01 #> 1 β1⋅log(x1) + β2⋅x1⋅x2 6 8.577613e-05 ``` ``` r # Stage 3: Evaluate additional metrics eval_table = regressor$evaluate( metrics = c('R2', 'Elbow'), data = test_data ) print("\n=== Additional metrics ===") #> [1] "\n=== Additional metrics ===" print(regressor$get_pareto_front()) #> Equation Complexity Loss R2 Elbow #> 0 β1⋅x1⋅x2 3 1.754847e-01 0.9999509 0.990636907 #> 1 β1⋅log(x1) + β2⋅x1⋅x2 6 8.577613e-05 0.9999999 0.009363093 ```