--- title: "Binary classification with leaf" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Binary classification with leaf} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ``` r library(leaf) if (!backend_available()) { message("Install backend with leaf::install_leaf()") } ``` ``` r set.seed(42) N <- 50L # Generate features x1 <- runif(N, min = 1, max = 40) x2 <- runif(N, min = 0, max = 2) # Generate target: y = log(x1) * x2 - 3 score <- log(x1) * x2 - 3 y <- as.integer(score > 0) train_data <- data.frame(x1 = x1, x2 = x2, y = y) head(train_data) #> x1 x2 y #> 1 36.67744 0.66685442 0 #> 2 37.54594 0.69349650 0 #> 3 12.15944 0.79697082 0 #> 4 33.38746 1.56938555 1 #> 5 26.02808 0.07787298 0 #> 6 21.24474 1.49759077 1 ``` # Stage 1: Initialize the symbolic regressor ``` r regressor = leaf::SymbolicRegressor$new( engine='rsrm', num_iterations=4L, loss='BinaryCrossEntropy', max_params=2L, base = list(verbose = FALSE) ) ``` # Stage 2: Discover equation skeletons ``` r search_results = regressor$search_equations( data = train_data, formula = "y ~ f(x1, x2)", normalization = 'divide_by_gmd' ) #> 1. Processing data for equation search based on formula... #> 2. Running engine 'rsrm' over 1 folds using up to 1 processes... #> -- FINAL RESULTS -- #> Episode: 1/4 #> time: 2.35s #> loss: 1.1102230246251565e-16 #> form: F #> HOF: #> equation complexity loss #> 0 0 0 999999999999999967336168804116691273849533185806555472917961779471295845921727862608739868455469056.00 #> 1 -0.3228 1 0.68 #> 2 0.3432*X2 2 0.66 #> 3 3.5531*X2 - 5.8267 3 0.29 #> 4 4.0450*X1*X2 - 11.8264 4 0.14 #> 5 2206.4145*X1*log(X2) - 1482.6623 5 0.00 #> --- #> task:dataset_d4bf7f82-d8e4-4933-a520-12d671bd251c expr:776.389030930181*X1*X2 + -2939.9324468385626/X2 Loss_BinaryCrossEntropy:0.00 Test 0/1. #> final result: #> success rate : 100% #> average discovery time is 2.357 seconds #> Number of equations looked at (per test) [Total, Timed out, Successful]: [[248, 0, 247]] #> 3. Found 6 raw skeletons. Deduplicating... print("=== Search results ===") #> [1] "=== Search results ===" print(search_results) #> Equation Complexity #> 0 -1⋅β1 1 #> 1 β1 1 #> 2 β1⋅x2 2 #> 3 β1⋅x2 + -1⋅β2 3 #> 4 β1⋅x1⋅x2 + -1⋅β2 4 #> 5 β1⋅x1⋅log(x2) + -1⋅β2 5 ``` # Stage 3: Fit parameters and compute loss ``` r regressor$fit(data = train_data) #> Fitting parameters for 6 equations... #> Parameter fitting complete. #> Equation Complexity Loss #> 0 -1⋅β1 1 6.802920e-01 #> 1 β1 1 6.802920e-01 #> 2 β1⋅x2 2 6.562670e-01 #> 3 β1⋅x2 + -1⋅β2 3 2.870362e-01 #> 4 β1⋅x1⋅x2 + -1⋅β2 4 1.448630e-01 #> 5 β1⋅x1⋅log(x2) + -1⋅β2 5 1.110223e-16 ``` # Stage 4: Evaluate additional metrics ``` r regressor$evaluate(metrics = c('TSS', 'Elbow')) #> Equation Complexity Loss TSS Elbow #> 1 β1 1 6.802920e-01 0.0000000 NaN #> 2 β1⋅x2 2 6.562670e-01 0.0000000 NaN #> 3 β1⋅x2 + -1⋅β2 3 2.870362e-01 0.7536946 0.02398125 #> 4 β1⋅x1⋅x2 + -1⋅β2 4 1.448630e-01 0.8834154 0.01048082 #> 5 β1⋅x1⋅log(x2) + -1⋅β2 5 1.110223e-16 1.0000000 0.02632977 # Show results print(regressor$get_pareto_front()) #> Equation Complexity Loss TSS Elbow #> 1 β1 1 6.802920e-01 0.0000000 NaN #> 2 β1⋅x2 2 6.562670e-01 0.0000000 NaN #> 3 β1⋅x2 + -1⋅β2 3 2.870362e-01 0.7536946 0.02398125 #> 4 β1⋅x1⋅x2 + -1⋅β2 4 1.448630e-01 0.8834154 0.01048082 #> 5 β1⋅x1⋅log(x2) + -1⋅β2 5 1.110223e-16 1.0000000 0.02632977 ```