--- title: "Getting Started with leaf" author: "Your Name" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Getting Started with leaf} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- # Introduction This vignette demonstrates how to get started with the `leaf` R package, a high-level interface to the LEAF Symbolic Regression framework. > **Note:** Before using `leaf`, you need to install the Python backend by running `leaf::install_leaf()`. ``` r # Install the Python backend (only needs to be done once). leaf::install_leaf() ``` # Load package ``` r library(leaf) if (!backend_available()) { message("Install backend with leaf::install_leaf()") } ``` # Initialize a symbolic regression analysis ``` r regressor <- SymbolicRegressor$new( engine = "rsrm", num_iterations=4L, loss = "MSE", operators = c("+", "-", "*", "/"), threshold = 1e-10, base = list(verbose = FALSE) ) ``` # Load the data ``` r train_data <- leaf_data("eg_train") test_data <- leaf_data("eg_test") print(head(train_data)) #> X x1 x2 y #> 1 12 37.45222 1.51908854 114.625592 #> 2 4 26.02808 0.07787298 4.804276 #> 3 37 9.09870 0.17996103 3.780866 #> 4 8 26.62270 0.52217593 28.528340 #> 5 3 33.38746 1.56938555 105.608897 #> 6 6 29.72694 1.35455366 81.320448 print(head(test_data)) #> X x1 x2 y #> 1 13 10.961724 1.1329768 25.40347 #> 2 39 24.859367 0.6104367 31.09740 #> 3 30 29.766229 1.1632080 70.04424 #> 4 45 38.345487 1.4681886 113.42693 #> 5 17 5.582007 1.6563170 18.89703 #> 6 48 38.867698 1.4879493 116.50896 ``` # Define the formula The formula specifies the transformed inputs and grouping structure. - Terms inside f(...) define the inputs (x1, x2, etc.) - The part after | defines grouping variables if specified Functional transformations are allowed inside the formula. ``` r formula <- "y ~ f(x1, x2, log(x2))" ``` # Search for candidate equations ``` r search_results <- regressor$search_equations( data = train_data, formula = formula ) #> 1. Processing data for equation search based on formula... #> 2. Running engine 'rsrm' over 1 folds using up to 1 processes... #> -- FINAL RESULTS -- #> Episode: 4/4 #> time: 60.08s #> loss: 0.00014448195748697755 #> form: X2/X1+1-X1+F #> HOF: #> equation complexity loss #> 0 0 0 999999999999999967336168804116691273849533185806555472917961779471295845921727862608739868455469056.00 #> 1 44.5014 1 1292.93 #> 2 33.4330*X2 2 450.49 #> 3 17.8258*X1*X2 3 0.15 #> 4 17.6940*X1*X2 + 0.5466 4 0.03 #> 5 X1*(17.6544*X2 + 0.3209) 5 0.02 #> 6 17.6484*X1*X2 + 0.2050*X1 + 0.2815 6 0.01 #> 7 17.6599*X1*X2 + 0.7257 - 0.0709/X1 7 0.01 #> 8 X1*(17.6404*X2*(X1 + 0.7876) + 1.0622)/(X1 + 0.7876) 8 0.00 #> 9 17.6249*X1*X2 + 0.1456*X1 + 0.0418*X2 + 0.4554 - 0.0461/X1 11 0.00 #> 10 -0.0423*X1**2 + 17.6418*X1*X2 + 0.2866*X1 + 0.3691 - 0.0330/X1 12 0.00 #> 11 17.6423*X1*X2 + 0.0970*X1 + 0.5986 - 0.1043/X1 + 0.0048/X1**2 13 0.00 #> --- #> task:dataset_923a85b0-5900-4aa2-a712-ab798821ae56 expr:17.642277203891876*X1*X2 + 0.09700466267074608*X1 + 0.5985664260163593 + -0.10427594286280444/X1 + 0.004757006682361494/X1**2 Loss_MSE:0.00 Test 0/1. #> final result: #> success rate : 0% #> average discovery time is 60.086 seconds #> Number of equations looked at (per test) [Total, Timed out, Successful]: [[3955, 0, 3955]] #> 3. Found 12 raw skeletons. Deduplicating... head(search_results) #> Equation Complexity #> 0 β1 1 #> 1 β1⋅x2 2 #> 2 β1⋅x1⋅x2 3 #> 3 x1⋅(β1⋅x2 + β2) 4 #> 4 β1⋅x1⋅x2 + β2 4 #> 5 β1⋅x1⋅x2 + β2⋅x1 + β3 6 ``` # Fit equation parameters ``` r fit_results <- regressor$fit( data = train_data, do_cv = FALSE ) #> Fitting parameters for 11 equations... #> Parameter fitting complete. head(fit_results) #> Equation Complexity Loss #> 0 β1 1 1.292927e+03 #> 1 β1⋅x2 2 4.504898e+02 #> 2 β1⋅x1⋅x2 3 1.513596e-01 #> 3 x1⋅(β1⋅x2 + β2) 4 2.168116e-02 #> 4 β1⋅x1⋅x2 + β2 4 3.160112e-02 #> 5 β1⋅x1⋅x2 + β2⋅x1 + β3 6 6.814638e-03 ``` # Evaluate equations ``` r eval_table <- regressor$evaluate( metrics = c("RMSE", "R2"), data = test_data ) head(eval_table) #> Equation Complexity Loss RMSE #> 0 β1 1 1.292927e+03 38.53714791 #> 1 β1⋅x2 2 4.504898e+02 29.85585277 #> 2 β1⋅x1⋅x2 3 1.513596e-01 0.37579394 #> 3 x1⋅(β1⋅x2 + β2) 4 2.168116e-02 0.20061360 #> 5 β1⋅x1⋅x2 + β2⋅x1 + β3 6 6.814638e-03 0.08439777 #> 6 β1⋅x1⋅x2 + β2 + -1⋅β3⋅x1^-1 7 6.803462e-03 0.06765065 #> R2 #> 0 -0.01053178 #> 1 0.39347348 #> 2 0.99990391 #> 3 0.99997262 #> 5 0.99999515 #> 6 0.99999689 ``` # Make predictions ``` r preds <- regressor$predict( data = test_data, equation_id = 3 ) head(preds) #> [1] 25.12047 30.96778 70.01509 113.60330 18.63938 116.68777 ``` # Inspect equation ``` r regressor$show_equation(equation_id = 3) #> Details for Equation ID: 3 #> ---------------------------------------- #> Equation: x1⋅(β1⋅x2 + β2) #> ---------------------------------------- #> Full Expression per Group: #> #> Group 'default_group': #> 2.00163686843539⋅x₁⋅x₂ + 0.0238483758781546⋅x₁ ``` # See final results ``` r print(regressor$get_pareto_front()) #> Equation Complexity #> 0 β1 1 #> 1 β1⋅x2 2 #> 2 β1⋅x1⋅x2 3 #> 3 x1⋅(β1⋅x2 + β2) 4 #> 5 β1⋅x1⋅x2 + β2⋅x1 + β3 6 #> 6 β1⋅x1⋅x2 + β2 + -1⋅β3⋅x1^-1 7 #> 7 x1⋅(β1⋅x2⋅(β2 + x1) + β3)⋅(β4 + x1)^-1 9 #> 8 β1⋅x1⋅x2 + β2⋅x1 + β3⋅x2 + β4 + -1⋅β5⋅x1^-1 11 #> 9 -1⋅β1⋅x1^2 + β2⋅x1⋅x2 + β3⋅x1 + β4 + -1⋅β5⋅x1^-1 12 #> 10 β1⋅x1⋅x2 + β2⋅x1 + β3 + -1⋅β4⋅x1^-1 + β5⋅x1^2^-1 13 #> Loss RMSE R2 #> 0 1.292927e+03 38.53714791 -0.01053178 #> 1 4.504898e+02 29.85585277 0.39347348 #> 2 1.513596e-01 0.37579394 0.99990391 #> 3 2.168116e-02 0.20061360 0.99997262 #> 5 6.814638e-03 0.08439777 0.99999515 #> 6 6.803462e-03 0.06765065 0.99999689 #> 7 4.223240e-04 0.01968863 0.99999974 #> 8 3.611868e-04 0.03950045 0.99999894 #> 9 1.582007e-04 0.01388075 0.99999987 #> 10 1.444818e-04 0.01629805 0.99999982 ``` # Plot pareto front ``` r results <- regressor$get_results() plot_pareto(results, 'Complexity', 'RMSE', y_bigger_better = FALSE) #> $plot ``` ![plot of chunk plot](figures/getting_started-plot-1.png) ``` #> #> $pareto_points #> Equation Complexity #> 0 β1 1 #> 1 β1⋅x2 2 #> 2 β1⋅x1⋅x2 3 #> 4 β1⋅x1⋅x2 + β2 4 #> 5 β1⋅x1⋅x2 + β2⋅x1 + β3 6 #> 6 β1⋅x1⋅x2 + β2 + -1⋅β3⋅x1^-1 7 #> 7 x1⋅(β1⋅x2⋅(β2 + x1) + β3)⋅(β4 + x1)^-1 9 #> 9 -1⋅β1⋅x1^2 + β2⋅x1⋅x2 + β3⋅x1 + β4 + -1⋅β5⋅x1^-1 12 #> Loss RMSE R2 .pareto #> 0 1.292927e+03 38.53714791 -0.01053178 TRUE #> 1 4.504898e+02 29.85585277 0.39347348 TRUE #> 2 1.513596e-01 0.37579394 0.99990391 TRUE #> 4 3.160112e-02 0.09294067 0.99999412 TRUE #> 5 6.814638e-03 0.08439777 0.99999515 TRUE #> 6 6.803462e-03 0.06765065 0.99999689 TRUE #> 7 4.223240e-04 0.01968863 0.99999974 TRUE #> 9 1.582007e-04 0.01388075 0.99999987 TRUE #> #> $all #> Equation Complexity #> 0 β1 1 #> 1 β1⋅x2 2 #> 2 β1⋅x1⋅x2 3 #> 3 x1⋅(β1⋅x2 + β2) 4 #> 4 β1⋅x1⋅x2 + β2 4 #> 5 β1⋅x1⋅x2 + β2⋅x1 + β3 6 #> 6 β1⋅x1⋅x2 + β2 + -1⋅β3⋅x1^-1 7 #> 7 x1⋅(β1⋅x2⋅(β2 + x1) + β3)⋅(β4 + x1)^-1 9 #> 8 β1⋅x1⋅x2 + β2⋅x1 + β3⋅x2 + β4 + -1⋅β5⋅x1^-1 11 #> 9 -1⋅β1⋅x1^2 + β2⋅x1⋅x2 + β3⋅x1 + β4 + -1⋅β5⋅x1^-1 12 #> 10 β1⋅x1⋅x2 + β2⋅x1 + β3 + -1⋅β4⋅x1^-1 + β5⋅x1^2^-1 13 #> Loss RMSE R2 .pareto #> 0 1.292927e+03 38.53714791 -0.01053178 TRUE #> 1 4.504898e+02 29.85585277 0.39347348 TRUE #> 2 1.513596e-01 0.37579394 0.99990391 TRUE #> 3 2.168116e-02 0.20061360 0.99997262 FALSE #> 4 3.160112e-02 0.09294067 0.99999412 TRUE #> 5 6.814638e-03 0.08439777 0.99999515 TRUE #> 6 6.803462e-03 0.06765065 0.99999689 TRUE #> 7 4.223240e-04 0.01968863 0.99999974 TRUE #> 8 3.611868e-04 0.03950045 0.99999894 FALSE #> 9 1.582007e-04 0.01388075 0.99999987 TRUE #> 10 1.444818e-04 0.01629805 0.99999982 FALSE ``` # Compute optimal threshold (only for classification tasks) Note on Task Types: The find_optimal_threshold() method is specifically designed for binary classification tasks. If your target variable is continuous (Regression), you should use the raw predictions directly. If your target is binary (Classification), this step helps transform continuous scores into class labels (0 or 1). ``` r threshold <- regressor$find_optimal_threshold( equation_id = 1, data = train_data, k_folds = 5 ) #> The model isn't a classification model. print(threshold) #> numeric(0) ``` # Save and load analysis ``` r # Save current analysis path <- tempfile(fileext = ".pkl") regressor$save(path) #> SymbolicRegressor instance successfully saved to C:\Users\marti\AppData\Local\Temp\Rtmp4kIIBE\file6dcc18c269b1.pkl # Load it later regressor_loaded <- load_leaf_analysis(path) #> SymbolicRegressor instance loaded successfully from C:\Users\marti\AppData\Local\Temp\Rtmp4kIIBE\file6dcc18c269b1.pkl # Make predictions with loaded model preds_loaded <- regressor_loaded$predict( data = test_data, equation_id = 3 ) head(preds_loaded) #> [1] 25.12047 30.96778 70.01509 113.60330 18.63938 116.68777 ```