--- title: "Minimal Example: Quick Start" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Minimal Example: Quick Start} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ``` r # Install the Python backend (only needs to be done once). leaf::install_leaf() ``` ``` r # Load package library(leaf) if (!backend_available()) { message("Install backend with leaf::install_leaf()") } ``` ``` r # Initialize the symbolic regressor regressor = leaf::SymbolicRegressor$new( engine = "rsrm", loss = 'PoissonDeviance', num_iterations = 3, base = list(verbose = FALSE) ) ``` ``` r # Load data train_data = leaf::leaf_data("GDM") head(train_data) #> Archipelago species A T H Dm Di Do y #> 1 Canary Islands Spiders 278 1.2 2423 381 61 61 85 #> 2 Canary Islands Spiders 729 2.0 2425 414 57 57 98 #> 3 Canary Islands Spiders 378 11.0 1484 333 28 254 137 #> 4 Canary Islands Spiders 2058 12.0 3711 286 28 28 229 #> 5 Canary Islands Spiders 1532 15.0 1950 195 61 61 159 #> 6 Canary Islands Spiders 1725 23.0 807 95 11 95 86 ``` ``` r # Stage 1: Generate subsets folds = leaf::generate_group_subsets( train_data, group_cols = "species", n_splits = "logo" ) ``` ``` r # Stage 2: Discover equation skeletons search_results <- regressor$search_equations( data = train_data, formula = "y ~ f(A, T | species)", normalization = 'divide_by_gmd', folds = folds ) #> 1. Processing data for equation search based on formula... #> 2. Running engine 'rsrm' over 5 folds using up to 1 processes... #> -- FINAL RESULTS -- #> Episode: 3/3 #> time: 59.85s #> loss: 18.63331198471096 #> form: X1+X2**3+1+X1*X2**3+F #> HOF: #> equation complexity loss #> 0 0 0 999999999999999967336168804116691273849533185806555472917961779471295845921727862608739868455469056.00 #> 1 36.4917 1 96.17 #> 2 67.6781*X1 + 1 2 65.94 #> 3 42.0845*X1**0.5 3 35.03 #> 4 12.9398*X1**1.2886 4 34.70 #> 5 39.7451*X1**0.5 - 4.7380*X2 5 32.45 #> 6 77.5044*X1**0.5 + 5.4535*X2 - 8.1433 6 32.45 #> 7 27.7485*X1**0.5 + 0.0751/X2**2 7 30.95 #> 8 ((78.8156*X1**0.5 - 6.3010)*(X2 - 0.1690) + 0.1148)/(X2 - 0.1690) 8 22.56 #> 9 ((X1*X2 - 0.6460)*(69.7163*X1 + 7.6966*X2 + 2.1710) - 2.2056)/(X1*X2 - 0.6460) 10 20.24 #> 10 31.0360*X1**0.5 + 6.4929 + 1.0065/(X2 + 0.0105*exp(-154.1501*X1*X2**2)) 13 20.11 #> 11 75.4312*X1**0.5 - 1.1332*X2**0.25 - 5.4296/(X2 + 0.5335*exp(-5.5694*X2**2)) 14 18.63 #> --- #> task:dataset_93df7a40-b8c5-41c1-827b-9b00049c4968 expr:75.43118432679253*X1**0.5 + -1.133214801020806*X2**0.25 + -5.429620123418192/(0.5334978123305567*exp(-5.569384925113638*X2**2) + X2) Loss_PoissonDeviance:18.63 Test 0/1. #> final result: #> success rate : 0% #> average discovery time is 59.858 seconds #> Number of equations looked at (per test) [Total, Timed out, Successful]: [[2808, 0, 2808]] #> -- FINAL RESULTS -- #> Episode: 3/3 #> time: 59.38s #> loss: 17.59757848151243 #> form: X1*X2+X2+X1+1/(X2 + 1)+F #> HOF: #> equation complexity loss #> 0 0 0 999999999999999967336168804116691273849533185806555472917961779471295845921727862608739868455469056.00 #> 1 36.4460 1 96.17 #> 2 36.5029 - 0.0141*X1 3 36.30 #> 3 48.7534*X1**0.7881 4 34.70 #> 4 8.8407*exp(0.8668*X1) - 1.7462 5 33.96 #> 5 0.1729*X1**0.5 + 42.9840*X2 + 7.0218 6 32.45 #> 6 ((82.3106*X1 + 5.0661)*(X2 - 0.6089) + 2.4446)/(X2 - 0.6089) 7 29.88 #> 7 ((77.8477*X1**0.5 - 5.6270)*(X2 - 1.7642) - 4.7431)/(X2 - 1.7642) 8 24.16 #> 8 (7.1006*X2 + (42.3987*X1**0.5 - 3.7818)*(X2 - 0.7305))/(X2 - 0.7305) 9 22.80 #> 9 44.5543*X1**0.5 - 2.0770/(X2 + 0.1104) + 0.0850/X1 10 22.03 #> 10 ((X2 - 2.1283)*(22.4709*X1*X2 + 17.4438*X1 - 0.8406*X2) - 0.9144)/(X2 - 2.1283) 11 19.68 #> 11 40.1253*X1**0.5 - 5.9888*exp(-1.3785*X2) + 4.1379/(1 - 0.6646/X2) 12 17.60 #> --- #> task:dataset_f0546bc5-399f-4a77-9cfe-f6ca2e85cef6 expr:40.1252851669824*X1**0.5 + -5.988838918042952*exp(-1.378456644669992*X2) + -4.137867123616743/(0.6645899121484922/X2 - 1) Loss_PoissonDeviance:17.60 Test 0/1. #> final result: #> success rate : 0% #> average discovery time is 59.387 seconds #> Number of equations looked at (per test) [Total, Timed out, Successful]: [[2622, 0, 2622]] #> -- FINAL RESULTS -- #> Episode: 3/3 #> time: 58.05s #> loss: 5.962939720605339 #> form: exp(C*X1**2)*exp(C*X2)+F #> HOF: #> equation complexity loss #> 0 0 0 999999999999999967336168804116691273849533185806555472917961779471295845921727862608739868455469056.00 #> 1 36.5019 1 28.81 #> 2 70.6407*X2 + 16.8314 3 26.82 #> 3 4.0612*X2**3 + 16.4502 4 25.46 #> 4 3.9899*X1 + 12.5641*X2 + 12.9638 5 18.22 #> 5 39.0775*X1 + 1.0661*X2**1.5 + 1.9261 6 16.05 #> 6 35.3703*X1 - 18.9153*X2*log(X2) + 14.4395 7 15.47 #> 7 ((38.5484*X1 + 2.6461)*(X1*X2 + 1.7735) + 0.1864)/(X1*X2 + 1.7735) 8 10.50 #> 8 (7.2087*X1*X2 + (36.3870*X1 + 0.0429)*(X1*X2 + 0.0587))/(X1*X2 + 0.0587) 10 9.31 #> 9 67.1542*X1 - 10.2438*X2/(X2 - 0.6284/X1**2) + 5.5596 11 9.07 #> 10 58.3751*X1 - 1.7373*X2 + 4.1695*exp(-1.4453*X1**2 + 2.0496*X2) 12 8.01 #> 11 30.6219*X1 + 7.3247*X2 - 0.0773*exp(1.8572*X1**2*X2 - 53102.3205*X2**2) 14 5.96 #> --- #> task:dataset_6c36e797-aad1-4cd4-95c6-270def6e9c44 expr:30.621939528772103*X1 + 7.324680913287321*X2 + -0.07725981565640147*exp(-53102.32045940429*X2**2)*exp(1.8572477531564031*X1**2*X2) Loss_PoissonDeviance:5.96 Test 0/1. #> final result: #> success rate : 0% #> average discovery time is 58.057 seconds #> Number of equations looked at (per test) [Total, Timed out, Successful]: [[2504, 0, 2504]] #> -- FINAL RESULTS -- #> Episode: 3/3 #> time: 108.04s #> loss: 19.50435371631782 #> form: X1**0.5+F #> HOF: #> equation complexity loss #> 0 0 0 999999999999999967336168804116691273849533185806555472917961779471295845921727862608739868455469056.00 #> 1 67.2097 1 96.17 #> 2 34.2120 2 96.17 #> 3 38.7829*X1 + 14.0659 3 36.30 #> 4 77.0625*X1**0.5 - 4.6835 4 35.01 #> 5 33.4054 + 1.3282*exp(-0.5158*X1) 5 33.95 #> 6 (33.6519*X1*(0.3080*X2 - 1) - 1.2838)/(0.3080*X2 - 1) 6 30.02 #> 7 ((72.1724*X1 + 7.6670)*(1.7186*X2 - 1) + 0.8432)/(1.7186*X2 - 1) 7 27.44 #> 8 ((43.5431*X1 + 10.5063)*(X2**0.5 - 0.3359) - 0.2592)/(X2**0.5 - 0.3359) 8 26.13 #> 9 ((X1**0.5 + 0.6660*X2)*(35.2524*X1 + 16.7830) + 1.9087)/(X1**0.5 + 0.6660*X2) 9 24.65 #> 10 2.2859*X1*X2**3 + 31.7147*X1 - 2.8622*X2**3 + 16.9945 10 24.57 #> 11 69.5182*X1 + 7.5648 - 1.6803/(X1**(-0.5) - 1.5438*X2) 11 23.53 #> 12 38.4328*X1**0.5 + 8.0335 - 10.8348/(X1**(-0.5) - 0.1255*X2) 12 19.95 #> 13 53.1144*X1**0.5/(0.5510*X2 + 1) + 0.3749*X1/(0.2771*X2 - 1) - 1.2407/(0.0908*X2 - 1) 15 19.50 #> --- #> task:dataset_3913e139-bfed-404b-90a1-36cc9bc4f774 expr:53.11439018348418*X1**0.5/(--0.5509822956642375*X2 + 1) + -0.3748504829813716*X1/(-0.27709108049022535*X2 + 1) + 1.2406776184902666/(-0.09082438278008342*X2 + 1) Loss_PoissonDeviance:19.50 Test 0/1. #> final result: #> success rate : 0% #> average discovery time is 108.041 seconds #> Number of equations looked at (per test) [Total, Timed out, Successful]: [[2542, 0, 2542]] #> -- FINAL RESULTS -- #> Episode: 3/3 #> time: 56.66s #> loss: 18.762297920666533 #> form: 1+X1+F #> HOF: #> equation complexity loss #> 0 1 0 999999999999999967336168804116691273849533185806555472917961779471295845921727862608739868455469056.00 #> 1 36.5557 1 96.17 #> 2 33.8333*X1 + 15.7211 3 36.30 #> 3 10.9228*X1**1.4121 4 34.70 #> 4 44.6580*X1 + 5.2839*X2 + 10.4877 5 34.02 #> 5 34.9336*X1**0.5 - 3.9694*X2 + 11.9954 6 32.45 #> 6 44.0426*X1**0.5 - 0.1832 - 0.6801/X2 7 31.41 #> 7 39.8360*X1 - 0.7837*X2**4 + 8.0852*X2 + 11.7166 8 28.08 #> 8 31.3763*X1 - 0.4998*X2**4 - 0.4158*X2**2 + 9.4675 9 26.03 #> 9 0.1881*X1*X2**3 + 28.2664*X1 - 1.9153*X2**3 + 19.3747 10 24.57 #> 10 11.3111*X1**2 + 0.2430*X1 + 1.4187*X1/log(X2) + 3.8953*X2 + 8.7231 13 19.21 #> 11 3.7521*X1**3 - 11.1122*X1 + 9.0340*X1/log(1/X2) + 5.3617*X2 + 21.6463 14 18.76 #> --- #> task:dataset_9ebaecd3-4c1d-405a-bc9c-86afabe3bbf4 expr:--3.7520887270100056*X1**3 + -11.11221617132173*X1 + 9.03397575247341*X1/log(1/X2) + 5.36166614199925*X2 + 21.64631742931612 Loss_PoissonDeviance:18.76 Test 0/1. #> final result: #> success rate : 0% #> average discovery time is 56.657 seconds #> Number of equations looked at (per test) [Total, Timed out, Successful]: [[2515, 0, 2515]] #> 3. Found 62 raw skeletons. Deduplicating... print("=== Search results ===") #> [1] "=== Search results ===" head(search_results) #> Equation Complexity #> 0 u1 1 #> 1 u1⋅A + u2 3 #> 2 u1⋅A^0.5 3 #> 3 u1⋅A^0.79 3 #> 4 u1⋅A^1.29 3 #> 5 u1⋅A^1.41 3 ``` ``` r # Stage 3: Fit parameters and compute loss fit_results <- regressor$fit(data=train_data) #> Fitting parameters for 48 equations... #> Parameter fitting complete. print("\n=== Fit ===") #> [1] "\n=== Fit ===" head(fit_results) #> Equation Complexity Loss #> 0 u1 1 33.52978 #> 1 u1⋅A + u2 3 16.13773 #> 2 u1⋅A^0.5 3 22.30063 #> 3 u1⋅A^0.79 3 34.60230 #> 4 u1⋅A^1.29 3 78.06473 #> 5 u1⋅A^1.41 3 92.01513 ``` ``` r # Stage 4: Evaluate additional metrics eval_table <- regressor$evaluate(metrics = c('PseudoR2', 'Elbow')) # Show results pf <- regressor$get_pareto_front() pf #> Equation Complexity #> 0 u1 1 #> 7 u2⋅-1⋅A + u1 3 #> 8 u1⋅A^0.5 + -1⋅u2 4 #> 11 u1⋅A + u2⋅T + u3 5 #> 15 u1⋅A^0.5 + -1⋅u2⋅T + u3 6 #> 21 u1⋅A + -1⋅u2⋅T^4 + u3⋅T + u4 8 #> 22 (u4 + (T + -1⋅u3)⋅(u1⋅A + u2))⋅(T + -1⋅u5)^-1 9 #> 27 u1⋅A + u2 + -1⋅u3⋅(u5⋅-1⋅T + A^-1⋅u4)^-1 10 #> 36 u1⋅A^0.5 + u2 + -1⋅u3⋅(u5⋅-1⋅T + A^-1⋅u4)^-1 11 #> 37 (-1⋅u5 + (A⋅T + -1⋅u1)⋅(u2⋅A + u3⋅T + u4))⋅(A⋅T + -1⋅u6)^-1 13 #> Loss PseudoR2 Elbow #> 0 33.529782 0.0000000 0.10618305 #> 7 16.137732 0.5187045 0.67001869 #> 8 15.710403 0.5314493 0.30059450 #> 11 13.468675 0.5983071 0.46352610 #> 15 13.018381 0.6117368 0.11457889 #> 21 11.148259 0.6675117 -0.10550209 #> 22 9.841585 0.7064823 -0.05861504 #> 27 8.250629 0.7539313 0.22047479 #> 36 6.658312 0.8014210 0.41404096 #> 37 5.906289 0.8238495 0.42228590 ```