Binary classification with leaf

library(leaf)
if (!backend_available()) {
  message("Install backend with leaf::install_leaf()")
}  
set.seed(42)

N <- 50L

# Generate features
x1 <- runif(N, min = 1, max = 40)
x2 <- runif(N, min = 0, max = 2)

# Generate target: y = log(x1) * x2 - 3
score <- log(x1) * x2 - 3
y <- as.integer(score > 0)

train_data <- data.frame(x1 = x1, x2 = x2, y = y)
head(train_data)
#>         x1         x2 y
#> 1 36.67744 0.66685442 0
#> 2 37.54594 0.69349650 0
#> 3 12.15944 0.79697082 0
#> 4 33.38746 1.56938555 1
#> 5 26.02808 0.07787298 0
#> 6 21.24474 1.49759077 1

Stage 1: Initialize the symbolic regressor

regressor = leaf::SymbolicRegressor$new(
  engine='rsrm', 
  num_iterations=4L, 
  loss='BinaryCrossEntropy', 
  max_params=2L,
  base = list(verbose = FALSE)
)

Stage 2: Discover equation skeletons

search_results = regressor$search_equations(
        data = train_data,
        formula = "y ~ f(x1, x2)",
        normalization = 'divide_by_gmd'
)
#> 1. Processing data for equation search based on formula...
#> 2. Running engine 'rsrm' over 1 folds using up to 1 processes...
#> -- FINAL RESULTS --
#> Episode: 1/4
#> time: 2.35s
#> loss: 1.1102230246251565e-16
#> form: F
#> HOF:
#>                            equation  complexity                                                                                                   loss
#> 0                                 0           0 999999999999999967336168804116691273849533185806555472917961779471295845921727862608739868455469056.00
#> 1                           -0.3228           1                                                                                                   0.68
#> 2                         0.3432*X2           2                                                                                                   0.66
#> 3                3.5531*X2 - 5.8267           3                                                                                                   0.29
#> 4            4.0450*X1*X2 - 11.8264           4                                                                                                   0.14
#> 5  2206.4145*X1*log(X2) - 1482.6623           5                                                                                                   0.00
#> ---
#> 
task:dataset_d4bf7f82-d8e4-4933-a520-12d671bd251c expr:776.389030930181*X1*X2 + -2939.9324468385626/X2 Loss_BinaryCrossEntropy:0.00 Test 0/1.
#> final result:
#> success rate : 100%
#> average discovery time is 2.357 seconds
#> Number of equations looked at (per test) [Total, Timed out, Successful]:  [[248, 0, 247]]
#> 3. Found 6 raw skeletons. Deduplicating...

print("=== Search results ===")
#> [1] "=== Search results ==="
print(search_results)
#>                Equation Complexity
#> 0                 -1⋅β1          1
#> 1                    β1          1
#> 2                 β1⋅x2          2
#> 3         β1⋅x2 + -1⋅β2          3
#> 4      β1⋅x1⋅x2 + -1⋅β2          4
#> 5 β1⋅x1⋅log(x2) + -1⋅β2          5

Stage 3: Fit parameters and compute loss

regressor$fit(data = train_data)
#> Fitting parameters for 6 equations...
#> Parameter fitting complete.
#>                Equation Complexity         Loss
#> 0                 -1⋅β1          1 6.802920e-01
#> 1                    β1          1 6.802920e-01
#> 2                 β1⋅x2          2 6.562670e-01
#> 3         β1⋅x2 + -1⋅β2          3 2.870362e-01
#> 4      β1⋅x1⋅x2 + -1⋅β2          4 1.448630e-01
#> 5 β1⋅x1⋅log(x2) + -1⋅β2          5 1.110223e-16

Stage 4: Evaluate additional metrics

regressor$evaluate(metrics = c('TSS', 'Elbow'))
#>                Equation Complexity         Loss       TSS      Elbow
#> 1                    β1          1 6.802920e-01 0.0000000        NaN
#> 2                 β1⋅x2          2 6.562670e-01 0.0000000        NaN
#> 3         β1⋅x2 + -1⋅β2          3 2.870362e-01 0.7536946 0.02398125
#> 4      β1⋅x1⋅x2 + -1⋅β2          4 1.448630e-01 0.8834154 0.01048082
#> 5 β1⋅x1⋅log(x2) + -1⋅β2          5 1.110223e-16 1.0000000 0.02632977

# Show results
print(regressor$get_pareto_front())
#>                Equation Complexity         Loss       TSS      Elbow
#> 1                    β1          1 6.802920e-01 0.0000000        NaN
#> 2                 β1⋅x2          2 6.562670e-01 0.0000000        NaN
#> 3         β1⋅x2 + -1⋅β2          3 2.870362e-01 0.7536946 0.02398125
#> 4      β1⋅x1⋅x2 + -1⋅β2          4 1.448630e-01 0.8834154 0.01048082
#> 5 β1⋅x1⋅log(x2) + -1⋅β2          5 1.110223e-16 1.0000000 0.02632977