Large-Scale Logistic Regression with lsReg

Example data

We use the same simulated dataset as the linear regression vignette. The binary outcome ylog was generated from a logistic model with x1, x2, z5, and z9 as predictors. The variables z1 through z10 are the candidates to be screened.

library(lsReg)

datafile <- system.file("extdata", "simulated_data.rds", package = "lsReg")
dat <- readRDS(datafile)
head(dat[, c("ylog", "x1", "x2", "z1", "z2", "z5", "z9")])
#>   ylog          x1 x2 z1 z2 z5        z9
#> 1    0 -1.52199116  1  0  1  0 0.5931028
#> 2    1 -0.96937043  1  0  0  0 0.6778513
#> 3    0 -0.38494124  0  0  0  0 0.8131135
#> 4    0 -0.24568841  0  1  1  0 0.2320682
#> 5    1 -0.49640532  0  0  1  0 0.7719654
#> 6    1  0.07975954  0  0  1  0 0.2162713

Step 1: Fit the base model

basemdl <- glm(ylog ~ x1 + x2, data = dat, family = binomial)
summary(basemdl)
#> 
#> Call:
#> glm(formula = ylog ~ x1 + x2, family = binomial, data = dat)
#> 
#> Coefficients:
#>             Estimate Std. Error z value Pr(>|z|)    
#> (Intercept) -0.16492    0.09298  -1.774  0.07609 .  
#> x1           0.82005    0.07669  10.694  < 2e-16 ***
#> x2           0.42391    0.13692   3.096  0.00196 ** 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 1386.2  on 999  degrees of freedom
#> Residual deviance: 1236.8  on 997  degrees of freedom
#> AIC: 1242.8
#> 
#> Number of Fisher Scoring iterations: 4

Step 2: Allocate memory

mem <- lsReg(basemdl, colstoadd = 1, testtype = "lrt")

Step 3: Test each candidate covariate

After each call to addcovar() the parameter estimate is in mem$fitdata$betab and the LRT chi-square statistic is in mem$testvalue.

zvars <- paste0("z", 1:10)

results <- data.frame(
  variable  = zvars,
  estimate  = NA_real_,
  statistic = NA_real_
)

for (i in seq_along(zvars)) {
  xr <- as.matrix(dat[, zvars[i], drop = FALSE])
  addcovar(mem, xr)
  results$estimate[i]  <- mem$fitdata$betab[1]
  results$statistic[i] <- mem$testvalue[1]
}

Results

The LRT statistic follows a chi-square distribution with 1 degree of freedom under the null.

results$pvalue <- pchisq(results$statistic, df = 1, lower.tail = FALSE)
print(results, digits = 4)
#>    variable estimate statistic    pvalue
#> 1        z1  0.35269   1.85561 1.731e-01
#> 2        z2  0.11901   0.75648 3.844e-01
#> 3        z3 -0.03377   0.05714 8.111e-01
#> 4        z4 -0.36539   3.93736 4.722e-02
#> 5        z5  0.77603  21.53147 3.481e-06
#> 6        z6  0.27795   1.41635 2.340e-01
#> 7        z7 -0.12305   0.26608 6.060e-01
#> 8        z8 -0.03611   0.02523 8.738e-01
#> 9        z9 -0.70644   8.79893 3.014e-03
#> 10      z10  0.12600   0.28733 5.919e-01

The variables z5 and z9 have the largest test statistics and the smallest p-values, consistent with the data-generating model.

Verification against standard GLM

We verify that lsReg matches a full glm() fit for z5 and z9. The LRT statistic is the difference in deviance between the base model and the full model, which anova() reports directly.

glm_z5 <- glm(ylog ~ x1 + x2 + z5, data = dat, family = binomial)
glm_z9 <- glm(ylog ~ x1 + x2 + z9, data = dat, family = binomial)

glm_est_z5  <- coef(glm_z5)["z5"]
glm_stat_z5 <- anova(basemdl, glm_z5, test = "LRT")$Deviance[2]

glm_est_z9  <- coef(glm_z9)["z9"]
glm_stat_z9 <- anova(basemdl, glm_z9, test = "LRT")$Deviance[2]

lsreg_est_z5  <- results$estimate[results$variable == "z5"]
lsreg_stat_z5 <- results$statistic[results$variable == "z5"]

lsreg_est_z9  <- results$estimate[results$variable == "z9"]
lsreg_stat_z9 <- results$statistic[results$variable == "z9"]

comparison <- data.frame(
  variable        = c("z5", "z9"),
  glm_estimate    = c(glm_est_z5,  glm_est_z9),
  lsreg_estimate  = c(lsreg_est_z5, lsreg_est_z9),
  glm_statistic   = c(glm_stat_z5,  glm_stat_z9),
  lsreg_statistic = c(lsreg_stat_z5, lsreg_stat_z9)
)
print(comparison, digits = 6)
#>    variable glm_estimate lsreg_estimate glm_statistic lsreg_statistic
#> z5       z5     0.776032       0.776032      21.53147        21.53147
#> z9       z9    -0.706441      -0.706441       8.79893         8.79893

The estimates and LRT statistics from lsReg match those from glm() to numerical precision.