Alcohol Consumption Data Analysis

Here we present the analysis on the alcohol consumption real data in Zhang et al. (2022). The data is made available in the package, see ?data(student). This dataset was originally collected and studied by Cortez and Silva (2008). In this analysis, we suppose that the binary response alc is subject to a false negative rate of 5%. We warn the readers that the code presented here may take time to run on a personal computer. Indeed, for the paper, the computations were performed on the “Baobab” and “Yggdrasil” HPC clusters provided by the University of Geneva.

Setting

# packages
library(IBpaper)
library(ib)

# for simulating logistic with misclassified response
misclassification <- function(object, control, extra){
  ftd <- fitted(object)
  n <- length(ftd)
  N <- n * control$H
  u1 <- 0.0
  u2 <- 0.05
  mu_star <- u1 * (1 - rep(ftd,control$H)) + (1 - u2) * rep(ftd,control$H)
  matrix(ifelse(runif(N) > mu_star,0,1), ncol=control$H)
}

# Verify where a value lies given a confidence interval
check_ci <- function(ci,theta){
  if(!is.matrix(ci)) ci <- as.matrix(ci)
  if(ncol(ci)!=2) ci <- t(ci)
  if(nrow(ci)!=length(theta)) stop("verify ci and theta dimensions match")
  apply(cbind(ci,theta), 1, function(x){
    if(x[3] <= min(x[1:2])){"left"}
    else if(x[3] >= max(x[1:2])){"right"}else{"center"}
  })
}

# IB specifics
H <- 200 # number of simulated estimators
ib_control <- ibControl(H = H, sim = misclassification, maxit = 100L) # control for iterative bootstrap (IB)
B <- 200 # number of bootstrap replicates
set.seed(432)
seeds <- sample.int(1e7,B) # seeds for bootstrap

# For coverage
alpha <- 0.05 # nominal level for confidence interval

Real data analysis

The code for the analysis on the real data is provided as follows.

## Fit initial estimator
fit_initial <- glm(alc ~ ., family = binomial(link = "logit"), 
                   data = student)

## Fit the MLE
x <- as.matrix(student[,-1])
y <- student[,1]
fit_mle <- logistic_misclassification_mle(x, y, fp = 0, fn = 0.05)

## Fit the JINI
fit_ib <- ib(fit_initial, control = ib_control)

## Retrieve coefficients
beta_hat <- fit_mle
beta_tilde <- coef(fit_ib)
p <- length(beta_hat)

## Standard errors for the MLE (based on Fisher information)
se1 <- sqrt(diag(inverse_FIM(x,beta_hat,0.0,0.05)))

## Asymptotic confidence interval for MLE
ci_mle <- beta_hat + se1 * matrix(rep(qnorm(c(alpha/2,1-alpha/2)),p),nc=2,byr=T)

## Standard errors for the JINI (estimated via bootstrap)
logistic_object <- make_logistic(x, beta_tilde)
boot <- matrix(nrow = B, ncol = p)
for(b in 1:B){
  ##------- simulate the process ---------
  # simulate logistic
  y <- simulation(logistic_object, control=list(seed=b, sim=misclassification))

  ##------ Initial estimation ----------------
  fit_tmp <- glm(y ~ x, family=binomial(link="logit"))

  ##------ IB bias correction ------------
  ib_control$seed <- seeds[b] # update seed in control
  fit_ib <- ib(fit_tmp, control = ib_control)
  boot[b,] <- getEst(fit_ib)
}

se2 <- sqrt(diag(cov(boot)))

## Asymptotic confidence interval for JINI
ci_jini <- beta_tilde + se2 * matrix(rep(qnorm(c(alpha/2,1-alpha/2)),p),nc=2,byr=T)

Numerical experiment

For the numerical experiment, we use beta_tilde from the real data analysis as the true parameter.

MC <- 1e4 # number of simulation
logistic_object <- make_logistic(x, beta_tilde) # for simulating response
# seed for RNG
set.seed(5847329)
seed <- vector("list",3)
seed$process <- sample.int(1e7,MC)
seed$ib <- sample.int(1e7,MC)
seed$boot <- sample.int(1e7,B)

# for saving results
res <- list(
  mle = matrix(nrow = MC, ncol = p),
  lci_mle = matrix(nrow = MC, ncol = p),
  coverage_mle = matrix(nrow = MC, ncol = p),
  ib = matrix(nrow = MC, ncol = p),
  lci_ib = matrix(nrow = MC, ncol = p),
  coverage_ib = matrix(nrow = MC, ncol = p)
)

ib_control2 <- ib_control # copy for bootstrap

# Simulations (careful, it is very long)
for(m in 1:MC){
  ##------- simulate the process ---------
  # simulate logistic
  y <- simulation(logistic_object, 
                  control = list(seed = seed$process[m], 
                                 sim = misclassification))
  
  ##------ Initial estimation ----------------
  fit_initial <- glm(y ~ x, family=binomial(link="logit"))
  
  ##------ MLE estimation ----------------
  fit_mle <- logistic_misclassification_mle(x,y,0.0,0.05)
  beta_hat <- fit_mle
  ci_mle <- beta_hat - sqrt(diag(inverse_FIM(x,beta_hat,0.0,0.05))) * matrix(rep(qnorm(alpha[2:1]),p),nc=2,byr=T) 
  res$mle[m,] <- beta_hat
  res$lci_mle[m,] <- apply(ci_mle,1,diff)
  res$coverage_mle[m,] <- check_ci(ci_mle,beta)
  
  ##------ IB bias correction ------------
  ib_control$seed <- seed$ib[m]
  fit_ib <- ib(fit_initial, control = ib_control)
  beta_tilde <- getEst(fit_ib)
  
  # bootstrap approximation of covariance
  lo <- make_logistic(x, beta_tilde)
  boot <- matrix(nrow = B, ncol = p)
  for(b in 1:B){
    ##------- simulate the process ---------
    # simulate logistic
    y <- simulation(lo, control=list(seed=b, sim=misclassification))
    
    ##------ MLE estimation ----------------
    fit_tmp <- glm(y~x, family=binomial(link="logit"))
    
    ##------ IB bias correction ------------
    ib_control2$seed <- seed$boot[b]
    fit_ib <- ib(fit_tmp, control = ib_control2)
    boot[b,] <- getEst(fit_ib)
  }
  se2 <- sqrt(diag(cov(boot)))
  ci_ib <- beta_tilde - se2 * matrix(rep(qnorm(alpha[2:1]),p),nc=2,byr=T) 
  res$ib[m,] <- beta_tilde
  res$lci_ib[m,] <- apply(ci_ib,1,diff)
  res$coverage_ib[m,] <- check_ci(ci_ib,beta)
  
  cat(m,"\n")
}

References

Cortez, Paulo, and Alice Maria Gonçalves Silva. 2008. “Using Data Mining to Predict Secondary School Student Performance.” In Proceedings of 5th Annual Future Business Technology Conference, Porto, 2008, 5–12. EUROSIS-ETI.

Zhang, Yuming, Yanyuan Ma, Samuel Orso, Mucyo Karemera, Maria-Pia Victoria-Feser, and Stéphane Guerrier. 2022. “A Flexible Bias Correction Method Based on Inconsistent Estimators.” https://arxiv.org/pdf/2204.07907.pdf.

2022-04-29

Setting

Real data analysis

Numerical experiment

References