Here we present the analysis on the alcohol consumption real data in Zhang et al. (2022). The data is made available in the package, see ?data(student)
. This dataset was originally collected and studied by Cortez and Silva (2008). In this analysis, we suppose that the binary response alc
is subject to a false negative rate of 5%. We warn the readers that the code presented here may take time to run on a personal computer. Indeed, for the paper, the computations were performed on the “Baobab” and “Yggdrasil” HPC clusters provided by the University of Geneva.
# packages
library(IBpaper)
library(ib)
# for simulating logistic with misclassified response
misclassification <- function(object, control, extra){
ftd <- fitted(object)
n <- length(ftd)
N <- n * control$H
u1 <- 0.0
u2 <- 0.05
mu_star <- u1 * (1 - rep(ftd,control$H)) + (1 - u2) * rep(ftd,control$H)
matrix(ifelse(runif(N) > mu_star,0,1), ncol=control$H)
}
# Verify where a value lies given a confidence interval
check_ci <- function(ci,theta){
if(!is.matrix(ci)) ci <- as.matrix(ci)
if(ncol(ci)!=2) ci <- t(ci)
if(nrow(ci)!=length(theta)) stop("verify ci and theta dimensions match")
apply(cbind(ci,theta), 1, function(x){
if(x[3] <= min(x[1:2])){"left"}
else if(x[3] >= max(x[1:2])){"right"}else{"center"}
})
}
# IB specifics
H <- 200 # number of simulated estimators
ib_control <- ibControl(H = H, sim = misclassification, maxit = 100L) # control for iterative bootstrap (IB)
B <- 200 # number of bootstrap replicates
set.seed(432)
seeds <- sample.int(1e7,B) # seeds for bootstrap
# For coverage
alpha <- 0.05 # nominal level for confidence interval
The code for the analysis on the real data is provided as follows.
## Fit initial estimator
fit_initial <- glm(alc ~ ., family = binomial(link = "logit"),
data = student)
## Fit the MLE
x <- as.matrix(student[,-1])
y <- student[,1]
fit_mle <- logistic_misclassification_mle(x, y, fp = 0, fn = 0.05)
## Fit the JINI
fit_ib <- ib(fit_initial, control = ib_control)
## Retrieve coefficients
beta_hat <- fit_mle
beta_tilde <- coef(fit_ib)
p <- length(beta_hat)
## Standard errors for the MLE (based on Fisher information)
se1 <- sqrt(diag(inverse_FIM(x,beta_hat,0.0,0.05)))
## Asymptotic confidence interval for MLE
ci_mle <- beta_hat + se1 * matrix(rep(qnorm(c(alpha/2,1-alpha/2)),p),nc=2,byr=T)
## Standard errors for the JINI (estimated via bootstrap)
logistic_object <- make_logistic(x, beta_tilde)
boot <- matrix(nrow = B, ncol = p)
for(b in 1:B){
##------- simulate the process ---------
# simulate logistic
y <- simulation(logistic_object, control=list(seed=b, sim=misclassification))
##------ Initial estimation ----------------
fit_tmp <- glm(y ~ x, family=binomial(link="logit"))
##------ IB bias correction ------------
ib_control$seed <- seeds[b] # update seed in control
fit_ib <- ib(fit_tmp, control = ib_control)
boot[b,] <- getEst(fit_ib)
}
se2 <- sqrt(diag(cov(boot)))
## Asymptotic confidence interval for JINI
ci_jini <- beta_tilde + se2 * matrix(rep(qnorm(c(alpha/2,1-alpha/2)),p),nc=2,byr=T)
For the numerical experiment, we use beta_tilde
from the real data analysis as the true parameter.
MC <- 1e4 # number of simulation
logistic_object <- make_logistic(x, beta_tilde) # for simulating response
# seed for RNG
set.seed(5847329)
seed <- vector("list",3)
seed$process <- sample.int(1e7,MC)
seed$ib <- sample.int(1e7,MC)
seed$boot <- sample.int(1e7,B)
# for saving results
res <- list(
mle = matrix(nrow = MC, ncol = p),
lci_mle = matrix(nrow = MC, ncol = p),
coverage_mle = matrix(nrow = MC, ncol = p),
ib = matrix(nrow = MC, ncol = p),
lci_ib = matrix(nrow = MC, ncol = p),
coverage_ib = matrix(nrow = MC, ncol = p)
)
ib_control2 <- ib_control # copy for bootstrap
# Simulations (careful, it is very long)
for(m in 1:MC){
##------- simulate the process ---------
# simulate logistic
y <- simulation(logistic_object,
control = list(seed = seed$process[m],
sim = misclassification))
##------ Initial estimation ----------------
fit_initial <- glm(y ~ x, family=binomial(link="logit"))
##------ MLE estimation ----------------
fit_mle <- logistic_misclassification_mle(x,y,0.0,0.05)
beta_hat <- fit_mle
ci_mle <- beta_hat - sqrt(diag(inverse_FIM(x,beta_hat,0.0,0.05))) * matrix(rep(qnorm(alpha[2:1]),p),nc=2,byr=T)
res$mle[m,] <- beta_hat
res$lci_mle[m,] <- apply(ci_mle,1,diff)
res$coverage_mle[m,] <- check_ci(ci_mle,beta)
##------ IB bias correction ------------
ib_control$seed <- seed$ib[m]
fit_ib <- ib(fit_initial, control = ib_control)
beta_tilde <- getEst(fit_ib)
# bootstrap approximation of covariance
lo <- make_logistic(x, beta_tilde)
boot <- matrix(nrow = B, ncol = p)
for(b in 1:B){
##------- simulate the process ---------
# simulate logistic
y <- simulation(lo, control=list(seed=b, sim=misclassification))
##------ MLE estimation ----------------
fit_tmp <- glm(y~x, family=binomial(link="logit"))
##------ IB bias correction ------------
ib_control2$seed <- seed$boot[b]
fit_ib <- ib(fit_tmp, control = ib_control2)
boot[b,] <- getEst(fit_ib)
}
se2 <- sqrt(diag(cov(boot)))
ci_ib <- beta_tilde - se2 * matrix(rep(qnorm(alpha[2:1]),p),nc=2,byr=T)
res$ib[m,] <- beta_tilde
res$lci_ib[m,] <- apply(ci_ib,1,diff)
res$coverage_ib[m,] <- check_ci(ci_ib,beta)
cat(m,"\n")
}
Cortez, Paulo, and Alice Maria Gonçalves Silva. 2008. “Using Data Mining to Predict Secondary School Student Performance.” In Proceedings of 5th Annual Future Business Technology Conference, Porto, 2008, 5–12. EUROSIS-ETI.
Zhang, Yuming, Yanyuan Ma, Samuel Orso, Mucyo Karemera, Maria-Pia Victoria-Feser, and Stéphane Guerrier. 2022. “A Flexible Bias Correction Method Based on Inconsistent Estimators.” https://arxiv.org/pdf/2204.07907.pdf.