Multiple testing in ANOVA and SEM

Author

Enrico Toffalini, Irene Alfarone

Published

April 17, 2025

Overview

We present two simulated scenarios to illustrate the issue of multiple testing, by looking at false positive rates and power in:

  • ANOVA

  • Structural Equation Modeling (SEM)

ANOVA

We simulated 5000 ANOVA with 3 binary predictors (A, B, C) and one continuous predictor (D), under H0 and H1 (y = 1 + 0.20*C + residual).

Code
aovsim = function(niter = NA, h1 = NA, N = NA) {
  
  significanceCount = rep(NA,niter)
  ms = as.data.frame(matrix(NA, nrow = niter, ncol = 15))
  
  for(i in 1:niter){
    A = rbinom(N, 1, .5)
    B = rbinom(N, 1, .5)
    C = rbinom(N, 1, .5)
    D = rnorm(N, 0, 1)
    residual = rnorm(N, 0, 1)
    
    if(h1){
      y = 1 + 0.20*C + residual
    } else {
      y = 1 + residual
    }
    
    df = data.frame(A, B, C, D, y)
    
    ps = Anova(aov(y~A*B*C*D, data = df), type="II")$"Pr(>F)"
    ps = ps[!is.na(ps)]
    
    significanceCount[i] = sum(ps<0.05)
    ms[i, 1:length(ps)] = ps
  }

  colnames(ms) = rownames(Anova(aov(y~A*B*C*D,data = df)))[-nrow(Anova(aov(y~A*B*C*D,data = df)))]
  
  return(list(
    ms = ms,
    signCount = significanceCount,
    hist = hist(significanceCount),
    percSignCount = mean(significanceCount > 0)
  ))
}
Code
library(car)

N = 1e+05
A = rbinom(N, 1, .5)
B = rbinom(N, 1, .5)
C = rbinom(N, 1, .5)
D = rnorm(N, 0, 1)
residual = rnorm(N, 0, 1)

y = 1 + 0.20*C + residual

df = data.frame(A, B, C, D, y)

Anova(aov(y~A*B*C*D, data = df), type="II")
Anova Table (Type II tests)

Response: y
          Sum Sq    Df  F value  Pr(>F)    
A              0     1   0.3252 0.56850    
B              0     1   0.1929 0.66049    
C            878     1 883.1740 < 2e-16 ***
D              1     1   0.6116 0.43417    
A:B            0     1   0.2360 0.62713    
A:C            0     1   0.0006 0.98016    
B:C            1     1   1.0365 0.30864    
A:D            0     1   0.1576 0.69133    
B:D            4     1   4.3097 0.03790 *  
C:D            0     1   0.2052 0.65053    
A:B:C          1     1   1.3219 0.25026    
A:B:D          0     1   0.0298 0.86295    
A:C:D          0     1   0.0648 0.79902    
B:C:D          3     1   2.7058 0.09999 .  
A:B:C:D        2     1   2.3440 0.12577    
Residuals  99426 99984                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Results of the simulation

Under H1 (true effect of C)
Metric Value
Power (before correction) 88.8%
Power (after correction; FDR) 60.6%
At least one false positive (before correction) 49.7%
At least one false positive (after correction; FDR) 6.8%

SEM

The simulated SEM model consists of six latent variables, each measured by ten indicators, with a true effect F3 ~ F1 under H1.

Code
library(lavaan)
semsim = function(niter = NA, h1 = NA, N = NA, k = 6, w = 10, loading = 0.5) {
  
  generate_lavaan_model = function(k, w) {
    model_lines = sapply(1:k, function(f) {
      lhs = paste0("F", f, " =~ ")
      rhs = paste0("F", f, "_item", 1:w, collapse = " + ")
      paste0(lhs, rhs)
    })
    paste(model_lines, collapse = "\n")
  }
  
  significanceCount = rep(NA, niter)
  ms.sem = as.data.frame(matrix(NA, nrow = niter, ncol = 15))
  
  for(i in 1:niter){
    residual_sd = sqrt(1 - loading^2)
    latent_vars = matrix(rnorm(N * k), nrow = N, ncol = k)
    colnames(latent_vars) = paste0("F", 1:k)
    
    if (h1) {
      latent_vars[, "F3"] = 0 + 0.15*latent_vars[, "F1"] + rnorm(N, 0, 1)
    }
    
    observed_list = vector("list", k)
    for (j in 1:k) {
      indicators = matrix(
        loading * latent_vars[, j] + residual_sd * matrix(rnorm(N * w), nrow = N),
        nrow = N, ncol = w
      )
      colnames(indicators) = paste0("F", j, "_item", 1:w)
      observed_list[[j]] = indicators
    }
    
    df = as.data.frame(do.call(cbind, observed_list))
    
    modelM = generate_lavaan_model(k = 6,w = 10)
    model = paste(modelM,
                  "\n
              F6 ~ F1 + F2 + F3 + F4 + F5 \n
              F5 ~ F1 + F2 + F3 + F4 \n
              F4 ~ F1 + F2 + F3 \n
              F3 ~ F1 + F2 \n
              F2 ~ F1")
    
    fit = sem(model,df)
    pe = summary(fit)$pe
    ps = pe$pvalue[pe$op=="~"]
    
    significanceCount[i] = sum(ps < 0.05)
    #  print(paste("i:",i))
    #  print(paste("sign.count:",significanceCount[i]))
    ms.sem[i, 1:length(ps)] = ps
  } 
  
  colnames(ms.sem) = paste(pe$lhs[pe$op == "~"], pe$op[pe$op == "~"], pe$rhs[pe$op == "~"])
  
  return(list(
    ms.sem = ms.sem,
    significanceCount = significanceCount,
    hist = hist(significanceCount),
    percSigPaths = mean(significanceCount > 0)
  ))
}

Latent Factors

Measurement model

Celestial SEM (from Celestial Mediation)

Results of the simulation

Under H1 (true path F3 ~ F1)
Metric Value
Power (before correction) 94.6%
Power (after correction; FDR) 72.8%
At least one false positive (before correction) 49.6%
At least one false positive (after correction; FDR) 6.1%