PSYC 575 Multilevel Modeling (2021 Fall): R Codes (Week 10)

# To install a package, run the following ONCE (and only once on your computer)
# install.packages("psych")
library(here)  # makes reading data more consistent
library(tidyverse)  # for data manipulation and plotting
library(haven)  # for importing SPSS/SAS/Stata data
library(lme4)  # for multilevel analysis
library(glmmTMB)  # for longitudinal analysis
library(sjPlot)  # for plotting
library(modelsummary)  # for making tables
library(interactions)  # for interaction plots
theme_set(theme_bw())  # Theme; just my personal preference

# Read in the data from gitHub (need Internet access)
curran_wide <- read_sav("https://github.com/MultiLevelAnalysis/Datasets-third-edition-Multilevel-book/raw/master/chapter%205/Curran/CurranData.sav")
# Make id a factor
curran_wide  # print the data

# Using the new `tidyr::pivot_longer()` function
curran_long <- curran_wide %>%
  pivot_longer(
    c(anti1:anti4, read1:read4),  # variables that are repeated measures
    # Convert 8 columns to 3: 2 columns each for anti/read (.value), and
    # one column for time
    names_to = c(".value", "time"),
    # Extract the names "anti"/"read" from the names of the variables for the
    # value columns, and then the number to the "time" column
    names_pattern = "(anti|read)([1-4])",
    # Convert the "time" column to integers
    names_transform = list(time = as.integer)
  ) %>%
  mutate(time0 = time - 1)

Covariance Structure

curran_wide %>%
  select(read1:read4) %>%
  cov(use = "pair") %>%
  print(digits = 2)

m0_ols <- lm(read ~ factor(time), data = curran_long)
diag(sigma(m0_ols), nrow = 4) %>%
  round(digits = 2)

m0_ri <- glmmTMB(read ~ factor(time) + (1 | id),
  data = curran_long,
  REML = TRUE
)
vc_m0_ri <- VarCorr(m0_ri)
(matrix(vc_m0_ri[[1]]$id[1, 1], nrow = 4, ncol = 4) +
  diag(attr(vc_m0_ri[[1]], "sc")^2, nrow = 4)) %>%
  print(digits = 2)

# Compute phase1 and phase2
curran_long <- curran_long %>%
  mutate(phase1 = pmin(time0, 1),  # anything bigger than 1 becomes 1
         phase2 = pmax(time0 - 1, 0))  # set time to start at TIME 1, and then make
                                       # anything smaller than 0 to 0

# Fit the piecewise growth model
m_pw <- glmmTMB(read ~ phase1 + phase2 + (phase1 + phase2 | id),
  data = curran_long, REML = TRUE,
  # The default optimizer did not converge; try optim
  control = glmmTMBControl(
    optimizer = optim,
    optArgs = list(method = "BFGS")
  )
)
vc_mpw <- VarCorr(m_pw)
# Covariance
(cbind(1, c(0, 1, 1, 1), c(0, 0, 1, 2)) %*%
  vc_mpw[[1]]$id[1:3, 1:3] %*%
  rbind(1, c(0, 1, 1, 1), c(0, 0, 1, 2)) +
  diag(attr(vc_mpw[[1]], "sc")^2, nrow = 4)) %>%
  print(digits = 2)

Autoregressive(1) Error Structure

# Add the AR(1) structure
m_pw_ar1 <- glmmTMB(
  read ~ phase1 + phase2 + (phase1 + phase2 | id) +
    ar1(0 + factor(time) | id),
  dispformula = ~0,
  data = curran_long, REML = TRUE,
  # The default optimizer did not converge; try optim
  control = glmmTMBControl(
    optimizer = optim,
    optArgs = list(method = "BFGS")
  )
)
vc_mpw_ar1 <- VarCorr(m_pw_ar1)
# Covariance
(cbind(1, c(0, 1, 1, 1), c(0, 0, 1, 2)) %*%
  vc_mpw_ar1[[1]]$id[1:3, 1:3] %*%
  rbind(1, c(0, 1, 1, 1), c(0, 0, 1, 2)) +
  unname(vc_mpw_ar1[[1]]$id.1[1:4, 1:4])) %>%
  print(digits = 2)

Likelihood Ratio Test

anova(m_pw, m_pw_ar1)  # Not statistically significant

Intensive Longitudinal Data

# Download the data from
# https://www.pilesofvariance.com/Chapter8/SPSS/SPSS_Chapter8.zip, and put it 
# into the "data_files" folder
zip_data <- here("data_files", "SPSS_Chapter8.zip")
# download.file("https://www.pilesofvariance.com/Chapter8/SPSS/SPSS_Chapter8.zip", 
#               zip_data)
stress_data <- read_sav(
  unz(zip_data,
      "SPSS_Chapter8/SPSS_Chapter8.sav"))
stress_data

Data Exploration

# First, separate the time-varying variables into within-person and
# between-person levels
stress_data <- stress_data %>%
  # Center mood (originally 1-5) at 1 for interpretation (so it becomes 0-4)
  # Also women to factor
  mutate(mood1 = mood - 1,
         women = factor(women, levels = c(0, 1), 
                        labels = c("men", "women"))) %>%
  group_by(PersonID) %>%
  # The `across()` function can be used to operate the same procedure on
  # multiple variables
  mutate(across(c(symptoms, mood1, stressor),
                # The `.` means the variable to be operated on
                list("pm" = ~ mean(., na.rm = TRUE),
                     "pmc" = ~ . - mean(., na.rm = TRUE)))) %>%
  ungroup()
stress_data

# Use `datasummary_skim` for a quick summary
datasummary_skim(stress_data %>%
                   select(-PersonID, -session, -studyday, -dayofweek,
                          -weekend, -mood,
                          # PMC for binary variable is not very meaningful
                          -stressor_pmc))

	Unique (#)	Missing (%)	Mean	SD	Min	Median	Max
baseage	103	0	80.1	6.1	69.7	80.0	95.3
symptoms	7	1	1.3	1.3	0.0	1.0	5.0
stressor	3	1	0.4	0.5	0.0	0.0	1.0
mood1	13	1	0.2	0.4	0.0	0.0	2.6
symptoms_pm	24	0	1.3	1.2	0.0	1.0	5.0
symptoms_pmc	44	1	−0.0	0.7	−2.4	0.0	3.0
mood1_pm	33	0	0.2	0.3	0.0	0.1	1.5
mood1_pmc	98	1	−0.0	0.3	−0.7	0.0	1.3
stressor_pm	8	0	0.4	0.3	0.0	0.4	1.0

stress_data %>%
    select(symptoms_pm, symptoms_pmc,
           mood1_pm, mood1_pmc, stressor_pm, stressor) %>%
    psych::pairs.panels(jiggle = TRUE, factor = 0.5, ellipses = FALSE,
                        cex.cor = 1, cex = 0.5)

Spaghetti Plot

# Plotting mood
p1 <- ggplot(stress_data, aes(x = studyday, y = mood1)) +
  # add lines to connect the data for each person
  geom_line(aes(color = factor(PersonID), group = PersonID)) +
  # add a mean trajectory
  stat_summary(fun = "mean", col = "red", size = 1, geom = "line") +
  # suppress legend
  guides(color = "none")
# Plotting symptoms
p2 <- ggplot(stress_data, aes(x = studyday, y = symptoms)) +
  # add lines to connect the data for each person
  geom_line(aes(color = factor(PersonID), group = PersonID)) +
  # add a mean trajectory
  stat_summary(fun = "mean", col = "red", size = 1, geom = "line") +
  # suppress legend
  guides(color = "none")
gridExtra::grid.arrange(p1, p2, ncol = 2)

We can see that there is not a clear trend across time, but instead fluctuations.

stress_data %>%
  # randomly sample 40 individuals
  filter(PersonID %in% sample(unique(PersonID), 40)) %>%
  ggplot(aes(x = studyday, y = symptoms)) +
  geom_point(size = 0.5) +
  geom_line() +  # add lines to connect the data for each person
  facet_wrap(~ PersonID, ncol = 10)

Unconditional Models

The outcome is symptoms, and the time-varying predictors are mood1 and stressor. Let’s look at the ICC for each of them.

m0_symp <- glmmTMB(symptoms ~ (1 | PersonID), data = stress_data,
                   REML = TRUE)
# If changing outcome, use the `update` function to avoid recompiling
m0_mood <- glmmTMB(mood1 ~ (1 | PersonID), data = stress_data,
                   REML = TRUE)
# Use family = bernoulli for a binary outcome; to be discussed in a later week
m0_stre <- glmmTMB(stressor ~ (1 | PersonID), data = stress_data,
                   family = binomial, REML = TRUE)

# Symptoms
(vc_m0sy <- VarCorr(m0_symp))  # shows the random effect SDs

icc_symp <- vc_m0sy[[1]]$PersonID[1, 1] /
  (vc_m0sy[[1]]$PersonID[1, 1] + attr(vc_m0sy[[1]], "sc")^2)
# Mood
(vc_m0mo <- VarCorr(m0_mood))  # shows the random effect SDs

icc_mood <- vc_m0mo[[1]]$PersonID[1, 1] /
  (vc_m0mo[[1]]$PersonID[1, 1] + attr(vc_m0mo[[1]], "sc")^2)
# Stressor (sigma is fixed to pi^2 / 3)
(vc_m0st <- VarCorr(m0_stre))  # shows the random effect SDs

# The error variance is pi^2 / 3 for a binary outcome
icc_stre <- vc_m0st[[1]]$PersonID[1, 1] /
  (vc_m0st[[1]]$PersonID[1, 1] + pi^2 / 3)
# Print ICCs
c(symptoms = icc_symp,
  mood1 = icc_mood,
  stressor = icc_stre)

Check the Trend of the Data

One thing to consider with longitudinal data in a short time is whether there may be trends in the data due to contextual factors (e.g., weekend effect; shared life events in certain time). If this is the case, a common practice is to de-trend the data; otherwise associations between predictors and the outcome may be merely due to contextual factor happening on certain days.

Here there is no strong reason to think the data contain trends, as shown in the graphs. We can further test the linear trend:

m0_trend <- glmmTMB(symptoms ~ studyday + (studyday | PersonID),
    data = stress_data,
    REML = TRUE
)
summary(m0_trend)

which was not significant. It there was a trend, we should include the time variable when studying the time-varying covariates.

Covariance Structure

# AR(1)
m0_ar1 <- glmmTMB(
  symptoms ~ ar1(factor(studyday) + 0 | PersonID),
  data = stress_data,
  REML = TRUE
)
anova(m0_symp, m0_ar1)  # not significant

Time-Varying Covariate

m1 <- glmmTMB(
    symptoms ~ (mood1_pm + mood1_pmc) * women + (mood1_pmc | PersonID),
    data = stress_data,
    REML = TRUE,
    # The default optimizer did not converge; try optim
    control = glmmTMBControl(
        optimizer = optim,
        optArgs = list(method = "BFGS")
    )
)
summary(m1)

Plotting

plot_model(m1,
    type = "pred", show.data = TRUE,
    title = "", dot.size = 0.5,
    jitter = 0.1
)

Interaction Plots

interact_plot(m1,
              pred = "mood1_pm",
              modx = "women",
              plot.points = TRUE,
              point.size = 0.5,
              point.alpha = 0.2,
              x.label = "Negative Mood (Between-Person)",
              y.label = "Physical Symptoms",
              jitter = 0.1)

interact_plot(m1,
              pred = "mood1_pmc",
              modx = "women",
              plot.points = TRUE,
              point.size = 0.5,
              point.alpha = 0.2,
              x.label = "Daily Negative Mood (Within-Person)",
              y.label = "Daily Physical Symptoms",
              jitter = 0.1)

Between/Within effects

broom.mixed::augment(m1) %>%
    mutate(mood1 = mood1_pm + mood1_pmc) %>%
    ggplot(aes(x = mood1, y = symptoms, color = factor(PersonID))) +
    # Add points
    geom_point(size = 0.5, alpha = 0.2) +
    # Add within-cluster lines
    geom_smooth(aes(y = .fitted),
        method = "lm", se = FALSE, size = 0.5
    ) +
    # Add group means
    stat_summary(
        aes(x = mood1_pm, y = .fitted, fill = factor(PersonID)),
        color = "red", # add border
        fun = mean,
        geom = "point",
        shape = 24,
        # use triangles
        size = 2.5
    ) +
    # Add between coefficient
    geom_smooth(aes(x = mood1_pm, y = .fitted),
        method = "lm", se = FALSE,
        color = "black"
    ) +
    facet_grid(~women) +
    labs(y = "Daily Symptoms") +
    # Suppress legend
    guides(color = "none", fill = "none")

Add stressor

# Convert stressor to a factor
stress_data$stressor <- factor(stress_data$stressor,
                               levels = c(0, 1),
                               labels = c("stressor-free day", "stressor day"))
m2 <- glmmTMB(
    symptoms ~ (mood1_pm + mood1_pmc) * women + stressor_pm + stressor +
        (mood1_pmc + stressor | PersonID),
    data = stress_data,
    REML = TRUE,
    # The default optimizer did not converge; try optim
    control = glmmTMBControl(
        optimizer = optim,
        optArgs = list(method = "BFGS")
    )
)

Plotting

# Contextual effect of stressor
plot_model(m2,
    type = "pred", terms = "stressor_pm",
    show.data = TRUE, jitter = 0.1,
    title = "", dot.size = 0.5
)

# Within-person effect
plot_model(m2,
    type = "pred", terms = "stressor",
    show.data = TRUE, jitter = 0.1,
    title = "", dot.size = 0.5
)

Bonus: Generalized Estimating Equations (GEE)

GEE is an alternative estimation method for clustered data, as discussed in 12.2 of Snijders and Bosker. It is a popular method especially for longitudinal data in some areas of research, as it offers some robustness against misspecification in the random effect structure (e.g., autoregressive errors, missing random slopes, etc). It accounts for the covariance structure by using a working correlation matrix, such as the exchangeable (aka compound symmetry) structure. On the other hand, it only estimates fixed effects (or sometimes called marginal effects), so we cannot known whether some slopes are different across persons, but only the average slopes. GEE estimates the fixed effects by iteratively calculating the working correlation matrix and updating the fixed effect estimates with OLS. At the end, it computes the standard errors using the cluster-robust sandwich estimator.

GEE tends to require a relatively large sample size, and is usually less efficient than MLM for the same model (i.e., GEE gives larger standard errors, wider confidence intervals, and thus has less statistical power), which can be seen in the example below.

stress_data_lw <- stress_data %>%
  drop_na(mood1_pm, mood1_pmc, women, stressor_pm, stressor) %>%
  # Also need to convert to a data frame (instead of tibble)
  as.data.frame()

library(geepack)
m2_gee <- geeglm(symptoms ~ (mood1_pm + mood1_pmc) * women +
                   stressor_pm + stressor,
                 data = stress_data_lw,
                 id = PersonID,
                 corstr = "exchangeable",
                 std.err = "san.se")
summary(m2_gee)

# Small sample correction (Mancl & DeRouen, 2001) for the robust standard errors
library(geesmv)
m2_vcov_md <- GEE.var.md(symptoms ~ (mood1_pm + mood1_pmc) * women +
                           stressor_pm + stressor,
                         data = stress_data_lw,
                         id = "PersonID",  # need to be string
                         corstr = "exchangeable")

library(lmtest)  # For testing using corrected standard error
coeftest(m2_gee, diag(m2_vcov_md$cov.beta))

# Ignore clustering
m2_lm <- lm(
    symptoms ~ (mood1_pm + mood1_pmc) * women +
        stressor_pm + stressor,
    data = stress_data_lw
)
# Compare the fixed effects
msummary(
    list(
        OLS = m2_lm,
        MLM = m2,
        `GEE (Sandwich SE)` = m2_gee,
        `GEE (Sandwich SE + small-sample correction)` = m2_gee
    ),
    vcov = list(
        OLS = NULL,
        MLM = NULL,
        `GEE (Sandwich SE)` = NULL,
        `GEE (Sandwich SE + small-sample correction)` = 
        diag(m2_vcov_md$cov.beta)
    )
)

	OLS	MLM	GEE (Sandwich SE)	GEE (Sandwich SE + small-sample correction)
(Intercept)	0.565	0.452	0.549	0.549
	(0.144)	(0.238)	(0.229)	(0.264)
mood1_pm	2.978	3.437	3.195	3.195
	(0.468)	(0.786)	(0.917)	(1.311)
mood1_pmc	−0.211	−0.064	−0.127	−0.127
	(0.399)	(0.292)	(0.257)	(0.301)
womenwomen	−0.082	0.039	−0.073	−0.073
	(0.152)	(0.252)	(0.273)	(0.310)
stressor_pm	0.890	0.849	0.904	0.904
	(0.218)	(0.301)	(0.324)	(0.343)
stressorstressor day	0.072	0.064	0.067	0.067
	(0.142)	(0.101)	(0.093)	(0.094)
mood1_pm × womenwomen	−1.684	−1.957	−1.897	−1.897
	(0.506)	(0.857)	(0.939)	(1.343)
mood1_pmc × womenwomen	0.371	0.251	0.286	0.286
	(0.451)	(0.333)	(0.298)	(0.342)
sd__(Intercept)		0.732
sd__mood1_pmc		0.215
sd__stressorstressor day		0.268
cor__(Intercept).mood1_pmc		0.557
cor__(Intercept).stressorstressor day		0.967
cor__mood1_pmc.stressorstressor day		0.713
sd__Observation		0.780
Num.Obs.	513	513
R2	0.253
R2 Adj.	0.243
AIC	1616.9	1436.7
BIC	1655.1	1500.3
Log.Lik.	−799.450	−703.364
Std.Errors	OLS	MLM	GEE (Sandwich SE)	GEE (Sandwich SE + small-sample correction)
n.clusters			105.000	105.000
max.cluster.size			5.000	5.000
alpha			0.521	0.521
gamma			1.323	1.323

As shown above, for OLS regression, which ignores the temporal dependence of the data, the standard errors for the between-level coefficients are too small. The results with MLM and with GEE are comparable, although GEE only gives the fixed effect estimates. Overall, GEE tends to have larger standard errors, especially when taking into account the samll-sample correction. In sum, the pros and cons of GEE are:

R Codes (Week 10)

Authors

Affiliations

Published

DOI

Load Packages and Import Data

Covariance Structure

Autoregressive(1) Error Structure

Likelihood Ratio Test

Intensive Longitudinal Data

Data Exploration

Spaghetti Plot

Unconditional Models

Check the Trend of the Data

Covariance Structure

Time-Varying Covariate

Plotting

Interaction Plots

Between/Within effects

Add `stressor`

Plotting

Bonus: Generalized Estimating Equations (GEE)

Footnotes

R Codes (Week 10)

Authors

Affiliations

Published

DOI

Load Packages and Import Data

Covariance Structure

Autoregressive(1) Error Structure

Likelihood Ratio Test

Intensive Longitudinal Data

Data Exploration

Spaghetti Plot

Unconditional Models

Check the Trend of the Data

Covariance Structure

Time-Varying Covariate

Plotting

Interaction Plots

Between/Within effects

Add stressor

Plotting

Bonus: Generalized Estimating Equations (GEE)

Footnotes

Add `stressor`