Advanced group-sequential and adaptive confirmatory clinical trial designs, with R practicals using rpact: Efficient use of futility and efficacy interim analyses in group-sequential designs

Author
Affiliation

Kaspar Rufibach

Methods, Collaboration, and Outreach Group (MCO), PD Data and Statistical Sciences, Roche Basel

Published

December 2, 2024

1 Purpose of this document

This R markdown file provides the code accompanying the first theory block in the BBS course Advanced group-sequential and adaptive confirmatory clinical trial designs, with R practicals using rpact.

2 Setup

Load rpact and further packages and define shortcut for sample size function.

# Load rpact
library(rpact)
library(knitr)

Attache Paket: 'knitr'
Das folgende Objekt ist maskiert 'package:rpact':

    kable
# shortcut for sample size function, to streamline code
myGetSampleSizeSurvival <- function(design){
    tmp <- getSampleSizeSurvival(design = design,
                        lambda2 = log(2) / m1, hazardRatio = hr,
                        dropoutRate1 = dout1, dropoutRate2 = dout2, dropoutTime = douttime,
                        accrualTime = accrualTime, accrualIntensity = accrualIntensity,
                        maxNumberOfSubjects = maxNumberOfSubjects)  
    return(tmp)
}

3 Specify details of example trial

# design parameters
alpha <- 0.05
beta <- 0.2
m1 <- 6 * 12
m2 <- 8 * 12
hr <- m1 / m2

# constant for computation of variance for log(hr)
# as a function of P(randomized to A)
pA <- 1 / 2     # 1:1 randomization
kappa <- (pA * (1 - pA)) ^ (-1)

# timing parameters
dout1 <- 0.025
dout2 <- 0.025
douttime <- 12
accrualTime <- 0:6
accrualIntensity <- seq(6, 42, by = 6)
maxNumberOfSubjects <- 1200

# informal futility boundary
inform_bound <- 1

4 How much do we gain with interim analyses?

The following code allows to compute all the numbers that appear in the first section of the slide deck. We do not show all the numbers here and invite everyone to get them out of the corresponding objects.

# Required events without interim
nevent <- getSampleSizeSurvival(lambda1 = getLambdaByMedian(m2), lambda2 = getLambdaByMedian(m1), 
                                sided = 2, alpha = alpha, beta = beta)
nevent <- ceiling(nevent$maxNumberOfEvents)
nevent 
[1] 380
# timing of interims (information fraction)
i1 <- 0.3
i2 <- 2 / 3
infofrac <- c(i1, i2, 1)

# OBF design without initial interim at 30% (to get cumulative alpha-spending function 
# for these visits) 
design_tmp <- getDesignGroupSequential(informationRates = infofrac[-1],
                                       sided = 1, alpha = alpha / 2, beta = beta,
                                       typeOfDesign = "asOF", futilityBounds = c(-6))

design <- getDesignGroupSequential(informationRates = infofrac,
                                   sided = 1, alpha = alpha / 2, beta = beta,
                                   futilityBounds = c(log(inform_bound), -6),
                                   typeOfDesign = "asUser", 
                                   userAlphaSpending = c(0.00001, design_tmp$alphaSpent))

samplesize <- myGetSampleSizeSurvival(design)

# end of recruitment
recrend <- samplesize$totalAccrualTime
recrend
[1] 31.57143
# total sample size
nevents_i1 <- as.vector(ceiling(samplesize$eventsPerStage))
nevent_gs <- max(nevents_i1)
nevent_gs
[1] 408
# stopping probabilities at futility and efficacy interim under H0 and H1
designChar <- getDesignCharacteristics(design)
stopProbsH0 <- getPowerAndAverageSampleNumber(design, theta = 0, nMax = designChar$shift)
stopProbsH1 <- getPowerAndAverageSampleNumber(design, theta = 1, nMax = designChar$shift)

stopFutIA_H0 <- stopProbsH0$earlyStop["stage = 1", 1]
stopEffIA_H0 <- stopProbsH0$earlyStop["stage = 2", 1]
c(stopFutIA_H0, stopEffIA_H0)
  stage = 1   stage = 2 
0.500010000 0.006000121 
stopFutIA_H1 <- stopProbsH1$earlyStop["stage = 1", 1]
stopEffIA_H1 <- stopProbsH1$earlyStop["stage = 2", 1]
c(stopFutIA_H1, stopEffIA_H1)
stage = 1 stage = 2 
0.0595379 0.4400694 
# Expected number of events under H0 and H1
expH0 <- samplesize$expectedEventsH0
expH1 <- samplesize$expectedEventsH1
c(expH0, expH1)
[1] 264.3069 331.0536
# clinical cutoffs
maxNumberOfSubjects1 <- maxNumberOfSubjects / 2
maxNumberOfSubjects2 <- maxNumberOfSubjects / 2
maxNumberOfSubjects <- maxNumberOfSubjects1 + maxNumberOfSubjects2

# time to interim cutoff
time <- 0:100 

# under H0
probEventH0 <- getEventProbabilities(time = time, lambda2 = log(2) / m1, 
                                     hazardRatio = 1, dropoutRate1 = dout1, 
                                     dropoutRate2 = dout2, dropoutTime = douttime,
                                     accrualTime = accrualTime, accrualIntensity = accrualIntensity, 
                                     maxNumberOfSubjects = maxNumberOfSubjects)
expEventH0 <- probEventH0$overallEventProbabilities * maxNumberOfSubjects   
timefixH0 <- min(time[expEventH0 >= nevent])
timeeventsH0 <- c(min(time[expEventH0 >= nevents_i1[1]]), 
                  min(time[expEventH0 >= nevents_i1[2]]), 
                  min(time[expEventH0 >= nevents_i1[3]]))
expdurH0 <- timeeventsH0[1] * stopFutIA_H0 + timeeventsH0[2] * stopEffIA_H0 + 
  timeeventsH0[3] * (1 - stopFutIA_H0 - stopEffIA_H0)

# number of events under H0 at end of accrual
recrend_H0_n <- floor(min(expEventH0[time >= recrend]))

# under H1
probEventH1 <- getEventProbabilities(time = time, lambda2 = log(2) / m1, hazardRatio = hr,
                                     dropoutRate1 = dout1, dropoutRate2 = dout2, 
                                     dropoutTime = douttime, accrualTime = accrualTime, 
                                     accrualIntensity = accrualIntensity, 
                                     maxNumberOfSubjects = maxNumberOfSubjects)
expEventH1 <- probEventH1$overallEventProbabilities * maxNumberOfSubjects   
timefixH1 <- min(time[expEventH1 >= nevent])
timeeventsH1 <- c(min(time[expEventH1 >= nevents_i1[1]]), 
                  min(time[expEventH1 >= nevents_i1[2]]), 
                  min(time[expEventH1 >= nevents_i1[3]]))
expdurH1 <- timeeventsH1[1] * stopFutIA_H1 + timeeventsH1[2] * stopEffIA_H1 + 
  timeeventsH1[3] * (1 - stopFutIA_H1 - stopEffIA_H1)
c(expdurH0, expdurH1)
stage = 1 stage = 1 
 46.39165  59.37703 
# same design as above but now without futility
design_eff_only <- getDesignGroupSequential(informationRates = infofrac,
                                   typeOfDesign = "asOF", sided = 1, 
                                   alpha = alpha / 2, beta = beta)

samplesize_eff_only <- myGetSampleSizeSurvival(design_eff_only)
nevent_eff_only <- ceiling(samplesize_eff_only$maxNumberOfEvents)

# same design as above but now with binding futility interim
design_binding <- getDesignGroupSequential(informationRates = infofrac, 
                                           typeOfDesign = "asOF", sided = 1, 
                                           alpha = alpha / 2, beta = beta, futilityBounds = c(0, -6), 
                                           bindingFutility = TRUE)

samplesize_binding <- myGetSampleSizeSurvival(design_binding)
nevent_binding <- ceiling(samplesize_binding$maxNumberOfEvents)
nevent_binding
[1] 401

5 Optimal use and timing of interim analyses

5.1 Efficacy

5.1.1 Bias

For code and recommendationas how to handle bias in group-sequential designs we refer to Vignette #7 on the rpact vignettes webpage.

The standard rpact output provides a median unbiased estimate. For details on the various types of biases in group-sequential designs and approaches we refer to Wassmer and Brannath (2016).

5.1.2 Adding a late efficacy interim

In what follows, we generate the table used in the slide deck to illustrate addition of a late efficacy interim analysis.

# illustrate effect of late efficacy on MDD and local significance level

# timing of interims (information fraction)
j1 <- 2 / 3
j2 <- 0.85
infofrac_late <- c(j1, j2, 1)

# OBF standard design
design_late1 <- getDesignGroupSequential(informationRates = infofrac_late[c(1, 3)],
                                   typeOfDesign = "asOF", sided = 2, alpha = alpha, 
                                   beta = beta)
ss_late1 <- myGetSampleSizeSurvival(design_late1)
nevents_late1 <- ceiling(nevent * c(j1, 1))

# add late interim
design_late2 <- getDesignGroupSequential(informationRates = infofrac_late,
                                         typeOfDesign = "asOF", sided = 2, alpha = alpha, 
                                         beta = beta)
ss_late2 <- myGetSampleSizeSurvival(design_late2)
nevents_late2 <- ceiling(nevent * c(j1, j2, 1))

# assemble table
tab_late <- data.frame(matrix(NA, ncol = 5, nrow = 4))
colnames(tab_late) <- c("", "quantity", paste("info = ", 
                                              round(infofrac_late, 2)[1:2], sep = ""), "final")
tab_late[c(1, 3), 1] <- paste("Design ", 1:2, sep = "")
tab_late[c(1, 3), 2] <- "MDD"
tab_late[c(2, 4), 2] <- "local significance level"
tab_late[1, c(3, 5)] <- round(ss_late1$criticalValuesEffectScaleLower, 3)
tab_late[2, c(3, 5)] <- format.pval(ss_late1$criticalValuesPValueScale, 3)
tab_late[3, 3:5] <- round(ss_late2$criticalValuesEffectScaleLower, 3)
tab_late[4, 3:5] <- format.pval(ss_late2$criticalValuesPValueScale, 3)
kable(tab_late)
quantity info = 0.67 info = 0.85 final
Design 1 MDD 0.731 NA 0.816
NA local significance level 0.0121 NA 0.0463
Design 2 MDD 0.733 0.784 0.813
NA local significance level 0.0121 0.0265 0.0404

5.2 Futility

See backup of this file for the table from the slides.

6 Backup

6.1 Efficacy

6.1.1 MDD

Minimal detectable differences can easily be extracted from getSampleSizeSurvival objects:

hrMDD <- as.vector(samplesize$criticalValuesEffectScale)
hrMDD
[1] 0.4625061 0.7375959 0.8209002

6.2 Futility

6.2.1 Conditional power

As a first approach to determine an interim boundary for futility we reproduce the conditional power plot.

# calculate condition power for interim HR ranging from 0.6 to 1.5
hrs <- seq(0.6, 1.5, by = 0.01)
cpower0 <- rep(NA,length(hrs))
cpower <- cpower0

for (i in 1:length(hrs)){
  
  # generate dataset that contains result up to interim
  results <- getDataset(
    overallEvents = nevents_i1[1],
    overallLogRanks = log(hrs[i]) / sqrt(kappa /nevents_i1[1]),
    overallAllocationRatio = 1)
  
  # proper object that can be used by rpact
  stageResults <- getStageResults(design, dataInput = results, directionUpper = FALSE)
  
  # compute conditional power under H1: theta_1 = 0.75
  cpower[i] <- getConditionalPower(stageResults, nPlanned = diff(nevents_i1), 
                                   thetaH1 = hr)$conditionalPower[3]
  
  # compute conditional power under H0: theta_1 = 1
  cpower0[i] <- getConditionalPower(stageResults, nPlanned = diff(nevents_i1), 
                                    thetaH1 = 1)$conditionalPower[3]
}

# what interim effect gives a conditional power of 20%?
condpow <- 0.2
hr_int_cp <- min(hrs[cpower <= condpow])
hr_int_cp
[1] 1.28
# p-value corresponding to that effect
# z = log(hr_int_cp) / sqrt(kappa / D_int) --> p = P(N(0, 1) <= z) = 1 - Phi(|z|)
z <- - log(hr_int_cp) / sqrt(kappa / nevents_i1[1])
p_int_cp <- 1 - pnorm(z)
p_int_cp
[1] 0.9144856
# check this p-value using rpact:
design_cp <- getDesignGroupSequential(informationRates = infofrac,
                                   typeOfDesign = "asOF", sided = 1, 
                                   alpha = alpha / 2, 
                                   beta = beta,
                                   futilityBounds = c(z, -6), 
                                   bindingFutility = FALSE)

samplesize_cp <- myGetSampleSizeSurvival(design_cp)
samplesize_cp$futilityBoundsPValueScale[1, 1]
[1] 0.9144856

Here you can find more details on conditional power computations in rpact, and also how to switch between different scales, i.e. Z-score, hazard ratio, etc.

And now plot the conditional power functions compute above:

par(las = 1, mfrow = c(1, 1), mar = c(4.5, 4.5, 2, 1))
plot(hrs, cpower, type = "n", xlab = expression("hazard ratio observed at interim"),
     ylab = "conditional power", ylim = c(0, 1), axes = FALSE, main = 
       expression("CP("*theta*") after futility interim, under treatment effect "*theta[1]*" used for powering"))
axis(1, at = seq(0.6, 10, by = 0.1))
axis(2, at = seq(0, 1, by = 0.1))
abline(v = seq(0.6, 10, by = 0.1), h = seq(0, 1, by = 0.1), col = gray(0.9))
segments(0, condpow, hr_int_cp, condpow, lty = 2, col = 3, lwd = 3)
segments(hr_int_cp, 0, hr_int_cp, condpow, lty = 2, col = 3, lwd = 3)
lines(hrs, cpower, col = 2, lwd = 4)
lines(hrs, cpower0, col = 4, lwd = 4)
legend(0.9, 0.9, paste("hazard ratio after interim: ", c(hr, 1), sep = ""), 
       col = c(2, 4), lwd = 4, bty = "n")

6.2.2 Stopping probabilities

An alternative way of defining an interim boundary for futility, especially when we use the pivotal Phase 3 with futility interim for the LIP, is to find a sweet spot by trading off false-decision probabilities at the interim. To this end, assume \[ \hat \theta \sim N(\theta, \sqrt{4 / d_1}). \] We are then interested in the probability of continuation (or stopping, simply one minus) computed as: \[ P_\theta(\hat \theta \le \theta_\text{int}) \ = \ \Phi\left(\frac{\theta_\text{int} - \theta}{\sqrt{4 / d_1}}\right), \] where \(\theta_\text{int}\) is an interim boundary. Below the corresponding plot.

# calculate stopping probabilities for interim HR ranging from 0.6 to 1.5
hrs2 <- seq(0.6, 1.2, by = 0.01)

# under H0
stopprob0 <- 1 - pnorm((log(hrs2) - log(1)) / sqrt(kappa / nevents_i1[1]))

# under H1
stopprob1 <- 1 - pnorm((log(hrs2) - log(hr)) / sqrt(kappa / nevents_i1[1]))

# interim boundary
sp_bound <- 0.9
fp <- max((1 - stopprob0)[hrs <= sp_bound]) 
fn <- min(stopprob1[hrs <= sp_bound])
c(fp, fn)
[1] 0.2795253 0.1560030

With these quantities, generate the plot.

par(las = 1, mfrow = c(1, 1), mar = c(4.5, 4.5, 2, 4.5))
plot(hrs2, stopprob0, type = "n", xlab = expression("interim boundary "*hat(theta)[int]),
     ylab = "", ylim = c(0, 1), axes = FALSE, main = "interim stopping probabilities")
axis(1, at = seq(0.6, 10, by = 0.1))
abline(v = seq(0.6, 10, by = 0.05), h = seq(0, 1, by = 0.1), col = gray(0.9))
legend(0.75, 1, paste("false-", c("negative", "positive"), ": hazard ratio: ", c(hr, 1), sep = ""), 
       col = c(2, 4), lwd = 4, bty = "n")

axis(2, at = seq(0, 1, by = 0.1), labels = seq(0, 1, by = 0.1), col.axis = 2, line = 0.5)
mtext("false-negative probability", 2, line = 3, col = 2, las = 3)
axis(4, at = seq(0, 1, by = 0.1), labels = seq(0, 1, by = 0.1), col.axis = 4, line = 0.5)
mtext("false-positive probability", 4, line = 3, col = 4, las = 3)

lines(hrs2, stopprob1, col = 2, lwd = 4)
lines(hrs2, 1 - stopprob0, col = 4, lwd = 4)

segments(min(hrs2), fn, sp_bound, fn, col = 2, lty = 2, lwd = 4)
segments(sp_bound, fn, sp_bound, 0, col = 2, lty = 2, lwd = 4)

segments(max(hrs2), fp, sp_bound, fp, col = 4, lty = 2, lwd = 4)

6.2.3 \(\beta\)-spending

Finally, we illustrate how \(\beta-\)-spending can be specified.

# compare designs with no futility vs. a design with beta-spending

# no futility
design0 <- getDesignGroupSequential(sided = 1, alpha = alpha / 2, beta = beta,
                                        informationRates = infofrac,
                                        typeOfDesign = "asOF", bindingFutility = FALSE)
Warning: 'bindingFutility' (FALSE) will be ignored
samplesize0 <- myGetSampleSizeSurvival(design0)

# beta-spending, non-binding
design_beta <- getDesignGroupSequential(sided = 1, alpha = alpha / 2, beta = beta,
                                   informationRates = infofrac,
                                   typeOfDesign = "asOF",
                                   typeBetaSpending = "bsOF")
samplesize_beta <- myGetSampleSizeSurvival(design_beta)
nevent_beta <- ceiling(samplesize_beta$maxNumberOfEvents)

# generate table 
tab_beta <- data.frame(matrix(NA, nrow = 11, ncol = 3))
colnames(tab_beta) <- c("quantity", "no futility interim", "beta-spending")
tab_beta[, 1] <- c("number of events", 
                   "efficacy boundary 1 (effect size)", "efficacy boundary 1 (p-value)",
                   "efficacy boundary 2 (effect size)", "efficacy boundary 2 (p-value)",
                   "efficacy boundary 3 (effect size)", "efficacy boundary 3 (p-value)",
                   "futility boundary 1 (effect size)", "futility boundary 1 (p-value)",
                   "futility boundary 2 (effect size)", "futility boundary 2 (p-value)")

tab_beta[1, 2:3] <- ceiling(c(samplesize0$maxNumberOfEvents, samplesize_beta$maxNumberOfEvents))
tab_beta[c(2, 4, 6), 2] <- round(samplesize0$criticalValuesEffectScale, 2)
tab_beta[c(2, 4, 6), 3] <- round(samplesize_beta$criticalValuesEffectScale, 2)

tab_beta[c(2, 4, 6) + 1, 2] <- round(samplesize0$criticalValuesPValueScale, 2)
tab_beta[c(2, 4, 6) + 1, 3] <- round(samplesize_beta$criticalValuesPValueScale, 2)

tab_beta[c(8, 10), 3] <- round(samplesize_beta$futilityBoundsEffectScale, 2)
tab_beta[c(9, 11), 3] <- format.pval(samplesize_beta$futilityBoundsPValueScale, 2)
kable(tab_beta, align = "lrr")
quantity no futility interim beta-spending
number of events 385.00 419
efficacy boundary 1 (effect size) 0.48 0.5
efficacy boundary 1 (p-value) 0.00 0
efficacy boundary 2 (effect size) 0.73 0.74
efficacy boundary 2 (p-value) 0.01 0.01
efficacy boundary 3 (effect size) 0.82 0.82
efficacy boundary 3 (p-value) 0.02 0.02
futility boundary 1 (effect size) NA 1.09
futility boundary 1 (p-value) NA 0.68
futility boundary 2 (effect size) NA 0.87
futility boundary 2 (p-value) NA 0.12

We see that by adding two futility interims based on \(\beta\)-spending, we increase the maximal number of events from 385 to `tab_beta[1, 3]’. To compute the power loss of adding the futilities, conservatively assuming they will be adhered to, we compute the power of the design with futilities using the number of events of the design without futilities.

# power of beta-spending design at the number of events without beta-spending
power <- getPowerSurvival(design_beta, 
                          maxNumberOfEvents = ceiling(samplesize0$maxNumberOfEvents),
                          maxNumberOfSubjects = maxNumberOfSubjects,
                          lambda2 = log(2) / m1, hazardRatio = hr,
                          dropoutRate1 = dout1, dropoutRate2 = dout2, dropoutTime = douttime,
                          accrualTime = accrualTime, accrualIntensity = accrualIntensity,
                          directionUpper = FALSE)

# power, as compared to the specified 80%
power$overallReject
[1] 0.7664614

6.2.4 Power loss

Finally, we specify the power loss of adding the various futility boundaries. To this, we proceed as follows:

  1. Generate a set of trials with hazard ratio at interim and final, without any interim analysis stopping. The nice thing about rpact is that we can still add informationRates, i.e. we get a set of datasets that simulate trials until the prespecified maximal number of events, and these simulation datasets contain the hazard ratio estimates at the time when we have reached informationRates% of events.

  2. From these datasets we can then extract those that jump over the interim boundary and are significant at the end. Simply computing their proportion with respect to the number of simulations gives an estimate of the power.

# generate a set of trials with HR at interim and final, without futility interim stopping
design_sim <- getDesignGroupSequential(informationRates = infofrac[c(1, 3)],
                                   sided = 1, alpha = alpha / 2, 
                                   beta = beta,
                                   typeOfDesign = "asUser",
                                   userAlphaSpending = c(0, 0.025),
                                   futilityBounds = -6)
Changed type of design to 'noEarlyEfficacy'
samplesize_sim <- myGetSampleSizeSurvival(design_sim)

nsim <- 10 ^ 4
simulationResult <- 
  getSimulationSurvival(design_sim, 
                        lambda2 = log(2) / m1, hazardRatio = hr,
                        dropoutRate1 = dout1, dropoutRate2 = dout2, dropoutTime = douttime,
                        accrualTime = accrualTime, accrualIntensity = accrualIntensity,
                        maxNumberOfSubjects = maxNumberOfSubjects,
                        plannedEvents = as.vector(ceiling(samplesize_sim$eventsPerStage)),
                        directionUpper = FALSE, maxNumberOfIterations = nsim,
                        maxNumberOfRawDatasetsPerStage = 1, seed = 2)

# get aggregate datasets from all simulation runs
aggregateSimulationData <- getData(simulationResult)

# power taking futility into account is proportion of significant trials that ran to the end
# use MDD from initial design with efficacy interim for final analysis
hrs_interim <- subset(aggregateSimulationData, stageNumber == 1, select = "hazardRatioEstimateLR")
hrs_final <- subset(aggregateSimulationData, stageNumber == 2, select = "hazardRatioEstimateLR")

# now assess power loss for the two interim boundaries we discuss

# futility interim analysis informal boundary of 1
survive_interim <- (hrs_interim <= inform_bound)
survive_final <- (hrs_final <= samplesize$criticalValuesEffectScale[3])
loss_inform <- mean(survive_interim & survive_final)

# futility interim analysis based on conditional power
survive_interim <- (hrs_interim <= hr_int_cp)
survive_final <- (hrs_final <= samplesize$criticalValuesEffectScale[3])
loss_cp <- mean(survive_interim & survive_final)

# stopping probabilities
survive_interim <- (hrs_interim <= sp_bound)
survive_final <- (hrs_final <= samplesize$criticalValuesEffectScale[3])
loss_sp <- mean(survive_interim & survive_final)

# power loss from beta-spending design
pl_spending <- power$overallReject

# generate output table
tab_pl <- data.frame(matrix(NA, ncol = 2, nrow = 4))
colnames(tab_pl) <- c("boundary", "power")
rownames(tab_pl) <- c("Design 1 (informal)", "Design 2 (conditional power)", 
                      "Design 3 (stopping probabilities)", "Design 4 (beta-spending)")
tab_pl[, 1] <- round(c(inform_bound, hr_int_cp, sp_bound, NA), 2)
tab_pl[, 2] <- round(c(loss_inform, loss_cp, loss_sp, 1 - design_beta$beta), 2)
tab_pl[tab_pl == "NA"] <- ""
kable(tab_pl)
boundary power
Design 1 (informal) 1.00 0.78
Design 2 (conditional power) 1.28 0.80
Design 3 (stopping probabilities) 0.90 0.72
Design 4 (beta-spending) NA 0.80

7 MIRROS

The original MIRROS publication is available here. The accompanying code is available on github here.

The Bayesian predictive power computations after not stopping at an interim based on point or interval knowledge are described in this publication. The corresponding R package is bpp, available on CRAN.

8 References

Wassmer, G., and W. Brannath. 2016. Group Sequential and Confirmatory Adaptive Designs in Clinical Trials. Springer.