Manuscript Authors: Schott, E., Rhemtulla, M., & Byers-Heinlein, K.
Code Written by Mijke Rhemtulla and Esther Schott email: esther.schott@mail.concordia.ca or mrhemtulla@ucdavis.edu if you have comments/questions

1 Preparation:

1.1 Setup: load necessary libraries

set.seed(123)

library(viridis)
library(tidyverse)
library(cowplot)
library(GroupSeq)
library(ldbounds)
library(pwr)

1.2 Create templates for graphs, & bins for colour mapping between p and t values

p0bins = seq(0,1,by=.01)

all.figs = list(
  guides(fill=FALSE),
                theme(text = element_text(size=20),
                  axis.line.y=element_blank(),
                      axis.ticks.y = element_blank(),
                      axis.text.y = element_blank(),
                  axis.text.x = element_text(size=22)))

t.plots = list(scale_x_continuous(limits =c(-4,4)), ylab("")
               
              )
theme_set(theme_cowplot(font_size=18))

2 Figure 1: t-and p-distributions under random sampling

Figure 1. Distribution of t-statistics and p-values when the null hypothesis is true (top) and when there is a true effect size of .5 (bottom). Left: curves represent the likelihood of observing each value of t. Axes above these plots show p-values corresponding to values of t. Right: distributions represent the likelihood of observing each value of p. The color gradient depicts the correspondence of particular t-values with p-values.

2.1 Generate data for distributions

#simulate data for t-distributions 
Nsim = 1000000
n <- 12 #per group
t0bins = qt(1-p0bins, df= (n*2-2))
d1 <- .5 #effect size for H1 distribution
ncp1 <- d1*sqrt(n/2) #noncentrality parameter for H1 t distribution

#draw a lot of values from the H0 and H1 t-distributions to plot
data = data.frame(sim =1:Nsim,
                  t_0 = rt(n = Nsim, 
                           df =(n*2-2), 
                           ncp=0), 
                  t_1 = rt(n = Nsim, 
                           df =(n*2-2), 
                           ncp=ncp1))

#create vectors of p-values corresponding to the t-values drawn 

data = data %>% 
  mutate(p_0 =  pt(t_0, 
                   df=(n*2-2), 
                   lower.tail = ifelse(t_0 > 0,
                                       FALSE ,
                                       TRUE)),
         p_1 =  pt(t_1, 
                   df=(n*2-2), 
                   lower.tail = ifelse(t_1 > 0,
                                       FALSE ,
                                       TRUE)))

2.2 Prepare for creating graph

# convert from wide to long
data.long = data %>% 
  gather(condition,measurement, t_0:p_1) %>%
  separate(col=condition, into=c("para","hyp"), sep = "_")

# convert to wide repeated measures format
data.wide = data.long %>% spread(key = para, measurement)

2.3 Create Fig 1

p.plots =  list(scale_x_continuous(limits =c(0,1)),
                ylab(""),ylim(0,.085*Nsim)
      )

     
# plot ts
# alternative hyp true
plot.t.H1.sim1 = data.wide %>% 
  mutate(bins=cut(t, breaks=unique(t0bins))) %>% 
  filter(hyp ==1) %>%
  {ggplot(.,aes(x=t, fill=bins
                )) +
      geom_histogram(binwidth=0.1)+
      scale_fill_manual(drop=FALSE,
                                values = c(viridis(nlevels(.$bins)/2),
                                           viridis(nlevels(.$bins)/2, 
                                                   direction = -1)))+
      t.plots +
      all.figs+
      ylab("Null Hyp. False")+
      xlab(expression(italic("t")))
  }
# null hyp true
plot.t.H0.sim1 = data.wide %>% 
  mutate(bins=cut(t, breaks=unique(t0bins))) %>% 
  filter(hyp ==0) %>%
  {ggplot(.,aes(x=t, fill=bins)) + 
      geom_histogram(binwidth=0.1)+
      scale_fill_manual(drop=FALSE,
                                 values = c(viridis(nlevels(.$bins)/2),
                                            viridis(nlevels(.$bins)/2, 
                                                    direction = -1)))+
      t.plots +
      all.figs+
      ylab("Null Hyp. True")+xlab("")
    }

      
#plot p-values
# null hyp true
plot.p.H0.sim1 =  data.wide %>% 
  mutate(bins.p=cut(p, breaks=p0bins)) %>%
    filter(hyp ==0) %>%
  {ggplot(.,aes(x=p, fill=bins.p)) +
      geom_histogram(binwidth=0.01)+
      scale_fill_manual(drop=FALSE,
                        values = viridis(nlevels(.$bins.p)))+
      all.figs+p.plots+xlab("")
      }

# null hyp false
  plot.p.H1.sim1 =  data.wide %>% 
  mutate(bins.p=cut(p, breaks=p0bins)) %>%
    filter(hyp ==1) %>%
  {ggplot(.,aes(x=p, fill=bins.p)) +
      geom_histogram(binwidth=0.01)+
     scale_fill_manual(drop=FALSE,
                       values = viridis(nlevels(.$bins.p)))+
      all.figs+ p.plots+
            xlab(expression(italic("p")))

      }


# Combine the four figures without legends
(sim1 = plot_grid(plot.t.H0.sim1, plot.p.H0.sim1,plot.t.H1.sim1, plot.p.H1.sim1, align = "h",axis = "b"))

## Warning: Removed 617 rows containing non-finite values (stat_bin).

## Warning: Removed 100 rows containing missing values (geom_bar).

## Warning: Removed 9210 rows containing non-finite values (stat_bin).

## Warning: Removed 100 rows containing missing values (geom_bar).

ggsave("Fig1.png", width= 12, height=5)

3 Figure 2: t- and p-distribution with data peeking

Distribution of t-statistics and p-values when the following data peeking rule is used: Begin with N = 16, then if p > .05, add N = 4 participants and recompute p, and so on until N = 32 or p < .05. Left: the histogram shows the distribution of t-statistics under the data peeking procedure. The black curve shows the regular t-distribution with 31 df; vertical lines represent critical values for a 2-tailed t-test based on the regular t-distribution. The shading of the distributions represent the correspondence between t- and p-values. Right: distribution of p-values resulting from the data peeking procedure; the black line shows the expected (uniform) distribution of p-values under a fixed-N procedure when the null hypothesis is true.

3.1 Simulate Data

Procedure: run 16 participants and check for statistical significance after N = 16, 20, 24, 28, 32 participants

n1 <- 16
n2 <- n3 <- n4 <- n5 <- 4
phigh <- 1 #phigh: what p-value is so big that you would stop/abandon? 

#warning: this takes a while to run! 
# reduce the number of simulations to 100000 or 10000 to make it run faster 
Nsim = 1000000


p <- rep(NA, Nsim)
t <- rep(NA, Nsim)
n <- rep(NA, Nsim)
for(i in 1:Nsim){
  group <- rnorm(n1)
  t[i] <- t.test(group)$stat
  p[i] <- t.test(group)$p.value
    if (p[i] > .05 & p[i] < phigh){
    group <- c(group, rnorm(n2))
    t[i] <- t.test(group)$stat
    p[i] <- t.test(group)$p.value
    if (p[i] > .05 & p[i] < phigh){
      group <- c(group, rnorm(n3))
      t[i] <- t.test(group)$stat
      p[i] <- t.test(group)$p.value
      if (p[i] > .05 & p[i] < phigh){
        group <- c(group, rnorm(n4))
        t[i] <- t.test(group)$stat
        p[i] <- t.test(group)$p.value
        if (p[i] > .05 & p[i] < phigh){
          group <- c(group, rnorm(n5))
          t[i] <- t.test(group)$stat
          p[i] <- t.test(group)$p.value
        }
      }
    }
  }
  n[i] <- length(group)
}

#save as dataframe
sim.Fig2SP = data.frame(t = t, 
                          p = p,
                      N =n)

# create critical values & t-curve assuming random sampling for plotting 
tcrit0_right <- qt(.025, df = (31), lower.tail = FALSE)
tcrit0_left <- qt(.975, df = (31), lower.tail = FALSE)
#t density curve to overlay: 
df = data.frame(x = seq(-4, 4, by = .001),
                dx = NA)
df = df %>%
  mutate(dx =dt(x, df = 31))

# mean number of simulations that end up with statistically significant effect despite effect size of 0
mean(sim.Fig2SP$p<.05) # 11.5 %

## [1] 0.114857

3.2 Create Figure 2

p.plots = list(scale_x_continuous(limits =c(0,1)),
                ylab(""),ylim(0,.085*Nsim)
      )

# add break points of histogram to plotting dataframe
sim.Fig2SP = sim.Fig2SP %>% 
  mutate(bins.t=cut(t, breaks=unique(t0bins))) 
ncols= nlevels(sim.Fig2SP$bins.t)
# plot t-values under data peeking plan
(plot.t.sim2 =  ggplot() +
      geom_histogram(data = sim.Fig2SP,aes(x=t, fill=bins.t),
                     binwidth=0.1)+
      scale_fill_manual(drop=FALSE,
                       values = c(viridis(ncols/2), 
                                 viridis(ncols/2, 
                                        direction = -1)))+
      t.plots + all.figs+
      geom_vline(aes(xintercept= tcrit0_left), 
                 colour = "red", 
                 size=1.1)+
      geom_vline(aes(xintercept= tcrit0_right), 
                 colour = "red", 
                 size=1)+
       geom_line(data=df,aes(y=dx*Nsim/10,
                            x=x),
                colour="black",
                size=1)+ 
    xlab(expression(italic("t")))
)

## Warning: Removed 1153 rows containing non-finite values (stat_bin).

## Warning: Removed 100 rows containing missing values (geom_bar).

# plot p-values
plot.p.sim2 =  sim.Fig2SP %>% 
  mutate(bins.p=cut(p, breaks=p0bins)) %>% 
  {ggplot(.,aes(x=p, fill=bins.p)) +
      geom_histogram(binwidth=0.01)+
      geom_hline(aes(yintercept= Nsim/100), colour = "black", size=0.8)+
          scale_fill_manual(drop=FALSE,
                        values = viridis(nlevels(.$bins.p)))+
      all.figs +p.plots+  xlab(expression(italic("p")))}
      
(sim2 = plot_grid(plot.t.sim2, plot.p.sim2, labels = c("A", "B")))

## Warning: Removed 1153 rows containing non-finite values (stat_bin).

## Warning: Removed 100 rows containing missing values (geom_bar).

ggsave("Fig2.jpg", width= 12, height=4)

4 Figure 3 random walks of a p-value

Three random walks of a p-value as each data point is added to the data set from N = 1 to N = 6000, where the null hypothesis is true. The red horizontal line indicates p = .05. Note that in each case the p-value dips below .05 numerous times, and in some cases even stays below .05 for a long stretch, despite the null hypothesis being true. Each different simulated example has a different random trajectory.

4.1 Simulate Data

Nsim = 6000 # max number of participants tested 
set.seed(1)
set.seed(12345678)
# create raw data for all three walks at once (to make sure set.seed works for reproducible results)
data = rnorm(Nsim*3) 

#create data frame
randomwalk = data_frame(run = 1:Nsim,
                        data_1 = data[1:Nsim], # first random walk
                  data_2 = data[(Nsim+1):(Nsim*2)], # second random walk
                  data_3 = data[(Nsim*2+1):(Nsim*3)], #third random walk
                  t_1 = rep(NA, Nsim),
                  t_2 = rep(NA, Nsim),
                  t_3 = rep(NA, Nsim),
                  p_1 = rep(NA, Nsim),
                  p_2 = rep(NA, Nsim),
                  p_3 = rep(NA, Nsim))

#get p-values and t-values after each new participant
for (run_id in 2:Nsim) {
  randomwalk$t_1[run_id] <- t.test(randomwalk$data_1[1:run_id])$stat
  randomwalk$p_1[run_id] <- t.test(randomwalk$data_1[1:run_id])$p.value
  
  randomwalk$t_2[run_id] <- t.test(randomwalk$data_2[1:run_id])$stat
  randomwalk$p_2[run_id] <- t.test(randomwalk$data_2[1:run_id])$p.value
  
  randomwalk$t_3[run_id] <- t.test(randomwalk$data_3[1:run_id])$stat
  randomwalk$p_3[run_id] <- t.test(randomwalk$data_3[1:run_id])$p.value
}

# change from wide to long format (for plotting)
randomwalk.long = randomwalk %>% gather(cond,value,data_1:p_3 ) %>%

  separate(cond, c("parameter","walk"), sep="_",
           extra = "drop", fill = "right") %>%
  spread(parameter, value) %>%
  mutate(walk.label = paste("Random Walk", walk))

4.2 Plot Random Walks

ggplot(randomwalk.long, aes(x=run, y=p)) +
  geom_line()+
  geom_hline(yintercept =.05, colour="red")+
  facet_wrap(~walk.label, ncol=1) + 
  xlab(expression(italic("N")))+
  ylab(expression(italic("p")))+

  theme_bw()+
  scale_x_continuous(breaks=c(0, 1000, 2000,3000,4000,5000,6000))+
  theme(text = element_text(size=20),
        panel.grid.major.x = element_blank(), 
        panel.grid.minor = element_blank(),
        axis.text.x = element_text(size=18))

## Warning: Removed 1 rows containing missing values (geom_path).

ggsave("Fig3.png", width= 12, height=8)

## Warning: Removed 1 rows containing missing values (geom_path).

5 Figure 4: Pilot Dropping (Dr. Headcount)

5.1 Pilot Dropping when Null Hypothesis is True

Figure 4. Pilot dropping when the null hypothesis is true. A. The black curve shows the distribution of test statistics when N = 16 infants are randomly sampled without inspecting the results of the first six participants; in red, the distribution of test statistics when the first six participants are kept only if at least four of them have a result in the anticipated direction. B. Distribution of the number of participants collected to achieve a single complete dataset of N = 16 when the pilot dropping procedure is followed. The x-axis was truncated at N = 70.

5.1.1 Simulate Data

# Simulate pilot-dropping procedure when H0 is true
# procedure: collect N = 6, look at individual data points. If at least 4 observations are greater than 0, keep going. stop at N = 16.

# paper used 1000000 simulations, but could lower this number to make the script run faster
Nsim = 1000000

pilot_size = 6 # test 6 participants in each pilot
crit_success = 4 # 4 need to show effect in expected direction
final_sample = 16 # target sample size

data0 <- rt(Nsim, df = (15))
tvec <- NULL
pvec <- NULL
nfailedpilotvec <- NULL

# run simulations
for (i in 1:Nsim) {
  nfailedpilot = 0 # so far, no pilot failure
  x <- rnorm(n = pilot_size, mean = 0, sd = 2) # test pilots: draw values from normal distribution
  while((sum(x > 0) >= crit_success) == FALSE){# continue until 4 show effect greater than 0 
    x <- rnorm(n = pilot_size, mean = 0, sd = 2)# test new pilots
    nfailedpilot <- nfailedpilot + pilot_size # keep record of failed pilots
  }
  # now that we observed a successful pilot,test until the final sample size
  x <- c(x, rnorm(n = (final_sample-pilot_size), mean = 0, sd = 2))
  t <- t.test(x, alternative = "greater")$stat # get t-value
  p <- t.test(x, alternative = "greater")$p.value # get p-value
  tvec[i] <- t
  pvec[i] <- p
  nfailedpilotvec[i] <- nfailedpilot
}
# save results of simulation in data frame
sim.Fig4PD = data.frame(t= tvec, 
                        p = pvec, 
                        z_Ndropped = nfailedpilotvec,
                        Nfinal = final_sample)


# compute some extra variables, useful for plotting and summarizing
sim.Fig4PD = sim.Fig4PD %>%
  mutate(Ncollected = Nfinal + z_Ndropped,
         sim = seq_along(Nfinal),
         NpilotRounds = z_Ndropped/pilot_size,
         Npilot.binned = factor(ifelse(NpilotRounds >2,"3+", as.character(NpilotRounds))))




# median number of participants tested + Type I error for Dr. Headcount's procedure under the null hypothesis
sim.Fig4PD %>% 
  summarize(median.N = median(Ncollected),
            Nsig = mean(p < 0.05))

median.N	Nsig
22	0.114227

# Type I error is 11.4 %

5.1.2 Plot T-values distribution

# colour scheme for t-values
n <- final_sample
t0bins = qt(1-p0bins, df= (n*2-2))
sim.Fig4PD = sim.Fig4PD %>% 
  mutate(bins=cut(t, breaks=unique(t0bins)))



#t density curve to overlay: 
df = data.frame(x = seq(-4, 4, by = .001),
                dx = NA)
df = df %>%
  mutate(dx =dt(x, df = 15))
# create plot 4a:
plot.t.sim4 =ggplot() +
      geom_histogram(data=sim.Fig4PD,
                     aes(x=t, 
                         fill="lightgrey"),
        binwidth=0.1)+
      scale_x_continuous(limits =c(-4,4)) +
      geom_line(data=df,aes(y=dx*Nsim/9.3,
                            x=x),
                colour="black",
                size=1.1)+
  geom_vline(xintercept=qt(.95, df = 15))+guides(fill=F)+
    all.figs +ylab("")+xlab(expression(italic("t")))
# need plot with N's

5.1.3 Number of Participants to get to N=16

sim.Fig4PD.byPart.summary = sim.Fig4PD %>%
  group_by(Ncollected) %>%
  summarize(N=n(),
            percent = N/Nsim)


Fig4B.xlabel = expression("Number of participants tested \nto complete a sample of "~
                                italic("N")~"= 16")
sim.Fig4PD.plotB = sim.Fig4PD.byPart.summary %>%
  filter(Ncollected < 71) %>%
{ggplot(., aes(x=factor(Ncollected),y=percent)) + 
  geom_bar(stat="identity")+ scale_y_continuous(limits = c(0,.65),labels=scales::percent) +  
    labs(x="Number of participants tested \nto complete a sample of N= 16", y ="% of simulations showing each outcome")+
    theme(text = element_text(size=18),
          axis.text.x = element_text(size=18))}

5.1.4 Plot and Save Figure 4

(sim4 = plot_grid(plot.t.sim4,sim.Fig4PD.plotB, labels = c("A", "B"), align="h"))

## Warning: Removed 1578 rows containing non-finite values (stat_bin).

## Warning: Removed 1 rows containing missing values (geom_bar).

ggsave("Fig4-pilotDropping.jpg",width = 12, height=6)

5.2 Pilot dropping when H1 is true (Dr. Relentless)

# Simulate pilot-dropping procedure when H1 is true (delta = .05 )
# procedure: collect N = 6, look at individual data points. If at least 4 observations are greater than 0, keep going. stop at N = 16.

# paper used 1000000 simulations, but could lower this number to make the script run faster
Nsim = 1000000

pilot_size = 6
crit_success = 4
final_sample =16

tvec <- NULL
pvec <- NULL
nfailedpilotvec <- NULL

mean.sim = 1
sd.sim = 2


for (i in 1:Nsim) {
  nfailedpilot = 0 # so far, no pilot failure
  x <- rnorm(n = pilot_size, mean = mean.sim, sd = sd.sim) # test pilots: draw values from normal distribution
  while((sum(x > 0) >= crit_success) == FALSE){# continue until 4 show effect greater than 0 
    x <- rnorm(n = pilot_size, mean = mean.sim, sd = sd.sim)# test new pilots
    nfailedpilot <- nfailedpilot + pilot_size # keep record of failed pilots
  }
  # now that we observed a successful pilot,test until the final sample size
  x <- c(x, rnorm(n = (final_sample-pilot_size), mean = mean.sim, sd = sd.sim))
  t <- t.test(x, alternative = "greater")$stat # get t-value
  p <- t.test(x, alternative = "greater")$p.value # get p-value
  tvec[i] <- t
  pvec[i] <- p
  nfailedpilotvec[i] <- nfailedpilot
}
# save results of simulation in data frame
sim.DrRel = data.frame(t= tvec, 
                        p = pvec, 
                        z_Ndropped = nfailedpilotvec)
# create some summary variables
sim.DrRel = sim.DrRel %>%
  mutate(Nfinal = final_sample,
         Ncollected = Nfinal + z_Ndropped,
         sim = seq_along(Nfinal),
         NpilotRounds = z_Ndropped/pilot_size,
         Npilot.binned = factor(ifelse(NpilotRounds >2,"3+",NpilotRounds))
  )



# calculate number of participants tested on average and Power
sim.DrRel %>% 
  summarize(average.N = mean(Ncollected),
           Power = mean(p < 0.05))

average.N	Power
18.22917	0.704084

# average N tested: 18.22, round up to 19 (since you can't test .2 participants)
# power .70
# power for a one-sided t-test with 19 participants
pwr.t.test(n=19, d=0.5,type="one.sample", alternative="greater")

## 
##      One-sample t test power calculation 
## 
##               n = 19
##               d = 0.5
##       sig.level = 0.05
##           power = 0.6742101
##     alternative = greater

# power for running 19 participants without pilot dropping .67

# how often does Dr. Relentless run 0 dropped pilots, 1 set of dropped pilots, 2 etc..
sim.DrRel.summary = sim.DrRel %>%
  group_by(Npilot.binned) %>%
  summarize(N=n(),
            percent = N/Nsim) %>%
  mutate(EffectSize = "Null hypothesis false")
# 73% no dropped pilots... 27% of simulations she runs one or more sets of dropped pilots

6 Figure 6: Pre-registered data peeking plan, adjusted nominal alpha for Strategy 1 and 2

6.1 Generate adjusted nominal alpha for examples of pre-registered data peeking plan

Figure 6. Illustrations of different approaches to correcting the significance threshold under the pre-registered data peeking plan. Using Strategy 1 (left panel), significance thresholds are distributed proportionally to the number of participants tested since the last peek. Using Strategy 2 (right panel), adjusted alpha levels are set very low at early peeks, and increase when the researcher gets closer to the maximum sample size. In both strategies, increasing the number of peeks lowers the alpha levels at each peek as well as at the final analysis, and decreases power.

# Using lbbounds package instead of groupseq(), easier to extract the p-values for plotting

# information to be extracted from each example
cols <- c("time",  "lower.bounds", "upper.bounds", "exit.pr", "diff.pr", "spending.type")

###############
### Example 1: No Peek
# Strategy 1
ex1.lin = as.data.frame(bounds(t = c(1), # one analysis at 100% of sample
                               iuse = c(3,3), # 3= linear spending functions
                               alpha = c(0.025,0.025)# two-tailed
                               )[cols])
#Strategy 2
ex1.obf = as.data.frame(bounds(t = c(1), 
                               iuse = c(1,1), # 1 = O'Brien-Fleming
                               alpha = c(0.025,0.025))[cols])

# combine both into one df
example1 = rbind(ex1.lin,
                 ex1.obf)
example1$example =1

###############
### Example 2
# Strategy 1
ex2.lin = as.data.frame(bounds(t = c(2/3,1), 
                               iuse = c(3,3), 
                               alpha = c(0.025,0.025))[cols])
# Strategy 2
ex2.obf = as.data.frame(bounds(t = c(2/3,1), 
                               iuse = c(1,1), 
                               alpha = c(0.025,0.025))[cols])

# combine both into one df
example2 = rbind(ex2.obf,
                   ex2.lin)
example2$example = 2

###############
### Example 3
# Strategy 1
ex3.lin = as.data.frame(bounds(t = c(.5,.75,1),
                               iuse = c(3,3),
                               alpha = c(0.025,0.025))[cols])
# Strategy 2
ex3.obf = as.data.frame(bounds(t = c(.5,.75,1), 
                               iuse = c(1,1), 
                               alpha = c(0.025,0.025))[cols])

# combine both into one df
example3 = rbind(ex3.lin,
                 ex3.obf)
example3$example =3

###############
### Example 4
# Strategy 1
ex4.lin = as.data.frame(bounds(t = seq(from=1/6, to =1,by=1/6), 
                               iuse = c(3,3), 
                               alpha = c(0.025,0.025))[cols])
# Strategy 2
ex4.obf = as.data.frame(bounds(t = seq(from=1/6, to =1,by=1/6), 
                               iuse = c(1,1), 
                               alpha = c(0.025,0.025))[cols])
# combine both into one df
example4 = rbind(ex4.obf,
                 ex4.lin)
example4$example = 4

6.2 Combine all examples into one graph

## Combine all examples into one graph

plot = rbind(example1,example2, example3, example4)

# create a more legible label for Strategy 1 & 2 in Graph
plot$method = ifelse(plot$spending.type=="O'Brien-Fleming", "Strategy 2 : Planning for a \nSmall Effect", "Strategy 1 : Planning for a \nLarge Effect")

# Convert % of maximum sample to actual # of participants
plot$N48 = plot$time*48


# compute the nominal alpha at each point
plot =  plot %>% 
  mutate(nominal = 2*pnorm(-abs(upper.bounds)))

# Create legend for plot
legend = data.frame(example= sort(unique(plot$example)),
                    SamplingPlan2 = c("N = 48 (0 Peeks)",
                                      "N = 32, again at N = 48 (1 Peek)",
                                      "N = 24, then every 12 (2 Peeks)",
                                      "N = 8, then every 8 (5 Peeks)")
                    )
# Combine plot data and legend info                               
plot = merge(plot,legend)

# Need this to get the legend to show up in the correct order:
order.legend = c("N = 48 (0 Peeks)",
                 "N = 32, again at N = 48 (1 Peek)",
                 "N = 24, then every 12 (2 Peeks)",
                 "N = 8, then every 8 (5 Peeks)")

6.3 Plot Figure 6

Plotting p-values for pre-registered data peeking plan Strategy 1 and 2

## Create plot
# Plotting p-values for pre-registered data peeking plan Strategy 1 and 2

ggplot(plot,aes(x=N48, 
                y=nominal, 
                colour=SamplingPlan2,
                group=SamplingPlan2,
                shape=SamplingPlan2,
                fill=SamplingPlan2)) +
  geom_line(aes(linetype=SamplingPlan2),
            size = 1.1)+
  geom_point(size=4)+
  # assign the linetype and colours, make sure legend is in right order
  scale_x_continuous(breaks=seq(0,48,by=8))+
  scale_colour_manual(name="First Statistical Test After...",
                        breaks=  order.legend, 
                        values = viridis(5)[1:4])+
  scale_fill_manual(name="First Statistical Test After...",
                        breaks=  order.legend, 
                        values = viridis(5)[1:4])+
  scale_linetype_manual(name="First Statistical Test After...",
                        breaks=  order.legend,
                        values= c("dashed",
                                  "solid",
                                  "blank", 
                                  "twodash"))+
  scale_shape_manual(name="First Statistical Test After...",
                       breaks = order.legend, 
                       values = 21:24)+
  # some formatting
  theme_bw(base_size=18)+
  xlab(expression(italic("N")~" Tested"))+
  ylab(expression(Nominal~alpha~"at Each Peek"))+
  facet_grid(.~method)+
  theme(legend.key.width = unit(15,"mm"),
        panel.grid.minor = element_blank())

# save the graph in working directory
ggplot2::ggsave("Figure6.png", width = 11,height=6)

7 Figure 7

Figure 7. Comparison of the average power and sample size at completion for different sampling plans. A medium effect size indicates the effect size at which a one-sample two-tailed t-test with 48 participants has 80% power (𝛿 = 0.41), and a large effect size is a 50% larger effect size (𝛿 = 0.62). When the effect size is large, power for both Strategy 1 and 2 is close to 100%, but Strategy 1 is slightly more advantageous because it results in a smaller average sample size at completion. With a medium effect size, power decreases more in Strategy 1 than 2, and thus Strategy 2 is more advantageous. ## Compute power & Mean N for each approach

# Function to determine power for each correction strategy: ####
power.sequential <- function(d, nseq, pseq){
  decvec <- NULL
  nvec <- NULL
  for (i in 1:10000){ # to speed up this function, lower the number of cycles here (e.g., to 1000)
    decvec[i] <- 0
    nvec[i] <- 0
    j <- 1
    x <- NULL
    while(decvec[i] == 0 & nvec[i] < sum(nseq)){
      x <- c(x, rnorm(nseq[j], mean = d))
      p <- t.test(x)$p.value
      nvec[i] <- nvec[i] + nseq[j]
      if (p < pseq[j]) decvec[i] <- 1
      j <- j + 1      
    }
  }
  power <- mean(decvec == 1)
  meanN <- mean(nvec)
  return(list("power" = power, "mean_N" = meanN))
}

# nseq is the sequence of ADDITIONAL Ns at which peeks are allowed:
# NOTE that this is not [24, 36, 48], but [24, 12, 12]
# The sum of nseq should equal the max sample size
#nseq <- c(24, 12, 12) 

# pseq is the sequence of alpha-levels at which p will be evaluated:
# it should have the same length as nseq
#pseq <- c(.025, .0125, .0125)

#examples: 

#1. Confirm that when d = 0 (H0 is true), alpha is < .05
#power.sequential(d = 0, nseq = rep(8, 6), pseq = rep(.05/6, 6))

#2. See what power is for H1 (80% power for N = 48) with 5 peeks: 
#power.sequential(d = 0.4128752, nseq = rep(8, 6), pseq = rep(.05/6, 6))

7.1 Define effect sizes to be used

## Define effect sizes to be used

#  find d corresponding to 
#  80% power for n = 48
d1 <- power.t.test(n = 48, delta = NULL, power = .8, 
                  sig.level = .05, type = "one.sample")$delta
# then compute 2nd effect size that is larger than d1
d2 = d1*1.5

7.2 Do simulation for each effect size

################################################################
# for reproducibility
set.seed(1234)

# convert cumulative N from plot dataframe for Figure 6 to difference in N as needed for power formula
plot = plot %>%
  group_by(SamplingPlan2, method) %>% 
  mutate(diff = abs(time - lag(time, default = 0))) %>%
  mutate(N = diff*48)

# Compute power and mean N for medium effect size (d1)
# this code takes a couple of minutes to run usually!
power.d1 = plot %>%
 group_by(SamplingPlan2, method) %>%
 do(data.frame(power.sequential(d=d1,nseq=.$N,pseq=.$nominal)[c(1, 2)])) %>%
 data.frame()

power.d1$d = d1

# Compute power and mean N for medium effect size (d2)
# this code takes a couple of minutes to run usually!
power.d2 = plot %>%
 group_by(SamplingPlan2, method) %>%
 do(data.frame(power.sequential(d=d2,nseq=.$N,pseq=.$nominal)[c(1, 2)])) %>%
 data.frame()

power.d2$d = d2

## Formatting
# Round up the mean final sample size 
power.d1$mean_N = ceiling(as.numeric(power.d1$mean_N))
power.d2$mean_N =ceiling(as.numeric(power.d2$mean_N))

# Join the results for d1 and d2
power.plot = full_join(power.d1, power.d2)

## Joining, by = c("SamplingPlan2", "method", "power", "mean_N", "d")

# Round power to 2 decimals
power.plot$power = round(power.plot$power,2)

# For nicer legends
power.plot$EffectSize = ifelse(power.plot$d < .5,"Medium", "Large")
power.plot$method.short = substr(power.plot$method,1,10)

7.3 Plot Figure 7

## Plot power and sample size at completion

# Save all formatting that both graphs have in common
Fig7.layers = list(geom_point(),
                     geom_line(size=1.3),
                     # make legend nicer
                     scale_x_discrete(limits=  order.legend ),
                     scale_linetype_manual(values=c("dotted", "solid"), 
                                           name="Effect Size"),
                     scale_colour_manual(name="Alpha Adjustment",
                                         values=c('#5e3c99','#f4a582')),
                     guides(linetype = guide_legend(reverse=TRUE)),
                     # general formatting
                     theme_bw(base_size=20),
                     expand_limits(y=c(0)),
                     theme(axis.text.x  = element_text(size = 18,
                                                       angle = 45, 
                                                       hjust = 1),
                           panel.grid.major.x = element_blank(),
                                                      panel.grid.minor = element_blank(),

                                                      panel.grid.major.y = element_blank(),

                           legend.position="none"),
                     xlab("")
)
######                  
# Plot mean sample size at end of data collection
Mean_N.graph =ggplot(power.plot, 
                     aes(x=SamplingPlan2, 
                         y=mean_N, 
                         group=interaction(method.short,EffectSize), 
                         colour=method.short, 
                         linetype =EffectSize))+ 
  scale_y_continuous(breaks=seq(0,48,by=8))+
  
  ylab("Average Final Sample Size")+
  Fig7.layers

#########
# Plot power 
power.graph = ggplot(power.plot, aes(x=SamplingPlan2, 
                                     y=power,
                              group=interaction(method.short,EffectSize),
                                     colour=method.short, 
                                     linetype=EffectSize))+ 
  ylab("Average Power")+
  scale_y_continuous(breaks = c(0, .2, .4,.6,.8,1),labels = scales::percent)+
  Fig7.layers

#######
# Get legend to plot on top
legend_b <- get_legend(power.graph + theme(legend.position="top"))

# Combine the two figures without legends
grid = plot_grid(power.graph, Mean_N.graph, labels = c("A", "B"))

# Combine legend and plots
plot_grid( legend_b, grid, ncol = 1, rel_heights = c(.1, 1))

# Save in working directory
ggplot2::ggsave("FigPowerMean.png", width=12, height=8.5)

8 Simulated data for examples in Box 1 & 2

8.1 Box 1: Pre-registered data peeking study

set.seed(1)
set.seed(3333)
# we are creating 32 random values from the outset (to be able to get reproducible random numbers )
example.data = rnorm(32, mean=.3, sd=1.5)

# Strategy 2
ex.obf = as.data.frame(bounds(t = c(.5,.75,1), 
                               iuse = c(1,1), 
                               alpha = c(0.025,0.025))[cols])
ex.obf =  ex.obf %>% 
  mutate(nominal = 2*pnorm(-abs(upper.bounds)))
# extract p-values at each peek
ex.obf$observed.pval = NA
(ex.obf$observed.pval [1] =t.test(example.data[1:16])$p.value)

## [1] 0.08432023

(ex.obf$observed.pval [2] =t.test(example.data[1:24])$p.value)

## [1] 0.01750228

(ex.obf$observed.pval [3] =t.test(example.data[1:32])$p.value)

## [1] 0.002114329

#extract  t-values at each peak
ex.obf$observed.tval = NA
ex.obf$observed.tval [1] =t.test(example.data[1:16])$statistic
ex.obf$observed.tval [2] =t.test(example.data[1:24])$statistic
ex.obf$observed.tval [3] =t.test(example.data[1:32])$statistic
# extract df for each peek
ex.obf$observed.df = NA
ex.obf$observed.df [1] =t.test(example.data[1:16])$parameter
ex.obf$observed.df [2] =t.test(example.data[1:24])$parameter
ex.obf$observed.df [3] =t.test(example.data[1:32])$parameter

# check if stopping criterion fulfilled at each peak
ex.obf$stop = ifelse(ex.obf$observed.pval < ex.obf$nominal, "stop", "continue")

# calculate z-value from observed p-value
ex.obf = ex.obf %>%
  mutate(z.forAdjP = ifelse(stop =="stop",
                             qnorm(1-(observed.pval/2)),
                                   upper.bounds))


# we can stop after the second peek - when the observed p (0.0175) is smaller than the nominal:  0.018325444

# compute the adjusted p-value (given than in this case we did not stop after the first peek)
drift.pr <- drift(zb=ex.obf$z.forAdjP,t=ex.obf$time,drft=0)
# cumulative exit probability is .0185 at the second peek
summary(drift.pr)

## 
## Lan-DeMets method for group sequential boundaries 
##  
## n =  3 
## 
## Boundaries: 
##    Time    Lower   Upper
## 1  0.50  -2.9626  2.9626
## 2  0.75  -2.3760  2.3760
## 3  1.00  -3.0737  3.0737
## 
## Drift parameters:  0 
##    Time  Lower probs  Upper probs   Exit pr.  Cum exit pr.
## 1  0.50    0.0015253    0.0015253  0.0030506     0.0030506
## 2  0.75    0.0077277    0.0077277  0.0154554     0.0185061
## 3  1.00    0.0001736    0.0001736  0.0003472     0.0188533

8.2 Box 2: Post-hoc data peeking

define paugmented (function copied from http://www.paugmented.com/)

paugmented <- function(ns,plargest,pfinal,pcrit=.05,slices=1000,tails=2) {
  slicewidth <- 1/slices
  pslices <- seq(.5*(1/slices),1-.5*(1/slices),length=slices)
  lowboundoldweights <- rep(slicewidth,times=slices)
  highboundoldweights <- rep(slicewidth,times=slices)
  naccumulator <- ns[1]
  lowboundpaccumulator <- 0
  highboundpaccumulator <- 0
  if (length(ns)>2) {
    for (i in 2:(length(ns)-1)) {
      lowboundnewweights <- rep(0,times=slices)
      highboundnewweights <- rep(0,times=slices)
      cat("Calculating augmentation round ", i-1, " of ", length(ns)-1, ":", sep = "")
      for (j in 1:slices) {
        if ((j-1)*10/slices==trunc((j-1)*10/slices)) {
          cat("", 10-(j-1)*10/slices)
        }
        if (((tails==1)&(pslices[j] > (1-pcrit)))|
              ((tails==2)&((pslices[j] > (1-pcrit/2))|(pslices[j] < (pcrit/2))))) {
          lowboundpaccumulator <- lowboundpaccumulator + lowboundoldweights[j]
          highboundpaccumulator <- highboundpaccumulator + highboundoldweights[j]
        } else if (((tails==1)&(pslices[j] > (1-plargest)))|
                     ((tails==2)&((pslices[j] > (1-plargest/2))|(pslices[j] < (plargest/2))))) {
          for (k in 1:slices) {
            pcombined <- pnorm((naccumulator^.5*qnorm(pslices[j])+ns[i]^.5*qnorm(pslices[k]))/(naccumulator+ns[i])^.5)
            lowboundnewweights[pcombined*slices+1] <- lowboundnewweights[pcombined*slices+1] + lowboundoldweights[j]*slicewidth
            highboundnewweights[pcombined*slices+1] <- highboundnewweights[pcombined*slices+1] + highboundoldweights[j]*slicewidth
          }
        } else {
          for (k in 1:slices) {
            pcombined <- pnorm((naccumulator^.5*qnorm(pslices[j])+ns[i]^.5*qnorm(pslices[k]))/(naccumulator+ns[i])^.5)
            highboundnewweights[pcombined*slices+1] <- highboundnewweights[pcombined*slices+1] + highboundoldweights[j]*slicewidth
          }
        }
      }
      naccumulator <- naccumulator + ns[i]
      lowboundoldweights <- lowboundnewweights
      highboundoldweights <- highboundnewweights
      cat("\n")
    }
    cat("Calculating augmentation round ", length(ns)-1, " of ", length(ns)-1, ": 10 9 8 7 6 5 4 3 2 1\n", sep = "")
  }
  if (tails==1) {
    lowboundpaccumulator <- lowboundpaccumulator + sum(lowboundoldweights*((pslices>(1-pcrit))+
                                                     (pslices>(1-plargest))*(pslices<=(1-pcrit))*
                                                     (1-pnorm((qnorm(1-pfinal)*(naccumulator+ns[length(ns)])^.5-naccumulator^.5*qnorm(pslices))/ns[length(ns)]^.5))))
    highboundpaccumulator <- highboundpaccumulator + sum(highboundoldweights*((pslices>(1-pcrit))+
                                                     (pslices>(1-1))*(pslices<=(1-pcrit))*
                                                     (1-pnorm((qnorm(1-pfinal)*(naccumulator+ns[length(ns)])^.5-naccumulator^.5*qnorm(pslices))/ns[length(ns)]^.5))))
  } else {
    lowboundpaccumulator <- lowboundpaccumulator + sum(lowboundoldweights*((pslices>(1-pcrit/2))+(pslices<(pcrit/2))+
                                                     ((pslices>(1-plargest/2))*(pslices<=(1-pcrit/2))+(pslices<(plargest/2))*(pslices>=(pcrit/2)))*
                                                     (1-pnorm((qnorm(1-pfinal/2)*(naccumulator+ns[length(ns)])^.5-naccumulator^.5*qnorm(pslices))/ns[length(ns)]^.5)+
                                                        pnorm((-qnorm(1-pfinal/2)*(naccumulator+ns[length(ns)])^.5-naccumulator^.5*qnorm(pslices))/ns[length(ns)]^.5))))
    highboundpaccumulator <- highboundpaccumulator + sum(highboundoldweights*((pslices>(1-pcrit/2))+(pslices<(pcrit/2))+
                                                                             ((pslices>(1-1/2))*(pslices<=(1-pcrit/2))+(pslices<(1/2))*(pslices>=(pcrit/2)))*
                                                                             (1-pnorm((qnorm(1-pfinal/2)*(naccumulator+ns[length(ns)])^.5-naccumulator^.5*qnorm(pslices))/ns[length(ns)]^.5)+
                                                                                pnorm((-qnorm(1-pfinal/2)*(naccumulator+ns[length(ns)])^.5-naccumulator^.5*qnorm(pslices))/ns[length(ns)]^.5))))
  }
  return(paste("[",lowboundpaccumulator,", ",highboundpaccumulator,"]", sep=""))
}

set.seed(1)
set.seed(12345)
example.data = rnorm(32, mean=.3, sd=1.5)
# first significance test at 24 infants
test.t1 = t.test(example.data[1:24])
test.t1

## 
##  One Sample t-test
## 
## data:  example.data[1:24]
## t = 1.4385, df = 23, p-value = 0.1638
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.1743684  0.9704030
## sample estimates:
## mean of x 
## 0.3980173

# second significance test at 32 infants
test.t2 = t.test(example.data[1:32])
test.t2

## 
##  One Sample t-test
## 
## data:  example.data[1:32]
## t = 2.1052, df = 31, p-value = 0.04348
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.01721362 1.08649735
## sample estimates:
## mean of x 
## 0.5518555

p.largest = max(test.t1$p.value, test.t2$p.value)
# calculated range for p-value adjusted for post-hoc sample size increase
paugmented(c(24,8), pfinal = test.t2$p.value,plargest = p.largest, pcrit = .05,tails = 2)

## [1] "[0.0638623202401946, 0.0689059256392728]"

Simulations and Figures for Manuscript “Should I test more babies? Solutions for transparent data peeking”