---
title: 'Simulation-based power analysis for Hadavi et al. 2023, Cross-cultural relationships between music, emotion, and visual imagery: A comparative study of Iran, Canada, and Japan [Stage 1 Registered Report]'
author:
- affiliation: Human Behaviour and Evolution Lab, Faculty of Psychology, Universidad El Bosque. Bogota, Colombia
  email: jleongomez@unbosque.edu.co
  name: "[Juan David Leongómez](https://jdleongomez.info/)"
date: "`r Sys.setlocale('LC_TIME','English');format(Sys.Date(),'%d %B, %Y')`"
output:
  html_document:
    highlight: textmate
    number_sections: false
    self-contained: true
    theme: cosmo
    toc: true
    toc_depth: 2
    toc_float: 
      collapsed: false
      smooth_scroll: false
editor_options:
  chunk_output_type: console
always_allow_html: yes
csl: apa.csl
---

```{r setup, include=FALSE}
library(knitr)
opts_chunk$set(echo = TRUE)
```

Load necessary packages
```{r message = FALSE}
library(tidyverse) # For data manipulation and visualization
library(lsr) # For effect size calculation
library(plyr) # For data manipulation
library(ggpubr) # For organising plots
library(lmerTest) # For fitting lmm models
library(ordinal) # For fitting clmm models
library(emmeans) # For testing contrasts in clmm models and obtain p-values
library(kableExtra) # For tables
```

Set random seed for reproducibility
```{r}
set.seed(1)
```

# Simulate a population 

This is a simulated a population (N = 60,000) in which the effect size of the (paired) difference between two conditions (A and B) is *d* = 0.4

Create simulated dataset with two columns (A and B) representing two conditions with different binomial probabilities
```{r}
dat <- tibble(A = rbinom(60000, size = 4, prob = 0.445) + 1,
              B = rbinom(60000, size = 4, prob = 0.585) + 1,
              Participant = rep(paste0("P", 1:10000), each = 6))

# Alternative with less "normal" looking distributions for A and B
# dat <- tibble(A = rbinom(10000, size = 4, prob = 0.78) + 1, 
#               B = rbinom(10000, size = 4, prob = 0.88) + 1)
```

Transform data into long format
```{r}
dat.long <- dat %>% 
  pivot_longer(cols = A:B, names_to = "Condition", values_to = "Value")
```

Calculate the mean and standard deviation of "Value" for each condition
```{r}
mu <- dat.long %>% 
  group_by(Condition) %>%    # Condition the data by the "Condition" column
  dplyr::summarise(     # Summarize the data by calculating the mean and standard deviation
    Mean = mean(Value), # Mean of "Value" for each condition
    SD= sd(Value))       # Standard deviation of "Value" for each condition

# Print table
mu %>%
  kable(digits = 2) %>%
  kable_paper("hover", full_width = F)
```

Compute effect size (Cohen's d) between the two conditions
```{r}
general.d <- cohensD(x = dat$A, y = dat$B,
                     method = "paired")
general.d
```

Plot histogram of data with vertical lines for mean of each condition and label for effect size
```{r fig.cap = "Histogram of the two conditions (A, B). Vertical dashed lines represente the mean of each condition."}
ggplot(dat.long, aes(x = Value, fill = Condition, colour = Condition)) +
  geom_histogram(alpha = 0.3, position = "identity", binwidth = 1) +
  geom_vline(data = mu, aes(xintercept = Mean, color = Condition),
             linetype="dashed") +
  labs(x = "Score", y = "Count") +
  annotate("text", x = 1, y = 20000, 
           label = paste0("Cohen's d = ", round(general.d, 2)))
```

# Power analysis from t-test, Wilcoxon signed-rank tests, and Linear Mixed Models (LMM)

## t-tests

Set parameters for simulation
```{r}
#Change these numbers to see their effects on statistical power
num_sims.t = 1000 # Number of simulations to run
sample_size.t = 82 # Sample size for each simulation
alpha.t = 0.05 # Significance level
```

Create simulation-based dataset with defined sample size
```{r cache = TRUE}
samples.t <- map_dfr(seq_len(num_sims.t), ~dat %>%
                     sample_n(sample_size.t) %>% 
                     mutate(sample.t = as.factor(.x)))
```

Define function to compute paired t-test
```{r}
compfun <- function(x, y) {
  comp = (t.test(x, y,
                 alternative = "two.sided", paired = TRUE))
}
```

Perform t-test on each simulated sample and save p-value and effect size (t) to new dataset (comps)
```{r}
comps.t <- ddply(samples.t, .(sample.t), summarise,
                 t = round(compfun(A, B)$statistic, 2),
                 p = round(compfun(A, B)$p.value, 3),
                 "Statistical signicance" = ifelse(p <= alpha.t, "Significant", "Non-significant"))
```

Calculate power of analysis (i.e., proportion of simulations where p-value is less than alpha)
```{r}
power.t <- sum(comps.t$`Statistical signicance` == "Significant") / num_sims.t
```

Plot histogram of p-values with label for power and sample size
```{r}
p.t <- ggplot(comps.t, aes(x = p, fill = `Statistical signicance`)) +
  scale_fill_hue(direction = -1) +
  geom_histogram(bins = 1/alpha.t, breaks = seq(0, 1, alpha.t)) +
  labs(y = "Count", x = "p-value", title = "Paired t-tests") +
  annotate("text", x = 0.5, y = sum(comps.t$`Statistical signicance` == "Significant") * 0.9, 
           label = paste0("Power = ", round(power.t, 3), "\nSample size = ", sample_size.t))
p.t
```

## Wilcoxon signed-rank tests

Set parameters for simulation
```{r}
#Change these numbers to see their effects on statistical power
num_sims.wilcox = 1000 # Number of simulations to run
sample_size.wilcox = 83 # Sample size for each simulation
alpha.wilcox = 0.05 # Significance level
```

Create simulation-based dataset with defined sample size
```{r cache = TRUE}
samples.wilcox <- map_dfr(seq_len(num_sims.wilcox), ~dat %>%
                            sample_n(sample_size.wilcox) %>%
                            mutate(sample.wilcox = as.factor(.x)))
```

Define function to compute Wilcoxon signed-rank tests
```{r}
wilcoxfun <- function(x, y) {
  comp = (wilcox.test(x, y,
                      alternative = "two.sided", paired = TRUE))
}
```

Perform Wilcoxon signed-rank tests on each simulated dataset and save p-value and effect size (V) to new dataset (comps)
```{r warning = FALSE}
wilcox.comps <- ddply(samples.wilcox, .(sample.wilcox), summarise,
                      V = round(wilcoxfun(A, B)$statistic, 2),
                      p = round(wilcoxfun(A, B)$p.value, 3),
                      "Statistical signicance" = ifelse(p <= alpha.wilcox, "Significant", "Non-significant"))
```

Calculate power of analysis (i.e., proportion of simulations where p-value is less than alpha)
```{r}
power.wilcox <- sum(wilcox.comps$`Statistical signicance` == "Significant") / num_sims.wilcox
```

Plot histogram of p-values with label for power and sample size
```{r}
p.wilcox <- ggplot(wilcox.comps, aes(x = p, fill = `Statistical signicance`)) +
  scale_fill_hue(direction = -1) +
  geom_histogram(bins = 1/alpha.wilcox, breaks = seq(0, 1, alpha.wilcox)) +
  labs(y = "Count", x = "p-value", title = "Wilcoxon signed-rank tests") +
  annotate("text", x = 0.5, y = sum(wilcox.comps$`Statistical signicance` == "Significant") * 0.9, 
           label = paste0("Power = ", round(power.wilcox, 3), "\nSample size = ", sample_size.wilcox))
p.wilcox
```

## Linear Mixed Models

Set parameters for simulation
```{r}
#Change these numbers to see their effects on statistical power
num_sims.lmm = 1000 # Number of simulations to run
sample_size.lmm = 14 # Sample size for each simulation (number of participants)
alpha.lmm = 0.05 # Significance level
```

Data frame of participant names
```{r}
part <- tibble(Participant = unique(dat$Participant))
```

Create simulation-based dataset of participants with defined sample size
```{r cache = TRUE}
samples.lmm <- map_dfr(seq_len(num_sims.lmm), ~part %>%
                         sample_n(sample_size.lmm) %>%
                         mutate(sample.lmm = as.factor(.x)))
```

Full simulation-based dataset with all data per particioant, in long format
```{r cache = TRUE}
samples.lmm.long <- left_join(samples.lmm, dat, by = "Participant",
                              relationship = "many-to-many") %>%
  pivot_longer(cols = A:B, names_to = "Condition", values_to = "Value")
```

Define function to compute lmm
```{r}
lmmfun <- function(y, x, z) {
  mod = (anova(lmer(y ~ x + (1 | z))))
}
```

Fit linear mixed model on each simulated sample and write p-value to new dataset (lmm.comps)
```{r cache = TRUE, warning = FALSE, message = FALSE}
lmm.comps <- ddply(samples.lmm.long, .(sample.lmm), summarise,
                   p = round(lmmfun(Value, Condition, Participant)$`Pr(>F)`, 3),
                   "Statistical signicance" = ifelse(p <= alpha.lmm, "Significant", "Non-significant"))
```

Calculate power of analysis (i.e., proportion of simulations where p-value is less than alpha)
```{r}
power.lmm <- sum(lmm.comps$`Statistical signicance` == "Significant") / num_sims.lmm
```

Plot histogram of p-values with label for power and sample size
```{r}
p.lmm <- ggplot(lmm.comps, aes(x = p, fill = `Statistical signicance`)) +
  scale_fill_hue(direction = -1) +
  geom_histogram(bins = 1/alpha.lmm, breaks = seq(0, 1, alpha.lmm)) +
  labs(y = "Count", x = "p-value", title = "Linear Mixed Models",
       subtitle = "Call: lmer(Value ~ Condition + (1 | Participant))") +
  annotate("text", x = 0.5, y = sum(lmm.comps$`Statistical signicance` == "Significant") * 0.9, 
           label = paste0("Power = ", round(power.lmm, 3), 
                          "\nSample size = ", sample_size.lmm, 
                          " participants\n(6 paired responses per participant)"))
p.lmm 
```

## Cumulative Link Mixed Models (clmm)

Set parameters for simulation
```{r}
#Change these numbers to see their effects on statistical power
num_sims.clmm = 1000 # Number of simulations to run
sample_size.clmm = 14 # Sample size for each simulation (number of participants)
alpha.clmm = 0.05 # Significance level
```

Create simulation-based dataset of participants with defined sample size
```{r cache = TRUE}
samples.clmm <- map_dfr(seq_len(num_sims.clmm), ~part %>%
                         sample_n(sample_size.clmm) %>%
                         mutate(sample.clmm = as.factor(.x)))
```

Full simulation-based dataset with all data per particioant, in long format
```{r cache = TRUE}
samples.clmm.long <- left_join(samples.clmm, dat, by = "Participant",
                              relationship = "many-to-many") %>%
  pivot_longer(cols = A:B, names_to = "Condition", values_to = "Value") %>%
  mutate(Value = as.factor(Value))
```

Define function to compute lmm
```{r}
clmmfun <- function(y, x, z) {
  m1 = emmeans(clmm(y ~ x + (1 | z)), pairwise~x) # Fit model
}
```

**Note:** I couldn´t manage to make the simulation work from this point, so this code is dysplayed, but not run

Fit ciumulative link mixed model on each simulated sample and write p-value to new dataset (clmm.comps)
```{r eval = FALSE}
clmm.comps <- ddply(samples.clmm.long, .(sample.clmm), here(summarise),
                   p = data.frame(clmmfun(Value, Condition, Participant)$contrasts)$p.value,
                   "Statistical signicance" = ifelse(p <= alpha.clmm, "Significant", "Non-significant"))
```

Calculate power of analysis (i.e., proportion of simulations where p-value is less than alpha)
```{r eval = FALSE}
power.clmm <- sum(clmm.comps$`Statistical signicance` == "Significant") / length(unique(samples.clmm.longTEMP$sample.clmm))
```

Plot histogram of p-values with label for power and sample size
```{r eval = FALSE}
p.clmm <- ggplot(clmm.comps, aes(x = p, fill = `Statistical signicance`)) +
  scale_fill_hue(direction = -1) +
  geom_histogram(bins = 1/alpha.clmm, breaks = seq(0, 1, alpha.clmm)) +
  labs(y = "Count", x = "p-value", title = "Cumulative Link Mixed Models",
       subtitle = "Call: clmm(Value ~ Condition + (1 | Participant))") +
  annotate("text", x = 0.5, y = sum(clmm.comps$`Statistical signicance` == "Significant") * 0.9, 
           label = paste0("Power = ", round(power.clmm, 3), 
                          "\nSample size = ", sample_size.clmm, 
                          " participants\n(6 paired responses per participant)"))
p.clmm 
```