--- title: 'Simulation-based power analysis for Hadavi et al. 2023, Cross-cultural relationships between music, emotion, and visual imagery: A comparative study of Iran, Canada, and Japan [Stage 1 Registered Report]' author: - affiliation: Human Behaviour and Evolution Lab, Faculty of Psychology, Universidad El Bosque. Bogota, Colombia email: jleongomez@unbosque.edu.co name: "[Juan David Leongómez](https://jdleongomez.info/)" date: "`r Sys.setlocale('LC_TIME','English');format(Sys.Date(),'%d %B, %Y')`" output: html_document: highlight: textmate number_sections: false self-contained: true theme: cosmo toc: true toc_depth: 2 toc_float: collapsed: false smooth_scroll: false editor_options: chunk_output_type: console always_allow_html: yes csl: apa.csl --- ```{r setup, include=FALSE} library(knitr) opts_chunk$set(echo = TRUE) ``` Load necessary packages ```{r message = FALSE} library(tidyverse) # For data manipulation and visualization library(lsr) # For effect size calculation library(plyr) # For data manipulation library(ggpubr) # For organising plots library(lmerTest) # For fitting lmm models library(ordinal) # For fitting clmm models library(emmeans) # For testing contrasts in clmm models and obtain p-values library(kableExtra) # For tables ``` Set random seed for reproducibility ```{r} set.seed(1) ``` # Simulate a population This is a simulated a population (N = 60,000) in which the effect size of the (paired) difference between two conditions (A and B) is *d* = 0.4 Create simulated dataset with two columns (A and B) representing two conditions with different binomial probabilities ```{r} dat <- tibble(A = rbinom(60000, size = 4, prob = 0.445) + 1, B = rbinom(60000, size = 4, prob = 0.585) + 1, Participant = rep(paste0("P", 1:10000), each = 6)) # Alternative with less "normal" looking distributions for A and B # dat <- tibble(A = rbinom(10000, size = 4, prob = 0.78) + 1, # B = rbinom(10000, size = 4, prob = 0.88) + 1) ``` Transform data into long format ```{r} dat.long <- dat %>% pivot_longer(cols = A:B, names_to = "Condition", values_to = "Value") ``` Calculate the mean and standard deviation of "Value" for each condition ```{r} mu <- dat.long %>% group_by(Condition) %>% # Condition the data by the "Condition" column dplyr::summarise( # Summarize the data by calculating the mean and standard deviation Mean = mean(Value), # Mean of "Value" for each condition SD= sd(Value)) # Standard deviation of "Value" for each condition # Print table mu %>% kable(digits = 2) %>% kable_paper("hover", full_width = F) ``` Compute effect size (Cohen's d) between the two conditions ```{r} general.d <- cohensD(x = dat$A, y = dat$B, method = "paired") general.d ``` Plot histogram of data with vertical lines for mean of each condition and label for effect size ```{r fig.cap = "Histogram of the two conditions (A, B). Vertical dashed lines represente the mean of each condition."} ggplot(dat.long, aes(x = Value, fill = Condition, colour = Condition)) + geom_histogram(alpha = 0.3, position = "identity", binwidth = 1) + geom_vline(data = mu, aes(xintercept = Mean, color = Condition), linetype="dashed") + labs(x = "Score", y = "Count") + annotate("text", x = 1, y = 20000, label = paste0("Cohen's d = ", round(general.d, 2))) ``` # Power analysis from t-test, Wilcoxon signed-rank tests, and Linear Mixed Models (LMM) ## t-tests Set parameters for simulation ```{r} #Change these numbers to see their effects on statistical power num_sims.t = 1000 # Number of simulations to run sample_size.t = 82 # Sample size for each simulation alpha.t = 0.05 # Significance level ``` Create simulation-based dataset with defined sample size ```{r cache = TRUE} samples.t <- map_dfr(seq_len(num_sims.t), ~dat %>% sample_n(sample_size.t) %>% mutate(sample.t = as.factor(.x))) ``` Define function to compute paired t-test ```{r} compfun <- function(x, y) { comp = (t.test(x, y, alternative = "two.sided", paired = TRUE)) } ``` Perform t-test on each simulated sample and save p-value and effect size (t) to new dataset (comps) ```{r} comps.t <- ddply(samples.t, .(sample.t), summarise, t = round(compfun(A, B)$statistic, 2), p = round(compfun(A, B)$p.value, 3), "Statistical signicance" = ifelse(p <= alpha.t, "Significant", "Non-significant")) ``` Calculate power of analysis (i.e., proportion of simulations where p-value is less than alpha) ```{r} power.t <- sum(comps.t$`Statistical signicance` == "Significant") / num_sims.t ``` Plot histogram of p-values with label for power and sample size ```{r} p.t <- ggplot(comps.t, aes(x = p, fill = `Statistical signicance`)) + scale_fill_hue(direction = -1) + geom_histogram(bins = 1/alpha.t, breaks = seq(0, 1, alpha.t)) + labs(y = "Count", x = "p-value", title = "Paired t-tests") + annotate("text", x = 0.5, y = sum(comps.t$`Statistical signicance` == "Significant") * 0.9, label = paste0("Power = ", round(power.t, 3), "\nSample size = ", sample_size.t)) p.t ``` ## Wilcoxon signed-rank tests Set parameters for simulation ```{r} #Change these numbers to see their effects on statistical power num_sims.wilcox = 1000 # Number of simulations to run sample_size.wilcox = 83 # Sample size for each simulation alpha.wilcox = 0.05 # Significance level ``` Create simulation-based dataset with defined sample size ```{r cache = TRUE} samples.wilcox <- map_dfr(seq_len(num_sims.wilcox), ~dat %>% sample_n(sample_size.wilcox) %>% mutate(sample.wilcox = as.factor(.x))) ``` Define function to compute Wilcoxon signed-rank tests ```{r} wilcoxfun <- function(x, y) { comp = (wilcox.test(x, y, alternative = "two.sided", paired = TRUE)) } ``` Perform Wilcoxon signed-rank tests on each simulated dataset and save p-value and effect size (V) to new dataset (comps) ```{r warning = FALSE} wilcox.comps <- ddply(samples.wilcox, .(sample.wilcox), summarise, V = round(wilcoxfun(A, B)$statistic, 2), p = round(wilcoxfun(A, B)$p.value, 3), "Statistical signicance" = ifelse(p <= alpha.wilcox, "Significant", "Non-significant")) ``` Calculate power of analysis (i.e., proportion of simulations where p-value is less than alpha) ```{r} power.wilcox <- sum(wilcox.comps$`Statistical signicance` == "Significant") / num_sims.wilcox ``` Plot histogram of p-values with label for power and sample size ```{r} p.wilcox <- ggplot(wilcox.comps, aes(x = p, fill = `Statistical signicance`)) + scale_fill_hue(direction = -1) + geom_histogram(bins = 1/alpha.wilcox, breaks = seq(0, 1, alpha.wilcox)) + labs(y = "Count", x = "p-value", title = "Wilcoxon signed-rank tests") + annotate("text", x = 0.5, y = sum(wilcox.comps$`Statistical signicance` == "Significant") * 0.9, label = paste0("Power = ", round(power.wilcox, 3), "\nSample size = ", sample_size.wilcox)) p.wilcox ``` ## Linear Mixed Models Set parameters for simulation ```{r} #Change these numbers to see their effects on statistical power num_sims.lmm = 1000 # Number of simulations to run sample_size.lmm = 14 # Sample size for each simulation (number of participants) alpha.lmm = 0.05 # Significance level ``` Data frame of participant names ```{r} part <- tibble(Participant = unique(dat$Participant)) ``` Create simulation-based dataset of participants with defined sample size ```{r cache = TRUE} samples.lmm <- map_dfr(seq_len(num_sims.lmm), ~part %>% sample_n(sample_size.lmm) %>% mutate(sample.lmm = as.factor(.x))) ``` Full simulation-based dataset with all data per particioant, in long format ```{r cache = TRUE} samples.lmm.long <- left_join(samples.lmm, dat, by = "Participant", relationship = "many-to-many") %>% pivot_longer(cols = A:B, names_to = "Condition", values_to = "Value") ``` Define function to compute lmm ```{r} lmmfun <- function(y, x, z) { mod = (anova(lmer(y ~ x + (1 | z)))) } ``` Fit linear mixed model on each simulated sample and write p-value to new dataset (lmm.comps) ```{r cache = TRUE, warning = FALSE, message = FALSE} lmm.comps <- ddply(samples.lmm.long, .(sample.lmm), summarise, p = round(lmmfun(Value, Condition, Participant)$`Pr(>F)`, 3), "Statistical signicance" = ifelse(p <= alpha.lmm, "Significant", "Non-significant")) ``` Calculate power of analysis (i.e., proportion of simulations where p-value is less than alpha) ```{r} power.lmm <- sum(lmm.comps$`Statistical signicance` == "Significant") / num_sims.lmm ``` Plot histogram of p-values with label for power and sample size ```{r} p.lmm <- ggplot(lmm.comps, aes(x = p, fill = `Statistical signicance`)) + scale_fill_hue(direction = -1) + geom_histogram(bins = 1/alpha.lmm, breaks = seq(0, 1, alpha.lmm)) + labs(y = "Count", x = "p-value", title = "Linear Mixed Models", subtitle = "Call: lmer(Value ~ Condition + (1 | Participant))") + annotate("text", x = 0.5, y = sum(lmm.comps$`Statistical signicance` == "Significant") * 0.9, label = paste0("Power = ", round(power.lmm, 3), "\nSample size = ", sample_size.lmm, " participants\n(6 paired responses per participant)")) p.lmm ``` ## Cumulative Link Mixed Models (clmm) Set parameters for simulation ```{r} #Change these numbers to see their effects on statistical power num_sims.clmm = 1000 # Number of simulations to run sample_size.clmm = 14 # Sample size for each simulation (number of participants) alpha.clmm = 0.05 # Significance level ``` Create simulation-based dataset of participants with defined sample size ```{r cache = TRUE} samples.clmm <- map_dfr(seq_len(num_sims.clmm), ~part %>% sample_n(sample_size.clmm) %>% mutate(sample.clmm = as.factor(.x))) ``` Full simulation-based dataset with all data per particioant, in long format ```{r cache = TRUE} samples.clmm.long <- left_join(samples.clmm, dat, by = "Participant", relationship = "many-to-many") %>% pivot_longer(cols = A:B, names_to = "Condition", values_to = "Value") %>% mutate(Value = as.factor(Value)) ``` Define function to compute lmm ```{r} clmmfun <- function(y, x, z) { m1 = emmeans(clmm(y ~ x + (1 | z)), pairwise~x) # Fit model } ``` **Note:** I couldn´t manage to make the simulation work from this point, so this code is dysplayed, but not run Fit ciumulative link mixed model on each simulated sample and write p-value to new dataset (clmm.comps) ```{r eval = FALSE} clmm.comps <- ddply(samples.clmm.long, .(sample.clmm), here(summarise), p = data.frame(clmmfun(Value, Condition, Participant)$contrasts)$p.value, "Statistical signicance" = ifelse(p <= alpha.clmm, "Significant", "Non-significant")) ``` Calculate power of analysis (i.e., proportion of simulations where p-value is less than alpha) ```{r eval = FALSE} power.clmm <- sum(clmm.comps$`Statistical signicance` == "Significant") / length(unique(samples.clmm.longTEMP$sample.clmm)) ``` Plot histogram of p-values with label for power and sample size ```{r eval = FALSE} p.clmm <- ggplot(clmm.comps, aes(x = p, fill = `Statistical signicance`)) + scale_fill_hue(direction = -1) + geom_histogram(bins = 1/alpha.clmm, breaks = seq(0, 1, alpha.clmm)) + labs(y = "Count", x = "p-value", title = "Cumulative Link Mixed Models", subtitle = "Call: clmm(Value ~ Condition + (1 | Participant))") + annotate("text", x = 0.5, y = sum(clmm.comps$`Statistical signicance` == "Significant") * 0.9, label = paste0("Power = ", round(power.clmm, 3), "\nSample size = ", sample_size.clmm, " participants\n(6 paired responses per participant)")) p.clmm ```