From 2b71d531a7204ab4ee7f9d13b76e7c551294c37c Mon Sep 17 00:00:00 2001 From: Levi Waldron Date: Fri, 3 Jul 2026 11:59:16 +0200 Subject: [PATCH 1/6] Modernize day1_intro.Rmd with ggplot2 and SummarizedExperiment --- vignettes/day1_intro.Rmd | 146 ++++++++++++++++++++++++--------------- 1 file changed, 92 insertions(+), 54 deletions(-) diff --git a/vignettes/day1_intro.Rmd b/vignettes/day1_intro.Rmd index 0a78b2d..411ec41 100644 --- a/vignettes/day1_intro.Rmd +++ b/vignettes/day1_intro.Rmd @@ -13,6 +13,7 @@ knitr::opts_chunk$set( comment = "#>" ) suppressPackageStartupMessages(library(AppStatBio)) +suppressPackageStartupMessages(library(ggplot2)) ``` # Front matter @@ -71,13 +72,15 @@ My research interests: Normally distributed random variable with mean $\mu = 0$ / standard deviation $\sigma = 1$, and a sample of $n=100$ -```{r, echo=FALSE} +```{r, echo=FALSE, message=FALSE} set.seed(1) -x = rnorm(100) -res = hist(x, main = "Standard Normal Distribution\n mean 0, std. dev. 1", prob = - TRUE) -xdens = seq(min(res$breaks), max(res$breaks), by = 0.01) -lines(xdens, dnorm(xdens)) +x <- rnorm(100) +df <- data.frame(x = x) +ggplot(df, aes(x = x)) + + geom_histogram(aes(y = after_stat(density)), bins = 15, fill = "lightgrey", color = "black") + + stat_function(fun = dnorm, color = "blue", linewidth = 1) + + labs(title = "Standard Normal Distribution\n mean 0, std. dev. 1", x = "Value", y = "Density") + + theme_minimal() ``` ## Random Variables - examples @@ -85,16 +88,14 @@ lines(xdens, dnorm(xdens)) Poisson distributed random variable ($\lambda = 2$), and a sample of $n=100$. ```{r, echo=FALSE} -x = rpois(100, lambda = 2) -res = hist( - x, - main = "Poisson Distribution", - prob = FALSE, - col = "lightgrey", - breaks = seq(-0.5, round(max(x)) + 0.5, by = 0.5) -) -xdens = seq(min(x), max(x), by = 1) -lines(xdens, length(x) * dpois(xdens, lambda = 2), lw = 2) +x <- rpois(100, lambda = 2) +df <- data.frame(x = x) +ggplot(df, aes(x = x)) + + geom_bar(fill = "lightgrey", color = "black") + + geom_point(data = data.frame(x = 0:max(x), y = 100 * dpois(0:max(x), lambda = 2)), aes(x = x, y = y), color = "blue", size = 2) + + geom_line(data = data.frame(x = 0:max(x), y = 100 * dpois(0:max(x), lambda = 2)), aes(x = x, y = y), color = "blue", linewidth = 1) + + labs(title = "Poisson Distribution", x = "Value", y = "Frequency") + + theme_minimal() ``` ## Random Variables - examples @@ -102,16 +103,14 @@ lines(xdens, length(x) * dpois(xdens, lambda = 2), lw = 2) Negative Binomially distributed random variable ($size=30, \mu=2$), and a sample of $n=100$. ```{r, echo=FALSE} -x = rnbinom(100, size = 30, mu = 2) -res = hist( - x, - main = "Negative Binomial Distribution", - prob = FALSE, - col = "lightgrey", - breaks = seq(-0.5, round(max(x)) + 0.5, by = 0.5) -) -xdens = seq(min(x), max(x), by = 1) -lines(xdens, length(x) * dnbinom(xdens, size = 30, mu = 2), lw = 2) +x <- rnbinom(100, size = 30, mu = 2) +df <- data.frame(x = x) +ggplot(df, aes(x = x)) + + geom_bar(fill = "lightgrey", color = "black") + + geom_point(data = data.frame(x = 0:max(x), y = 100 * dnbinom(0:max(x), size = 30, mu = 2)), aes(x = x, y = y), color = "blue", size = 2) + + geom_line(data = data.frame(x = 0:max(x), y = 100 * dnbinom(0:max(x), size = 30, mu = 2)), aes(x = x, y = y), color = "blue", linewidth = 1) + + labs(title = "Negative Binomial Distribution", x = "Value", y = "Frequency") + + theme_minimal() ``` ## Random Variables - examples @@ -120,16 +119,14 @@ lines(xdens, length(x) * dnbinom(xdens, size = 30, mu = 2), lw = 2) - use for binary outcomes ```{r, echo=FALSE} -x = rbinom(100, size = 20, prob = 0.25) -res = hist( - x, - main = "Binomial Distribution", - prob = FALSE, - col = "lightgrey", - breaks = seq(-0.5, round(max(x)) + 0.5, by = 0.5) -) -xdens = seq(min(x), max(x), by = 1) -lines(xdens, length(x) * dbinom(xdens, size = 20, prob = 0.25), lw = 2) +x <- rbinom(100, size = 20, prob = 0.25) +df <- data.frame(x = x) +ggplot(df, aes(x = x)) + + geom_bar(fill = "lightgrey", color = "black") + + geom_point(data = data.frame(x = 0:max(x), y = 100 * dbinom(0:max(x), size = 20, prob = 0.25)), aes(x = x, y = y), color = "blue", size = 2) + + geom_line(data = data.frame(x = 0:max(x), y = 100 * dbinom(0:max(x), size = 20, prob = 0.25)), aes(x = x, y = y), color = "blue", linewidth = 1) + + labs(title = "Binomial Distribution", x = "Value", y = "Frequency") + + theme_minimal() ``` # Hypothesis Testing @@ -300,22 +297,21 @@ reps <- 100 N <- 30 alpha <- 0.05 my.conf.ints <- replicate(reps, t.test(rnorm(N), conf.level = 1-alpha)$conf.int) -plot(x = rep(range(my.conf.ints), reps / 2), - y = 1:reps, - main = paste(reps, "samples of size N=", N), - type = "n") -abline(v = 0, lw = 2, lty = 3) -for (i in seq(1, reps)) { - acceptH0 <- my.conf.ints[1, i] < 0 & my.conf.ints[2, i] > 0 - segments( - x0 = my.conf.ints[1, i], - y0 = i, - x1 = my.conf.ints[2, i], - y1 = i, - lw = ifelse(acceptH0, 1, 2), - col = ifelse(acceptH0, "black", "red") - ) -} +df <- data.frame( + sample = 1:reps, + lower = my.conf.ints[1, ], + upper = my.conf.ints[2, ] +) +df$contains_zero <- df$lower < 0 & df$upper > 0 + +ggplot(df, aes(y = sample)) + + geom_segment(aes(x = lower, xend = upper, yend = sample, color = contains_zero, linewidth = contains_zero)) + + geom_vline(xintercept = 0, linetype = "dashed", color = "black") + + scale_color_manual(values = c("FALSE" = "red", "TRUE" = "black")) + + scale_linewidth_manual(values = c("FALSE" = 1, "TRUE" = 0.5)) + + labs(title = paste(reps, "samples of size N=", N), x = "Confidence Interval", y = "Sample") + + theme_minimal() + + theme(legend.position = "none") ``` ## Intervals for any confidence level @@ -364,6 +360,24 @@ Which of the following are true? - variable under repeated sampling, and - sensitive to test assumptions +## Distribution of p-values under the null hypothesis + +When the null hypothesis is true, p-values are uniformly distributed between 0 and 1. This means that even if there is no true effect, we will still observe p-values < 0.05 in 5% of our tests purely by chance. + +```{r, echo=FALSE, fig.height=4, fig.width=6} +set.seed(123) +n_tests <- 1000 +# Simulate 1000 t-tests where there is no difference (both groups from standard normal) +p_vals <- replicate(n_tests, t.test(rnorm(10), rnorm(10))$p.value) + +ggplot(data.frame(p = p_vals), aes(x = p)) + + geom_histogram(breaks = seq(0, 1, by = 0.05), fill = "lightgrey", color = "black") + + geom_hline(yintercept = n_tests * 0.05, linetype = "dashed", color = "red") + + labs(title = "Histogram of p-values when Null Hypothesis is True", + x = "p-value", y = "Frequency") + + theme_minimal() +``` + ## Use and mis-use of the p-value (cont'd) * If we fail to reject $H_0$, is the probability that $H_0$ is true equal to ($1-\alpha$)? (Hint: NO NO NO!) @@ -562,9 +576,33 @@ alphabetFrequency(dnastring, baseOnly=TRUE, as.prob=TRUE) ![Summarized Experiment](images/SE.svg) * Source: https://bioconductor.org/packages/SummarizedExperiment/ -## (Ranged)SummarizedExperiment +## (Ranged)SummarizedExperiment in Practice + +* `SummarizedExperiment` is the _de facto_ standard for "rectangular" data with metadata +* Let's build a real `SummarizedExperiment` using the tissue gene expression data: + +```{r, message=FALSE, warning=FALSE} +suppressPackageStartupMessages(library(SummarizedExperiment)) +# We can use the genomicsclass tissuesGeneExpression data +suppressPackageStartupMessages(library(tissuesGeneExpression)) +data(tissuesGeneExpression) + +# Create the SummarizedExperiment object +se <- SummarizedExperiment( + assays = list(counts = e), + colData = DataFrame(tissue = tissue), + rowData = DataFrame(gene_id = rownames(e)) +) +se +``` + +* We can directly access the expression matrix (`assay()`), sample metadata (`colData()`), and feature metadata (`rowData()`): + +```{r} +dim(assay(se)) +head(colData(se), 3) +``` -* `RangedSummarizedExperiment` is the _de facto_ standard for "rectangular" data with metadata * Extended by `SingleCellExperiment`, `DESeqDataSet` * Emulated by `MultiAssayExperiment` From aa80215ab6479279c67d1e1b9fd0beb35602e944 Mon Sep 17 00:00:00 2001 From: Levi Waldron Date: Fri, 3 Jul 2026 13:14:51 +0200 Subject: [PATCH 2/6] Modernize day2_unsupervised.Rmd with scater, ggplot2, and BLAS tips --- vignettes/day2_unsupervised.Rmd | 106 +++++++++++++++----------------- 1 file changed, 49 insertions(+), 57 deletions(-) diff --git a/vignettes/day2_unsupervised.Rmd b/vignettes/day2_unsupervised.Rmd index 02ec439..33c7541 100644 --- a/vignettes/day2_unsupervised.Rmd +++ b/vignettes/day2_unsupervised.Rmd @@ -14,7 +14,12 @@ knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) -library(AppStatBio) +suppressPackageStartupMessages({ + library(AppStatBio) + library(SingleCellExperiment) + library(scater) + library(ggplot2) +}) ``` @@ -96,8 +101,14 @@ if(!require(GSE5859)){ library(GSE5859) library(tissuesGeneExpression) data(tissuesGeneExpression) -dim(e) ##gene expression data -table(tissue) ##tissue[i] corresponds to e[,i] + +# Modern approach: store expression and metadata together +sce <- SingleCellExperiment( + assays = list(logcounts = e), + colData = DataFrame(tissue = tissue) +) +dim(sce) ## gene expression data +table(sce$tissue) ## samples per tissue ``` Interested in identifying similar *samples* and similar *genes* @@ -321,11 +332,12 @@ plot(s$u[, 1] ~ p$rotation[, 1]) ## Visualizing PCA loadings ```{r visloadings} -plot(p$rotation[, 1], - xlab = "Index of genes", - ylab = "Loadings of PC1", - main = "PC1 loadings of each gene") #or, predict(p) -abline(h = c(-0.03, 0.03), col = "red") +df_loadings <- data.frame(Index = seq_len(nrow(p$rotation)), Loading = p$rotation[, 1]) +ggplot(df_loadings, aes(x = Index, y = Loading)) + + geom_point(alpha = 0.5, size = 1) + + geom_hline(yintercept = c(-0.03, 0.03), color = "red", linetype = "dashed") + + labs(title = "PC1 loadings of each gene", x = "Index of genes", y = "Loadings of PC1") + + theme_minimal() ``` ## Genes with high PC1 loadings @@ -347,14 +359,16 @@ pheatmap::pheatmap( - $\mathbf{D}$ (**eigenvalues**): standard deviation scaling factor that each decomposed variable is multiplied by. ```{r screeplot, fig.height=3, fig.width=5, echo=TRUE, fig.align='center'} -rafalib::mypar() -plot( - p$sdev ^ 2 / sum(p$sdev ^ 2) * 100, - xlim = c(0, 150), - type = "b", - ylab = "% variance explained", - main = "Screeplot" +df_scree <- data.frame( + PC = seq_along(p$sdev), + Variance = p$sdev^2 / sum(p$sdev^2) * 100, + Cumulative = cumsum(p$sdev^2) / sum(p$sdev^2) * 100 ) + +ggplot(df_scree[1:150, ], aes(x = PC, y = Variance)) + + geom_line() + geom_point(size = 1) + + labs(title = "Screeplot", y = "% variance explained") + + theme_minimal() ``` ## PCA interpretation: eigenvalues @@ -362,14 +376,11 @@ plot( Alternatively as cumulative % variance explained (using `cumsum()` function) ```{r cumscreeplot, fig.height=4, echo=TRUE, fig.align='center'} -rafalib::mypar() -plot( - cumsum(p$sdev ^ 2) / sum(p$sdev ^ 2) * 100, - ylab = "cumulative % variance explained", - ylim = c(0, 100), - type = "b", - main = "Cumulative screeplot" -) +ggplot(df_scree, aes(x = PC, y = Cumulative)) + + geom_line() + + labs(title = "Cumulative screeplot", y = "Cumulative % variance explained") + + coord_cartesian(ylim = c(0, 100)) + + theme_minimal() ``` ## PCA interpretation: scores @@ -384,22 +395,10 @@ plot( ## PCA interpretation: scores ```{r scores, fig.height=5, echo=FALSE} -rafalib::mypar() -plot( - p$x[, 1:2], - xlab = "PC1", - ylab = "PC2", - main = "plot of p$x[, 1:2]", - col = factor(tissue), - pch = as.integer(factor(tissue)) -) -legend( - "topright", - legend = levels(factor(tissue)), - col = 1:length(unique(tissue)), - pch = 1:length(unique(tissue)), - bty = 'n' -) +# Using scater for modern SingleCellExperiment PCA plotting +sce <- runPCA(sce, exprs_values = "logcounts") +plotPCA(sce, colour_by = "tissue") + + ggtitle("PCA plot of tissue data") ``` ## Multi-dimensional Scaling (MDS) @@ -414,15 +413,11 @@ mds <- cmdscale(d) ``` ```{r plotmds, echo=FALSE} -rafalib::mypar() -plot(mds, col = factor(tissue), pch = as.integer(factor(tissue))) -legend( - "topright", - legend = levels(factor(tissue)), - col = 1:length(unique(tissue)), - bty = 'n', - pch = 1:length(unique(tissue)) -) +mds_df <- data.frame(Dim1 = mds[, 1], Dim2 = mds[, 2], tissue = tissue) +ggplot(mds_df, aes(x = Dim1, y = Dim2, color = tissue)) + + geom_point(size = 2) + + labs(title = "MDS Plot of tissues", x = "Coordinate 1", y = "Coordinate 2") + + theme_minimal() ``` ## t-SNE @@ -493,13 +488,9 @@ plotReducedDim(sce.zeisel, dimred="UMAP", colour_by="level1class") Using default parameters and no log transformation ```{r umaptsnezeisel} -sct <- SingleCellExperiment(e) -sct$tissue <- tissue -# fake the log normalization because it is undesirable for microarray data -names(assays(sct)) <- "logcounts" -sct <- fixedPCA(sct, subset.row=NULL) -sct <- runTSNE(sct, dimred="PCA") -plotReducedDim(sct, dimred="TSNE", colour_by="tissue") +# We already set up `sce` with `tissue` earlier +sce <- runTSNE(sce, dimred="PCA") +plotReducedDim(sce, dimred="TSNE", colour_by="tissue") ``` ## UMAP of tissue microarray data @@ -507,8 +498,8 @@ plotReducedDim(sct, dimred="TSNE", colour_by="tissue") Also using default parameters and no log transformation ```{r nologumap} -sct <- runUMAP(sct, dimred="PCA") -plotReducedDim(sct, dimred="UMAP", colour_by="tissue") +sce <- runUMAP(sce, dimred="PCA") +plotReducedDim(sce, dimred="UMAP", colour_by="tissue") ``` ## Summary: distances and dimension reduction @@ -523,6 +514,7 @@ plotReducedDim(sct, dimred="UMAP", colour_by="tissue") ## Lab exercise +* **Check your BLAS**: Matrix multiplication speed relies heavily on your Basic Linear Algebra Subprograms (BLAS). In the lab, verify you are using an optimized BLAS (like OpenBLAS, Intel MKL, or Apple Accelerate). You can check this by running `sessionInfo()` in your R console and looking for the "Matrix products:" and "BLAS:" fields. If it points to `libRblas.so` or `libR.dylib` (instead of e.g. `libopenblas.so` or `vecLib`), you might be using the unoptimized default! * OSCA Basics: [Chapter 4 Dimensionality Reduction](http://bioconductor.org/books/release/OSCA.basic/dimensionality-reduction.html) * Optional if you are interested, OSCA Advanced: [Chapter 4 Dimensionality reduction, redux](http://bioconductor.org/books/release/OSCA.advanced/dimensionality-reduction-redux.html) From 8310cf318375abbea33f7352626fe03dc1417716 Mon Sep 17 00:00:00 2001 From: Levi Waldron Date: Fri, 3 Jul 2026 13:19:58 +0200 Subject: [PATCH 3/6] Modernize day3_linearmodels.Rmd with ggplot2 and modern regression tables --- vignettes/day3_linearmodels.Rmd | 95 +++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 35 deletions(-) diff --git a/vignettes/day3_linearmodels.Rmd b/vignettes/day3_linearmodels.Rmd index 1bf6531..e5ab416 100644 --- a/vignettes/day3_linearmodels.Rmd +++ b/vignettes/day3_linearmodels.Rmd @@ -228,21 +228,28 @@ broom::tidy(fit) ## Interpretation of spider leg type coefficients -```{r spider_main_coef, fig.cap="Diagram of the estimated coefficients in the linear model. The green arrow indicates the Intercept term, which goes from zero to the mean of the reference group (here the 'pull' samples). The orange arrow indicates the difference between the push group and the pull group, which is negative in this example. The circles show the individual samples, jittered horizontally to avoid overplotting.",echo=FALSE} -set.seed(1) #same jitter in stripchart -stripchart(split(spider.sub$friction, spider.sub$type), - vertical=TRUE, pch=1, method="jitter", las=2, xlim=c(0,3), ylim=c(0,2)) +```{r spider_main_coef, fig.cap="Diagram of the estimated coefficients in the linear model. The green dashed line indicates the Intercept term (mean of the reference group 'pull'). The orange dashed line indicates the push group mean. The solid green/orange segments explicitly show the intercept distance and the effect difference.", echo=FALSE, warning=FALSE} +library(ggplot2) coefs <- coef(fit) -a <- -0.25 -lgth <- .1 -library(RColorBrewer) -cols <- brewer.pal(3,"Dark2") -abline(h=0) -arrows(1+a,0,1+a,coefs[1],lwd=3,col=cols[1],length=lgth) -abline(h=coefs[1],col=cols[1]) -arrows(2+a,coefs[1],2+a,coefs[1]+coefs[2],lwd=3,col=cols[2],length=lgth) -abline(h=coefs[1]+coefs[2],col=cols[2]) -legend("right",names(coefs),fill=cols,cex=.75,bg="white") +spider.sub |> + ggplot(aes(x = type, y = friction)) + + geom_jitter(width = 0.1, color = "grey50", alpha = 0.7) + + # Intercept line and arrow + geom_hline(yintercept = coefs[1], color = "#1B9E77", linetype = "dashed") + + annotate("segment", x = 1, xend = 1, y = 0, yend = coefs[1], + color = "#1B9E77", arrow = arrow(length = unit(0.1, "inches")), linewidth = 1) + + # Difference arrow + geom_hline(yintercept = coefs[1] + coefs[2], color = "#D95F02", linetype = "dashed") + + annotate("segment", x = 2, xend = 2, y = coefs[1], yend = coefs[1] + coefs[2], + color = "#D95F02", arrow = arrow(length = unit(0.1, "inches")), linewidth = 1) + + # Base line + geom_hline(yintercept = 0, color = "black") + + labs( + title = "Estimated coefficients in the linear model", + x = "Type", + y = "Friction" + ) + + theme_minimal() ``` ## regression on spider leg **position** @@ -253,9 +260,8 @@ Remember there are positions 1-4 fit <- lm(friction ~ leg, data=spider) ``` -```{r, results="asis", echo=TRUE, message=FALSE} -fit.table <- xtable::xtable(fit, label=NULL) -print(fit.table, type="html") +```{r, echo=TRUE, message=FALSE} +knitr::kable(broom::tidy(fit), digits = 3) ``` - Interpretation of the dummy variables legL2, legL3, legL4 ? @@ -292,9 +298,8 @@ by: fit <- lm(friction ~ type + leg, data=spider) ``` -```{r, results="asis", echo=TRUE, message=FALSE} -fit.table <- xtable::xtable(fit, label=NULL) -print(fit.table, type="html") +```{r, echo=TRUE, message=FALSE} +gtsummary::tbl_regression(fit) ``` - this model still doesn't represent how the friction differences @@ -318,9 +323,8 @@ Image credit: fit <- lm(friction ~ type * leg, data=spider) ``` -```{r, results="asis", echo=TRUE, message=FALSE} -fit.table <- xtable::xtable(fit, label=NULL) -print(fit.table, type="html") +```{r, echo=TRUE, message=FALSE} +gtsummary::tbl_regression(fit) ``` ## Model formulae (cont'd) @@ -472,18 +476,39 @@ $$ 1. \# of trials n, 2. probability of success p -```{r, echo=FALSE} -plot(x=0:40, y=dnbinom(0:40, size=10, prob=0.5), - type="b", lwd=2, ylim=c(0, 0.15), - xlab="Counts (k)", ylab="Probability density") -lines(x=0:40, y=dnbinom(0:40, size=20, prob=0.5), - type="b", lwd=2, lty=2, pch=2) -lines(x=0:40, y=dnbinom(0:40, size=10, prob=0.3), - type="b", lwd=2, lty=3, pch=3) -lines(x=0:40, y=dpois(0:40, lambda=9), col="red") -lines(x=0:40, y=dpois(0:40, lambda=20), col="red") -legend("topright", lwd=c(2,2,2,1), lty=c(1:3,1), pch=c(1:3,-1), col=c(rep("black", 3), "red"), - legend=c("n=10, p=0.5", "n=20, p=0.5", "n=10, p=0.3", "Poisson")) +```{r, echo=FALSE, message=FALSE, warning=FALSE} +library(ggplot2) +library(tidyr) +library(dplyr) + +x <- 0:40 +df_dist <- data.frame( + Counts = x, + dist1 = dnbinom(x, size = 10, prob = 0.5), + dist2 = dnbinom(x, size = 20, prob = 0.5), + dist3 = dnbinom(x, size = 10, prob = 0.3), + dist4 = dpois(x, lambda = 9) +) + +df_long <- df_dist |> + pivot_longer(-Counts, names_to = "Distribution", values_to = "Density") |> + mutate(Distribution = case_match( + Distribution, + "dist1" ~ "NB (n=10, p=0.5)", + "dist2" ~ "NB (n=20, p=0.5)", + "dist3" ~ "NB (n=10, p=0.3)", + "dist4" ~ "Poisson (\u03bb=9)" + )) + +ggplot(df_long, aes(x = Counts, y = Density, color = Distribution, linetype = Distribution, shape = Distribution)) + + geom_line(linewidth = 1) + + geom_point(size = 2) + + scale_color_manual(values = c("black", "black", "black", "red")) + + scale_linetype_manual(values = c("solid", "dashed", "dotted", "solid")) + + scale_shape_manual(values = c(1, 2, 3, NA)) + + labs(x = "Counts (k)", y = "Probability density") + + theme_minimal() + + theme(legend.position = c(0.8, 0.7)) ``` ## Additive vs. multiplicative models From 7a60f7a7602370b1e48528bb1676c6dc9b3b6bfb Mon Sep 17 00:00:00 2001 From: Levi Waldron Date: Fri, 3 Jul 2026 13:22:17 +0200 Subject: [PATCH 4/6] Modernize day4_batcheffects-vis.Rmd with ggplot2 and patchwork --- vignettes/day4_batcheffects-vis.Rmd | 232 ++++++++++++++++++---------- 1 file changed, 151 insertions(+), 81 deletions(-) diff --git a/vignettes/day4_batcheffects-vis.Rmd b/vignettes/day4_batcheffects-vis.Rmd index 73ceb79..e877a29 100644 --- a/vignettes/day4_batcheffects-vis.Rmd +++ b/vignettes/day4_batcheffects-vis.Rmd @@ -49,35 +49,37 @@ library(AppStatBio) ## Example: Quantile Quantile Plots -```{r qq1, echo=FALSE, fig.width=8} -suppressPackageStartupMessages(library(UsingR)) -suppressPackageStartupMessages(library(rafalib)) +```{r qq1, echo=FALSE, fig.width=10, fig.height=4, warning=FALSE} +suppressPackageStartupMessages({ + library(UsingR) + library(rafalib) + library(ggplot2) + library(patchwork) +}) + # height qq plot -x <- father.son$fheight -ps <- (seq(0, 99) + 0.5) / 100 -qs <- quantile(x, ps) -normalqs <- qnorm(ps, mean(x), popsd(x)) -par(mfrow = c(1, 3)) -plot(normalqs, - qs, - xlab = "Normal Percentiles", - ylab = "Height Percentiles", - main = "Height Q-Q Plot") -abline(0, 1) +p1 <- ggplot(data.frame(x = father.son$fheight), aes(sample = x)) + + stat_qq() + stat_qq_line() + + labs(title = "Height Q-Q Plot", x = "Theoretical Normal Quantiles", y = "Height Quantiles") + + theme_minimal() + # t-distributed for 12 df -x <- rt(1000, 12) -qqnorm(x, - xlab = "t quantiles", - main = "T Quantiles (df=12) Q-Q Plot", - ylim = c(-6, 6)) -qqline(x) +p2 <- ggplot(data.frame(x = rt(1000, 12)), aes(sample = x)) + + stat_qq(distribution = qt, dparams = list(df = 12)) + + stat_qq_line(distribution = qt, dparams = list(df = 12)) + + labs(title = "T Quantiles (df=12) Q-Q Plot", x = "Theoretical t quantiles", y = "Sample Quantiles") + + coord_cartesian(ylim = c(-6, 6)) + + theme_minimal() + # t-distributed for 3 df -x <- rt(1000, 3) -qqnorm(x, - xlab = "t quantiles", - main = "T Quantiles (df=3) Q-Q Plot", - ylim = c(-6, 6)) -qqline(x) +p3 <- ggplot(data.frame(x = rt(1000, 3)), aes(sample = x)) + + stat_qq(distribution = qt, dparams = list(df = 3)) + + stat_qq_line(distribution = qt, dparams = list(df = 3)) + + labs(title = "T Quantiles (df=3) Q-Q Plot", x = "Theoretical t quantiles", y = "Sample Quantiles") + + coord_cartesian(ylim = c(-6, 6)) + + theme_minimal() + +p1 | p2 | p3 ``` ## Boxplots: About @@ -88,11 +90,30 @@ qqline(x) ## Boxplots: Example -```{r bp1, echo=FALSE, fig.width=8, fig.height=4} -par(mfrow=c(1, 3)) -hist(exec.pay, main = "CEO Compensation") -qqnorm(exec.pay, main = "CEO Compensation") -boxplot(exec.pay, ylab="10,000s of dollars", ylim=c(0,400), main = "CEO Compensation") +```{r bp1, echo=FALSE, fig.width=10, fig.height=4, warning=FALSE} +library(ggplot2) +library(patchwork) + +df_exec <- data.frame(pay = exec.pay) + +p1 <- ggplot(df_exec, aes(x = pay)) + + geom_histogram(bins = 30, fill = "gray", color = "black") + + labs(title = "CEO Compensation", x = "10,000s of dollars", y = "Count") + + theme_minimal() + +p2 <- ggplot(df_exec, aes(sample = pay)) + + stat_qq() + stat_qq_line() + + labs(title = "CEO Compensation Q-Q Plot") + + theme_minimal() + +p3 <- ggplot(df_exec, aes(y = pay)) + + geom_boxplot(fill = "lightblue") + + coord_cartesian(ylim = c(0, 400)) + + labs(title = "CEO Compensation", y = "10,000s of dollars") + + theme_minimal() + + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) + +p1 | p2 | p3 ```
Three different views of a continuous variable. The boxplot clearly shows the skew and outliers. @@ -105,35 +126,29 @@ Three different views of a continuous variable. The boxplot clearly shows the sk ## Scatterplots And Correlation: Example -```{r cor1, echo=FALSE, fig.width=8, fig.height=4} -par(mfrow = c(1, 3)) -plot( - father.son$fheight, - father.son$sheight, - xlab = "Father's height in inches", - ylab = "Son's height in inches", - main = paste("correlation =", signif( - cor(father.son$fheight, father.son$sheight), 2 - )) -) -plot( - cars$speed, - cars$dist, - xlab = "Speed", - ylab = "Stopping Distance", - main = paste("correlation =", signif(cor( - cars$speed, cars$dist - ), 2)) -) -plot( - faithful$eruptions, - faithful$waiting, - xlab = "Eruption Duration", - ylab = "Waiting Time", - main = paste("correlation =", signif( - cor(faithful$eruptions, faithful$waiting), 2 - )) -) +```{r cor1, echo=FALSE, fig.width=10, fig.height=4, warning=FALSE} +library(ggplot2) +library(patchwork) + +p1 <- ggplot(father.son, aes(x = fheight, y = sheight)) + + geom_point(alpha = 0.5) + + labs(title = paste("correlation =", signif(cor(father.son$fheight, father.son$sheight), 2)), + x = "Father's height in inches", y = "Son's height in inches") + + theme_minimal() + +p2 <- ggplot(cars, aes(x = speed, y = dist)) + + geom_point(alpha = 0.5) + + labs(title = paste("correlation =", signif(cor(cars$speed, cars$dist), 2)), + x = "Speed", y = "Stopping Distance") + + theme_minimal() + +p3 <- ggplot(faithful, aes(x = eruptions, y = waiting)) + + geom_point(alpha = 0.5) + + labs(title = paste("correlation =", signif(cor(faithful$eruptions, faithful$waiting), 2)), + x = "Eruption Duration", y = "Waiting Time") + + theme_minimal() + +p1 | p2 | p3 ``` ## Exploratory data analysis in high dimensions @@ -163,17 +178,27 @@ Note that these 8,793 tests are done in about 0.01s ## Volcano plots: Example -```{r vp2, echo = FALSE, fig.height=5, fig.width=5} -par(mar = c(4, 4, 0, 0)) -plot(results$dm, - -log10(results$p.value), - xlab = "Effect size (difference in group means)", - ylab = "- log (base 10) p-values") -abline(h = -log10(0.05 / nrow(geneExpression)), col = "red") -legend("bottomright", - lty = 1, - col = "red", - legend = "Bonferroni = 0.05") +```{r vp2, echo = FALSE, fig.height=5, fig.width=5, warning=FALSE} +library(ggplot2) +library(dplyr) + +bonferroni_thresh <- -log10(0.05 / nrow(geneExpression)) + +df_volcano <- data.frame( + EffectSize = results$dm, + NegLogP = -log10(results$p.value) +) %>% + mutate(Significant = NegLogP > bonferroni_thresh) + +ggplot(df_volcano, aes(x = EffectSize, y = NegLogP, color = Significant)) + + geom_point(alpha = 0.5, size = 1) + + geom_hline(yintercept = bonferroni_thresh, color = "red", linetype = "dashed") + + scale_color_manual(values = c("black", "red")) + + labs(x = "Effect size (difference in group means)", y = "-log10(p-value)") + + theme_minimal() + + theme(legend.position = "none") + + annotate("text", x = max(df_volcano$EffectSize), y = bonferroni_thresh + 0.2, + label = "Bonferroni = 0.05", color = "red", hjust = 1) ``` ## Volcano plots: Summary @@ -198,20 +223,42 @@ nullpvals <- rowttests(randomData, g)$p.value ## P-value histograms: Example -```{r pvalhist2, echo=FALSE, fig.height=3} -par(mfrow = c(1, 2)) -hist(pvals, ylim = c(0, 1400)) -hist(nullpvals, ylim = c(0, 1400)) +```{r pvalhist2, echo=FALSE, fig.height=3, fig.width=8, warning=FALSE} +library(ggplot2) +library(patchwork) + +df_pvals <- data.frame( + Original = pvals, + Random = nullpvals +) + +p1 <- ggplot(df_pvals, aes(x = Original)) + + geom_histogram(breaks = seq(0, 1, 0.05), fill = "gray", color = "black") + + coord_cartesian(ylim = c(0, 1400)) + + labs(title = "Original Data p-values", x = "p-value", y = "Frequency") + + theme_minimal() + +p2 <- ggplot(df_pvals, aes(x = Random)) + + geom_histogram(breaks = seq(0, 1, 0.05), fill = "gray", color = "black") + + coord_cartesian(ylim = c(0, 1400)) + + labs(title = "Random Data p-values", x = "p-value", y = "Frequency") + + theme_minimal() + +p1 | p2 ``` ## P-value histograms: Example 2 (permutation) Note that permuting these data doesn't produce an ideal null p-value histogram due to batch effects: -```{r pvalhist3, fig.height=3} +```{r pvalhist3, fig.height=3, fig.width=4, warning=FALSE} permg <- sample(g) permresults <- rowttests(geneExpression, permg) -hist(permresults$p.value) + +ggplot(data.frame(pvalue = permresults$p.value), aes(x = pvalue)) + + geom_histogram(breaks = seq(0, 1, 0.05), fill = "gray", color = "black") + + labs(title = "Permuted Data p-values", x = "p-value", y = "Frequency") + + theme_minimal() ``` ## P-value histograms: Summary @@ -224,11 +271,34 @@ hist(permresults$p.value) - **Why use it?** An MA plot is a clever transformation of a scatter plot, designed to better visualize differences between two samples (or one sample and a reference). It's just a scatterplot rotated 45$^o$. - The rotation helps us see systematic biases. The 'A' (Average) on the x-axis represents overall signal intensity, and the 'M' (Minus, or log-ratio) on the y-axis represents the fold change. This makes it much easier to see if the fold change is dependent on gene intensity. -```{r pvalhist4, fig.height=3} -rafalib::mypar(1, 2) +```{r pvalhist4, fig.height=4, fig.width=8, warning=FALSE} +library(ggplot2) +library(patchwork) +library(dplyr) + pseudo <- apply(geneExpression, 1, median) -plot(geneExpression[, 1], pseudo) # Standard scatter plot -plot((geneExpression[, 1] + pseudo) / 2, (geneExpression[, 1] - pseudo)) # MA plot +df_ma <- data.frame( + Sample = geneExpression[, 1], + Pseudo = pseudo +) %>% + mutate( + A = (Sample + Pseudo) / 2, + M = Sample - Pseudo + ) + +p1 <- ggplot(df_ma, aes(x = Pseudo, y = Sample)) + + geom_point(alpha = 0.3, size = 1) + + geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") + + labs(title = "Standard scatter plot", x = "Median across samples", y = "Sample 1") + + theme_minimal() + +p2 <- ggplot(df_ma, aes(x = A, y = M)) + + geom_point(alpha = 0.3, size = 1) + + geom_hline(yintercept = 0, color = "red", linetype = "dashed") + + labs(title = "MA plot", x = "Average (A)", y = "Minus (M)") + + theme_minimal() + +p1 | p2 ``` ## MA plot: Summary From 779b6b2a71e2f219facadc61910fd23db5822dd6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 3 Jul 2026 11:29:01 +0000 Subject: [PATCH 5/6] Fix pkgdown workflow: downgrade r-lib/actions from v3 to v2 --- .github/workflows/pkgdown.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 4089fd1..f6eac73 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -25,13 +25,13 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: r-lib/actions/setup-pandoc@v3 + - uses: r-lib/actions/setup-pandoc@v2 - - uses: r-lib/actions/setup-r@v3 + - uses: r-lib/actions/setup-r@v2 with: use-public-rspm: true - - uses: r-lib/actions/setup-r-dependencies@v3 + - uses: r-lib/actions/setup-r-dependencies@v2 with: extra-packages: any::pkgdown, local::. needs: website From eade692beaed4577a9bb2d5ba2611f00cac90677 Mon Sep 17 00:00:00 2001 From: Levi Waldron Date: Fri, 3 Jul 2026 07:33:12 -0400 Subject: [PATCH 6/6] Apply suggestions from code review Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- vignettes/day2_unsupervised.Rmd | 2 +- vignettes/day3_linearmodels.Rmd | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vignettes/day2_unsupervised.Rmd b/vignettes/day2_unsupervised.Rmd index 33c7541..5224ae4 100644 --- a/vignettes/day2_unsupervised.Rmd +++ b/vignettes/day2_unsupervised.Rmd @@ -105,7 +105,7 @@ data(tissuesGeneExpression) # Modern approach: store expression and metadata together sce <- SingleCellExperiment( assays = list(logcounts = e), - colData = DataFrame(tissue = tissue) + colData = S4Vectors::DataFrame(tissue = tissue) ) dim(sce) ## gene expression data table(sce$tissue) ## samples per tissue diff --git a/vignettes/day3_linearmodels.Rmd b/vignettes/day3_linearmodels.Rmd index e5ab416..d43cbf6 100644 --- a/vignettes/day3_linearmodels.Rmd +++ b/vignettes/day3_linearmodels.Rmd @@ -236,12 +236,12 @@ spider.sub |> geom_jitter(width = 0.1, color = "grey50", alpha = 0.7) + # Intercept line and arrow geom_hline(yintercept = coefs[1], color = "#1B9E77", linetype = "dashed") + - annotate("segment", x = 1, xend = 1, y = 0, yend = coefs[1], - color = "#1B9E77", arrow = arrow(length = unit(0.1, "inches")), linewidth = 1) + +annotate("segment", x = 1, xend = 1, y = 0, yend = coefs[1], + color = "#1B9E77", arrow = grid::arrow(length = grid::unit(0.1, "inches")), linewidth = 1) + # Difference arrow geom_hline(yintercept = coefs[1] + coefs[2], color = "#D95F02", linetype = "dashed") + - annotate("segment", x = 2, xend = 2, y = coefs[1], yend = coefs[1] + coefs[2], - color = "#D95F02", arrow = arrow(length = unit(0.1, "inches")), linewidth = 1) + +annotate("segment", x = 2, xend = 2, y = coefs[1], yend = coefs[1] + coefs[2], + color = "#D95F02", arrow = grid::arrow(length = grid::unit(0.1, "inches")), linewidth = 1) + # Base line geom_hline(yintercept = 0, color = "black") + labs(