waldronlab · lwaldron · Jul 3, 2026 · Jul 3, 2026 · Jul 3, 2026 · Jul 3, 2026
diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
@@ -25,13 +25,13 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
-      - uses: r-lib/actions/setup-pandoc@v3
+      - uses: r-lib/actions/setup-pandoc@v2
 
-      - uses: r-lib/actions/setup-r@v3
+      - uses: r-lib/actions/setup-r@v2
         with:
           use-public-rspm: true
 
-      - uses: r-lib/actions/setup-r-dependencies@v3
+      - uses: r-lib/actions/setup-r-dependencies@v2
         with:
           extra-packages: any::pkgdown, local::.
           needs: website

diff --git a/vignettes/day1_intro.Rmd b/vignettes/day1_intro.Rmd
@@ -13,6 +13,7 @@ knitr::opts_chunk$set(
   comment = "#>"
 )
 suppressPackageStartupMessages(library(AppStatBio))
+suppressPackageStartupMessages(library(ggplot2))
 ```
 
 # Front matter
@@ -71,47 +72,45 @@ My research interests:
 
 Normally distributed random variable with mean $\mu = 0$ / standard deviation $\sigma = 1$, and a sample of $n=100$
 
-```{r, echo=FALSE}
+```{r, echo=FALSE, message=FALSE}
 set.seed(1)
-x = rnorm(100)
-res = hist(x, main = "Standard Normal Distribution\n mean 0, std. dev. 1", prob =
-               TRUE)
-xdens = seq(min(res$breaks), max(res$breaks), by = 0.01)
-lines(xdens, dnorm(xdens))
+x <- rnorm(100)
+df <- data.frame(x = x)
+ggplot(df, aes(x = x)) +
+  geom_histogram(aes(y = after_stat(density)), bins = 15, fill = "lightgrey", color = "black") +
+  stat_function(fun = dnorm, color = "blue", linewidth = 1) +
+  labs(title = "Standard Normal Distribution\n mean 0, std. dev. 1", x = "Value", y = "Density") +
+  theme_minimal()
 ```
 
 ## Random Variables - examples
 
 Poisson distributed random variable ($\lambda = 2$), and a sample of $n=100$.
 
 ```{r, echo=FALSE}
-x = rpois(100, lambda = 2)
-res = hist(
-    x,
-    main = "Poisson Distribution",
-    prob = FALSE,
-    col = "lightgrey",
-    breaks = seq(-0.5, round(max(x)) + 0.5, by = 0.5)
-)
-xdens = seq(min(x), max(x), by = 1)
-lines(xdens, length(x) * dpois(xdens, lambda = 2), lw = 2)
+x <- rpois(100, lambda = 2)
+df <- data.frame(x = x)
+ggplot(df, aes(x = x)) +
+  geom_bar(fill = "lightgrey", color = "black") +
+  geom_point(data = data.frame(x = 0:max(x), y = 100 * dpois(0:max(x), lambda = 2)), aes(x = x, y = y), color = "blue", size = 2) +
+  geom_line(data = data.frame(x = 0:max(x), y = 100 * dpois(0:max(x), lambda = 2)), aes(x = x, y = y), color = "blue", linewidth = 1) +
+  labs(title = "Poisson Distribution", x = "Value", y = "Frequency") +
+  theme_minimal()
 ```
 
 ## Random Variables - examples
 
 Negative Binomially distributed random variable ($size=30, \mu=2$), and a sample of $n=100$.
 
 ```{r, echo=FALSE}
-x = rnbinom(100, size = 30, mu = 2)
-res = hist(
-    x,
-    main = "Negative Binomial Distribution",
-    prob = FALSE,
-    col = "lightgrey",
-    breaks = seq(-0.5, round(max(x)) + 0.5, by = 0.5)
-)
-xdens = seq(min(x), max(x), by = 1)
-lines(xdens, length(x) * dnbinom(xdens, size = 30, mu = 2), lw = 2)
+x <- rnbinom(100, size = 30, mu = 2)
+df <- data.frame(x = x)
+ggplot(df, aes(x = x)) +
+  geom_bar(fill = "lightgrey", color = "black") +
+  geom_point(data = data.frame(x = 0:max(x), y = 100 * dnbinom(0:max(x), size = 30, mu = 2)), aes(x = x, y = y), color = "blue", size = 2) +
+  geom_line(data = data.frame(x = 0:max(x), y = 100 * dnbinom(0:max(x), size = 30, mu = 2)), aes(x = x, y = y), color = "blue", linewidth = 1) +
+  labs(title = "Negative Binomial Distribution", x = "Value", y = "Frequency") +
+  theme_minimal()
 ```
 
 ## Random Variables - examples
@@ -120,16 +119,14 @@ lines(xdens, length(x) * dnbinom(xdens, size = 30, mu = 2), lw = 2)
     - use for binary outcomes
 
 ```{r, echo=FALSE}
-x = rbinom(100, size = 20, prob = 0.25)
-res = hist(
-    x,
-    main = "Binomial Distribution",
-    prob = FALSE,
-    col = "lightgrey",
-    breaks = seq(-0.5, round(max(x)) + 0.5, by = 0.5)
-)
-xdens = seq(min(x), max(x), by = 1)
-lines(xdens, length(x) * dbinom(xdens, size = 20, prob = 0.25), lw = 2)
+x <- rbinom(100, size = 20, prob = 0.25)
+df <- data.frame(x = x)
+ggplot(df, aes(x = x)) +
+  geom_bar(fill = "lightgrey", color = "black") +
+  geom_point(data = data.frame(x = 0:max(x), y = 100 * dbinom(0:max(x), size = 20, prob = 0.25)), aes(x = x, y = y), color = "blue", size = 2) +
+  geom_line(data = data.frame(x = 0:max(x), y = 100 * dbinom(0:max(x), size = 20, prob = 0.25)), aes(x = x, y = y), color = "blue", linewidth = 1) +
+  labs(title = "Binomial Distribution", x = "Value", y = "Frequency") +
+  theme_minimal()
 ```
 
 # Hypothesis Testing
@@ -300,22 +297,21 @@ reps <- 100
 N <- 30
 alpha <- 0.05
 my.conf.ints <- replicate(reps, t.test(rnorm(N), conf.level = 1-alpha)$conf.int)
-plot(x = rep(range(my.conf.ints), reps / 2),
-     y = 1:reps,
-     main = paste(reps, "samples of size N=", N),
-     type = "n")
-abline(v = 0, lw = 2, lty = 3)
-for (i in seq(1, reps)) {
-    acceptH0 <- my.conf.ints[1, i] < 0 & my.conf.ints[2, i] > 0
-    segments(
-        x0 = my.conf.ints[1, i],
-        y0 = i,
-        x1 = my.conf.ints[2, i],
-        y1 = i,
-        lw = ifelse(acceptH0, 1, 2),
-        col = ifelse(acceptH0, "black", "red")
-    )
-}
+df <- data.frame(
+  sample = 1:reps,
+  lower = my.conf.ints[1, ],
+  upper = my.conf.ints[2, ]
+)
+df$contains_zero <- df$lower < 0 & df$upper > 0
+
+ggplot(df, aes(y = sample)) +
+  geom_segment(aes(x = lower, xend = upper, yend = sample, color = contains_zero, linewidth = contains_zero)) +
+  geom_vline(xintercept = 0, linetype = "dashed", color = "black") +
+  scale_color_manual(values = c("FALSE" = "red", "TRUE" = "black")) +
+  scale_linewidth_manual(values = c("FALSE" = 1, "TRUE" = 0.5)) +
+  labs(title = paste(reps, "samples of size N=", N), x = "Confidence Interval", y = "Sample") +
+  theme_minimal() +
+  theme(legend.position = "none")
 ```
 
 ## Intervals for any confidence level
@@ -364,6 +360,24 @@ Which of the following are true?
     - variable under repeated sampling, and
     - sensitive to test assumptions
 
+## Distribution of p-values under the null hypothesis
+
+When the null hypothesis is true, p-values are uniformly distributed between 0 and 1. This means that even if there is no true effect, we will still observe p-values < 0.05 in 5% of our tests purely by chance.
+
+```{r, echo=FALSE, fig.height=4, fig.width=6}
+set.seed(123)
+n_tests <- 1000
+# Simulate 1000 t-tests where there is no difference (both groups from standard normal)
+p_vals <- replicate(n_tests, t.test(rnorm(10), rnorm(10))$p.value)
+
+ggplot(data.frame(p = p_vals), aes(x = p)) +
+  geom_histogram(breaks = seq(0, 1, by = 0.05), fill = "lightgrey", color = "black") +
+  geom_hline(yintercept = n_tests * 0.05, linetype = "dashed", color = "red") +
+  labs(title = "Histogram of p-values when Null Hypothesis is True",
+       x = "p-value", y = "Frequency") +
+  theme_minimal()
+```
+
 ## Use and mis-use of the p-value (cont'd)
 
 * If we fail to reject $H_0$, is the probability that $H_0$ is true equal to ($1-\alpha$)?  (Hint: NO NO NO!)
@@ -562,9 +576,33 @@ alphabetFrequency(dnastring, baseOnly=TRUE, as.prob=TRUE)
 ![Summarized Experiment](images/SE.svg)
 * Source: https://bioconductor.org/packages/SummarizedExperiment/
 
-## (Ranged)SummarizedExperiment
+## (Ranged)SummarizedExperiment in Practice
+
+* `SummarizedExperiment` is the _de facto_ standard for "rectangular" data with metadata
+* Let's build a real `SummarizedExperiment` using the tissue gene expression data:
+
+```{r, message=FALSE, warning=FALSE}
+suppressPackageStartupMessages(library(SummarizedExperiment))
+# We can use the genomicsclass tissuesGeneExpression data
+suppressPackageStartupMessages(library(tissuesGeneExpression))
+data(tissuesGeneExpression)
+
+# Create the SummarizedExperiment object
+se <- SummarizedExperiment(
+    assays = list(counts = e),
+    colData = DataFrame(tissue = tissue),
+    rowData = DataFrame(gene_id = rownames(e))
+)
+se
+```
+
+* We can directly access the expression matrix (`assay()`), sample metadata (`colData()`), and feature metadata (`rowData()`):
+
+```{r}
+dim(assay(se))
+head(colData(se), 3)
+```
 
-* `RangedSummarizedExperiment` is the _de facto_ standard for "rectangular" data with metadata
 * Extended by `SingleCellExperiment`, `DESeqDataSet`
 * Emulated by `MultiAssayExperiment`
 

diff --git a/vignettes/day2_unsupervised.Rmd b/vignettes/day2_unsupervised.Rmd
@@ -14,7 +14,12 @@ knitr::opts_chunk$set(
   collapse = TRUE,
   comment = "#>"
 )
-library(AppStatBio)
+suppressPackageStartupMessages({
+  library(AppStatBio)
+  library(SingleCellExperiment)
+  library(scater)
+  library(ggplot2)
+})
 ```
 
 
@@ -96,8 +101,14 @@ if(!require(GSE5859)){
 library(GSE5859)
 library(tissuesGeneExpression)
 data(tissuesGeneExpression)
-dim(e) ##gene expression data
-table(tissue) ##tissue[i] corresponds to e[,i]
+
+# Modern approach: store expression and metadata together
+sce <- SingleCellExperiment(
+  assays = list(logcounts = e),
+  colData = S4Vectors::DataFrame(tissue = tissue)
+)
+dim(sce) ## gene expression data
+table(sce$tissue) ## samples per tissue
 ```
 
 Interested in identifying similar *samples* and similar *genes*
@@ -321,11 +332,12 @@ plot(s$u[, 1] ~ p$rotation[, 1])
 ## Visualizing PCA loadings
 
 ```{r visloadings}
-plot(p$rotation[, 1],
-     xlab = "Index of genes",
-     ylab = "Loadings of PC1",
-     main = "PC1 loadings of each gene") #or, predict(p)
-abline(h = c(-0.03, 0.03), col = "red")
+df_loadings <- data.frame(Index = seq_len(nrow(p$rotation)), Loading = p$rotation[, 1])
+ggplot(df_loadings, aes(x = Index, y = Loading)) +
+  geom_point(alpha = 0.5, size = 1) +
+  geom_hline(yintercept = c(-0.03, 0.03), color = "red", linetype = "dashed") +
+  labs(title = "PC1 loadings of each gene", x = "Index of genes", y = "Loadings of PC1") +
+  theme_minimal()
 ```
 
 ## Genes with high PC1 loadings
@@ -347,29 +359,28 @@ pheatmap::pheatmap(
 - $\mathbf{D}$ (**eigenvalues**): standard deviation scaling factor that each decomposed variable is multiplied by.
 
 ```{r screeplot, fig.height=3, fig.width=5, echo=TRUE, fig.align='center'}
-rafalib::mypar()
-plot(
-    p$sdev ^ 2 / sum(p$sdev ^ 2) * 100,
-    xlim = c(0, 150),
-    type = "b",
-    ylab = "% variance explained",
-    main = "Screeplot"
+df_scree <- data.frame(
+  PC = seq_along(p$sdev),
+  Variance = p$sdev^2 / sum(p$sdev^2) * 100,
+  Cumulative = cumsum(p$sdev^2) / sum(p$sdev^2) * 100
 )
+
+ggplot(df_scree[1:150, ], aes(x = PC, y = Variance)) +
+  geom_line() + geom_point(size = 1) +
+  labs(title = "Screeplot", y = "% variance explained") +
+  theme_minimal()
 ```
 
 ## PCA interpretation: eigenvalues
 
 Alternatively as cumulative % variance explained (using `cumsum()` function)
 
 ```{r cumscreeplot, fig.height=4, echo=TRUE, fig.align='center'}
-rafalib::mypar()
-plot(
-    cumsum(p$sdev ^ 2) / sum(p$sdev ^ 2) * 100,
-    ylab = "cumulative % variance explained",
-    ylim = c(0, 100),
-    type = "b",
-    main = "Cumulative screeplot"
-)
+ggplot(df_scree, aes(x = PC, y = Cumulative)) +
+  geom_line() +
+  labs(title = "Cumulative screeplot", y = "Cumulative % variance explained") +
+  coord_cartesian(ylim = c(0, 100)) +
+  theme_minimal()
 ```
 
 ## PCA interpretation: scores
@@ -384,22 +395,10 @@ plot(
 ## PCA interpretation: scores
 
 ```{r scores, fig.height=5, echo=FALSE}
-rafalib::mypar()
-plot(
-    p$x[, 1:2],
-    xlab = "PC1",
-    ylab = "PC2",
-    main = "plot of p$x[, 1:2]",
-    col = factor(tissue),
-    pch = as.integer(factor(tissue))
-)
-legend(
-    "topright",
-    legend = levels(factor(tissue)),
-    col = 1:length(unique(tissue)),
-    pch = 1:length(unique(tissue)),
-    bty = 'n'
-)
+# Using scater for modern SingleCellExperiment PCA plotting
+sce <- runPCA(sce, exprs_values = "logcounts")
+plotPCA(sce, colour_by = "tissue") +
+  ggtitle("PCA plot of tissue data")
 ```
 
 ## Multi-dimensional Scaling (MDS)
@@ -414,15 +413,11 @@ mds <- cmdscale(d)
 ```
 
 ```{r plotmds, echo=FALSE}
-rafalib::mypar()
-plot(mds, col = factor(tissue), pch = as.integer(factor(tissue)))
-legend(
-    "topright",
-    legend = levels(factor(tissue)),
-    col = 1:length(unique(tissue)),
-    bty = 'n',
-    pch = 1:length(unique(tissue))
-)
+mds_df <- data.frame(Dim1 = mds[, 1], Dim2 = mds[, 2], tissue = tissue)
+ggplot(mds_df, aes(x = Dim1, y = Dim2, color = tissue)) +
+  geom_point(size = 2) +
+  labs(title = "MDS Plot of tissues", x = "Coordinate 1", y = "Coordinate 2") +
+  theme_minimal()
 ```
 
 ## t-SNE
@@ -493,22 +488,18 @@ plotReducedDim(sce.zeisel, dimred="UMAP", colour_by="level1class")
 Using default parameters and no log transformation
 
 ```{r umaptsnezeisel}
-sct <- SingleCellExperiment(e)
-sct$tissue <- tissue
-# fake the log normalization because it is undesirable for microarray data
-names(assays(sct)) <- "logcounts" 
-sct <- fixedPCA(sct, subset.row=NULL)
-sct <- runTSNE(sct, dimred="PCA")
-plotReducedDim(sct, dimred="TSNE", colour_by="tissue")
+# We already set up `sce` with `tissue` earlier
+sce <- runTSNE(sce, dimred="PCA")
+plotReducedDim(sce, dimred="TSNE", colour_by="tissue")
 ```
 
 ## UMAP of tissue microarray data
 
 Also using default parameters and no log transformation
 
 ```{r nologumap}
-sct <- runUMAP(sct, dimred="PCA")
-plotReducedDim(sct, dimred="UMAP", colour_by="tissue")
+sce <- runUMAP(sce, dimred="PCA")
+plotReducedDim(sce, dimred="UMAP", colour_by="tissue")
 ```
 
 ## Summary: distances and dimension reduction
@@ -523,6 +514,7 @@ plotReducedDim(sct, dimred="UMAP", colour_by="tissue")
 
 ## Lab exercise
 
+* **Check your BLAS**: Matrix multiplication speed relies heavily on your Basic Linear Algebra Subprograms (BLAS). In the lab, verify you are using an optimized BLAS (like OpenBLAS, Intel MKL, or Apple Accelerate). You can check this by running `sessionInfo()` in your R console and looking for the "Matrix products:" and "BLAS:" fields. If it points to `libRblas.so` or `libR.dylib` (instead of e.g. `libopenblas.so` or `vecLib`), you might be using the unoptimized default!
 * OSCA Basics: [Chapter 4 Dimensionality Reduction](http://bioconductor.org/books/release/OSCA.basic/dimensionality-reduction.html)
 * Optional if you are interested, OSCA Advanced: [Chapter 4 Dimensionality reduction, redux](http://bioconductor.org/books/release/OSCA.advanced/dimensionality-reduction-redux.html)