diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
index 4089fd1..f6eac73 100644
--- a/.github/workflows/pkgdown.yaml
+++ b/.github/workflows/pkgdown.yaml
@@ -25,13 +25,13 @@ jobs:
steps:
- uses: actions/checkout@v4
- - uses: r-lib/actions/setup-pandoc@v3
+ - uses: r-lib/actions/setup-pandoc@v2
- - uses: r-lib/actions/setup-r@v3
+ - uses: r-lib/actions/setup-r@v2
with:
use-public-rspm: true
- - uses: r-lib/actions/setup-r-dependencies@v3
+ - uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: any::pkgdown, local::.
needs: website
diff --git a/vignettes/day1_intro.Rmd b/vignettes/day1_intro.Rmd
index 0a78b2d..411ec41 100644
--- a/vignettes/day1_intro.Rmd
+++ b/vignettes/day1_intro.Rmd
@@ -13,6 +13,7 @@ knitr::opts_chunk$set(
comment = "#>"
)
suppressPackageStartupMessages(library(AppStatBio))
+suppressPackageStartupMessages(library(ggplot2))
```
# Front matter
@@ -71,13 +72,15 @@ My research interests:
Normally distributed random variable with mean $\mu = 0$ / standard deviation $\sigma = 1$, and a sample of $n=100$
-```{r, echo=FALSE}
+```{r, echo=FALSE, message=FALSE}
set.seed(1)
-x = rnorm(100)
-res = hist(x, main = "Standard Normal Distribution\n mean 0, std. dev. 1", prob =
- TRUE)
-xdens = seq(min(res$breaks), max(res$breaks), by = 0.01)
-lines(xdens, dnorm(xdens))
+x <- rnorm(100)
+df <- data.frame(x = x)
+ggplot(df, aes(x = x)) +
+ geom_histogram(aes(y = after_stat(density)), bins = 15, fill = "lightgrey", color = "black") +
+ stat_function(fun = dnorm, color = "blue", linewidth = 1) +
+ labs(title = "Standard Normal Distribution\n mean 0, std. dev. 1", x = "Value", y = "Density") +
+ theme_minimal()
```
## Random Variables - examples
@@ -85,16 +88,14 @@ lines(xdens, dnorm(xdens))
Poisson distributed random variable ($\lambda = 2$), and a sample of $n=100$.
```{r, echo=FALSE}
-x = rpois(100, lambda = 2)
-res = hist(
- x,
- main = "Poisson Distribution",
- prob = FALSE,
- col = "lightgrey",
- breaks = seq(-0.5, round(max(x)) + 0.5, by = 0.5)
-)
-xdens = seq(min(x), max(x), by = 1)
-lines(xdens, length(x) * dpois(xdens, lambda = 2), lw = 2)
+x <- rpois(100, lambda = 2)
+df <- data.frame(x = x)
+ggplot(df, aes(x = x)) +
+ geom_bar(fill = "lightgrey", color = "black") +
+ geom_point(data = data.frame(x = 0:max(x), y = 100 * dpois(0:max(x), lambda = 2)), aes(x = x, y = y), color = "blue", size = 2) +
+ geom_line(data = data.frame(x = 0:max(x), y = 100 * dpois(0:max(x), lambda = 2)), aes(x = x, y = y), color = "blue", linewidth = 1) +
+ labs(title = "Poisson Distribution", x = "Value", y = "Frequency") +
+ theme_minimal()
```
## Random Variables - examples
@@ -102,16 +103,14 @@ lines(xdens, length(x) * dpois(xdens, lambda = 2), lw = 2)
Negative Binomially distributed random variable ($size=30, \mu=2$), and a sample of $n=100$.
```{r, echo=FALSE}
-x = rnbinom(100, size = 30, mu = 2)
-res = hist(
- x,
- main = "Negative Binomial Distribution",
- prob = FALSE,
- col = "lightgrey",
- breaks = seq(-0.5, round(max(x)) + 0.5, by = 0.5)
-)
-xdens = seq(min(x), max(x), by = 1)
-lines(xdens, length(x) * dnbinom(xdens, size = 30, mu = 2), lw = 2)
+x <- rnbinom(100, size = 30, mu = 2)
+df <- data.frame(x = x)
+ggplot(df, aes(x = x)) +
+ geom_bar(fill = "lightgrey", color = "black") +
+ geom_point(data = data.frame(x = 0:max(x), y = 100 * dnbinom(0:max(x), size = 30, mu = 2)), aes(x = x, y = y), color = "blue", size = 2) +
+ geom_line(data = data.frame(x = 0:max(x), y = 100 * dnbinom(0:max(x), size = 30, mu = 2)), aes(x = x, y = y), color = "blue", linewidth = 1) +
+ labs(title = "Negative Binomial Distribution", x = "Value", y = "Frequency") +
+ theme_minimal()
```
## Random Variables - examples
@@ -120,16 +119,14 @@ lines(xdens, length(x) * dnbinom(xdens, size = 30, mu = 2), lw = 2)
- use for binary outcomes
```{r, echo=FALSE}
-x = rbinom(100, size = 20, prob = 0.25)
-res = hist(
- x,
- main = "Binomial Distribution",
- prob = FALSE,
- col = "lightgrey",
- breaks = seq(-0.5, round(max(x)) + 0.5, by = 0.5)
-)
-xdens = seq(min(x), max(x), by = 1)
-lines(xdens, length(x) * dbinom(xdens, size = 20, prob = 0.25), lw = 2)
+x <- rbinom(100, size = 20, prob = 0.25)
+df <- data.frame(x = x)
+ggplot(df, aes(x = x)) +
+ geom_bar(fill = "lightgrey", color = "black") +
+ geom_point(data = data.frame(x = 0:max(x), y = 100 * dbinom(0:max(x), size = 20, prob = 0.25)), aes(x = x, y = y), color = "blue", size = 2) +
+ geom_line(data = data.frame(x = 0:max(x), y = 100 * dbinom(0:max(x), size = 20, prob = 0.25)), aes(x = x, y = y), color = "blue", linewidth = 1) +
+ labs(title = "Binomial Distribution", x = "Value", y = "Frequency") +
+ theme_minimal()
```
# Hypothesis Testing
@@ -300,22 +297,21 @@ reps <- 100
N <- 30
alpha <- 0.05
my.conf.ints <- replicate(reps, t.test(rnorm(N), conf.level = 1-alpha)$conf.int)
-plot(x = rep(range(my.conf.ints), reps / 2),
- y = 1:reps,
- main = paste(reps, "samples of size N=", N),
- type = "n")
-abline(v = 0, lw = 2, lty = 3)
-for (i in seq(1, reps)) {
- acceptH0 <- my.conf.ints[1, i] < 0 & my.conf.ints[2, i] > 0
- segments(
- x0 = my.conf.ints[1, i],
- y0 = i,
- x1 = my.conf.ints[2, i],
- y1 = i,
- lw = ifelse(acceptH0, 1, 2),
- col = ifelse(acceptH0, "black", "red")
- )
-}
+df <- data.frame(
+ sample = 1:reps,
+ lower = my.conf.ints[1, ],
+ upper = my.conf.ints[2, ]
+)
+df$contains_zero <- df$lower < 0 & df$upper > 0
+
+ggplot(df, aes(y = sample)) +
+ geom_segment(aes(x = lower, xend = upper, yend = sample, color = contains_zero, linewidth = contains_zero)) +
+ geom_vline(xintercept = 0, linetype = "dashed", color = "black") +
+ scale_color_manual(values = c("FALSE" = "red", "TRUE" = "black")) +
+ scale_linewidth_manual(values = c("FALSE" = 1, "TRUE" = 0.5)) +
+ labs(title = paste(reps, "samples of size N=", N), x = "Confidence Interval", y = "Sample") +
+ theme_minimal() +
+ theme(legend.position = "none")
```
## Intervals for any confidence level
@@ -364,6 +360,24 @@ Which of the following are true?
- variable under repeated sampling, and
- sensitive to test assumptions
+## Distribution of p-values under the null hypothesis
+
+When the null hypothesis is true, p-values are uniformly distributed between 0 and 1. This means that even if there is no true effect, we will still observe p-values < 0.05 in 5% of our tests purely by chance.
+
+```{r, echo=FALSE, fig.height=4, fig.width=6}
+set.seed(123)
+n_tests <- 1000
+# Simulate 1000 t-tests where there is no difference (both groups from standard normal)
+p_vals <- replicate(n_tests, t.test(rnorm(10), rnorm(10))$p.value)
+
+ggplot(data.frame(p = p_vals), aes(x = p)) +
+ geom_histogram(breaks = seq(0, 1, by = 0.05), fill = "lightgrey", color = "black") +
+ geom_hline(yintercept = n_tests * 0.05, linetype = "dashed", color = "red") +
+ labs(title = "Histogram of p-values when Null Hypothesis is True",
+ x = "p-value", y = "Frequency") +
+ theme_minimal()
+```
+
## Use and mis-use of the p-value (cont'd)
* If we fail to reject $H_0$, is the probability that $H_0$ is true equal to ($1-\alpha$)? (Hint: NO NO NO!)
@@ -562,9 +576,33 @@ alphabetFrequency(dnastring, baseOnly=TRUE, as.prob=TRUE)

* Source: https://bioconductor.org/packages/SummarizedExperiment/
-## (Ranged)SummarizedExperiment
+## (Ranged)SummarizedExperiment in Practice
+
+* `SummarizedExperiment` is the _de facto_ standard for "rectangular" data with metadata
+* Let's build a real `SummarizedExperiment` using the tissue gene expression data:
+
+```{r, message=FALSE, warning=FALSE}
+suppressPackageStartupMessages(library(SummarizedExperiment))
+# We can use the genomicsclass tissuesGeneExpression data
+suppressPackageStartupMessages(library(tissuesGeneExpression))
+data(tissuesGeneExpression)
+
+# Create the SummarizedExperiment object
+se <- SummarizedExperiment(
+ assays = list(counts = e),
+ colData = DataFrame(tissue = tissue),
+ rowData = DataFrame(gene_id = rownames(e))
+)
+se
+```
+
+* We can directly access the expression matrix (`assay()`), sample metadata (`colData()`), and feature metadata (`rowData()`):
+
+```{r}
+dim(assay(se))
+head(colData(se), 3)
+```
-* `RangedSummarizedExperiment` is the _de facto_ standard for "rectangular" data with metadata
* Extended by `SingleCellExperiment`, `DESeqDataSet`
* Emulated by `MultiAssayExperiment`
diff --git a/vignettes/day2_unsupervised.Rmd b/vignettes/day2_unsupervised.Rmd
index 02ec439..5224ae4 100644
--- a/vignettes/day2_unsupervised.Rmd
+++ b/vignettes/day2_unsupervised.Rmd
@@ -14,7 +14,12 @@ knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
-library(AppStatBio)
+suppressPackageStartupMessages({
+ library(AppStatBio)
+ library(SingleCellExperiment)
+ library(scater)
+ library(ggplot2)
+})
```
@@ -96,8 +101,14 @@ if(!require(GSE5859)){
library(GSE5859)
library(tissuesGeneExpression)
data(tissuesGeneExpression)
-dim(e) ##gene expression data
-table(tissue) ##tissue[i] corresponds to e[,i]
+
+# Modern approach: store expression and metadata together
+sce <- SingleCellExperiment(
+ assays = list(logcounts = e),
+ colData = S4Vectors::DataFrame(tissue = tissue)
+)
+dim(sce) ## gene expression data
+table(sce$tissue) ## samples per tissue
```
Interested in identifying similar *samples* and similar *genes*
@@ -321,11 +332,12 @@ plot(s$u[, 1] ~ p$rotation[, 1])
## Visualizing PCA loadings
```{r visloadings}
-plot(p$rotation[, 1],
- xlab = "Index of genes",
- ylab = "Loadings of PC1",
- main = "PC1 loadings of each gene") #or, predict(p)
-abline(h = c(-0.03, 0.03), col = "red")
+df_loadings <- data.frame(Index = seq_len(nrow(p$rotation)), Loading = p$rotation[, 1])
+ggplot(df_loadings, aes(x = Index, y = Loading)) +
+ geom_point(alpha = 0.5, size = 1) +
+ geom_hline(yintercept = c(-0.03, 0.03), color = "red", linetype = "dashed") +
+ labs(title = "PC1 loadings of each gene", x = "Index of genes", y = "Loadings of PC1") +
+ theme_minimal()
```
## Genes with high PC1 loadings
@@ -347,14 +359,16 @@ pheatmap::pheatmap(
- $\mathbf{D}$ (**eigenvalues**): standard deviation scaling factor that each decomposed variable is multiplied by.
```{r screeplot, fig.height=3, fig.width=5, echo=TRUE, fig.align='center'}
-rafalib::mypar()
-plot(
- p$sdev ^ 2 / sum(p$sdev ^ 2) * 100,
- xlim = c(0, 150),
- type = "b",
- ylab = "% variance explained",
- main = "Screeplot"
+df_scree <- data.frame(
+ PC = seq_along(p$sdev),
+ Variance = p$sdev^2 / sum(p$sdev^2) * 100,
+ Cumulative = cumsum(p$sdev^2) / sum(p$sdev^2) * 100
)
+
+ggplot(df_scree[1:150, ], aes(x = PC, y = Variance)) +
+ geom_line() + geom_point(size = 1) +
+ labs(title = "Screeplot", y = "% variance explained") +
+ theme_minimal()
```
## PCA interpretation: eigenvalues
@@ -362,14 +376,11 @@ plot(
Alternatively as cumulative % variance explained (using `cumsum()` function)
```{r cumscreeplot, fig.height=4, echo=TRUE, fig.align='center'}
-rafalib::mypar()
-plot(
- cumsum(p$sdev ^ 2) / sum(p$sdev ^ 2) * 100,
- ylab = "cumulative % variance explained",
- ylim = c(0, 100),
- type = "b",
- main = "Cumulative screeplot"
-)
+ggplot(df_scree, aes(x = PC, y = Cumulative)) +
+ geom_line() +
+ labs(title = "Cumulative screeplot", y = "Cumulative % variance explained") +
+ coord_cartesian(ylim = c(0, 100)) +
+ theme_minimal()
```
## PCA interpretation: scores
@@ -384,22 +395,10 @@ plot(
## PCA interpretation: scores
```{r scores, fig.height=5, echo=FALSE}
-rafalib::mypar()
-plot(
- p$x[, 1:2],
- xlab = "PC1",
- ylab = "PC2",
- main = "plot of p$x[, 1:2]",
- col = factor(tissue),
- pch = as.integer(factor(tissue))
-)
-legend(
- "topright",
- legend = levels(factor(tissue)),
- col = 1:length(unique(tissue)),
- pch = 1:length(unique(tissue)),
- bty = 'n'
-)
+# Using scater for modern SingleCellExperiment PCA plotting
+sce <- runPCA(sce, exprs_values = "logcounts")
+plotPCA(sce, colour_by = "tissue") +
+ ggtitle("PCA plot of tissue data")
```
## Multi-dimensional Scaling (MDS)
@@ -414,15 +413,11 @@ mds <- cmdscale(d)
```
```{r plotmds, echo=FALSE}
-rafalib::mypar()
-plot(mds, col = factor(tissue), pch = as.integer(factor(tissue)))
-legend(
- "topright",
- legend = levels(factor(tissue)),
- col = 1:length(unique(tissue)),
- bty = 'n',
- pch = 1:length(unique(tissue))
-)
+mds_df <- data.frame(Dim1 = mds[, 1], Dim2 = mds[, 2], tissue = tissue)
+ggplot(mds_df, aes(x = Dim1, y = Dim2, color = tissue)) +
+ geom_point(size = 2) +
+ labs(title = "MDS Plot of tissues", x = "Coordinate 1", y = "Coordinate 2") +
+ theme_minimal()
```
## t-SNE
@@ -493,13 +488,9 @@ plotReducedDim(sce.zeisel, dimred="UMAP", colour_by="level1class")
Using default parameters and no log transformation
```{r umaptsnezeisel}
-sct <- SingleCellExperiment(e)
-sct$tissue <- tissue
-# fake the log normalization because it is undesirable for microarray data
-names(assays(sct)) <- "logcounts"
-sct <- fixedPCA(sct, subset.row=NULL)
-sct <- runTSNE(sct, dimred="PCA")
-plotReducedDim(sct, dimred="TSNE", colour_by="tissue")
+# We already set up `sce` with `tissue` earlier
+sce <- runTSNE(sce, dimred="PCA")
+plotReducedDim(sce, dimred="TSNE", colour_by="tissue")
```
## UMAP of tissue microarray data
@@ -507,8 +498,8 @@ plotReducedDim(sct, dimred="TSNE", colour_by="tissue")
Also using default parameters and no log transformation
```{r nologumap}
-sct <- runUMAP(sct, dimred="PCA")
-plotReducedDim(sct, dimred="UMAP", colour_by="tissue")
+sce <- runUMAP(sce, dimred="PCA")
+plotReducedDim(sce, dimred="UMAP", colour_by="tissue")
```
## Summary: distances and dimension reduction
@@ -523,6 +514,7 @@ plotReducedDim(sct, dimred="UMAP", colour_by="tissue")
## Lab exercise
+* **Check your BLAS**: Matrix multiplication speed relies heavily on your Basic Linear Algebra Subprograms (BLAS). In the lab, verify you are using an optimized BLAS (like OpenBLAS, Intel MKL, or Apple Accelerate). You can check this by running `sessionInfo()` in your R console and looking for the "Matrix products:" and "BLAS:" fields. If it points to `libRblas.so` or `libR.dylib` (instead of e.g. `libopenblas.so` or `vecLib`), you might be using the unoptimized default!
* OSCA Basics: [Chapter 4 Dimensionality Reduction](http://bioconductor.org/books/release/OSCA.basic/dimensionality-reduction.html)
* Optional if you are interested, OSCA Advanced: [Chapter 4 Dimensionality reduction, redux](http://bioconductor.org/books/release/OSCA.advanced/dimensionality-reduction-redux.html)
diff --git a/vignettes/day3_linearmodels.Rmd b/vignettes/day3_linearmodels.Rmd
index 1bf6531..d43cbf6 100644
--- a/vignettes/day3_linearmodels.Rmd
+++ b/vignettes/day3_linearmodels.Rmd
@@ -228,21 +228,28 @@ broom::tidy(fit)
## Interpretation of spider leg type coefficients
-```{r spider_main_coef, fig.cap="Diagram of the estimated coefficients in the linear model. The green arrow indicates the Intercept term, which goes from zero to the mean of the reference group (here the 'pull' samples). The orange arrow indicates the difference between the push group and the pull group, which is negative in this example. The circles show the individual samples, jittered horizontally to avoid overplotting.",echo=FALSE}
-set.seed(1) #same jitter in stripchart
-stripchart(split(spider.sub$friction, spider.sub$type),
- vertical=TRUE, pch=1, method="jitter", las=2, xlim=c(0,3), ylim=c(0,2))
+```{r spider_main_coef, fig.cap="Diagram of the estimated coefficients in the linear model. The green dashed line indicates the Intercept term (mean of the reference group 'pull'). The orange dashed line indicates the push group mean. The solid green/orange segments explicitly show the intercept distance and the effect difference.", echo=FALSE, warning=FALSE}
+library(ggplot2)
coefs <- coef(fit)
-a <- -0.25
-lgth <- .1
-library(RColorBrewer)
-cols <- brewer.pal(3,"Dark2")
-abline(h=0)
-arrows(1+a,0,1+a,coefs[1],lwd=3,col=cols[1],length=lgth)
-abline(h=coefs[1],col=cols[1])
-arrows(2+a,coefs[1],2+a,coefs[1]+coefs[2],lwd=3,col=cols[2],length=lgth)
-abline(h=coefs[1]+coefs[2],col=cols[2])
-legend("right",names(coefs),fill=cols,cex=.75,bg="white")
+spider.sub |>
+ ggplot(aes(x = type, y = friction)) +
+ geom_jitter(width = 0.1, color = "grey50", alpha = 0.7) +
+ # Intercept line and arrow
+ geom_hline(yintercept = coefs[1], color = "#1B9E77", linetype = "dashed") +
+annotate("segment", x = 1, xend = 1, y = 0, yend = coefs[1],
+ color = "#1B9E77", arrow = grid::arrow(length = grid::unit(0.1, "inches")), linewidth = 1) +
+ # Difference arrow
+ geom_hline(yintercept = coefs[1] + coefs[2], color = "#D95F02", linetype = "dashed") +
+annotate("segment", x = 2, xend = 2, y = coefs[1], yend = coefs[1] + coefs[2],
+ color = "#D95F02", arrow = grid::arrow(length = grid::unit(0.1, "inches")), linewidth = 1) +
+ # Base line
+ geom_hline(yintercept = 0, color = "black") +
+ labs(
+ title = "Estimated coefficients in the linear model",
+ x = "Type",
+ y = "Friction"
+ ) +
+ theme_minimal()
```
## regression on spider leg **position**
@@ -253,9 +260,8 @@ Remember there are positions 1-4
fit <- lm(friction ~ leg, data=spider)
```
-```{r, results="asis", echo=TRUE, message=FALSE}
-fit.table <- xtable::xtable(fit, label=NULL)
-print(fit.table, type="html")
+```{r, echo=TRUE, message=FALSE}
+knitr::kable(broom::tidy(fit), digits = 3)
```
- Interpretation of the dummy variables legL2, legL3, legL4 ?
@@ -292,9 +298,8 @@ by:
fit <- lm(friction ~ type + leg, data=spider)
```
-```{r, results="asis", echo=TRUE, message=FALSE}
-fit.table <- xtable::xtable(fit, label=NULL)
-print(fit.table, type="html")
+```{r, echo=TRUE, message=FALSE}
+gtsummary::tbl_regression(fit)
```
- this model still doesn't represent how the friction differences
@@ -318,9 +323,8 @@ Image credit: