Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/pkgdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ jobs:
steps:
- uses: actions/checkout@v4

- uses: r-lib/actions/setup-pandoc@v3
- uses: r-lib/actions/setup-pandoc@v2

- uses: r-lib/actions/setup-r@v3
- uses: r-lib/actions/setup-r@v2
with:
use-public-rspm: true

- uses: r-lib/actions/setup-r-dependencies@v3
- uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: any::pkgdown, local::.
needs: website
Expand Down
146 changes: 92 additions & 54 deletions vignettes/day1_intro.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ knitr::opts_chunk$set(
comment = "#>"
)
suppressPackageStartupMessages(library(AppStatBio))
suppressPackageStartupMessages(library(ggplot2))
```

# Front matter
Expand Down Expand Up @@ -71,47 +72,45 @@ My research interests:

Normally distributed random variable with mean $\mu = 0$ / standard deviation $\sigma = 1$, and a sample of $n=100$

```{r, echo=FALSE}
```{r, echo=FALSE, message=FALSE}
set.seed(1)
x = rnorm(100)
res = hist(x, main = "Standard Normal Distribution\n mean 0, std. dev. 1", prob =
TRUE)
xdens = seq(min(res$breaks), max(res$breaks), by = 0.01)
lines(xdens, dnorm(xdens))
x <- rnorm(100)
df <- data.frame(x = x)
ggplot(df, aes(x = x)) +
geom_histogram(aes(y = after_stat(density)), bins = 15, fill = "lightgrey", color = "black") +
stat_function(fun = dnorm, color = "blue", linewidth = 1) +
labs(title = "Standard Normal Distribution\n mean 0, std. dev. 1", x = "Value", y = "Density") +
theme_minimal()
```

## Random Variables - examples

Poisson distributed random variable ($\lambda = 2$), and a sample of $n=100$.

```{r, echo=FALSE}
x = rpois(100, lambda = 2)
res = hist(
x,
main = "Poisson Distribution",
prob = FALSE,
col = "lightgrey",
breaks = seq(-0.5, round(max(x)) + 0.5, by = 0.5)
)
xdens = seq(min(x), max(x), by = 1)
lines(xdens, length(x) * dpois(xdens, lambda = 2), lw = 2)
x <- rpois(100, lambda = 2)
df <- data.frame(x = x)
ggplot(df, aes(x = x)) +
geom_bar(fill = "lightgrey", color = "black") +
geom_point(data = data.frame(x = 0:max(x), y = 100 * dpois(0:max(x), lambda = 2)), aes(x = x, y = y), color = "blue", size = 2) +
geom_line(data = data.frame(x = 0:max(x), y = 100 * dpois(0:max(x), lambda = 2)), aes(x = x, y = y), color = "blue", linewidth = 1) +
labs(title = "Poisson Distribution", x = "Value", y = "Frequency") +
theme_minimal()
```

## Random Variables - examples

Negative Binomially distributed random variable ($size=30, \mu=2$), and a sample of $n=100$.

```{r, echo=FALSE}
x = rnbinom(100, size = 30, mu = 2)
res = hist(
x,
main = "Negative Binomial Distribution",
prob = FALSE,
col = "lightgrey",
breaks = seq(-0.5, round(max(x)) + 0.5, by = 0.5)
)
xdens = seq(min(x), max(x), by = 1)
lines(xdens, length(x) * dnbinom(xdens, size = 30, mu = 2), lw = 2)
x <- rnbinom(100, size = 30, mu = 2)
df <- data.frame(x = x)
ggplot(df, aes(x = x)) +
geom_bar(fill = "lightgrey", color = "black") +
geom_point(data = data.frame(x = 0:max(x), y = 100 * dnbinom(0:max(x), size = 30, mu = 2)), aes(x = x, y = y), color = "blue", size = 2) +
geom_line(data = data.frame(x = 0:max(x), y = 100 * dnbinom(0:max(x), size = 30, mu = 2)), aes(x = x, y = y), color = "blue", linewidth = 1) +
labs(title = "Negative Binomial Distribution", x = "Value", y = "Frequency") +
theme_minimal()
```

## Random Variables - examples
Expand All @@ -120,16 +119,14 @@ lines(xdens, length(x) * dnbinom(xdens, size = 30, mu = 2), lw = 2)
- use for binary outcomes

```{r, echo=FALSE}
x = rbinom(100, size = 20, prob = 0.25)
res = hist(
x,
main = "Binomial Distribution",
prob = FALSE,
col = "lightgrey",
breaks = seq(-0.5, round(max(x)) + 0.5, by = 0.5)
)
xdens = seq(min(x), max(x), by = 1)
lines(xdens, length(x) * dbinom(xdens, size = 20, prob = 0.25), lw = 2)
x <- rbinom(100, size = 20, prob = 0.25)
df <- data.frame(x = x)
ggplot(df, aes(x = x)) +
geom_bar(fill = "lightgrey", color = "black") +
geom_point(data = data.frame(x = 0:max(x), y = 100 * dbinom(0:max(x), size = 20, prob = 0.25)), aes(x = x, y = y), color = "blue", size = 2) +
geom_line(data = data.frame(x = 0:max(x), y = 100 * dbinom(0:max(x), size = 20, prob = 0.25)), aes(x = x, y = y), color = "blue", linewidth = 1) +
labs(title = "Binomial Distribution", x = "Value", y = "Frequency") +
theme_minimal()
```

# Hypothesis Testing
Expand Down Expand Up @@ -300,22 +297,21 @@ reps <- 100
N <- 30
alpha <- 0.05
my.conf.ints <- replicate(reps, t.test(rnorm(N), conf.level = 1-alpha)$conf.int)
plot(x = rep(range(my.conf.ints), reps / 2),
y = 1:reps,
main = paste(reps, "samples of size N=", N),
type = "n")
abline(v = 0, lw = 2, lty = 3)
for (i in seq(1, reps)) {
acceptH0 <- my.conf.ints[1, i] < 0 & my.conf.ints[2, i] > 0
segments(
x0 = my.conf.ints[1, i],
y0 = i,
x1 = my.conf.ints[2, i],
y1 = i,
lw = ifelse(acceptH0, 1, 2),
col = ifelse(acceptH0, "black", "red")
)
}
df <- data.frame(
sample = 1:reps,
lower = my.conf.ints[1, ],
upper = my.conf.ints[2, ]
)
df$contains_zero <- df$lower < 0 & df$upper > 0

ggplot(df, aes(y = sample)) +
geom_segment(aes(x = lower, xend = upper, yend = sample, color = contains_zero, linewidth = contains_zero)) +
geom_vline(xintercept = 0, linetype = "dashed", color = "black") +
scale_color_manual(values = c("FALSE" = "red", "TRUE" = "black")) +
scale_linewidth_manual(values = c("FALSE" = 1, "TRUE" = 0.5)) +
labs(title = paste(reps, "samples of size N=", N), x = "Confidence Interval", y = "Sample") +
theme_minimal() +
theme(legend.position = "none")
```

## Intervals for any confidence level
Expand Down Expand Up @@ -364,6 +360,24 @@ Which of the following are true?
- variable under repeated sampling, and
- sensitive to test assumptions

## Distribution of p-values under the null hypothesis

When the null hypothesis is true, p-values are uniformly distributed between 0 and 1. This means that even if there is no true effect, we will still observe p-values < 0.05 in 5% of our tests purely by chance.

```{r, echo=FALSE, fig.height=4, fig.width=6}
set.seed(123)
n_tests <- 1000
# Simulate 1000 t-tests where there is no difference (both groups from standard normal)
p_vals <- replicate(n_tests, t.test(rnorm(10), rnorm(10))$p.value)

ggplot(data.frame(p = p_vals), aes(x = p)) +
geom_histogram(breaks = seq(0, 1, by = 0.05), fill = "lightgrey", color = "black") +
geom_hline(yintercept = n_tests * 0.05, linetype = "dashed", color = "red") +
labs(title = "Histogram of p-values when Null Hypothesis is True",
x = "p-value", y = "Frequency") +
theme_minimal()
```

## Use and mis-use of the p-value (cont'd)

* If we fail to reject $H_0$, is the probability that $H_0$ is true equal to ($1-\alpha$)? (Hint: NO NO NO!)
Expand Down Expand Up @@ -562,9 +576,33 @@ alphabetFrequency(dnastring, baseOnly=TRUE, as.prob=TRUE)
![Summarized Experiment](images/SE.svg)
* Source: https://bioconductor.org/packages/SummarizedExperiment/

## (Ranged)SummarizedExperiment
## (Ranged)SummarizedExperiment in Practice

* `SummarizedExperiment` is the _de facto_ standard for "rectangular" data with metadata
* Let's build a real `SummarizedExperiment` using the tissue gene expression data:

```{r, message=FALSE, warning=FALSE}
suppressPackageStartupMessages(library(SummarizedExperiment))
# We can use the genomicsclass tissuesGeneExpression data
suppressPackageStartupMessages(library(tissuesGeneExpression))
data(tissuesGeneExpression)

# Create the SummarizedExperiment object
se <- SummarizedExperiment(
assays = list(counts = e),
colData = DataFrame(tissue = tissue),
rowData = DataFrame(gene_id = rownames(e))
)
se
```

* We can directly access the expression matrix (`assay()`), sample metadata (`colData()`), and feature metadata (`rowData()`):

```{r}
dim(assay(se))
head(colData(se), 3)
```

* `RangedSummarizedExperiment` is the _de facto_ standard for "rectangular" data with metadata
* Extended by `SingleCellExperiment`, `DESeqDataSet`
* Emulated by `MultiAssayExperiment`

Expand Down
106 changes: 49 additions & 57 deletions vignettes/day2_unsupervised.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@ knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
library(AppStatBio)
suppressPackageStartupMessages({
library(AppStatBio)
library(SingleCellExperiment)
library(scater)
library(ggplot2)
})
```


Expand Down Expand Up @@ -96,8 +101,14 @@ if(!require(GSE5859)){
library(GSE5859)
library(tissuesGeneExpression)
data(tissuesGeneExpression)
dim(e) ##gene expression data
table(tissue) ##tissue[i] corresponds to e[,i]

# Modern approach: store expression and metadata together
sce <- SingleCellExperiment(
assays = list(logcounts = e),
colData = S4Vectors::DataFrame(tissue = tissue)
)
Comment thread
lwaldron marked this conversation as resolved.
dim(sce) ## gene expression data
table(sce$tissue) ## samples per tissue
```

Interested in identifying similar *samples* and similar *genes*
Expand Down Expand Up @@ -321,11 +332,12 @@ plot(s$u[, 1] ~ p$rotation[, 1])
## Visualizing PCA loadings

```{r visloadings}
plot(p$rotation[, 1],
xlab = "Index of genes",
ylab = "Loadings of PC1",
main = "PC1 loadings of each gene") #or, predict(p)
abline(h = c(-0.03, 0.03), col = "red")
df_loadings <- data.frame(Index = seq_len(nrow(p$rotation)), Loading = p$rotation[, 1])
ggplot(df_loadings, aes(x = Index, y = Loading)) +
geom_point(alpha = 0.5, size = 1) +
geom_hline(yintercept = c(-0.03, 0.03), color = "red", linetype = "dashed") +
labs(title = "PC1 loadings of each gene", x = "Index of genes", y = "Loadings of PC1") +
theme_minimal()
```

## Genes with high PC1 loadings
Expand All @@ -347,29 +359,28 @@ pheatmap::pheatmap(
- $\mathbf{D}$ (**eigenvalues**): standard deviation scaling factor that each decomposed variable is multiplied by.

```{r screeplot, fig.height=3, fig.width=5, echo=TRUE, fig.align='center'}
rafalib::mypar()
plot(
p$sdev ^ 2 / sum(p$sdev ^ 2) * 100,
xlim = c(0, 150),
type = "b",
ylab = "% variance explained",
main = "Screeplot"
df_scree <- data.frame(
PC = seq_along(p$sdev),
Variance = p$sdev^2 / sum(p$sdev^2) * 100,
Cumulative = cumsum(p$sdev^2) / sum(p$sdev^2) * 100
)

ggplot(df_scree[1:150, ], aes(x = PC, y = Variance)) +
geom_line() + geom_point(size = 1) +
labs(title = "Screeplot", y = "% variance explained") +
theme_minimal()
```

## PCA interpretation: eigenvalues

Alternatively as cumulative % variance explained (using `cumsum()` function)

```{r cumscreeplot, fig.height=4, echo=TRUE, fig.align='center'}
rafalib::mypar()
plot(
cumsum(p$sdev ^ 2) / sum(p$sdev ^ 2) * 100,
ylab = "cumulative % variance explained",
ylim = c(0, 100),
type = "b",
main = "Cumulative screeplot"
)
ggplot(df_scree, aes(x = PC, y = Cumulative)) +
geom_line() +
labs(title = "Cumulative screeplot", y = "Cumulative % variance explained") +
coord_cartesian(ylim = c(0, 100)) +
theme_minimal()
```

## PCA interpretation: scores
Expand All @@ -384,22 +395,10 @@ plot(
## PCA interpretation: scores

```{r scores, fig.height=5, echo=FALSE}
rafalib::mypar()
plot(
p$x[, 1:2],
xlab = "PC1",
ylab = "PC2",
main = "plot of p$x[, 1:2]",
col = factor(tissue),
pch = as.integer(factor(tissue))
)
legend(
"topright",
legend = levels(factor(tissue)),
col = 1:length(unique(tissue)),
pch = 1:length(unique(tissue)),
bty = 'n'
)
# Using scater for modern SingleCellExperiment PCA plotting
sce <- runPCA(sce, exprs_values = "logcounts")
plotPCA(sce, colour_by = "tissue") +
ggtitle("PCA plot of tissue data")
```

## Multi-dimensional Scaling (MDS)
Expand All @@ -414,15 +413,11 @@ mds <- cmdscale(d)
```

```{r plotmds, echo=FALSE}
rafalib::mypar()
plot(mds, col = factor(tissue), pch = as.integer(factor(tissue)))
legend(
"topright",
legend = levels(factor(tissue)),
col = 1:length(unique(tissue)),
bty = 'n',
pch = 1:length(unique(tissue))
)
mds_df <- data.frame(Dim1 = mds[, 1], Dim2 = mds[, 2], tissue = tissue)
ggplot(mds_df, aes(x = Dim1, y = Dim2, color = tissue)) +
geom_point(size = 2) +
labs(title = "MDS Plot of tissues", x = "Coordinate 1", y = "Coordinate 2") +
theme_minimal()
```

## t-SNE
Expand Down Expand Up @@ -493,22 +488,18 @@ plotReducedDim(sce.zeisel, dimred="UMAP", colour_by="level1class")
Using default parameters and no log transformation

```{r umaptsnezeisel}
sct <- SingleCellExperiment(e)
sct$tissue <- tissue
# fake the log normalization because it is undesirable for microarray data
names(assays(sct)) <- "logcounts"
sct <- fixedPCA(sct, subset.row=NULL)
sct <- runTSNE(sct, dimred="PCA")
plotReducedDim(sct, dimred="TSNE", colour_by="tissue")
# We already set up `sce` with `tissue` earlier
sce <- runTSNE(sce, dimred="PCA")
plotReducedDim(sce, dimred="TSNE", colour_by="tissue")
```

## UMAP of tissue microarray data

Also using default parameters and no log transformation

```{r nologumap}
sct <- runUMAP(sct, dimred="PCA")
plotReducedDim(sct, dimred="UMAP", colour_by="tissue")
sce <- runUMAP(sce, dimred="PCA")
plotReducedDim(sce, dimred="UMAP", colour_by="tissue")
```

## Summary: distances and dimension reduction
Expand All @@ -523,6 +514,7 @@ plotReducedDim(sct, dimred="UMAP", colour_by="tissue")

## Lab exercise

* **Check your BLAS**: Matrix multiplication speed relies heavily on your Basic Linear Algebra Subprograms (BLAS). In the lab, verify you are using an optimized BLAS (like OpenBLAS, Intel MKL, or Apple Accelerate). You can check this by running `sessionInfo()` in your R console and looking for the "Matrix products:" and "BLAS:" fields. If it points to `libRblas.so` or `libR.dylib` (instead of e.g. `libopenblas.so` or `vecLib`), you might be using the unoptimized default!
* OSCA Basics: [Chapter 4 Dimensionality Reduction](http://bioconductor.org/books/release/OSCA.basic/dimensionality-reduction.html)
* Optional if you are interested, OSCA Advanced: [Chapter 4 Dimensionality reduction, redux](http://bioconductor.org/books/release/OSCA.advanced/dimensionality-reduction-redux.html)

Expand Down
Loading
Loading