Correlation and Ordination

In this session, we’re going to take a look at beta diversity, and at the microViz package for working with phyloseq objects.

Have this microviz documentation open

First let’s load the libraries:

library(tidyverse)
library(here)
library(phyloseq)
library(microViz)

sample_df <- readRDS("data/instructional/sample_data_instructional.rds")
count_tab <- readRDS("data/instructional/count_table_instructional.rds")
tax_tab <- readRDS("data/instructional/tax_table_instructional.rds")

# make a data frame of only the vaginal samples
vaginal_samples_to_keep <- sample_df %>% 
  filter(sample_type == "vaginal")

asv_df <- tax_tab %>%
  data.frame() %>% 
  rownames_to_column(var = "asv") %>% 
  mutate(asv_len = nchar(asv))

keepers <- asv_df %>% 
  filter(asv_len > 385 & asv_len < 460) %>% 
  # Careful below! Don't drop NA unless you intend to
  filter(Order != "Chloroplast" | is.na(Order)) %>% 
  filter(Family != "Mitochondria" | is.na(Family)) %>%
  pull(asv) %>% # pull() goes from column to vector
  unique()

# filter the count table to only include vaginal samples
# and to only include ASVs who are non-zero in at least one sample

filtered_counts <- count_tab %>%
  # convert to data frame
  as.data.frame() %>%
  # change rownames to a column (https://tibble.tidyverse.org/reference/rownames.html)
  rownames_to_column("amplicon_sample_id") %>%  
  # make the table "longer" to be in tidy format
  # (https://r4ds.hadley.nz/data-tidy.html#sec-tidy-data)
  pivot_longer(-amplicon_sample_id,names_to="asv", values_to = "count") %>%
  # semi_join() returns all rows from x with a match in y.
  semi_join(vaginal_samples_to_keep) %>%
  # get rid of asvs that are zero in all samples
  group_by(asv) %>%
  filter(sum(count > 0) > 0) %>%
  ungroup() %>%
  #get rid of ASVs that aren't in the keepers vector
  filter(asv %in% keepers)

# we need to convert the count table back into a matrix
# to make a phyloseq object
filtered_counts_matrix <- filtered_counts %>%
  pivot_wider(names_from = asv, values_from = count) %>%
  column_to_rownames("amplicon_sample_id") %>%
  as.matrix()


# lastly, remove asv tax assignments if they are
# not present in any of the vaginal samples
# also remove Label because microviz doesn't like it
filtered_taxonomy <- tax_tab %>%
  as.data.frame() %>%
  rownames_to_column("asv") %>%
  as_tibble() %>%
  semi_join(filtered_counts) %>%
  select(-Label) %>%
  column_to_rownames("asv") %>%
  as.matrix()

ps <- phyloseq(sample_data(sample_df %>% 
                             filter(sample_type == "vaginal") %>%
                             column_to_rownames(var = "amplicon_sample_id")),
               otu_table(filtered_counts_matrix, taxa_are_rows = FALSE),
               tax_table(filtered_taxonomy)) %>%
               prune_samples(sample_sums(.) > 100, .) %>%
               tax_fix() %>% # this propogates higher taxonomy when lower is NA
               tax_mutate(genus_species = str_c(Genus, Species, sep=" ")) %>%
               tax_rename(rank = "genus_species")

ps_manual_taxonomy <- ps %>%
  tax_fix() %>%
  tax_mutate(Species = case_when(
    Species ==  "acidophilus/casei/crispatus/gallinarum" ~ "crispatus",
    Species == "crispatus/gasseri/helveticus/johnsonii/kefiranofaciens" ~ "crispatus",
    Species == "animalis/apodemi/crispatus/murinus" ~ "crispatus",
    .default = Species)) %>%
    #also remake genus_species to fix those taxa
    tax_mutate(genus_species = str_c(Genus, Species, sep = " ")) %>%
    tax_rename(rank = "genus_species")

Why manual taxonomy? Well, if we look at the original dataframe we can see these taxa where our taxonomic database SILVA didn’t know which species it was exactly, but instead gave a list of possible species. However, we know these are vaginal samples, and that 99% of the time, these ASVs correspond to L. crispatus because the other possible species are very rare in vaginal samples.

ps %>%
  tax_fix() %>%
  tax_agg("genus_species") %>%
  ps_seriate() %>% # this changes the order of the samples to be sorted by similarity
  comp_barplot(tax_level = "genus_species", sample_order = "asis", n_taxa = 10) +
  facet_wrap(vars(arm, time_point), scales="free_x")

Ok, now we’ll switch to live-coding for the rest of the session.