Pourquoi Phyloseq?

Why Phyloseq?

1. Commençons! / Let’s start!

Charger les librairies / Load packages

library(phyloseq)
library(vegan)
library(ggplot2)

R version 3.4.3
Phyloseq version 1.22.3
Linux machine image 4.4.0-64-generic

Importer les données / Data import

## Sample data
soiltab <- read.table('data/soil.csv', sep = ';', row.names = 1, header = TRUE)

# Taxonomy table
load("data/taxotab.01.05.rdata")

# ASV table
load("data/seqtab.nochim.rdata")


Construire l’objet Phyloseq / Construct the Phyloseq object

ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), 
               sample_data(soiltab), 
               tax_table(as.matrix(taxo$tax)))
ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 546 taxa and 74 samples ]
## sample_data() Sample Data:       [ 74 samples by 8 sample variables ]
## tax_table()   Taxonomy Table:    [ 546 taxa by 7 taxonomic ranks ]


Réordonner les facteurs / Reorder factors

sample_data(ps)$horizon = factor(sample_data(ps)$horizon, levels = c('L', 'F', 'H', 'Ae', 'B'))
sample_data(ps)$forest = factor(sample_data(ps)$forest, levels = c('AS', 'Mixed', 'FG'))


Pour commencer directement avec l’objet Phyloseq / To start directly from Phyloseq object

#load(file = "data/ps.rdata")
#ps


2. Filtrage et transformation / Filtering and transformation

Subset of the dataset with only fungi

ps.fungi = subset_taxa(ps, Kingdom=="k__Fungi")
ntaxa(ps)
## [1] 546
ntaxa(ps.fungi)
## [1] 524

Filtered 22 non-fungal taxa

Enlever les taxons avec seulement 1 ou 2 sequences au total / Remove singletons and doubletons

ps.fungi.nosd = filter_taxa(ps.fungi, function(x) sum(x) > 2, TRUE)
ntaxa(ps.fungi.nosd)
## [1] 523


Transformation log + 1 pour normaliser les données / Shifted log transformation to normalize

ps.fungi.nosd.log <- transform_sample_counts(ps.fungi.nosd, function(x) log(x+1))


Agglomerer les taxons au niveau specifique / Agglomerate taxa at the species level

ps.fungi.nosd.log.sp = tax_glom(ps.fungi.nosd.log, "Species", NArm = FALSE) # NArm = FALSE in order to keep ESV not taxonomically assigned at the species level


Exemple de transmformation / Example of other transformation

ps.fungi.nosd.hel = transform_sample_counts(ps.fungi.nosd, function(x) sqrt(x/sum(x))) # Hellinger transformation


3. Exploration des données / Data exploration

Details de l’objet Phyloseq / Details of the object Phyloseq

ntaxa(ps.fungi.nosd.log)
## [1] 523
nsamples(ps.fungi.nosd.log)
## [1] 74
rank_names(ps.fungi.nosd.log)
## [1] "Kingdom" "Phylum"  "Class"   "Order"   "Family"  "Genus"   "Species"
sample_variables(ps.fungi.nosd.log)
## [1] "sampleID" "plot"     "block"    "forest"   "horizon"  "Carbon"  
## [7] "Nitrogen" "pH"
otu_table(ps.fungi.nosd.log)[1:2, 1:2]
## OTU Table:          [2 taxa and 2 samples]
##                      taxa are columns
##     ATGCGATAAGTAATGTGAATTGCAGAATTCAGTGAATCATCGAATCTTTGAACGCACCTTGCGCCCTTTGGTATTCCGAAGGGCATGCCTGTTTGAGTGTCATTAAATTATCAACCTTGCTCGCTTTTATTAGCTTGAGTTAGGCTTGGATGTGAGGGTTTTGCTGGCTTCCTTCAGTGGATGGTCTGCTCCCTTTGAATGCATTAGCGGGATCTCTTGTGGACCGTCACTTGGTGTGATAATTATCTATGCCTTGAGACTTTGAAACAAACTTATGAGAATCTGCTTATAACCGTCCTCACGGACAACTTTTGACAATTTGACCTCAAATCAGGTAGGACTACCCGCTGAACTTAA
## S10                                                                                                                                                                                                                                                                                                                                                              0.000000
## S11                                                                                                                                                                                                                                                                                                                                                              1.098612
##     ATGCGATACGTAATGTGAATTGCAGAATTCAGTGAATCATCGAATCTTTGAACGCACATTGCACTCCTTGGTATTCCGAGGAGTATGCCTGTTTCAGTATCATGAGCACTCTCACACCTAACCTTTGGGTTTATGGCGTGGAATTGGAATGCGCCGACTGTCATGGTTGGCCCTTCTAAAATGTAGTTCTTGGCTGTCACCTAATACAGCAGTTTGGCCTAATAGTTTTGGCATTCATTGTCAAATCTTTGGCTAACATTTGCTCCAGGAGTCAGTCTTGATAATACAGAAAACTCATTCAAATTTTGATCTGAAATCAGGTAGGGCTACCCGCTGAACTTAA
## S10                                                                                                                                                                                                                                                                                                                                                0.000000
## S11                                                                                                                                                                                                                                                                                                                                                2.639057
tax_table(ps.fungi.nosd.log)[1:2, 1:2]
## Taxonomy Table:     [2 taxa by 2 taxonomic ranks]:
##                                                                                                                                                                                                                                                                                                                                                                       Kingdom   
## ATGCGATAAGTAATGTGAATTGCAGAATTCAGTGAATCATCGAATCTTTGAACGCACCTTGCGCCCTTTGGTATTCCGAAGGGCATGCCTGTTTGAGTGTCATTAAATTATCAACCTTGCTCGCTTTTATTAGCTTGAGTTAGGCTTGGATGTGAGGGTTTTGCTGGCTTCCTTCAGTGGATGGTCTGCTCCCTTTGAATGCATTAGCGGGATCTCTTGTGGACCGTCACTTGGTGTGATAATTATCTATGCCTTGAGACTTTGAAACAAACTTATGAGAATCTGCTTATAACCGTCCTCACGGACAACTTTTGACAATTTGACCTCAAATCAGGTAGGACTACCCGCTGAACTTAA "k__Fungi"
## ATGCGATACGTAATGTGAATTGCAGAATTCAGTGAATCATCGAATCTTTGAACGCACATTGCACTCCTTGGTATTCCGAGGAGTATGCCTGTTTCAGTATCATGAGCACTCTCACACCTAACCTTTGGGTTTATGGCGTGGAATTGGAATGCGCCGACTGTCATGGTTGGCCCTTCTAAAATGTAGTTCTTGGCTGTCACCTAATACAGCAGTTTGGCCTAATAGTTTTGGCATTCATTGTCAAATCTTTGGCTAACATTTGCTCCAGGAGTCAGTCTTGATAATACAGAAAACTCATTCAAATTTTGATCTGAAATCAGGTAGGGCTACCCGCTGAACTTAA               "k__Fungi"
##                                                                                                                                                                                                                                                                                                                                                                       Phylum            
## ATGCGATAAGTAATGTGAATTGCAGAATTCAGTGAATCATCGAATCTTTGAACGCACCTTGCGCCCTTTGGTATTCCGAAGGGCATGCCTGTTTGAGTGTCATTAAATTATCAACCTTGCTCGCTTTTATTAGCTTGAGTTAGGCTTGGATGTGAGGGTTTTGCTGGCTTCCTTCAGTGGATGGTCTGCTCCCTTTGAATGCATTAGCGGGATCTCTTGTGGACCGTCACTTGGTGTGATAATTATCTATGCCTTGAGACTTTGAAACAAACTTATGAGAATCTGCTTATAACCGTCCTCACGGACAACTTTTGACAATTTGACCTCAAATCAGGTAGGACTACCCGCTGAACTTAA "p__Basidiomycota"
## ATGCGATACGTAATGTGAATTGCAGAATTCAGTGAATCATCGAATCTTTGAACGCACATTGCACTCCTTGGTATTCCGAGGAGTATGCCTGTTTCAGTATCATGAGCACTCTCACACCTAACCTTTGGGTTTATGGCGTGGAATTGGAATGCGCCGACTGTCATGGTTGGCCCTTCTAAAATGTAGTTCTTGGCTGTCACCTAATACAGCAGTTTGGCCTAATAGTTTTGGCATTCATTGTCAAATCTTTGGCTAACATTTGCTCCAGGAGTCAGTCTTGATAATACAGAAAACTCATTCAAATTTTGATCTGAAATCAGGTAGGGCTACCCGCTGAACTTAA               "p__Mucoromycota"


CHALLENGE

Pouvez-vous trouvez le nombre de phylum de champignons?
Can you find the number of fungal phylum?

Profondeur de sequencage après filtrage de la canalisation Dada2 / Sequencing depth per samples after filtering of Dada2 pipeline

plot_bar(ps, x = "Sample", y = "Abundance", fill = "Kingdom") +
  geom_bar(stat="identity", color = "black") +
  labs(x = "Sample ID", y = "Reads abundance")

plot_bar(ps.fungi,  x = "Sample", y = "Abundance", fill = "Kingdom") +
  geom_bar(stat="identity", color = "black") +
  labs(x = "Sample ID", y = "Reads abundance of fungi")

plot_bar(ps.fungi,  x = "Sample", y = "Abundance", fill = "Phylum") +
  geom_bar(stat="identity") +
  labs(x = "Sample ID", y = "Reads abundance of fungi")



Schéma expérimental / Sampling design

facet_wrap(~forest)

plot_bar(ps.fungi, fill = "Phylum") +
  geom_bar(stat="identity") +
  facet_wrap(~forest, ncol = 3, scales = "free_x") +
  labs(x = "Sample ID", y = "Reads abundance of fungi")



facet_wrap(horizon~forest)

plot_bar(ps.fungi, fill = "Phylum") +
  geom_bar(stat="identity") +
  facet_wrap(horizon~forest, ncol = 3, scales = "free_x") +
  labs(x = "Sample ID", y = "Reads abundance of fungi")

3. Ordination non contrainte / Unconstrained ordination

NMDS avec measure de dissimilarité de Bray-Curtis / NMDS with Bray-Curtis measure dissimilarity (aka Steinhaus)

ord.nmds.bray <- ordinate(ps.fungi.nosd.log.sp, method="NMDS", k = 2, try = 20, distance = "bray") # /!\ number of tries need to be high enough to reach convergence
## Wisconsin double standardization
## Run 0 stress 0.225241 
## Run 1 stress 0.2223462 
## ... New best solution
## ... Procrustes: rmse 0.05440405  max resid 0.306738 
## Run 2 stress 0.2203151 
## ... New best solution
## ... Procrustes: rmse 0.04550081  max resid 0.2865166 
## Run 3 stress 0.2446921 
## Run 4 stress 0.2253402 
## Run 5 stress 0.2207518 
## ... Procrustes: rmse 0.04898002  max resid 0.3214373 
## Run 6 stress 0.2325877 
## Run 7 stress 0.2307777 
## Run 8 stress 0.2425666 
## Run 9 stress 0.2187777 
## ... New best solution
## ... Procrustes: rmse 0.04569631  max resid 0.2879466 
## Run 10 stress 0.2188551 
## ... Procrustes: rmse 0.04600953  max resid 0.2881654 
## Run 11 stress 0.2272405 
## Run 12 stress 0.2310672 
## Run 13 stress 0.2221372 
## Run 14 stress 0.2296952 
## Run 15 stress 0.2181904 
## ... New best solution
## ... Procrustes: rmse 0.04294399  max resid 0.2883536 
## Run 16 stress 0.2148475 
## ... New best solution
## ... Procrustes: rmse 0.02259523  max resid 0.185004 
## Run 17 stress 0.2207538 
## Run 18 stress 0.2355219 
## Run 19 stress 0.2181883 
## Run 20 stress 0.2269349 
## *** No convergence -- monoMDS stopping criteria:
##     20: stress ratio > sratmax


Figure rapide / Quick plot

plot_ordination(ps.fungi.nosd.log.sp, ord.nmds.bray, type="samples", color = "horizon")

Figure detaillée / Full plot

plot_ordination(ps.fungi.nosd.log.sp, ord.nmds.bray, type="samples", color = "horizon") +
  geom_point(size = 6, alpha = .5, show.legend = TRUE) +
  scale_color_manual(name = "Horizon", values = c('darkgreen', 'sienna4', 'grey1', 'grey60', 'darkorange2')) +
  labs(x = "Axis 1", y = "Axis 2", caption = paste("Stress =", round(ord.nmds.bray$stress, 2))) +
  facet_wrap(~forest) +
  stat_ellipse(type = "t", level = 0.8) + # 80% confidence interval ellispes
  coord_fixed() +
  theme(text = element_text(size=20), strip.text = element_text(size=20))

NMDS avec les données binaire / NMDS with binary data

ord.nmds.bin <- ordinate(ps.fungi.nosd.log.sp, method="NMDS", k = 2, try = 20, distance = "bray", binary =TRUE)
## Wisconsin double standardization
## Run 0 stress 0.2034727 
## Run 1 stress 0.2170128 
## Run 2 stress 0.215945 
## Run 3 stress 0.2169219 
## Run 4 stress 0.2156148 
## Run 5 stress 0.2177145 
## Run 6 stress 0.2141023 
## Run 7 stress 0.2170497 
## Run 8 stress 0.2166483 
## Run 9 stress 0.2189259 
## Run 10 stress 0.2133106 
## Run 11 stress 0.2127333 
## Run 12 stress 0.2166672 
## Run 13 stress 0.2034529 
## ... New best solution
## ... Procrustes: rmse 0.01271852  max resid 0.05686887 
## Run 14 stress 0.2178162 
## Run 15 stress 0.2158355 
## Run 16 stress 0.2183089 
## Run 17 stress 0.2261992 
## Run 18 stress 0.2182643 
## Run 19 stress 0.2181284 
## Run 20 stress 0.2202713 
## *** No convergence -- monoMDS stopping criteria:
##     20: stress ratio > sratmax


Figure de la NMDS avec données binaires / NMDS plot with binary data

plot_ordination(ps.fungi.nosd.log.sp, ord.nmds.bin, type="samples", color = "horizon") +
  geom_point(size = 6, alpha = .5, show.legend = TRUE) +
  scale_color_manual(name = "Horizon", values = c('darkgreen', 'sienna4', 'grey1', 'grey60', 'darkorange2')) +
  labs(x = "Axis 1", y = "Axis 2", caption = paste("Stress =", round(ord.nmds.bin$stress, 2))) +
  facet_wrap(~forest) +
  stat_ellipse(type = "t", level = 0.8) + # 80% confidence interval ellispes
  coord_fixed() +
  theme(text = element_text(size=20), strip.text = element_text(size=20))

CHALLENGE

Pouvez-vous faire une analyse en composantes principales?
Can you run a principal component analysis?

4. Analyse de variance: les communautés sont-elles statistiquement differentes? / Variance analysis: Are communities statistically different?

metadata <- as(sample_data(ps.fungi.nosd.log.sp), "data.frame") # Export data


PERMANOVA avec la measure de Bray-Curtis / PERMANOVA with Bray-Curtis measure

adonis(phyloseq::distance(physeq = ps.fungi.nosd.log.sp, method="bray") ~ horizon * forest, strata = metadata$block,# Horizon and forest effect with interaction and block as strata
      data = metadata,
      permutations = 99999)
## 
## Call:
## adonis(formula = phyloseq::distance(physeq = ps.fungi.nosd.log.sp,      method = "bray") ~ horizon * forest, data = metadata, permutations = 99999,      strata = metadata$block) 
## 
## Blocks:  strata 
## Permutation: free
## Number of permutations: 99999
## 
## Terms added sequentially (first to last)
## 
##                Df SumsOfSqs MeanSqs F.Model      R2  Pr(>F)    
## horizon         4    6.8294 1.70736  7.6311 0.29952   1e-05 ***
## forest          2    0.9236 0.46181  2.0641 0.04051 0.00216 ** 
## horizon:forest  8    1.8480 0.23099  1.0324 0.08105 0.38681    
## Residuals      59   13.2005 0.22374         0.57893            
## Total          73   22.8015                 1.00000            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


Test de l’homogénéité de dispersion / Test for the homogeneity of dispersion

betadisp <- betadisper(phyloseq::distance(physeq = ps.fungi.nosd.log.sp, method="bray"), metadata$forest)
permutest(betadisp, permutations = 99999) 
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 99999
## 
## Response: Distances
##           Df  Sum Sq   Mean Sq      F N.Perm Pr(>F)
## Groups     2 0.00882 0.0044116 0.9854  99999 0.3799
## Residuals 71 0.31786 0.0044769
betadisp <- betadisper(phyloseq::distance(physeq = ps.fungi.nosd.log.sp, method="bray"), metadata$horizon)
permutest(betadisp, permutations = 99999)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 99999
## 
## Response: Distances
##           Df  Sum Sq Mean Sq      F N.Perm  Pr(>F)   
## Groups     4 0.13844 0.03461 5.1657  99999 0.00123 **
## Residuals 69 0.46229 0.00670                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


PERMANOVA avec les données binaires / PERMANOVA with binary data

adonis(phyloseq::distance(ps.fungi.nosd.log.sp, method="bray", binary = TRUE) ~ horizon * forest, strata = metadata$block, # Horizon and forest effect with interaction and block as strata
       data = metadata,
       permutations = 99999)
## 
## Call:
## adonis(formula = phyloseq::distance(ps.fungi.nosd.log.sp, method = "bray",      binary = TRUE) ~ horizon * forest, data = metadata, permutations = 99999,      strata = metadata$block) 
## 
## Blocks:  strata 
## Permutation: free
## Number of permutations: 99999
## 
## Terms added sequentially (first to last)
## 
##                Df SumsOfSqs MeanSqs F.Model      R2  Pr(>F)    
## horizon         4    6.8302 1.70756  8.8627 0.33451   1e-05 ***
## forest          2    0.7049 0.35247  1.8294 0.03452 0.01524 *  
## horizon:forest  8    1.5161 0.18951  0.9836 0.07425 0.53206    
## Residuals      59   11.3674 0.19267         0.55672            
## Total          73   20.4186                 1.00000            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


Test de l’homogénéité de dispersion / Test for the homogeneity of dispersion

betadisp <- betadisper(phyloseq::distance(ps.fungi.nosd.log.sp, method="bray", binary = TRUE), metadata$forest)
permutest(betadisp, permutations = 99999) 
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 99999
## 
## Response: Distances
##           Df  Sum Sq   Mean Sq      F N.Perm Pr(>F)
## Groups     2 0.00669 0.0033460 0.5549  99999 0.5757
## Residuals 71 0.42809 0.0060294


betadisp <- betadisper(phyloseq::distance(ps.fungi.nosd.log.sp, method="bray", binary = TRUE), metadata$horizon)
permutest(betadisp, permutations = 99999)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 99999
## 
## Response: Distances
##           Df  Sum Sq  Mean Sq      F N.Perm  Pr(>F)    
## Groups     4 0.17097 0.042743 6.5332  99999 0.00022 ***
## Residuals 69 0.45143 0.006542                          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


5. Ordination sous contraites: analyse de redondance / Constrained ordination: Redundancy Analysis (RDA)

ord.rda <- ordinate(ps.fungi.nosd.log.sp, formula = ps.fungi.nosd.log.sp ~ Carbon+Nitrogen+pH, method="CAP", distance = "bray") # Uses Capscale function of Vegan package


Obtenir les statistiques de la RDA / RDA stats

RsquareAdj(ord.rda) 
## $r.squared
## [1] 0.2387887
## 
## $adj.r.squared
## [1] 0.2061653
anova.cca(ord.rda)
## Permutation test for capscale under reduced model
## Permutation: free
## Number of permutations: 999
## 
## Model: capscale(formula = OTU ~ Carbon + Nitrogen + pH, data = data, distance = distance)
##          Df SumOfSqs      F Pr(>F)    
## Model     3   5.4447 7.1209  0.001 ***
## Residual 70  17.8409                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


Verifications des facteurs d’inflation de la variance / Check variance inflation factors (VIF)

vif.cca(ord.rda) # VIF < 10 should be fine
##   Carbon Nitrogen       pH 
## 6.975395 7.163518 1.129545


CHALLENGE

Que feriez-vous pour exclure les données d’azote de la RDA?
What if you want to exclude Nitrogen data of the RDA?

Figure rapide / Quick plot

plot_ordination(ps.fungi.nosd.log.sp, vegan::scores(ord.rda, scaling=1), type="sites",  color = "horizon") 

Enregistrer les valeurs propres / Get eigenvector values

evals <- ord.rda$CCA$eig/sum(ord.rda$CCA$eig)


Figure detaillée / Full plot

arrowmat = vegan::scores(ord.rda, display = "bp", scaling = 1) # Add the environmental variables as arrows
arrowdf <- data.frame(arrowmat) # Add labels 
rownames(arrowdf) <- c("Carbon","Nitrogen","pH") # Add rownames 
arrowdf <- data.frame(labels = c("Carbon","Nitrogen","pH"), arrowmat) # Make a data.frame
plot_ordination(ps.fungi.nosd.log.sp, vegan::scores(ord.rda, scaling = 1), type="sites",  color = "horizon") + 
  coord_fixed(sqrt(evals[2] / evals[1])) +
  geom_point(size = 8, alpha = .5) +
  scale_color_manual(name = "Horizon", values = c('darkgreen', 'sienna4', 'grey1', 'grey60', 'darkorange2')) +
  geom_segment(aes(xend = CAP1, yend = CAP2, x = 0, y = 0), size = .7, data = arrowdf, color = "black", arrow = arrow(length = unit(0.05, "npc"))) +
  geom_text(aes(x = CAP1*1.15, y = CAP2*1.15, color = NULL, label = labels), size = 6, data = arrowdf, show.legend = FALSE) +
  facet_wrap(~forest) +
  theme(text = element_text(size=18), strip.text = element_text(size=20)) +
  labs(x = paste("Constrained axis 1 (",round(evals[1]*100, 0),"%)"), y = paste("Constrained axis 2 (",round(evals[2]*100, 0),"%)"))