Tutorial - Compare results of Dada2 vs Mothur

1 Aim
2 Directory structure
3 Setup
4 Phyloseq
- 4.1 Read phyloseq object for dada2
- 4.2 Create a phyloseq object for the mothur results
5 Comparison between dada2 and mothur
- 5.1 Compare at the division level
6 Conclusion on the dada2 pipeline

1 Aim

Compare the results of the analysis by mothur vs dada2

Prerequisite : the R_dada2_tutorial must have been run before

2 Directory structure

Relative to the main directory from GitHub

../fastq : fastq files
../dada2 : dada2 processed files
../R_dada2 : This tutorial

3 Setup

3.1 Load the necessary libraries

library("phyloseq")
library("Biostrings")

library("ggplot2")
library("stringr")
library("dplyr")
library("tidyr")
library("readxl")
library("tibble")
library("readr")

library("kableExtra")  # necessary for nice table formatting with knitr

3.2 Set up directories

fastq_dir <- "../fastq/"  # fastq directory
dada2_dir <- "../dada2/"  # dada2 results

3.3 PR2 tax levels

PR2_tax_levels <- c("Kingdom", "Supergroup", "Division", "Class", "Order", "Family", 
    "Genus", "Species")

4 Phyloseq

4.1 Read phyloseq object for dada2

The file has been created in the R_dada2_tutorial.Rmd

ps_dada2 <- readRDS(str_c(dada2_dir, "CARBOM_phyloseq.rds"))

4.2 Create a phyloseq object for the mothur results

4.2.1 Sample names

It is assumed that the sample names are at the start of file name and separated by _.

# get a list of all fastq files in the ngs directory and separate R1 and R2
fns <- sort(list.files(fastq_dir, full.names = TRUE))
fns <- fns[str_detect(basename(fns), ".fastq")]
fns_R1 <- fns[str_detect(basename(fns), "R1")]
fns_R2 <- fns[str_detect(basename(fns), "R2")]

# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
sample.names <- str_split(basename(fns_R1), pattern = "_", simplify = TRUE)
sample.names <- sample.names[, 1]

samdf <- data.frame(sample_name = sample.names)
rownames(samdf) <- sample.names

4.2.2 Mothur data from Excel file

# read the mothur database from Excel file
mothur_file <- "../Comparison different methods 1.0.xlsx"
mothur_database <- read_excel(mothur_file, sheet = "mothur")
# need to remove empty rows
mothur_database <- mothur_database %>% filter(!is.na(OTUNumber))

# create the taxonomy matrix
mothur_tax <- str_split(mothur_database$OTUConTaxonomy, ";", 9, simplify = TRUE)
# the last column is empty so remove
mothur_tax <- mothur_tax[, 1:8]

# give row names and column names
colnames(mothur_tax) <- PR2_tax_levels
rownames(mothur_tax) <- mothur_database$OTUNumber

# replace in the database, the unique taxonomy column by 8 columns
mothur_database <- cbind(mothur_database, mothur_tax)
mothur_database <- mothur_database %>% select(-OTUConTaxonomy)

# create the otu_table
mothur_otu <- select(mothur_database, ends_with("p"), -Supergroup)
rownames(mothur_otu) <- mothur_database$OTUNumber
mothur_otu <- as.matrix(mothur_otu)

# create the phyloseq for mothur
ps_mothur <- phyloseq(otu_table(mothur_otu, taxa_are_rows = TRUE), sample_data(samdf), 
    tax_table(mothur_tax))

5 Comparison between dada2 and mothur

5.1 Compare at the division level

# remove Lobosa only found in Mothur
ps_mothur_common <- subset_taxa(ps_mothur, !(Division %in% c("Lobosa")))

plot_bar(ps_mothur_common, fill = "Division") + geom_bar(aes(color = Division, 
    fill = Division), stat = "identity", position = "stack") + ggtitle("Mothur")

plot_bar(ps_dada2, fill = "Division") + geom_bar(aes(color = Division, fill = Division), 
    stat = "identity", position = "stack") + ggtitle("Dada2")

## Compare by aggregation

5.1.1 Transform the database files into the long version

dada2_database <- read_tsv(str_c(dada2_dir, "CARBOM_dada2.database.tsv"))
dada2_long <- dada2_database %>% gather(key = "sample", value = "n_seq", num_range(120:200, 
    "p")) %>% select(-sequence)
knitr::kable(head(dada2_long), "latex") %>% kable_styling(bootstrap_options = "striped", 
    font_size = 7)

mothur_long <- mothur_database %>% gather(key = "sample", value = "n_seq", num_range(120:200, 
    "p")) %>% select(-contains("repSeq"), -nseq)
knitr::kable(head(mothur_long), "latex") %>% kable_styling(bootstrap_options = "striped", 
    font_size = 7)

5.1.2 Aggregate by Division, Class, Genus, Species

dada2_species <- dada2_long %>% group_by(Division, Class, Genus, Species) %>% 
    summarize(n_seq = sum(n_seq))
knitr::kable(head(dada2_species))

Division	Class	Genus	Species	n_seq
Centroheliozoa	Centroheliozoa_X	Pterocystida_XX	Pterocystida_XX_sp.	20
Chlorophyta	Mamiellophyceae	Bathycoccus	Bathycoccus_prasinos	367
Chlorophyta	Mamiellophyceae	Micromonas	Micromonas_Clade-B..4	19
Chlorophyta	Mamiellophyceae	Micromonas	Micromonas_Clade-B.E.3	20
Chlorophyta	Mamiellophyceae	Ostreococcus	Ostreococcus_lucimarinus	36
Chlorophyta	Mamiellophyceae	Ostreococcus	Ostreococcus_tauri	222

mothur_species <- mothur_long %>% group_by(Division, Class, Genus, Species) %>% 
    summarize(n_seq = sum(n_seq))
knitr::kable(head(mothur_species))

Division	Class	Genus	Species	n_seq
Centroheliozoa	Centroheliozoa_X	Pterocystida_XX	Pterocystida_XX_sp.	24
Chlorophyta	Mamiellophyceae	Bathycoccus	Bathycoccus_prasinos	249
Chlorophyta	Mamiellophyceae	Dolichomastix	Dolichomastix_tenuilepis	3
Chlorophyta	Mamiellophyceae	Micromonas	Micromonas_Clade-A.ABC.1-2	21
Chlorophyta	Mamiellophyceae	Micromonas	Micromonas_Clade-B..4	10
Chlorophyta	Mamiellophyceae	Micromonas	Micromonas_Clade-B.E.3	11

5.1.3 Merge the two lists to comute the relation between mothur and dada2 estimates

both_species <- rbind(dada2_species, mothur_species) %>% group_by(Division, 
    Class, Genus, Species) %>% summarize(n_methods = n()) %>% left_join(dada2_species) %>% 
    dplyr::rename(dada2 = n_seq) %>% left_join(mothur_species) %>% dplyr::rename(mothur = n_seq) %>% 
    arrange(desc(dada2))
both_class <- both_species %>% group_by(Division, Class) %>% summarise(dada2 = sum(dada2, 
    na.rm = TRUE), mothur = sum(mothur, na.rm = TRUE)) %>% arrange(desc(dada2))

5.1.4 Plot at Class level

ggplot(both_class) + geom_point(aes(mothur, dada2)) + geom_smooth(aes(mothur, 
    dada2), method = "lm", show.legend = TRUE) + ggtitle("Class level count")

summary(lm(both_class$dada2 ~ both_class$mothur))


Call:
lm(formula = both_class$dada2 ~ both_class$mothur)

Residuals:
    Min      1Q  Median      3Q     Max 
-96.108 -15.934   3.425  11.841 113.903 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)        4.14811   11.64032   0.356    0.726    
both_class$mothur  1.68365    0.05887  28.597   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 39.53 on 18 degrees of freedom
Multiple R-squared:  0.9785,    Adjusted R-squared:  0.9773 
F-statistic: 817.8 on 1 and 18 DF,  p-value: < 2.2e-16

5.1.5 Plot at Species level

ggplot(both_species) + geom_point(aes(mothur, dada2)) + geom_smooth(aes(mothur, 
    dada2), method = "lm", show.legend = TRUE) + ggtitle("Species level count")

summary(lm(both_species$dada2 ~ both_species$mothur))


Call:
lm(formula = both_species$dada2 ~ both_species$mothur)

Residuals:
    Min      1Q  Median      3Q     Max 
-63.624  -6.747   2.490  13.762  42.152 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)          1.47566    5.57181   0.265    0.793    
both_species$mothur  1.72349    0.04935  34.922   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 22.23 on 26 degrees of freedom
  (36 observations deleted due to missingness)
Multiple R-squared:  0.9791,    Adjusted R-squared:  0.9783 
F-statistic:  1220 on 1 and 26 DF,  p-value: < 2.2e-16

species_one_method <- both_species %>% filter(n_methods == 1)
knitr::kable(species_one_method, "latex") %>% kable_styling(bootstrap_options = "striped", 
    font_size = 7)

6 Conclusion on the dada2 pipeline

The dada2 pipeline yieds 1.7 more reads than mothur
The number of reads at the species and class levels are correlated
It is very fast, the longest step is the taxonomy assignement
It offers the advantage of having everything performed under R.