Metabarcoding - Antarctica 2015 - Sorted samples

Initialize knitr
1 Aim
2 Directory structure
3 Downloads
4 Samples
5 Initialize
6 Station map
7 Examine the fastQ files
8 Filter and Trim the reads
- 8.1 Create names for the filtered files in filtered/ subdirectory of the fastq carbom
- 8.2 Remove primers by truncation and filter
9 Dada2 processing
10 BLAST results
- 10.1 Save as a fasta file
- 10.2 Process BLAST
11 Phyloseq

Initialize knitr

  knitr::opts_chunk$set(fig.height=10, fig.width=10,
                        eval=TRUE, 
                        cache=TRUE,
                        echo=TRUE,
                        prompt=FALSE,
                        tidy=TRUE,
                        comment=NA,
                        message=FALSE,
                        warning=FALSE)

1 Aim

Process Antarctica 2015 sorted samples (18S V4) sequenced on Illumina data with the Dada2 suite as implemented in R (dada2 is also implemented in Qiime). The processing steps are adapted from : https://benjjneb.github.io/dada2/tutorial.html

2 Directory structure

../fastq : fastq files
../fastq_filtered : fastq files after filtration
../qual_pdf : qual pdf files
../dada2 : dada2 processed files
../databases : PR2 database files (contains the latest PR2 database formatted for dada2 - https://github.com/vaulot/pr2_database/releases/)
../blast : BLAST files output
../R : R project files
../img : Images

3 Downloads

Install the following software :

R : https://pbil.univ-lyon1.fr/CRAN/
R studio : https://www.rstudio.com/products/rstudio/download/#download
Download and install the following libraries by running under R studio the following lines

install.packages("dplyr")     # To manipulate dataframes
install.packages("stringr")   # To strings
install.packages("ggplot2")   # To do plots
install.packages("readxl")    # To read excel files
install.packages("tibble")    # To work with data frames
install.packages("tidyr")    # To work with data frames
install.packages("readr")    # To read and write files

install.packages("leaflet")   # For maps
install.packages("mapview")   # To save leaflet maps

webshot::install_phantomjs() # Necessary to save images

source("https://bioconductor.org/biocLite.R")
biocLite('dada2')             # metabarcode data analysis
biocLite('phyloseq')          # metabarcode data analysis
biocLite('Biostrings')        # needed for fastq.geometry

4 Samples

The samples originate from the Antartic campaign cruise (2015) in Fildes Bay.

Samples have been sorted by flow cytometry and 1 gene has been PCR amplified :

18S rRNA - V4 region

The PCR products have been sequenced by 1 run of Illumina 2*250 bp.

5 Initialize

5.1 Load the necessary libraries

library("dada2")
library("phyloseq")
library("Biostrings")

library("ggplot2")
library("stringr")
library("dplyr")
library("tidyr")
library("readxl")
library("tibble")
library("readr")

library("leaflet")
library("mapview")

library("kableExtra")  # necessary for nice table formatting with knitr

5.2 Primers

Note that the primers are degenerated. Dada2 has an option to remove primers (FilterandTrim) but this function will not accept degeneracy.

primer_set_fwd = c("CCAGCAGCCGCGGTAATTCC", "CCAGCACCCGCGGTAATTCC", "CCAGCAGCTGCGGTAATTCC", 
    "CCAGCACCTGCGGTAATTCC")
primer_set_rev = c("ACTTTCGTTCTTGATYRATGA")
primer_length_fwd <- str_length(primer_set_fwd[1])
primer_length_rev <- str_length(primer_set_rev[1])

5.3 PR2 tax levels

PR2_tax_levels <- c("kingdom", "supergroup", "division", "class", "order", "family", 
    "genus", "species")

5.4 Set up directories and files

ngs_dir <- "../fastq"
qual_dir <- "../qual_pdf"
dada2_dir <- "../dada2"
databases_dir <- "../databases"
blast_dir <- "../blast"

# Read metadata

metadata_file <- "../Antarctica 2015 fcm sorted samples.xlsx"

samples <- read_xlsx(path = metadata_file, sheet = "samples")
stations <- read_xlsx(path = metadata_file, sheet = "stations")

6 Station map

stations <- stations %>% filter(!is.na(longitude))

map <- leaflet(width = 800, height = 800) %>% addTiles() %>% setView(lng = -58.9, 
    lat = -62.2, zoom = 11) %>% addCircleMarkers(data = stations, lat = ~latitude, 
    lng = ~longitude, radius = 5, label = ~as.character(station), labelOptions = labelOptions(textsize = "10px", 
        noHide = T), clusterOptions = markerClusterOptions())
map

mapshot(map, file = "../img/map_antarctica_01.png")

map <- leaflet(width = 800, height = 800) %>% addTiles() %>% setView(lng = -58.9, 
    lat = -62.2, zoom = 5) %>% addRectangles(lng1 = -58.5, lat1 = -62.5, lng2 = -59.5, 
    lat2 = -62, fillColor = "transparent") %>% addProviderTiles(providers$Esri.OceanBasemap)
map

mapshot(map, file = "../img/map_antarctica_02.png")

7 Examine the fastQ files

7.1 Construct a list of the fastq files

It is assumed that the sample names are at the start of file name and separated by _.

# get a list of all fastq files in the ngs directory and separate R1 and R2
fns <- sort(list.files(ngs_dir, full.names = TRUE))
fns <- fns[str_detect(basename(fns), ".fastq")]
fns_R1 <- fns[str_detect(basename(fns), "R1")]
fns_R2 <- fns[str_detect(basename(fns), "R2")]

# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
sample.names <- str_split(basename(fns_R1), pattern = "_", simplify = TRUE)
sample.names <- sample.names[, 1]

7.2 Compute number of paired reads

Note: Use Biostrings function fastq.geometry

# create an empty data frame
in.df <- data.frame()

# loop throuh all the R1 files (no need to go through R2 which should be the
# same)

for (i in 1:length(fns_R1)) {
    
    # use the dada2 function fastq.geometry
    geom <- fastq.geometry(fns_R1[i])
    
    # extract the information on number of sequences and file name
    df_one_row <- data.frame(n_seq = geom[1], file_name = basename(fns_R1[i]))
    
    # add one line to data frame
    in.df <- bind_rows(in.df, df_one_row)
}
# display number of sequences and write data to small file
knitr::kable(head(in.df))

n_seq	file_name
25785	33N-st6-5_S280_R1.fastq.gz
50940	33P-st6-5_S268_R1.fastq.gz
19924	34N-st14-5_S209_R1.fastq.gz
21306	34N-st6-5_S233_R1.fastq.gz
26613	34P-st14-5_S197_R1.fastq.gz
28325	34P-st6-5_S221_R1.fastq.gz

write_tsv(in.df, str_c(dada2_dir, "/n_seq.txt"), na = "")
str_c("Min number of reads: ", min(in.df$n_seq))

[1] "Min number of reads: 9460"

str_c("Max number of reads: ", max(in.df$n_seq))

[1] "Max number of reads: 115278"

# plot the histogram with number of sequences
ggplot(in.df, aes(x = n_seq)) + geom_histogram(alpha = 0.5, position = "identity", 
    binwidth = 5000) + xlim(0, 120000)

7.3 Plot quality for reads

for (i in 1:length(fns)) {
    
    # Use dada2 function to plot quality
    p1 <- plotQualityProfile(fns[i])
    
    # save the file as a pdf file (uncomment to execute)
    p1_file <- paste0(qual_dir, "/", basename(fns[i]), ".pdf")
    ggsave(plot = p1, filename = p1_file, device = "pdf", width = 15, height = 15, 
        scale = 1, units = "cm")
}

Just print on the screen for 2

for (i in 1:2) {
    
    # Use dada2 function to plot quality
    p1 <- plotQualityProfile(fns[i])
    
    print(p1)
    
}

8 Filter and Trim the reads

8.1 Create names for the filtered files in filtered/ subdirectory of the fastq carbom

filt_dir <- file.path("../fastq_filtered")

filt_R1 <- file.path(filt_dir, str_c(sample.names, "_R1_filt.fastq.gz"))
filt_R2 <- file.path(filt_dir, str_c(sample.names, "_R2_filt.fastq.gz"))

8.2 Remove primers by truncation and filter

The dada2 algorithm requires primers to be removed prior to processing. The dada2 package does not allow for primer degeneracy. Since our forward primer is degenerated at two positions, all four combinations would need to be tested. However it will be necessary to re-assemble after that the 4 fastQ files created.
If you have a run with a mixed set of primers (e.g. 18S plus 16S), one option is to use cutadapt to remove the primers : http://cutadapt.readthedocs.io/en/stable/guide.html#. The program is really very powerful with lots of option.
In our case, since we have only 18S, the best strategy is to remove primers by truncation using the length of the forward and reverse primers.

Filter

all sequences with N
truncate R2 to 250 bp (could choose a lower thresold if the data were bad)
truncate the reads if their quality is less than 10
We could remove also remove short reads but better done latter

[1] “Min number of reads: 5397”
[1] “Max number of reads: 69637”

out <- filterAndTrim(fns_R1, filt_R1, fns_R2, filt_R2, truncLen = c(250, 250), 
    trimLeft = c(primer_length_fwd, primer_length_rev), maxN = 0, maxEE = c(2, 
        2), truncQ = 10, rm.phix = TRUE, compress = TRUE, multithread = FALSE)

out.df <- data.frame(out) %>% rownames_to_column(var = "file_R1")

write_tsv(out.df, str_c(dada2_dir, "/n_seq_filtered.txt"), na = "")
str_c("Min number of reads: ", min(out.df$reads.out))
str_c("Max number of reads: ", max(out.df$reads.out))

9 Dada2 processing

9.1 Learn error rates

The error rates are plotted.

# R1 files

err_R1 <- learnErrors(filt_R1, multithread = FALSE)
p1 <- plotErrors(err_R1, nominalQ = TRUE)

p1
p_file <- paste0(dada2_dir, "/error_R1.pdf")
ggsave(plot = p1, filename = p_file, device = "pdf", width = 15, height = 15, 
    scale = 1, units = "cm")

# R2 files

err_R2 <- learnErrors(filt_R2, multithread = FALSE)
p2 <- plotErrors(err_R2, nominalQ = TRUE)

p2
p_file <- str_c(dada2_dir, "/error_R2.pdf")
ggsave(plot = p2, filename = p_file, device = "pdf", width = 15, height = 15, 
    scale = 1, units = "cm")

# Save R objects
saveRDS(err_R1, str_c(dada2_dir, "/err_R1.rds"))
saveRDS(err_R2, str_c(dada2_dir, "/err_R2.rds"))

knitr::include_graphics(path = str_c(dada2_dir, "/error_R1.png"))

R1 error

knitr::include_graphics(path = str_c(dada2_dir, "/error_R2.png"))

R2 error

9.2 Dereplicate the reads

derep_R1 <- derepFastq(filt_R1, verbose = FALSE)
derep_R2 <- derepFastq(filt_R2, verbose = FALSE)

# Name the derep-class objects by the sample names
names(derep_R1) <- sample.names
names(derep_R2) <- sample.names

# Save the objects
saveRDS(derep_R1, str_c(dada2_dir, "/derep_R1.rds"))
saveRDS(derep_R2, str_c(dada2_dir, "/derep_R2.rds"))

9.3 Sequence-variant inference algorithm to the dereplicated data

dada_R1 <- dada(derep_R1, err = err_R1, multithread = FALSE)
dada_R2 <- dada(derep_R2, err = err_R2, multithread = FALSE)

dada_R1[[1]]
dada_R2[[1]]

# Save the objects
saveRDS(dada_R1, str_c(dada2_dir, "/dada_R1.rds"))
saveRDS(dada_R2, str_c(dada2_dir, "/dada_R2.rds"))

9.4 Merge sequences

mergers <- mergePairs(dada_R1, derep_R1, dada_R2, derep_R2, verbose = TRUE)

# Inspect the merger data.frame from the first sample
knitr::kable(head(mergers[[1]]))

saveRDS(mergers, str_c(dada2_dir, "/mergers.rds"))

9.5 Make sequence table

mergers <- readRDS(str_c(dada2_dir, "/mergers.rds"))
seqtab <- makeSequenceTable(mergers)

saveRDS(seqtab, str_c(dada2_dir, "/seqtab.rds"))

dim(seqtab)

[1]   60 2202

# Make a transposed of the seqtab to make it be similar to mothur database
t_seqtab <- t(seqtab)

# Inspect distribution of sequence lengths

df <- data.frame(length = nchar(getSequences(seqtab)))
sprintf("Mean sequence length: %5.2f", mean(df$length))

[1] "Mean sequence length: 379.03"

ggplot(df, aes(x = length)) + geom_histogram(alpha = 0.5, position = "identity", 
    binwidth = 5) + xlim(250, 440)

9.6 Remove chimeras

Note that remove chimeras will produce spurious results if primers have not be removed. The parameter methods can be pooled or consensus

[1] “% of non chimeras :92.97 %” [1] “Total number of sequences : 1,247,403”

seqtab.nochim <- removeBimeraDenovo(seqtab, method = "consensus", multithread = FALSE, 
    verbose = TRUE)

saveRDS(seqtab.nochim, str_c(dada2_dir, "/seqtab.nochim.rds"))

# Compute % of non chimeras
str_c("% of non chimeras :", format(sum(seqtab.nochim)/sum(seqtab) * 100, digits = 2, 
    nsmall = 2), " %")
str_c("Total number of sequences : ", format(sum(seqtab.nochim), big.mark = ","))

9.7 Track number of reads at each step

# define a function
getN <- function(x) sum(getUniques(x))

track <- cbind(sample.names, out.df, sapply(dada_R1, getN), sapply(mergers, 
    getN), rowSums(seqtab), rowSums(seqtab.nochim))

colnames(track) <- c("sample_name", "file_R1", "input", "filtered", "denoised", 
    "merged", "tabled", "nonchim")


knitr::kable(track)

write_tsv(track, str_c(dada2_dir, "/dada2_track.tsv"))

9.8 Assigning taxonomy

pr2_file <- str_c(databases_dir, "/pr2_version_4.10.0_dada2.fasta.gz")
taxa <- assignTaxonomy(seqtab.nochim, refFasta = pr2_file, taxLevels = PR2_tax_levels, 
    minBoot = 0, outputBootstraps = TRUE, verbose = TRUE)
saveRDS(taxa, str_c(dada2_dir, "/taxa.rds"))

9.9 Changing OTU names from sequence to Otuxxxx

In the OTU put of dada2, otu names are the sequences. We change to give a Otuxxx name and the sequences are stored in the taxonomy table.

Save as a text.file

taxa <- readRDS(str_c(dada2_dir, "/taxa.rds"))
seqtab.nochim <- readRDS(str_c(dada2_dir, "/seqtab.nochim.rds"))

taxa_tax <- as.data.frame(taxa$tax)

# Rename the boot columns by appending '_boot'
taxa_boot <- as.data.frame(taxa$boot) %>% select_all(~str_c(., "_boot"))

# Create taxonmic table with sequence and bootstraps

dada2_database <- taxa_tax %>% rownames_to_column(var = "sequence") %>% rowid_to_column(var = "OTUNumber") %>% 
    mutate(OTUNumber = sprintf("otu%04d", OTUNumber)) %>% bind_cols(taxa_boot) %>% 
    bind_cols(as.data.frame(t(seqtab.nochim)))


write_tsv(dada2_database, path = str_c(dada2_dir, "/dada2_database.txt"))

# Transpose matrix of abundance seqtab.nochim_trans <-
# as.data.frame(t(seqtab.nochim)) row.names(seqtab.nochim_trans) <-
# taxa_tax$OTUNumber

10 BLAST results

10.1 Save as a fasta file

Use the Biostrings library

df <- dada2_database %>% mutate(sequence = str_replace_all(sequence, "(-|\\.)", 
    ""))

seq_out <- Biostrings::DNAStringSet(df$sequence)

names(seq_out) <- str_c(df$OTUNumber, df$supergroup, df$division, df$class, 
    df$order, df$family, df$genus, df$species, sep = "|")

Biostrings::writeXStringSet(seq_out, str_c(blast_dir, "/Antar_ASV.fasta"), compress = FALSE, 
    width = 20000)

10.2 Process BLAST

BLAST is done on the server using the file qsub_blast_antar.sh found in the ..directory
The BLAST file is then processed with a special routine that I wrote

library("pr2database")
dvutils::blast_18S_reformat(str_c(blast_dir, "/Antar_ASV.blast.tsv"))

11 Phyloseq

11.1 Filter for 18S

We remove the sequences are not probably not 18S by selecting only bootstrap value for Supergroup in excess of 80.

bootstrap_min <- 80

# Filter based on the bootstrap
dada2_database_18S <- dada2_database %>% filter(supergroup_boot >= bootstrap_min)

# [taxa_boot$Supergroup >= bootstrap_min,] # Filter matrix of abundance by
# removing row for which Supergroup bootstrap < min seqtab.nochim_18S <-
# seqtab.nochim_trans[taxa_boot$Supergroup >= bootstrap_min,] # Create a
# database like file for dada2 dada2_database <- cbind(taxa_tax_18S,
# seqtab.nochim_18S)

11.2 Create a phyloseq object for dada2

row.names(taxa_tax) <- taxa_tax$OTUNumber

otu_table <- dada2_database_18S %>% column_to_rownames(var = "OTUNumber") %>% 
    select(matches("\\d\\d(N|P).+"))

sample_data <- samples %>% column_to_rownames(var = "sample_label")

tax_table <- dada2_database_18S %>% column_to_rownames(var = "OTUNumber") %>% 
    select(kingdom:species)

OTU = phyloseq::otu_table(as.matrix(otu_table), taxa_are_rows = TRUE)
TAX = phyloseq::tax_table(as.matrix(tax_table))
samples = phyloseq::sample_data(sample_data)


# samdf <- data.frame(sample_name=sample.names) rownames(samdf) <-
# sample.names

ps_dada2 <- phyloseq(OTU, TAX, samples)

saveRDS(ps_dada2, str_c(dada2_dir, "/ps_dada2.rds"))

11.3 Quick plot of data at Division level

plot_bar(ps_dada2, fill = "division") + geom_bar(aes(color = division, fill = division), 
    stat = "identity", position = "stack") + ggtitle("Antarctica sorted - Dada2")