Metabarcoding - Antarctica 2015 - Sorted samples
Metabarcoding - Antarctica 2015 - Sorted samples
Initialize knitr
knitr::opts_chunk$set(fig.height=10, fig.width=10,
eval=TRUE,
cache=TRUE,
echo=TRUE,
prompt=FALSE,
tidy=TRUE,
comment=NA,
message=FALSE,
warning=FALSE)1 Aim
Process Antarctica 2015 sorted samples (18S V4) sequenced on Illumina data with the Dada2 suite as implemented in R (dada2 is also implemented in Qiime). The processing steps are adapted from : https://benjjneb.github.io/dada2/tutorial.html
2 Directory structure
- ../fastq : fastq files
- ../fastq_filtered : fastq files after filtration
- ../qual_pdf : qual pdf files
- ../dada2 : dada2 processed files
- ../databases : PR2 database files (contains the latest PR2 database formatted for dada2 - https://github.com/vaulot/pr2_database/releases/)
- ../blast : BLAST files output
- ../R : R project files
- ../img : Images
3 Downloads
Install the following software :
R studio : https://www.rstudio.com/products/rstudio/download/#download
Download and install the following libraries by running under R studio the following lines
install.packages("dplyr") # To manipulate dataframes
install.packages("stringr") # To strings
install.packages("ggplot2") # To do plots
install.packages("readxl") # To read excel files
install.packages("tibble") # To work with data frames
install.packages("tidyr") # To work with data frames
install.packages("readr") # To read and write files
install.packages("leaflet") # For maps
install.packages("mapview") # To save leaflet maps
webshot::install_phantomjs() # Necessary to save images
source("https://bioconductor.org/biocLite.R")
biocLite('dada2') # metabarcode data analysis
biocLite('phyloseq') # metabarcode data analysis
biocLite('Biostrings') # needed for fastq.geometry4 Samples
The samples originate from the Antartic campaign cruise (2015) in Fildes Bay.
Samples have been sorted by flow cytometry and 1 gene has been PCR amplified :
- 18S rRNA - V4 region
The PCR products have been sequenced by 1 run of Illumina 2*250 bp.
5 Initialize
5.1 Load the necessary libraries
library("dada2")
library("phyloseq")
library("Biostrings")
library("ggplot2")
library("stringr")
library("dplyr")
library("tidyr")
library("readxl")
library("tibble")
library("readr")
library("leaflet")
library("mapview")
library("kableExtra") # necessary for nice table formatting with knitr5.2 Primers
Note that the primers are degenerated. Dada2 has an option to remove primers (FilterandTrim) but this function will not accept degeneracy.
primer_set_fwd = c("CCAGCAGCCGCGGTAATTCC", "CCAGCACCCGCGGTAATTCC", "CCAGCAGCTGCGGTAATTCC",
"CCAGCACCTGCGGTAATTCC")
primer_set_rev = c("ACTTTCGTTCTTGATYRATGA")
primer_length_fwd <- str_length(primer_set_fwd[1])
primer_length_rev <- str_length(primer_set_rev[1])5.3 PR2 tax levels
PR2_tax_levels <- c("kingdom", "supergroup", "division", "class", "order", "family",
"genus", "species")5.4 Set up directories and files
ngs_dir <- "../fastq"
qual_dir <- "../qual_pdf"
dada2_dir <- "../dada2"
databases_dir <- "../databases"
blast_dir <- "../blast"
# Read metadata
metadata_file <- "../Antarctica 2015 fcm sorted samples.xlsx"
samples <- read_xlsx(path = metadata_file, sheet = "samples")
stations <- read_xlsx(path = metadata_file, sheet = "stations")6 Station map
stations <- stations %>% filter(!is.na(longitude))
map <- leaflet(width = 800, height = 800) %>% addTiles() %>% setView(lng = -58.9,
lat = -62.2, zoom = 11) %>% addCircleMarkers(data = stations, lat = ~latitude,
lng = ~longitude, radius = 5, label = ~as.character(station), labelOptions = labelOptions(textsize = "10px",
noHide = T), clusterOptions = markerClusterOptions())
mapmapshot(map, file = "../img/map_antarctica_01.png")
map <- leaflet(width = 800, height = 800) %>% addTiles() %>% setView(lng = -58.9,
lat = -62.2, zoom = 5) %>% addRectangles(lng1 = -58.5, lat1 = -62.5, lng2 = -59.5,
lat2 = -62, fillColor = "transparent") %>% addProviderTiles(providers$Esri.OceanBasemap)
mapmapshot(map, file = "../img/map_antarctica_02.png")7 Examine the fastQ files
7.1 Construct a list of the fastq files
It is assumed that the sample names are at the start of file name and separated by _.
# get a list of all fastq files in the ngs directory and separate R1 and R2
fns <- sort(list.files(ngs_dir, full.names = TRUE))
fns <- fns[str_detect(basename(fns), ".fastq")]
fns_R1 <- fns[str_detect(basename(fns), "R1")]
fns_R2 <- fns[str_detect(basename(fns), "R2")]
# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
sample.names <- str_split(basename(fns_R1), pattern = "_", simplify = TRUE)
sample.names <- sample.names[, 1]7.2 Compute number of paired reads
Note: Use Biostrings function fastq.geometry
# create an empty data frame
in.df <- data.frame()
# loop throuh all the R1 files (no need to go through R2 which should be the
# same)
for (i in 1:length(fns_R1)) {
# use the dada2 function fastq.geometry
geom <- fastq.geometry(fns_R1[i])
# extract the information on number of sequences and file name
df_one_row <- data.frame(n_seq = geom[1], file_name = basename(fns_R1[i]))
# add one line to data frame
in.df <- bind_rows(in.df, df_one_row)
}
# display number of sequences and write data to small file
knitr::kable(head(in.df))| n_seq | file_name |
|---|---|
| 25785 | 33N-st6-5_S280_R1.fastq.gz |
| 50940 | 33P-st6-5_S268_R1.fastq.gz |
| 19924 | 34N-st14-5_S209_R1.fastq.gz |
| 21306 | 34N-st6-5_S233_R1.fastq.gz |
| 26613 | 34P-st14-5_S197_R1.fastq.gz |
| 28325 | 34P-st6-5_S221_R1.fastq.gz |
write_tsv(in.df, str_c(dada2_dir, "/n_seq.txt"), na = "")
str_c("Min number of reads: ", min(in.df$n_seq))[1] "Min number of reads: 9460"
str_c("Max number of reads: ", max(in.df$n_seq))[1] "Max number of reads: 115278"
# plot the histogram with number of sequences
ggplot(in.df, aes(x = n_seq)) + geom_histogram(alpha = 0.5, position = "identity",
binwidth = 5000) + xlim(0, 120000)7.3 Plot quality for reads
for (i in 1:length(fns)) {
# Use dada2 function to plot quality
p1 <- plotQualityProfile(fns[i])
# save the file as a pdf file (uncomment to execute)
p1_file <- paste0(qual_dir, "/", basename(fns[i]), ".pdf")
ggsave(plot = p1, filename = p1_file, device = "pdf", width = 15, height = 15,
scale = 1, units = "cm")
}Just print on the screen for 2
for (i in 1:2) {
# Use dada2 function to plot quality
p1 <- plotQualityProfile(fns[i])
print(p1)
}8 Filter and Trim the reads
8.1 Create names for the filtered files in filtered/ subdirectory of the fastq carbom
filt_dir <- file.path("../fastq_filtered")
filt_R1 <- file.path(filt_dir, str_c(sample.names, "_R1_filt.fastq.gz"))
filt_R2 <- file.path(filt_dir, str_c(sample.names, "_R2_filt.fastq.gz"))8.2 Remove primers by truncation and filter
The dada2 algorithm requires primers to be removed prior to processing. The dada2 package does not allow for primer degeneracy. Since our forward primer is degenerated at two positions, all four combinations would need to be tested. However it will be necessary to re-assemble after that the 4 fastQ files created.
If you have a run with a mixed set of primers (e.g. 18S plus 16S), one option is to use cutadapt to remove the primers : http://cutadapt.readthedocs.io/en/stable/guide.html#. The program is really very powerful with lots of option.
In our case, since we have only 18S, the best strategy is to remove primers by truncation using the length of the forward and reverse primers.
Filter
- all sequences with N
- truncate R2 to 250 bp (could choose a lower thresold if the data were bad)
- truncate the reads if their quality is less than 10
- We could remove also remove short reads but better done latter
[1] “Min number of reads: 5397”
[1] “Max number of reads: 69637”
out <- filterAndTrim(fns_R1, filt_R1, fns_R2, filt_R2, truncLen = c(250, 250),
trimLeft = c(primer_length_fwd, primer_length_rev), maxN = 0, maxEE = c(2,
2), truncQ = 10, rm.phix = TRUE, compress = TRUE, multithread = FALSE)
out.df <- data.frame(out) %>% rownames_to_column(var = "file_R1")
write_tsv(out.df, str_c(dada2_dir, "/n_seq_filtered.txt"), na = "")
str_c("Min number of reads: ", min(out.df$reads.out))
str_c("Max number of reads: ", max(out.df$reads.out))9 Dada2 processing
9.1 Learn error rates
The error rates are plotted.
# R1 files
err_R1 <- learnErrors(filt_R1, multithread = FALSE)
p1 <- plotErrors(err_R1, nominalQ = TRUE)
p1
p_file <- paste0(dada2_dir, "/error_R1.pdf")
ggsave(plot = p1, filename = p_file, device = "pdf", width = 15, height = 15,
scale = 1, units = "cm")
# R2 files
err_R2 <- learnErrors(filt_R2, multithread = FALSE)
p2 <- plotErrors(err_R2, nominalQ = TRUE)
p2
p_file <- str_c(dada2_dir, "/error_R2.pdf")
ggsave(plot = p2, filename = p_file, device = "pdf", width = 15, height = 15,
scale = 1, units = "cm")
# Save R objects
saveRDS(err_R1, str_c(dada2_dir, "/err_R1.rds"))
saveRDS(err_R2, str_c(dada2_dir, "/err_R2.rds"))knitr::include_graphics(path = str_c(dada2_dir, "/error_R1.png"))R1 error
knitr::include_graphics(path = str_c(dada2_dir, "/error_R2.png"))R2 error
9.2 Dereplicate the reads
derep_R1 <- derepFastq(filt_R1, verbose = FALSE)
derep_R2 <- derepFastq(filt_R2, verbose = FALSE)
# Name the derep-class objects by the sample names
names(derep_R1) <- sample.names
names(derep_R2) <- sample.names
# Save the objects
saveRDS(derep_R1, str_c(dada2_dir, "/derep_R1.rds"))
saveRDS(derep_R2, str_c(dada2_dir, "/derep_R2.rds"))9.3 Sequence-variant inference algorithm to the dereplicated data
dada_R1 <- dada(derep_R1, err = err_R1, multithread = FALSE)
dada_R2 <- dada(derep_R2, err = err_R2, multithread = FALSE)
dada_R1[[1]]
dada_R2[[1]]
# Save the objects
saveRDS(dada_R1, str_c(dada2_dir, "/dada_R1.rds"))
saveRDS(dada_R2, str_c(dada2_dir, "/dada_R2.rds"))9.4 Merge sequences
mergers <- mergePairs(dada_R1, derep_R1, dada_R2, derep_R2, verbose = TRUE)
# Inspect the merger data.frame from the first sample
knitr::kable(head(mergers[[1]]))
saveRDS(mergers, str_c(dada2_dir, "/mergers.rds"))9.5 Make sequence table
mergers <- readRDS(str_c(dada2_dir, "/mergers.rds"))
seqtab <- makeSequenceTable(mergers)
saveRDS(seqtab, str_c(dada2_dir, "/seqtab.rds"))
dim(seqtab)[1] 60 2202
# Make a transposed of the seqtab to make it be similar to mothur database
t_seqtab <- t(seqtab)
# Inspect distribution of sequence lengths
df <- data.frame(length = nchar(getSequences(seqtab)))
sprintf("Mean sequence length: %5.2f", mean(df$length))[1] "Mean sequence length: 379.03"
ggplot(df, aes(x = length)) + geom_histogram(alpha = 0.5, position = "identity",
binwidth = 5) + xlim(250, 440)9.6 Remove chimeras
Note that remove chimeras will produce spurious results if primers have not be removed. The parameter methods can be pooled or consensus
[1] “% of non chimeras :92.97 %” [1] “Total number of sequences : 1,247,403”
seqtab.nochim <- removeBimeraDenovo(seqtab, method = "consensus", multithread = FALSE,
verbose = TRUE)
saveRDS(seqtab.nochim, str_c(dada2_dir, "/seqtab.nochim.rds"))
# Compute % of non chimeras
str_c("% of non chimeras :", format(sum(seqtab.nochim)/sum(seqtab) * 100, digits = 2,
nsmall = 2), " %")
str_c("Total number of sequences : ", format(sum(seqtab.nochim), big.mark = ","))9.7 Track number of reads at each step
# define a function
getN <- function(x) sum(getUniques(x))
track <- cbind(sample.names, out.df, sapply(dada_R1, getN), sapply(mergers,
getN), rowSums(seqtab), rowSums(seqtab.nochim))
colnames(track) <- c("sample_name", "file_R1", "input", "filtered", "denoised",
"merged", "tabled", "nonchim")
knitr::kable(track)
write_tsv(track, str_c(dada2_dir, "/dada2_track.tsv"))9.8 Assigning taxonomy
pr2_file <- str_c(databases_dir, "/pr2_version_4.10.0_dada2.fasta.gz")
taxa <- assignTaxonomy(seqtab.nochim, refFasta = pr2_file, taxLevels = PR2_tax_levels,
minBoot = 0, outputBootstraps = TRUE, verbose = TRUE)
saveRDS(taxa, str_c(dada2_dir, "/taxa.rds"))9.9 Changing OTU names from sequence to Otuxxxx
In the OTU put of dada2, otu names are the sequences. We change to give a Otuxxx name and the sequences are stored in the taxonomy table.
Save as a text.file
taxa <- readRDS(str_c(dada2_dir, "/taxa.rds"))
seqtab.nochim <- readRDS(str_c(dada2_dir, "/seqtab.nochim.rds"))
taxa_tax <- as.data.frame(taxa$tax)
# Rename the boot columns by appending '_boot'
taxa_boot <- as.data.frame(taxa$boot) %>% select_all(~str_c(., "_boot"))
# Create taxonmic table with sequence and bootstraps
dada2_database <- taxa_tax %>% rownames_to_column(var = "sequence") %>% rowid_to_column(var = "OTUNumber") %>%
mutate(OTUNumber = sprintf("otu%04d", OTUNumber)) %>% bind_cols(taxa_boot) %>%
bind_cols(as.data.frame(t(seqtab.nochim)))
write_tsv(dada2_database, path = str_c(dada2_dir, "/dada2_database.txt"))
# Transpose matrix of abundance seqtab.nochim_trans <-
# as.data.frame(t(seqtab.nochim)) row.names(seqtab.nochim_trans) <-
# taxa_tax$OTUNumber10 BLAST results
10.1 Save as a fasta file
Use the Biostrings library
df <- dada2_database %>% mutate(sequence = str_replace_all(sequence, "(-|\\.)",
""))
seq_out <- Biostrings::DNAStringSet(df$sequence)
names(seq_out) <- str_c(df$OTUNumber, df$supergroup, df$division, df$class,
df$order, df$family, df$genus, df$species, sep = "|")
Biostrings::writeXStringSet(seq_out, str_c(blast_dir, "/Antar_ASV.fasta"), compress = FALSE,
width = 20000)10.2 Process BLAST
- BLAST is done on the server using the file
qsub_blast_antar.shfound in the ..directory - The BLAST file is then processed with a special routine that I wrote
library("pr2database")
dvutils::blast_18S_reformat(str_c(blast_dir, "/Antar_ASV.blast.tsv"))11 Phyloseq
11.1 Filter for 18S
We remove the sequences are not probably not 18S by selecting only bootstrap value for Supergroup in excess of 80.
bootstrap_min <- 80
# Filter based on the bootstrap
dada2_database_18S <- dada2_database %>% filter(supergroup_boot >= bootstrap_min)
# [taxa_boot$Supergroup >= bootstrap_min,] # Filter matrix of abundance by
# removing row for which Supergroup bootstrap < min seqtab.nochim_18S <-
# seqtab.nochim_trans[taxa_boot$Supergroup >= bootstrap_min,] # Create a
# database like file for dada2 dada2_database <- cbind(taxa_tax_18S,
# seqtab.nochim_18S)11.2 Create a phyloseq object for dada2
row.names(taxa_tax) <- taxa_tax$OTUNumber
otu_table <- dada2_database_18S %>% column_to_rownames(var = "OTUNumber") %>%
select(matches("\\d\\d(N|P).+"))
sample_data <- samples %>% column_to_rownames(var = "sample_label")
tax_table <- dada2_database_18S %>% column_to_rownames(var = "OTUNumber") %>%
select(kingdom:species)
OTU = phyloseq::otu_table(as.matrix(otu_table), taxa_are_rows = TRUE)
TAX = phyloseq::tax_table(as.matrix(tax_table))
samples = phyloseq::sample_data(sample_data)
# samdf <- data.frame(sample_name=sample.names) rownames(samdf) <-
# sample.names
ps_dada2 <- phyloseq(OTU, TAX, samples)
saveRDS(ps_dada2, str_c(dada2_dir, "/ps_dada2.rds"))11.3 Quick plot of data at Division level
plot_bar(ps_dada2, fill = "division") + geom_bar(aes(color = division, fill = division),
stat = "identity", position = "stack") + ggtitle("Antarctica sorted - Dada2")