BIOSOPE - step 3 - sequence processing (dada2) and phyloseq file building
## Warning: package 'rmdformats' was built under R version 3.4.4
Aim
Processing the BIOSOPE sequences with Dada2 (running on the server)
Data
- 32 samples amplified for V4 using forward universal primer and reverse Chlo2 (Chloroophyta specific, but amplifies Radiolaria as well)
- fastq files have been cleaned by cutadapt
Directory structure
- /cutadapt : fastq files treated by cutadapt
- /dada2 : dada2 files
- /mothur : mothur files
Analysis
Read BIOSOPE_init.R
This file defines all the necessary libraries and variables
source('BIOSOPE_init.R', echo=TRUE)
> library("dada2")
> library("phyloseq")
> library("Biostrings")
> library("ggplot2")
> library("stringr")
> library("dplyr")
> library("tidyr")
> library("readxl")
> library("readr")
> library("tibble")
> options(stringsAsFactors = FALSE, digits = 2)
> ngs_dir <- "C:/Data Biomol/RNA/Tags/BIOSOPE/"
> fastq_dir <- paste0(ngs_dir, "fastq/")
> mothur_dir <- paste0(ngs_dir, "mothur/")
> cutadapt_dir <- paste0(ngs_dir, "cutadapt/")
> dada2_dir <- paste0(ngs_dir, "dada2/")
> PR2_tax_levels <- c("Kingdom", "Supergroup", "Division",
+ "Class", "Order", "Family", "Genus", "Species")
Get the list of fastQ files
It is assumed that the sample names are at the start of file name
# get a list of all fastq files in the cutadapt directory
fns <- sort(list.files(cutadapt_dir, full.names = TRUE))
fns <- fns[str_detect( basename(fns),".fastq.gz")]
# Extract sample names, assuming filenames have format: SAMPLENAME.fastq
sample.names <- str_replace(basename(fns), ".fastq.gz", "")
# Get the number of sequences in each file
nseq_init <- data.frame()
for(i in 1:length(fns)) {
geom <- fastq.geometry(fns[i])
one_row <- data.frame (n_seq=geom[1])
nseq_init <- bind_rows(nseq_init, one_row)
}
rownames(nseq_init) <- sample.namesDada2 processing
Learn and plot error rates
err <- learnErrors(fns, multithread=TRUE)
plotErrors(err, nominalQ=TRUE)
save(err, file=paste0(dada2_dir,"err.Rdata"))Dereplicate the reads
derep <- derepFastq(fns, verbose=TRUE)
# Name the derep-class objects by the sample names
names(derep) <- sample.names
save(derep, file=paste0(dada2_dir,"derep.Rdata"))Sequence-variant inference algorithm to the dereplicated data
The options HOMOPOLYMER_GAP_PENALTY=-1, BAND_SIZE=32 are recommended for 454 data
dada <- dada(derep, err=err, multithread=FALSE,
HOMOPOLYMER_GAP_PENALTY=-1, BAND_SIZE=32)
dada[[1]]
save(dada, file=paste0(dada2_dir,"dada.Rdata"))Make sequence table
Number of sequences found : 2147 ranging from 371 to 460 bp.
seqtab <- makeSequenceTable(dada)
dim(seqtab)
# Make a transposed of the seqtab to make it be similar to mothur database
t_seqtab <- t(seqtab)
# Inspect distribution of sequence lengths
table(nchar(getSequences(seqtab))) Remove chimeras
The parameter methods can be pooled or consensus
Identified 38 bimeras out of 2147 input sequences. 1] “% of non chimeras : 99.3699906591675” [1] “total number of sequences : 349998
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus",
multithread=FALSE, verbose=TRUE)
# Compute % of non chimeras
paste0("% of non chimeras : ",sum(seqtab.nochim)/sum(seqtab)*100)
paste0("total number of sequences : ",sum(seqtab.nochim))Track number of reads at each step
track <- cbind(nseq_init,rowSums(seqtab.nochim))
colnames(track) <- c("initial", "nonchim")
rownames(track) <- sample.names
knitr::kable(track) Assigning taxonomy
pr2_file <- "C:/Data Biomol/RNA/_PR2/versions/4.10.0/pr2_version_4.10.0_dada2.fasta"
taxa <- assignTaxonomy(seqtab.nochim, refFasta=pr2_file,
taxLevels = PR2_tax_levels,
minBoot = 0, outputBootstraps = TRUE,
verbose = TRUE)
nrow(taxa$tax)[1] 2057
Export data as produced by Dada2
write.table(taxa$tax, file = paste0(dada2_dir,"taxa.txt"), sep="\t",
row.names = TRUE, na="", quote=FALSE, append=FALSE)
write.table(taxa$boot, file = paste0(dada2_dir,"taxa_boot.txt"), sep="\t",
row.names = TRUE, na="", quote=FALSE, append=FALSE)
write.table(seqtab.nochim, file = paste0(dada2_dir,"seqtab.txt"), sep="\t",
row.names = TRUE, na="", quote=FALSE, append=FALSE) Changing OTU names from sequence to Otuxxxx
In the OTU put of dada2, otu names are the sequences. We change to give a Otuxxx name and the sequences are stored in the taxonomy table.
taxa_tax <- as.data.frame(taxa$tax)
taxa_boot <- as.data.frame(taxa$boot)
taxa_tax <- taxa_tax %>% rownames_to_column(var = "sequence") %>%
rowid_to_column(var = "OTUNumber") %>%
mutate(OTUNumber = sprintf("otu%04d", OTUNumber))
row.names(taxa_tax) <- taxa_tax$OTUNumber
row.names(taxa_boot) <- taxa_tax$OTUNumber
# Transpose matrix of abundance
seqtab.nochim_trans <- as.data.frame(t(seqtab.nochim))
row.names(seqtab.nochim_trans) <- taxa_tax$OTUNumberDatabase like file
dada2_database <- cbind(taxa_tax, taxa_boot, seqtab.nochim_trans)
write.table(dada2_database, file = paste0(dada2_dir,"dada2_database.txt"), sep="\t",
row.names = FALSE, na="", quote=FALSE, append=FALSE) Filter for 18S
We remove the sequences are not 18S by selecting only bootstrap value for Supergroup in excess of 80.
Look up Powerpoint for analysis
bootstrap_min <- 80
# Filter based on the bootstrap
taxa_tax_18S <- taxa_tax[taxa_boot$Supergroup >= bootstrap_min,]
taxa_boot_18S <- taxa_boot[taxa_boot$Supergroup >= bootstrap_min,]
# Filter matrix of abundance by removing row for which Supergroup bootstrap < min
seqtab.nochim_18S <- seqtab.nochim_trans[taxa_boot$Supergroup >= bootstrap_min,]
# Create a database like file for dada2
dada2_database_18S <- cbind(taxa_tax_18S, taxa_boot_18S, seqtab.nochim_18S)
write.table(dada2_database_18S, file = paste0(dada2_dir,"dada2_database_18S.txt"),
sep="\t", row.names = FALSE, na="", quote=FALSE, append=FALSE)
# Write up a fasta file with the sequences
seq_out <- DNAStringSet(taxa_tax_18S$sequence)
names(seq_out) <- str_c(taxa_tax_18S$OTUNumber,
taxa_tax_18S$Supergroup,
taxa_tax_18S$Division,
taxa_tax_18S$Class,
taxa_tax_18S$Order,
taxa_tax_18S$Family,
taxa_tax_18S$Genus,
taxa_tax_18S$Species,
sep="|")
writeXStringSet(seq_out, str_c(dada2_dir,"dada2_seq_18S.fas"),
compress=FALSE, width = 20000)Phyloseq
Read the metadata
Merge the data with the existing sample.names and create new sample_id which will be used for phyloseq
metadata <- read_excel(str_c(ngs_dir,"BIOSOPE sample list 02.xlsx"), sheet="metadata")
df.samples <- data.frame(sample_name=sample.names)
df.samples <- left_join(df.samples,metadata)
# create a new sample id with the ctd number and the depth
df.samples$sample_id = sprintf("%03d_%03d",df.samples$ctd,df.samples$depth )
rownames(df.samples) <- df.samples$sample_id
# change the name of the columns for the OTU table
colnames(seqtab.nochim_18S) <- df.samples$sample_id Create a phyloseq object for dada2
ps_biosope_dada2 <- phyloseq(otu_table(as.matrix(seqtab.nochim_18S), taxa_are_rows=TRUE),
sample_data(df.samples),
tax_table(as.matrix(select(taxa_tax_18S, - sequence, -OTUNumber))))
saveRDS(ps_biosope_dada2, file = str_c(dada2_dir,"ps_biosope_dada2.rds"))
# Can be loaded with readRDS(file = "xxx")Create a phyloseq object for the mothur results
Read the mothur database from Excel file
mothur_file <- paste0( ngs_dir, "")
mothur_database <- read_excel(str_c(ngs_dir,"BIOSOPE sample list 02.xlsx"),
sheet="Mothur database")
# need to remove empty rows
mothur_database <- mothur_database %>% filter(!is.na(OTUNumber))Create the taxonomy matrix. Bootstrap values (xxx) needs to be removed
# removing the bootstrap
remove_bootstrap <- function(value) {str_replace_all(value,"\\(\\d+\\)","")}
mothur_tax <- mothur_database %>% select(Kingdom:Species) %>%
mutate_all(remove_bootstrap)
# give row names with the OTU number
rownames(mothur_tax) <- mothur_database$OTUNumber Create the otu_table
mothur_otu <- select(mothur_database, starts_with("Bio"))
# rename the columns to remove the end
rename_columns <- function(value) {str_replace_all(value,".18S_Chlo_fwd", "")}
mothur_otu <- rename_all(mothur_otu, rename_columns)Create the mothur_samples data frame
mothur_samples <- data.frame(sample_name = colnames(mothur_otu))
# merge mothur_samples with sample table
mothur_samples<- left_join(mothur_samples,df.samples)
rownames(mothur_samples) <- mothur_samples$sample_idChange the name of the columns of the otu_table to sample_id and name the rows with otu name
colnames(mothur_otu) <- mothur_samples$sample_id
rownames(mothur_otu) <- mothur_database$OTUNumber
mothur_otu <- as.matrix(mothur_otu)Create the phyloseq for mothur
ps_biosope_mothur <- phyloseq(otu_table(mothur_otu, taxa_are_rows=TRUE),
sample_data(mothur_samples),
tax_table(as.matrix(mothur_tax)))
saveRDS(ps_biosope_mothur, file = str_c(mothur_dir,"ps_biosope_mothur.rds"))
# Can be loaded with readRDS(file = "xxx")