BIOSOPE - step 3 - sequence processing (dada2) and phyloseq file building

Aim
Data
Directory structure
Analysis

## Warning: package 'rmdformats' was built under R version 3.4.4

Aim

Processing the BIOSOPE sequences with Dada2 (running on the server)

Data

32 samples amplified for V4 using forward universal primer and reverse Chlo2 (Chloroophyta specific, but amplifies Radiolaria as well)
fastq files have been cleaned by cutadapt

Directory structure

/cutadapt : fastq files treated by cutadapt
/dada2 : dada2 files
/mothur : mothur files

Analysis

Read BIOSOPE_init.R

This file defines all the necessary libraries and variables

source('BIOSOPE_init.R', echo=TRUE)


> library("dada2")

> library("phyloseq")

> library("Biostrings")

> library("ggplot2")

> library("stringr")

> library("dplyr")

> library("tidyr")

> library("readxl")

> library("readr")

> library("tibble")

> options(stringsAsFactors = FALSE, digits = 2)

> ngs_dir <- "C:/Data Biomol/RNA/Tags/BIOSOPE/"

> fastq_dir <- paste0(ngs_dir, "fastq/")

> mothur_dir <- paste0(ngs_dir, "mothur/")

> cutadapt_dir <- paste0(ngs_dir, "cutadapt/")

> dada2_dir <- paste0(ngs_dir, "dada2/")

> PR2_tax_levels <- c("Kingdom", "Supergroup", "Division", 
+     "Class", "Order", "Family", "Genus", "Species")

Get the list of fastQ files

It is assumed that the sample names are at the start of file name

# get a list of all fastq files in the cutadapt directory
  fns <- sort(list.files(cutadapt_dir, full.names = TRUE)) 
  fns <- fns[str_detect( basename(fns),".fastq.gz")]

# Extract sample names, assuming filenames have format: SAMPLENAME.fastq
  sample.names <- str_replace(basename(fns), ".fastq.gz", "") 

# Get the number of sequences in each file
 nseq_init <- data.frame()
 for(i in 1:length(fns)) { 
   geom <- fastq.geometry(fns[i])
   one_row <- data.frame (n_seq=geom[1])
   nseq_init <- bind_rows(nseq_init, one_row)
  } 
  rownames(nseq_init) <- sample.names

Dada2 processing

Learn and plot error rates

  err <- learnErrors(fns, multithread=TRUE)
  plotErrors(err, nominalQ=TRUE)
  
  save(err, file=paste0(dada2_dir,"err.Rdata"))

Dereplicate the reads

  derep <- derepFastq(fns, verbose=TRUE)

# Name the derep-class objects by the sample names
  names(derep) <- sample.names
  save(derep, file=paste0(dada2_dir,"derep.Rdata"))

Sequence-variant inference algorithm to the dereplicated data

The options HOMOPOLYMER_GAP_PENALTY=-1, BAND_SIZE=32 are recommended for 454 data

    dada <- dada(derep, err=err, multithread=FALSE, 
                 HOMOPOLYMER_GAP_PENALTY=-1, BAND_SIZE=32)

    dada[[1]]
    save(dada, file=paste0(dada2_dir,"dada.Rdata"))

Make sequence table

Number of sequences found : 2147 ranging from 371 to 460 bp.

    seqtab <- makeSequenceTable(dada)
    
    dim(seqtab)
    
  # Make a transposed of the seqtab to make it be similar to mothur database
    t_seqtab <- t(seqtab)

  # Inspect distribution of sequence lengths
    table(nchar(getSequences(seqtab)))

Remove chimeras

The parameter methods can be pooled or consensus

Identified 38 bimeras out of 2147 input sequences. 1] “% of non chimeras : 99.3699906591675” [1] “total number of sequences : 349998

    seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", 
                                        multithread=FALSE, verbose=TRUE)
  
  # Compute % of non chimeras
    paste0("% of non chimeras : ",sum(seqtab.nochim)/sum(seqtab)*100)
    paste0("total number of sequences : ",sum(seqtab.nochim))

Track number of reads at each step

    track <- cbind(nseq_init,rowSums(seqtab.nochim))

    colnames(track) <- c("initial", "nonchim")
    rownames(track) <- sample.names
    
    knitr::kable(track)

Assigning taxonomy

    pr2_file <- "C:/Data Biomol/RNA/_PR2/versions/4.10.0/pr2_version_4.10.0_dada2.fasta"
    taxa <- assignTaxonomy(seqtab.nochim, refFasta=pr2_file,  
                           taxLevels = PR2_tax_levels,
                           minBoot = 0, outputBootstraps = TRUE,
                           verbose = TRUE)
    nrow(taxa$tax)

[1] 2057

Export data as produced by Dada2

    write.table(taxa$tax, file = paste0(dada2_dir,"taxa.txt"), sep="\t", 
                row.names = TRUE, na="", quote=FALSE, append=FALSE)   
    write.table(taxa$boot, file = paste0(dada2_dir,"taxa_boot.txt"), sep="\t", 
                row.names = TRUE, na="", quote=FALSE, append=FALSE) 
    write.table(seqtab.nochim, file = paste0(dada2_dir,"seqtab.txt"), sep="\t", 
                row.names = TRUE, na="", quote=FALSE, append=FALSE)

Changing OTU names from sequence to Otuxxxx

In the OTU put of dada2, otu names are the sequences. We change to give a Otuxxx name and the sequences are stored in the taxonomy table.

    taxa_tax <- as.data.frame(taxa$tax)
    taxa_boot <- as.data.frame(taxa$boot)

    taxa_tax <- taxa_tax %>%  rownames_to_column(var = "sequence") %>%
                              rowid_to_column(var = "OTUNumber") %>%
                              mutate(OTUNumber = sprintf("otu%04d", OTUNumber))
    row.names(taxa_tax) <- taxa_tax$OTUNumber
    row.names(taxa_boot) <- taxa_tax$OTUNumber    

  # Transpose matrix of abundance
    seqtab.nochim_trans <- as.data.frame(t(seqtab.nochim))
    row.names(seqtab.nochim_trans) <- taxa_tax$OTUNumber

Database like file

    dada2_database <-   cbind(taxa_tax, taxa_boot, seqtab.nochim_trans)
    write.table(dada2_database, file = paste0(dada2_dir,"dada2_database.txt"), sep="\t", 
                row.names = FALSE, na="", quote=FALSE, append=FALSE)

Filter for 18S

We remove the sequences are not 18S by selecting only bootstrap value for Supergroup in excess of 80.
Look up Powerpoint for analysis

    bootstrap_min <- 80

  # Filter based on the bootstrap
    taxa_tax_18S <- taxa_tax[taxa_boot$Supergroup >= bootstrap_min,]
    taxa_boot_18S <- taxa_boot[taxa_boot$Supergroup >= bootstrap_min,]


  # Filter matrix of abundance by removing row for which Supergroup bootstrap < min
    seqtab.nochim_18S <- seqtab.nochim_trans[taxa_boot$Supergroup >= bootstrap_min,]
  
  # Create a database like file for dada2
    dada2_database_18S <-   cbind(taxa_tax_18S, taxa_boot_18S, seqtab.nochim_18S)
    write.table(dada2_database_18S, file = paste0(dada2_dir,"dada2_database_18S.txt"), 
                sep="\t", row.names = FALSE, na="", quote=FALSE, append=FALSE) 
    
  # Write up a fasta file with the sequences
    seq_out <- DNAStringSet(taxa_tax_18S$sequence)
    names(seq_out) <- str_c(taxa_tax_18S$OTUNumber,
                              taxa_tax_18S$Supergroup,
                              taxa_tax_18S$Division,
                              taxa_tax_18S$Class, 
                              taxa_tax_18S$Order, 
                              taxa_tax_18S$Family,
                              taxa_tax_18S$Genus,
                              taxa_tax_18S$Species,
                              sep="|")
    writeXStringSet(seq_out, str_c(dada2_dir,"dada2_seq_18S.fas"), 
                    compress=FALSE, width = 20000)

Phyloseq

Read the metadata

Merge the data with the existing sample.names and create new sample_id which will be used for phyloseq

  metadata <- read_excel(str_c(ngs_dir,"BIOSOPE sample list 02.xlsx"), sheet="metadata")
  df.samples <- data.frame(sample_name=sample.names)
  df.samples <- left_join(df.samples,metadata)
  
# create a new sample id with the ctd number and the depth
  df.samples$sample_id = sprintf("%03d_%03d",df.samples$ctd,df.samples$depth )
  rownames(df.samples) <- df.samples$sample_id
  
# change the name of the columns for the OTU table
  colnames(seqtab.nochim_18S) <- df.samples$sample_id

Create a phyloseq object for dada2

  ps_biosope_dada2 <- phyloseq(otu_table(as.matrix(seqtab.nochim_18S), taxa_are_rows=TRUE), 
                      sample_data(df.samples), 
                      tax_table(as.matrix(select(taxa_tax_18S, - sequence, -OTUNumber))))
  saveRDS(ps_biosope_dada2, file = str_c(dada2_dir,"ps_biosope_dada2.rds"))     
  # Can be loaded with readRDS(file = "xxx")

Create a phyloseq object for the mothur results

Read the mothur database from Excel file

  mothur_file <- paste0( ngs_dir, "")
  mothur_database <- read_excel(str_c(ngs_dir,"BIOSOPE sample list 02.xlsx"), 
                                sheet="Mothur database")
# need to remove empty rows
  mothur_database <- mothur_database %>% filter(!is.na(OTUNumber))

Create the taxonomy matrix. Bootstrap values (xxx) needs to be removed

#  removing the bootstrap  
  remove_bootstrap <- function(value) {str_replace_all(value,"\\(\\d+\\)","")}
  mothur_tax <- mothur_database %>% select(Kingdom:Species) %>%
                                    mutate_all(remove_bootstrap)
  
# give row names with the OTU number
  rownames(mothur_tax) <- mothur_database$OTUNumber

Create the otu_table

  mothur_otu <- select(mothur_database, starts_with("Bio"))

# rename the columns to remove the end
  rename_columns <- function(value) {str_replace_all(value,".18S_Chlo_fwd", "")}
  mothur_otu <- rename_all(mothur_otu, rename_columns)

Create the mothur_samples data frame

  mothur_samples <- data.frame(sample_name = colnames(mothur_otu))

# merge mothur_samples with sample table
  mothur_samples<- left_join(mothur_samples,df.samples)
  rownames(mothur_samples) <- mothur_samples$sample_id

Change the name of the columns of the otu_table to sample_id and name the rows with otu name

  colnames(mothur_otu) <- mothur_samples$sample_id
  rownames(mothur_otu) <- mothur_database$OTUNumber
  mothur_otu <- as.matrix(mothur_otu)

Create the phyloseq for mothur

  ps_biosope_mothur <- phyloseq(otu_table(mothur_otu, taxa_are_rows=TRUE), 
               sample_data(mothur_samples), 
               tax_table(as.matrix(mothur_tax)))
  saveRDS(ps_biosope_mothur, file = str_c(mothur_dir,"ps_biosope_mothur.rds"))     
  # Can be loaded with readRDS(file = "xxx")