BIOSOPE - step 3 - sequence processing (dada2) and phyloseq file building

## Warning: package 'rmdformats' was built under R version 3.4.4

Aim

Processing the BIOSOPE sequences with Dada2 (running on the server)

Data

  • 32 samples amplified for V4 using forward universal primer and reverse Chlo2 (Chloroophyta specific, but amplifies Radiolaria as well)
  • fastq files have been cleaned by cutadapt

Directory structure

  • /cutadapt : fastq files treated by cutadapt
  • /dada2 : dada2 files
  • /mothur : mothur files

Analysis

Read BIOSOPE_init.R

This file defines all the necessary libraries and variables

source('BIOSOPE_init.R', echo=TRUE)

> library("dada2")

> library("phyloseq")

> library("Biostrings")

> library("ggplot2")

> library("stringr")

> library("dplyr")

> library("tidyr")

> library("readxl")

> library("readr")

> library("tibble")

> options(stringsAsFactors = FALSE, digits = 2)

> ngs_dir <- "C:/Data Biomol/RNA/Tags/BIOSOPE/"

> fastq_dir <- paste0(ngs_dir, "fastq/")

> mothur_dir <- paste0(ngs_dir, "mothur/")

> cutadapt_dir <- paste0(ngs_dir, "cutadapt/")

> dada2_dir <- paste0(ngs_dir, "dada2/")

> PR2_tax_levels <- c("Kingdom", "Supergroup", "Division", 
+     "Class", "Order", "Family", "Genus", "Species")

Get the list of fastQ files

It is assumed that the sample names are at the start of file name

# get a list of all fastq files in the cutadapt directory
  fns <- sort(list.files(cutadapt_dir, full.names = TRUE)) 
  fns <- fns[str_detect( basename(fns),".fastq.gz")]

# Extract sample names, assuming filenames have format: SAMPLENAME.fastq
  sample.names <- str_replace(basename(fns), ".fastq.gz", "") 

# Get the number of sequences in each file
 nseq_init <- data.frame()
 for(i in 1:length(fns)) { 
   geom <- fastq.geometry(fns[i])
   one_row <- data.frame (n_seq=geom[1])
   nseq_init <- bind_rows(nseq_init, one_row)
  } 
  rownames(nseq_init) <- sample.names

Dada2 processing

Learn and plot error rates

  err <- learnErrors(fns, multithread=TRUE)
  plotErrors(err, nominalQ=TRUE)
  
  save(err, file=paste0(dada2_dir,"err.Rdata"))

Dereplicate the reads

  derep <- derepFastq(fns, verbose=TRUE)

# Name the derep-class objects by the sample names
  names(derep) <- sample.names
  save(derep, file=paste0(dada2_dir,"derep.Rdata"))

Sequence-variant inference algorithm to the dereplicated data

The options HOMOPOLYMER_GAP_PENALTY=-1, BAND_SIZE=32 are recommended for 454 data

    dada <- dada(derep, err=err, multithread=FALSE, 
                 HOMOPOLYMER_GAP_PENALTY=-1, BAND_SIZE=32)

    dada[[1]]
    save(dada, file=paste0(dada2_dir,"dada.Rdata"))

Make sequence table

Number of sequences found : 2147 ranging from 371 to 460 bp.

    seqtab <- makeSequenceTable(dada)
    
    dim(seqtab)
    
  # Make a transposed of the seqtab to make it be similar to mothur database
    t_seqtab <- t(seqtab)

  # Inspect distribution of sequence lengths
    table(nchar(getSequences(seqtab))) 

Remove chimeras

The parameter methods can be pooled or consensus

Identified 38 bimeras out of 2147 input sequences. 1] “% of non chimeras : 99.3699906591675” [1] “total number of sequences : 349998

    seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", 
                                        multithread=FALSE, verbose=TRUE)
  
  # Compute % of non chimeras
    paste0("% of non chimeras : ",sum(seqtab.nochim)/sum(seqtab)*100)
    paste0("total number of sequences : ",sum(seqtab.nochim))

Track number of reads at each step

    track <- cbind(nseq_init,rowSums(seqtab.nochim))

    colnames(track) <- c("initial", "nonchim")
    rownames(track) <- sample.names
    
    knitr::kable(track)  

Assigning taxonomy

    pr2_file <- "C:/Data Biomol/RNA/_PR2/versions/4.10.0/pr2_version_4.10.0_dada2.fasta"
    taxa <- assignTaxonomy(seqtab.nochim, refFasta=pr2_file,  
                           taxLevels = PR2_tax_levels,
                           minBoot = 0, outputBootstraps = TRUE,
                           verbose = TRUE)
    nrow(taxa$tax)

[1] 2057

Export data as produced by Dada2

    write.table(taxa$tax, file = paste0(dada2_dir,"taxa.txt"), sep="\t", 
                row.names = TRUE, na="", quote=FALSE, append=FALSE)   
    write.table(taxa$boot, file = paste0(dada2_dir,"taxa_boot.txt"), sep="\t", 
                row.names = TRUE, na="", quote=FALSE, append=FALSE) 
    write.table(seqtab.nochim, file = paste0(dada2_dir,"seqtab.txt"), sep="\t", 
                row.names = TRUE, na="", quote=FALSE, append=FALSE) 

Changing OTU names from sequence to Otuxxxx

In the OTU put of dada2, otu names are the sequences. We change to give a Otuxxx name and the sequences are stored in the taxonomy table.

    taxa_tax <- as.data.frame(taxa$tax)
    taxa_boot <- as.data.frame(taxa$boot)

    taxa_tax <- taxa_tax %>%  rownames_to_column(var = "sequence") %>%
                              rowid_to_column(var = "OTUNumber") %>%
                              mutate(OTUNumber = sprintf("otu%04d", OTUNumber))
    row.names(taxa_tax) <- taxa_tax$OTUNumber
    row.names(taxa_boot) <- taxa_tax$OTUNumber    

  # Transpose matrix of abundance
    seqtab.nochim_trans <- as.data.frame(t(seqtab.nochim))
    row.names(seqtab.nochim_trans) <- taxa_tax$OTUNumber

Database like file

    dada2_database <-   cbind(taxa_tax, taxa_boot, seqtab.nochim_trans)
    write.table(dada2_database, file = paste0(dada2_dir,"dada2_database.txt"), sep="\t", 
                row.names = FALSE, na="", quote=FALSE, append=FALSE) 

Filter for 18S

We remove the sequences are not 18S by selecting only bootstrap value for Supergroup in excess of 80.
Look up Powerpoint for analysis

    bootstrap_min <- 80

  # Filter based on the bootstrap
    taxa_tax_18S <- taxa_tax[taxa_boot$Supergroup >= bootstrap_min,]
    taxa_boot_18S <- taxa_boot[taxa_boot$Supergroup >= bootstrap_min,]


  # Filter matrix of abundance by removing row for which Supergroup bootstrap < min
    seqtab.nochim_18S <- seqtab.nochim_trans[taxa_boot$Supergroup >= bootstrap_min,]
  
  # Create a database like file for dada2
    dada2_database_18S <-   cbind(taxa_tax_18S, taxa_boot_18S, seqtab.nochim_18S)
    write.table(dada2_database_18S, file = paste0(dada2_dir,"dada2_database_18S.txt"), 
                sep="\t", row.names = FALSE, na="", quote=FALSE, append=FALSE) 
    
  # Write up a fasta file with the sequences
    seq_out <- DNAStringSet(taxa_tax_18S$sequence)
    names(seq_out) <- str_c(taxa_tax_18S$OTUNumber,
                              taxa_tax_18S$Supergroup,
                              taxa_tax_18S$Division,
                              taxa_tax_18S$Class, 
                              taxa_tax_18S$Order, 
                              taxa_tax_18S$Family,
                              taxa_tax_18S$Genus,
                              taxa_tax_18S$Species,
                              sep="|")
    writeXStringSet(seq_out, str_c(dada2_dir,"dada2_seq_18S.fas"), 
                    compress=FALSE, width = 20000)

Phyloseq

Read the metadata

Merge the data with the existing sample.names and create new sample_id which will be used for phyloseq

  metadata <- read_excel(str_c(ngs_dir,"BIOSOPE sample list 02.xlsx"), sheet="metadata")
  df.samples <- data.frame(sample_name=sample.names)
  df.samples <- left_join(df.samples,metadata)
  
# create a new sample id with the ctd number and the depth
  df.samples$sample_id = sprintf("%03d_%03d",df.samples$ctd,df.samples$depth )
  rownames(df.samples) <- df.samples$sample_id
  
# change the name of the columns for the OTU table
  colnames(seqtab.nochim_18S) <- df.samples$sample_id  

Create a phyloseq object for dada2

  ps_biosope_dada2 <- phyloseq(otu_table(as.matrix(seqtab.nochim_18S), taxa_are_rows=TRUE), 
                      sample_data(df.samples), 
                      tax_table(as.matrix(select(taxa_tax_18S, - sequence, -OTUNumber))))
  saveRDS(ps_biosope_dada2, file = str_c(dada2_dir,"ps_biosope_dada2.rds"))     
  # Can be loaded with readRDS(file = "xxx")

Create a phyloseq object for the mothur results

Read the mothur database from Excel file

  mothur_file <- paste0( ngs_dir, "")
  mothur_database <- read_excel(str_c(ngs_dir,"BIOSOPE sample list 02.xlsx"), 
                                sheet="Mothur database")
# need to remove empty rows
  mothur_database <- mothur_database %>% filter(!is.na(OTUNumber))

Create the taxonomy matrix. Bootstrap values (xxx) needs to be removed

#  removing the bootstrap  
  remove_bootstrap <- function(value) {str_replace_all(value,"\\(\\d+\\)","")}
  mothur_tax <- mothur_database %>% select(Kingdom:Species) %>%
                                    mutate_all(remove_bootstrap)
  
# give row names with the OTU number
  rownames(mothur_tax) <- mothur_database$OTUNumber 

Create the otu_table

  mothur_otu <- select(mothur_database, starts_with("Bio"))

# rename the columns to remove the end
  rename_columns <- function(value) {str_replace_all(value,".18S_Chlo_fwd", "")}
  mothur_otu <- rename_all(mothur_otu, rename_columns)

Create the mothur_samples data frame

  mothur_samples <- data.frame(sample_name = colnames(mothur_otu))

# merge mothur_samples with sample table
  mothur_samples<- left_join(mothur_samples,df.samples)
  rownames(mothur_samples) <- mothur_samples$sample_id

Change the name of the columns of the otu_table to sample_id and name the rows with otu name

  colnames(mothur_otu) <- mothur_samples$sample_id
  rownames(mothur_otu) <- mothur_database$OTUNumber
  mothur_otu <- as.matrix(mothur_otu)

Create the phyloseq for mothur

  ps_biosope_mothur <- phyloseq(otu_table(mothur_otu, taxa_are_rows=TRUE), 
               sample_data(mothur_samples), 
               tax_table(as.matrix(mothur_tax)))
  saveRDS(ps_biosope_mothur, file = str_c(mothur_dir,"ps_biosope_mothur.rds"))     
  # Can be loaded with readRDS(file = "xxx")

Daniel Vaulot

20 04 2018