Bolido review Frontiers 2018

  library(knitr)
  library(rmdformats)
  library(kableExtra) 

  knitr::opts_chunk$set(fig.width=8, 
                        fig.height=6, 
                        eval=TRUE, 
                        cache=TRUE,
                        echo=TRUE,
                        prompt=FALSE,
                        tidy=TRUE,
                        comment=NA,
                        message=FALSE,
                        warning=FALSE)
  opts_knit$set(width=90)
  options(max.print="500")  

To do

1 Goal

Plot the distribution of Parmales using both metabarcodes and litterature data

2 Load necessary libraries

library("RMySQL")

library("stringr")  # To do replacement
library("dplyr")  # To manipulate dataframes
library("tidyr")  # To manipulate dataframes
library("readxl")
library("readr")

library("treemap")

library("ggplot2")
library("gridExtra")  # necessary to arrange graphics into grid

# Libraries dvutils -------------------------------------------------------
if (any(grepl("package:dvutils", search()))) detach("package:dvutils", unload = TRUE)
library("dvutils")

3 Data sets used

  • dataset_id dataset_code dataset_name marker region
  • 5 MALINA_Monier_2014 MALINA - Monier 2014 18S_V4 Arctic Ocean
  • 8 Arctic_Comeau_2011 ACME - Comeau 2011 18S_V4 Arctic Ocean
  • 9 Nansen_Metfies_2016 Nansen Basin - Metfies 2016 18S_V4 Arctic Ocean
  • 10 Southern_Ocean_Wolf_2014 Southern Ocean - Wolf 2014 18S_V4 Southern Ocean
  • 11 Fieldes_Bay_Luo_2016 Fieldes Bay - Luo 2016 18S_V4 Southern Ocean
  • 13 Fram_strait_Kilias_2013 Fram Strait Kilias - 2013 18S_V4 Arctic Ocean
  • 14 OSD_LGC_2014 OSD LGC 2014 18S_V4 Ocean

4 Sequence processing


#==========================================================================================================================
#      Usearch to process metabarcodes
#==========================================================================================================================


usearch -makeudb_usearch 18S_V4.fasta -output 18S_V4.udb

usearch -makeudb_utax PR2_version_4.6_UTAX_all.fasta -output PR2_version_4.6_UTAX_all.udb -report PR2_version_4.6_UTAX_all.txt

usearch -makeudb_utax PR2_version_4.6_UTAX_Bolido.fasta -output PR2_version_4.6_UTAX_Bolido.udb -report PR2_version_4.6_UTAX_Bolido.txt

# usearch -makeudb_usearch OSD_18S_unique.fasta -output OSD_18S_unique.udb

# Note the Fragment must be smaller than the sequences in the database....

# ============ Assignments of data sets =====================


usearch -utax 18S_V4_Metfies_Kilian.fasta -db PR2_version_4.6_UTAX_Bolido.udb -utaxout 18S_V4_Metfies_Kilian.utax -strand both

# ============ Bolido =====================
#  USEARCH
usearch -usearch_global Bolidophyceae_V4_Genbank.fasta -db 18S_V4.udb -id 0.95 -alnout Bolidophyceae_V4_Genbank_hits.aln -strand both -dbmatched Bolidophyceae_V4_Genbank_hits.fasta -userout Bolidophyceae_V4_Genbank_hits.m8 -userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+bits -maxaccepts 300


# UCLUST
usearch -cluster_fast Bolidophyceae_V4_all.fasta -id 0.99 -sort other -centroids Bolidophyceae_V4_all_centroids.fasta -uc Bolidophyceae_V4_all_clusters.uc

usearch -cluster_fast Bolido_pr2_metfies_kilian.fasta -id 0.99 -sort length -centroids Bolido_pr2_metfies_kilian_centroids.fasta -uc BBolido_pr2_metfies_kilian_clusters.uc

usearch -cluster_fast 18S_V4_Bolido.fasta -id 0.99 -sort other -centroids 18S_V4_Bolido_centroids.fasta -uc 18S_V4_Bolido_clusters.uc -msaout 18S_V4_Bolido_align.fasta

# add this to get the alignements : -msaout Bolidophyceae_V4_all_alignments.fasta




#==========================================================================================================================
#      Mothur - Extract V4 region from PR2 Bolido and Tetraparma (unpublished) sequences 
#==========================================================================================================================

set.dir(input=/projet/umr7144/dipo/vaulot/mothur/bolido/)

/usr/local/genome2/mothur-1.39.5/mothur "#pcr.seqs(fasta=pr2_version_4.6_Bolidophyceae.fasta, oligos=oligos18s_V4_Zingone.oligos, pdiffs=2, rdiffs=2, keepprimer=TRUE, keepdots=F, processors=32)"
/usr/local/genome2/mothur-1.39.5/mothur "#pcr.seqs(fasta=Tetraparma.fasta, oligos=oligos18s_V4_Zingone.oligos, pdiffs=2, rdiffs=2, keepprimer=TRUE, keepdots=F, processors=32)"

#  Remove part after PCR for Metabarcodes

/usr/local/genome2/mothur-1.39.5/mothur "#pcr.seqs(fasta=18S_V4_Bolido.fasta, oligos=oligos18s_V4_Zingone_reverse.oligos, rdiffs=2, keepprimer=TRUE, keepdots=F, processors=32, nomatch=keep)"

#   Dereplicate metabarcodes  = Do not use use Vsearch....
#/usr/local/genome2/mothur-1.39.5/mothur "#unique.seqs(fasta=18S_V4_Bolido.scrap.pcr.fasta)"
#/usr/local/genome2/mothur-1.39.5/mothur "#unique.seqs(fasta=18S_V4_Bolido.pcr.fasta)"
/usr/local/genome2/mothur-1.39.5/mothur "#unique.seqs(fasta=pr2_version_4.6_Bolidophyceae.pcr_clones.fasta)"

#==========================================================================================================================
#      Vsearch - Dereplicate metabarcodes and clones
#==========================================================================================================================
vsearch --derep_prefix 18S_V4_Bolido.pcr.fasta --output 18S_V4_Bolido.pcr.dereplicated.fasta

vsearch --derep_prefix pr2_version_4.6_Bolidophyceae.pcr_clones.fasta --output pr2_version_4.6_Bolidophyceae.pcr_clones.dereplicated.fasta

# Reading file 18S_V4_Bolido.pcr.fasta 100%
# 70525 nt in 177 seqs, min 270, max 417, avg 398
# Sorting by length 100%
# Dereplicating 100%
# Sorting 100%
# 129 unique sequences, avg cluster 1.4, median 1, max 29
# Writing output file 100%


#==========================================================================================================================
#     Vsearch - Cluster env + barcodes used for the tree
#==========================================================================================================================
vsearch --cluster_fast Bolido_V4_pr2_and_meta_final.fasta --id 0.97 --iddef 2 --clusterout_sort --uc Bolido_V4_pr2_and_meta_final.uclust.txt

5 Initialize variables and read text data

dataset_astan <- 1  # SOMLIT Astan
dataset_map <- c(5, 8:11, 13:14)  # Do not use OSD 2014-2015, OSD LW, Tara, Oslofjord

df.metabarcode_species_color <- read_tsv("Bolido colors metabarcodes.txt")
df.morpho_species_color <- read_tsv("Bolido colors morpho species.txt")
morpho_species <- read_excel("C:/Users/vaulot/Google Drive/Papers/2017 Kuwata Bolido Frontiers/Supplementary material/Sup_Table_1_Parmales_distribution.xls", 
    sheet = "Table for R 3.0")

6 Read metabarcode database

# Connect to metabarcodes database
db_metabarcodes <- db_connect(db_info("metapr2"))

# Read database tables and filters according to taxonomy and type of samples

otus <- tbl(db_metabarcodes, "otus") %>% collect()
otus_select <- otus %>% filter(str_detect(class, "Bolido"))

samples <- tbl(db_metabarcodes, "samples") %>% collect()
# samples <- dbGetQuery(db_metabarcodes, 'select * from samples')
samples_select <- samples %>% filter(dataset_id %in% dataset_map) %>% filter(depth_level == 
    "surface") %>% filter(DNA_RNA == "DNA")

read_abundance <- tbl(db_metabarcodes, "read_abundance") %>% collect()

# Disonnect from metabarcodes database
db_disconnect(db_metabarcodes)
[1] TRUE

7 Summarize information about metabarcode sequences

# Join the tables otu and read abundance
otus_select <- inner_join(read_abundance, otus_select)


# Only keep samples that have been selected
otus_select <- otus_select %>% filter((dataset_id %in% samples_select$dataset_id) & 
    (sample_code %in% samples_select$sample_code))

# Summarize per family, species at each station
sample_species <- otus_select %>% group_by(dataset_id, sample_code, family, 
    species) %>% summarize(reads_total_species = sum(read_number))

# Spread with one column per species
sample_species_wide <- sample_species %>% ungroup() %>% select(-family) %>% 
    spread(key = species, value = reads_total_species) %>% right_join(select(samples_select, 
    dataset_id, sample_code))

sample_species_wide[is.na(sample_species_wide)] <- 0

# Summarize overall by species
total_species <- sample_species %>% group_by(family, species) %>% summarize(reads_total_species_overall = sum(reads_total_species))

# Total Bolido at each station
sample_class <- sample_species %>% group_by(dataset_id, sample_code) %>% summarize(reads_total_class = sum(reads_total_species))

# Reads Dominant species at each station and remove duplicate
sample_species_dominant <- sample_species %>% group_by(dataset_id, sample_code) %>% 
    top_n(1, reads_total_species) %>% distinct(dataset_id, sample_code, .keep_all = TRUE) %>% 
    rename(species_dominant = species, family_dominant = family, reads_total_species_dominant = reads_total_species)

# Assemble everything in the big table
metabarcodes_species <- samples_select %>% left_join(sample_class) %>% left_join(sample_species_dominant) %>% 
    mutate(reads_total_class_pct = reads_total_class/reads_total * 100) %>% 
    left_join(sample_species_wide)
metabarcodes_species$reads_total_class_pct[is.na(metabarcodes_species$reads_total_class_pct)] <- 0

write_tsv(metabarcodes_species, "Metabarcodes_Bolido.txt", na = "")

8 Treemap of metabarcodes

# Merge colors
total_species <- left_join(total_species, select(df.metabarcode_species_color, 
    -family))

treemap(total_species, index = c("family", "species"), vSize = "reads_total_species_overall", 
    vColor = "species_color_hex", type = "color", title = "", asp = 1, lowerbound.cex.labels = 1, 
    fontsize.labels = 12)

9 Boxplot for % of total metabarcodes

plot1 <- ggplot(data = filter(metabarcodes_species, reads_total_class_pct > 
    0), aes(DNA_RNA, reads_total_class_pct)) + theme_bw() + geom_boxplot() + 
    scale_y_log10(breaks = c(0.01, 0.1, 1, 10)) + annotation_logticks(sides = "l") + 
    labs(x = "", y = "Bolidophyceae (% of total eukaryotes)")
# coord_flip() # coord_flip and annotation_loticks do not work together....

plot1

# ggsave(plot=plot1, filename=paste('Fig Bolido boxplot 1.0.pdf',sep=''),
# width = 4, height = 10, scale=1, units='cm', useDingbats=FALSE)

summary(metabarcodes_species$reads_total_class_pct)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
 0.00000  0.01070  0.03529  0.29380  0.11468 12.24143 

10 Maps

10.1 Background maps

# Define constants for the map
size_points <- 2.5
size_cross <- 1
scaling_factor <- 30
color_ice <- "lightslateblue"
color_water <- "red"
color_cultures <- "green"
color_not_present <- "blue"
color_morpho <- "brown"

# colors for metabarcode species
species_color <- as.vector(df.metabarcode_species_color$species_color_hex)
names(species_color) <- df.metabarcode_species_color$species

# colors for morpho species
morpho_species_color <- as.vector(df.morpho_species_color$species_color_hex)
names(morpho_species_color) <- df.morpho_species_color$species

# shape for morpho genus
morpho_genus_shape <- as.vector(df.morpho_species_color$genus_shape)
names(morpho_genus_shape) <- df.morpho_species_color$genus


# Background maps
background_map <- map_world()

background_north_america <- map_US()

10.2 Global Maps

  • Dominant group = Color
  • % of total metabarcodes = Size of circle
# Add points where not detected
df_map <- dplyr::filter(metabarcodes_species, reads_total_class_pct == 0)
species_map <- background_map + geom_point(data = df_map, aes(x = longitude, 
    y = latitude), color = color_not_present, size = size_cross, shape = 3)


# Add the water metabarcodes
df_map <- dplyr::filter(metabarcodes_species, reads_total_class > 0)
species_map <- species_map + geom_point(data = df_map, aes(x = longitude, y = latitude, 
    size = reads_total_class_pct), color = color_water) + scale_size("% of barcodes", 
    range = c(0, 10), limits = c(0, 10), breaks = c(1, 2.5, 5, 10))
# range gives maximum and minimum size of symbols, limits the extent of the
# scale

species_map

# ggsave(plot=species_map, filename=paste('Fig Bolido map world
# 1.1.pdf',sep=''), width = 15, height = 15, scale=1.25, units='cm',
# useDingbats=FALSE)

# Zoom on Europe

species_map_europe <- species_map + coord_map("mercator", xlim = c(-30, 60), 
    ylim = c(30, 60)) + scale_x_continuous(breaks = (-3:6) * 10) + scale_y_continuous(breaks = (6:12) * 
    5) + scale_size("% of barcodes", range = c(0, 15), limits = c(0, 0.5), breaks = c(0.1, 
    0.25, 0.5))
species_map_europe

# Zoom on poles

species_map_pole <- species_map + coord_map("ortho", orientation = c(75, -20, 
    0))

species_map_pole

# ggsave(plot=species_map, filename=paste('Fig Bolido map pole
# 1.0.pdf',sep=''), width = 10, height = 10, scale=1.25, units='cm',
# useDingbats=FALSE)

10.3 Maps for each metabarcode group at species level

map_array <- list()  # to store the maps to do tiled graph

for (one_species in df.metabarcode_species_color$species) {
    
    df_map <- metabarcodes_species %>% mutate_(read_my_species = one_species) %>% 
        dplyr::filter(read_my_species == 0)
    
    species_title <- str_replace_all(one_species, "_", " ")
    species_title <- str_replace_all(species_title, " sp.", "")
    
    one_species_map <- background_map + geom_point(data = df_map, aes(x = longitude, 
        y = latitude), color = color_not_present, size = size_cross, shape = 3) + 
        ggtitle(species_title)
    
    if (str_detect(species_title, "Tri")) {
        one_species_map <- one_species_map + theme(plot.title = element_text(face = "italic"))
    }
    
    df_map <- metabarcodes_species %>% mutate_(read_my_species = one_species) %>% 
        dplyr::filter(read_my_species > 0)
    
    one_species_map <- one_species_map + geom_point(data = df_map, aes(x = longitude, 
        y = latitude, size = read_my_species/reads_total_class * 100), color = color_water) + 
        theme(legend.position = "top") + guides(color = guide_legend(override.aes = list(size = 5))) + 
        scale_size("% of Bolido barcodes", range = c(0, 5), limits = c(0, 100), 
            breaks = c(10, 25, 50, 100), guide = FALSE)
    # range gives maximum and minimum size of symbols, limits the extent of the
    # scale replace guide = 'legend' by guide=FALSE to remove the legend....
    
    print(one_species_map)
    map_array[[one_species]] <- one_species_map
}

grid_map <- grid.arrange(grobs = map_array, ncol = 2, nrow = 5, clip = FALSE)

# ggsave(plot=grid_map, filename=paste('Fig Bolido maps barcodes
# 1.0.pdf',sep=''), width = 20 , height = 35, scale=1.2, units='cm',
# useDingbats=FALSE)

10.4 Map individual morphospecies in a grid array

map_array <- list()  # to store the maps to do tiled graph

for (one_species in df.morpho_species_color$species) {
    
    df_map <- morpho_species %>% filter(species == one_species)
    
    species_title <- one_species
    
    one_species_map <- background_map + geom_point(data = df_map, aes(x = longitude, 
        y = latitude), color = color_morpho, size = size_cross, shape = 19) + 
        ggtitle(species_title) + theme(plot.title = element_text(face = "italic"))
    
    print(one_species_map)
    map_array[[one_species]] <- one_species_map
}

grid_map <- grid.arrange(grobs = map_array, ncol = 3, nrow = 4, clip = FALSE)

# ggsave(plot=grid_map, filename=paste('Fig Bolido maps morpho species
# 3.42.pdf',sep=''), width = 30 , height = 30, scale=1.2, units='cm',
# useDingbats=FALSE)

10.5 Map morphospecies on global maps

  • World
  • Arctic
  • Japan
  • Alaska
morpho_species_map <- background_map + geom_jitter(data = morpho_species, aes(x = longitude, 
    y = latitude, color = species, shape = genus), size = size_cross * 3, width = size_cross, 
    height = size_cross) + scale_color_manual(values = morpho_species_color) + 
    scale_shape_manual(values = morpho_genus_shape) + guides(color = guide_legend(override.aes = list(size = 5)))
morpho_species_map

morpho_species_map_pole <- morpho_species_map + coord_map("ortho", orientation = c(60, 
    -180, 0))
morpho_species_map_pole

xmin = 143
xmax = 148
ymin = 37
ymax = 45
xbreak = 1
ybreak = 1
morpho_species_map_japan <- background_map + geom_jitter(data = morpho_species, 
    aes(x = longitude, y = latitude, color = species, shape = genus), size = size_cross * 
        3, width = 0.2 * size_cross, height = 0.2 * size_cross) + scale_color_manual(values = morpho_species_color) + 
    scale_shape_manual(values = morpho_genus_shape) + guides(color = guide_legend(override.aes = list(size = 5))) + 
    scale_x_continuous(breaks = (as.integer(xmin/xbreak):as.integer(xmax/xbreak)) * 
        xbreak) + scale_y_continuous(breaks = (as.integer(ymin/ybreak):as.integer(ymax/ybreak)) * 
    ybreak) + coord_map("mercator", xlim = c(xmin, xmax), ylim = c(ymin, ymax))
morpho_species_map_japan

xmin = -150
xmax = -135
xbreak = 5
ymin = 55
ymax = 61
ybreak = 1
morpho_species_map_alaska <- background_north_america + geom_jitter(data = morpho_species, 
    aes(x = longitude, y = latitude, color = species, shape = genus), size = size_cross * 
        5, width = 0.2 * size_cross, height = 0.2 * size_cross) + scale_color_manual(values = morpho_species_color) + 
    scale_shape_manual(values = morpho_genus_shape) + guides(color = guide_legend(override.aes = list(size = 5))) + 
    scale_x_continuous(breaks = (as.integer(xmin/xbreak):as.integer(xmax/xbreak)) * 
        xbreak) + scale_y_continuous(breaks = (as.integer(ymin/ybreak):as.integer(ymax/ybreak)) * 
    ybreak) + coord_map("mercator", xlim = c(xmin, xmax), ylim = c(ymin, ymax))
morpho_species_map_alaska

Daniel Vaulot

15 05 2018