Bolido review Frontiers 2018
Bolido review Frontiers 2018
library(knitr)
library(rmdformats)
library(kableExtra)
knitr::opts_chunk$set(fig.width=8,
fig.height=6,
eval=TRUE,
cache=TRUE,
echo=TRUE,
prompt=FALSE,
tidy=TRUE,
comment=NA,
message=FALSE,
warning=FALSE)
opts_knit$set(width=90)
options(max.print="500") To do
- Encoding problem in reading the local database… Very strange…
- Can use iconv()… to convert the columns but not very elegant…
- See : https://stackoverflow.com/questions/38872770/character-encoding-dplyr-with-database-postgresql
1 Goal
Plot the distribution of Parmales using both metabarcodes and litterature data
2 Load necessary libraries
library("RMySQL")
library("stringr") # To do replacement
library("dplyr") # To manipulate dataframes
library("tidyr") # To manipulate dataframes
library("readxl")
library("readr")
library("treemap")
library("ggplot2")
library("gridExtra") # necessary to arrange graphics into grid
# Libraries dvutils -------------------------------------------------------
if (any(grepl("package:dvutils", search()))) detach("package:dvutils", unload = TRUE)
library("dvutils")3 Data sets used
- dataset_id dataset_code dataset_name marker region
- 5 MALINA_Monier_2014 MALINA - Monier 2014 18S_V4 Arctic Ocean
- 8 Arctic_Comeau_2011 ACME - Comeau 2011 18S_V4 Arctic Ocean
- 9 Nansen_Metfies_2016 Nansen Basin - Metfies 2016 18S_V4 Arctic Ocean
- 10 Southern_Ocean_Wolf_2014 Southern Ocean - Wolf 2014 18S_V4 Southern Ocean
- 11 Fieldes_Bay_Luo_2016 Fieldes Bay - Luo 2016 18S_V4 Southern Ocean
- 13 Fram_strait_Kilias_2013 Fram Strait Kilias - 2013 18S_V4 Arctic Ocean
- 14 OSD_LGC_2014 OSD LGC 2014 18S_V4 Ocean
4 Sequence processing
#==========================================================================================================================
# Usearch to process metabarcodes
#==========================================================================================================================
usearch -makeudb_usearch 18S_V4.fasta -output 18S_V4.udb
usearch -makeudb_utax PR2_version_4.6_UTAX_all.fasta -output PR2_version_4.6_UTAX_all.udb -report PR2_version_4.6_UTAX_all.txt
usearch -makeudb_utax PR2_version_4.6_UTAX_Bolido.fasta -output PR2_version_4.6_UTAX_Bolido.udb -report PR2_version_4.6_UTAX_Bolido.txt
# usearch -makeudb_usearch OSD_18S_unique.fasta -output OSD_18S_unique.udb
# Note the Fragment must be smaller than the sequences in the database....
# ============ Assignments of data sets =====================
usearch -utax 18S_V4_Metfies_Kilian.fasta -db PR2_version_4.6_UTAX_Bolido.udb -utaxout 18S_V4_Metfies_Kilian.utax -strand both
# ============ Bolido =====================
# USEARCH
usearch -usearch_global Bolidophyceae_V4_Genbank.fasta -db 18S_V4.udb -id 0.95 -alnout Bolidophyceae_V4_Genbank_hits.aln -strand both -dbmatched Bolidophyceae_V4_Genbank_hits.fasta -userout Bolidophyceae_V4_Genbank_hits.m8 -userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+bits -maxaccepts 300
# UCLUST
usearch -cluster_fast Bolidophyceae_V4_all.fasta -id 0.99 -sort other -centroids Bolidophyceae_V4_all_centroids.fasta -uc Bolidophyceae_V4_all_clusters.uc
usearch -cluster_fast Bolido_pr2_metfies_kilian.fasta -id 0.99 -sort length -centroids Bolido_pr2_metfies_kilian_centroids.fasta -uc BBolido_pr2_metfies_kilian_clusters.uc
usearch -cluster_fast 18S_V4_Bolido.fasta -id 0.99 -sort other -centroids 18S_V4_Bolido_centroids.fasta -uc 18S_V4_Bolido_clusters.uc -msaout 18S_V4_Bolido_align.fasta
# add this to get the alignements : -msaout Bolidophyceae_V4_all_alignments.fasta
#==========================================================================================================================
# Mothur - Extract V4 region from PR2 Bolido and Tetraparma (unpublished) sequences
#==========================================================================================================================
set.dir(input=/projet/umr7144/dipo/vaulot/mothur/bolido/)
/usr/local/genome2/mothur-1.39.5/mothur "#pcr.seqs(fasta=pr2_version_4.6_Bolidophyceae.fasta, oligos=oligos18s_V4_Zingone.oligos, pdiffs=2, rdiffs=2, keepprimer=TRUE, keepdots=F, processors=32)"
/usr/local/genome2/mothur-1.39.5/mothur "#pcr.seqs(fasta=Tetraparma.fasta, oligos=oligos18s_V4_Zingone.oligos, pdiffs=2, rdiffs=2, keepprimer=TRUE, keepdots=F, processors=32)"
# Remove part after PCR for Metabarcodes
/usr/local/genome2/mothur-1.39.5/mothur "#pcr.seqs(fasta=18S_V4_Bolido.fasta, oligos=oligos18s_V4_Zingone_reverse.oligos, rdiffs=2, keepprimer=TRUE, keepdots=F, processors=32, nomatch=keep)"
# Dereplicate metabarcodes = Do not use use Vsearch....
#/usr/local/genome2/mothur-1.39.5/mothur "#unique.seqs(fasta=18S_V4_Bolido.scrap.pcr.fasta)"
#/usr/local/genome2/mothur-1.39.5/mothur "#unique.seqs(fasta=18S_V4_Bolido.pcr.fasta)"
/usr/local/genome2/mothur-1.39.5/mothur "#unique.seqs(fasta=pr2_version_4.6_Bolidophyceae.pcr_clones.fasta)"
#==========================================================================================================================
# Vsearch - Dereplicate metabarcodes and clones
#==========================================================================================================================
vsearch --derep_prefix 18S_V4_Bolido.pcr.fasta --output 18S_V4_Bolido.pcr.dereplicated.fasta
vsearch --derep_prefix pr2_version_4.6_Bolidophyceae.pcr_clones.fasta --output pr2_version_4.6_Bolidophyceae.pcr_clones.dereplicated.fasta
# Reading file 18S_V4_Bolido.pcr.fasta 100%
# 70525 nt in 177 seqs, min 270, max 417, avg 398
# Sorting by length 100%
# Dereplicating 100%
# Sorting 100%
# 129 unique sequences, avg cluster 1.4, median 1, max 29
# Writing output file 100%
#==========================================================================================================================
# Vsearch - Cluster env + barcodes used for the tree
#==========================================================================================================================
vsearch --cluster_fast Bolido_V4_pr2_and_meta_final.fasta --id 0.97 --iddef 2 --clusterout_sort --uc Bolido_V4_pr2_and_meta_final.uclust.txt5 Initialize variables and read text data
dataset_astan <- 1 # SOMLIT Astan
dataset_map <- c(5, 8:11, 13:14) # Do not use OSD 2014-2015, OSD LW, Tara, Oslofjord
df.metabarcode_species_color <- read_tsv("Bolido colors metabarcodes.txt")
df.morpho_species_color <- read_tsv("Bolido colors morpho species.txt")
morpho_species <- read_excel("C:/Users/vaulot/Google Drive/Papers/2017 Kuwata Bolido Frontiers/Supplementary material/Sup_Table_1_Parmales_distribution.xls",
sheet = "Table for R 3.0")6 Read metabarcode database
# Connect to metabarcodes database
db_metabarcodes <- db_connect(db_info("metapr2"))
# Read database tables and filters according to taxonomy and type of samples
otus <- tbl(db_metabarcodes, "otus") %>% collect()
otus_select <- otus %>% filter(str_detect(class, "Bolido"))
samples <- tbl(db_metabarcodes, "samples") %>% collect()
# samples <- dbGetQuery(db_metabarcodes, 'select * from samples')
samples_select <- samples %>% filter(dataset_id %in% dataset_map) %>% filter(depth_level ==
"surface") %>% filter(DNA_RNA == "DNA")
read_abundance <- tbl(db_metabarcodes, "read_abundance") %>% collect()
# Disonnect from metabarcodes database
db_disconnect(db_metabarcodes)[1] TRUE
7 Summarize information about metabarcode sequences
# Join the tables otu and read abundance
otus_select <- inner_join(read_abundance, otus_select)
# Only keep samples that have been selected
otus_select <- otus_select %>% filter((dataset_id %in% samples_select$dataset_id) &
(sample_code %in% samples_select$sample_code))
# Summarize per family, species at each station
sample_species <- otus_select %>% group_by(dataset_id, sample_code, family,
species) %>% summarize(reads_total_species = sum(read_number))
# Spread with one column per species
sample_species_wide <- sample_species %>% ungroup() %>% select(-family) %>%
spread(key = species, value = reads_total_species) %>% right_join(select(samples_select,
dataset_id, sample_code))
sample_species_wide[is.na(sample_species_wide)] <- 0
# Summarize overall by species
total_species <- sample_species %>% group_by(family, species) %>% summarize(reads_total_species_overall = sum(reads_total_species))
# Total Bolido at each station
sample_class <- sample_species %>% group_by(dataset_id, sample_code) %>% summarize(reads_total_class = sum(reads_total_species))
# Reads Dominant species at each station and remove duplicate
sample_species_dominant <- sample_species %>% group_by(dataset_id, sample_code) %>%
top_n(1, reads_total_species) %>% distinct(dataset_id, sample_code, .keep_all = TRUE) %>%
rename(species_dominant = species, family_dominant = family, reads_total_species_dominant = reads_total_species)
# Assemble everything in the big table
metabarcodes_species <- samples_select %>% left_join(sample_class) %>% left_join(sample_species_dominant) %>%
mutate(reads_total_class_pct = reads_total_class/reads_total * 100) %>%
left_join(sample_species_wide)
metabarcodes_species$reads_total_class_pct[is.na(metabarcodes_species$reads_total_class_pct)] <- 0
write_tsv(metabarcodes_species, "Metabarcodes_Bolido.txt", na = "")8 Treemap of metabarcodes
# Merge colors
total_species <- left_join(total_species, select(df.metabarcode_species_color,
-family))
treemap(total_species, index = c("family", "species"), vSize = "reads_total_species_overall",
vColor = "species_color_hex", type = "color", title = "", asp = 1, lowerbound.cex.labels = 1,
fontsize.labels = 12)9 Boxplot for % of total metabarcodes
plot1 <- ggplot(data = filter(metabarcodes_species, reads_total_class_pct >
0), aes(DNA_RNA, reads_total_class_pct)) + theme_bw() + geom_boxplot() +
scale_y_log10(breaks = c(0.01, 0.1, 1, 10)) + annotation_logticks(sides = "l") +
labs(x = "", y = "Bolidophyceae (% of total eukaryotes)")
# coord_flip() # coord_flip and annotation_loticks do not work together....
plot1# ggsave(plot=plot1, filename=paste('Fig Bolido boxplot 1.0.pdf',sep=''),
# width = 4, height = 10, scale=1, units='cm', useDingbats=FALSE)
summary(metabarcodes_species$reads_total_class_pct) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.01070 0.03529 0.29380 0.11468 12.24143
10 Maps
10.1 Background maps
# Define constants for the map
size_points <- 2.5
size_cross <- 1
scaling_factor <- 30
color_ice <- "lightslateblue"
color_water <- "red"
color_cultures <- "green"
color_not_present <- "blue"
color_morpho <- "brown"
# colors for metabarcode species
species_color <- as.vector(df.metabarcode_species_color$species_color_hex)
names(species_color) <- df.metabarcode_species_color$species
# colors for morpho species
morpho_species_color <- as.vector(df.morpho_species_color$species_color_hex)
names(morpho_species_color) <- df.morpho_species_color$species
# shape for morpho genus
morpho_genus_shape <- as.vector(df.morpho_species_color$genus_shape)
names(morpho_genus_shape) <- df.morpho_species_color$genus
# Background maps
background_map <- map_world()
background_north_america <- map_US()10.2 Global Maps
- Dominant group = Color
- % of total metabarcodes = Size of circle
# Add points where not detected
df_map <- dplyr::filter(metabarcodes_species, reads_total_class_pct == 0)
species_map <- background_map + geom_point(data = df_map, aes(x = longitude,
y = latitude), color = color_not_present, size = size_cross, shape = 3)
# Add the water metabarcodes
df_map <- dplyr::filter(metabarcodes_species, reads_total_class > 0)
species_map <- species_map + geom_point(data = df_map, aes(x = longitude, y = latitude,
size = reads_total_class_pct), color = color_water) + scale_size("% of barcodes",
range = c(0, 10), limits = c(0, 10), breaks = c(1, 2.5, 5, 10))
# range gives maximum and minimum size of symbols, limits the extent of the
# scale
species_map# ggsave(plot=species_map, filename=paste('Fig Bolido map world
# 1.1.pdf',sep=''), width = 15, height = 15, scale=1.25, units='cm',
# useDingbats=FALSE)
# Zoom on Europe
species_map_europe <- species_map + coord_map("mercator", xlim = c(-30, 60),
ylim = c(30, 60)) + scale_x_continuous(breaks = (-3:6) * 10) + scale_y_continuous(breaks = (6:12) *
5) + scale_size("% of barcodes", range = c(0, 15), limits = c(0, 0.5), breaks = c(0.1,
0.25, 0.5))
species_map_europe# Zoom on poles
species_map_pole <- species_map + coord_map("ortho", orientation = c(75, -20,
0))
species_map_pole# ggsave(plot=species_map, filename=paste('Fig Bolido map pole
# 1.0.pdf',sep=''), width = 10, height = 10, scale=1.25, units='cm',
# useDingbats=FALSE)10.3 Maps for each metabarcode group at species level
map_array <- list() # to store the maps to do tiled graph
for (one_species in df.metabarcode_species_color$species) {
df_map <- metabarcodes_species %>% mutate_(read_my_species = one_species) %>%
dplyr::filter(read_my_species == 0)
species_title <- str_replace_all(one_species, "_", " ")
species_title <- str_replace_all(species_title, " sp.", "")
one_species_map <- background_map + geom_point(data = df_map, aes(x = longitude,
y = latitude), color = color_not_present, size = size_cross, shape = 3) +
ggtitle(species_title)
if (str_detect(species_title, "Tri")) {
one_species_map <- one_species_map + theme(plot.title = element_text(face = "italic"))
}
df_map <- metabarcodes_species %>% mutate_(read_my_species = one_species) %>%
dplyr::filter(read_my_species > 0)
one_species_map <- one_species_map + geom_point(data = df_map, aes(x = longitude,
y = latitude, size = read_my_species/reads_total_class * 100), color = color_water) +
theme(legend.position = "top") + guides(color = guide_legend(override.aes = list(size = 5))) +
scale_size("% of Bolido barcodes", range = c(0, 5), limits = c(0, 100),
breaks = c(10, 25, 50, 100), guide = FALSE)
# range gives maximum and minimum size of symbols, limits the extent of the
# scale replace guide = 'legend' by guide=FALSE to remove the legend....
print(one_species_map)
map_array[[one_species]] <- one_species_map
}grid_map <- grid.arrange(grobs = map_array, ncol = 2, nrow = 5, clip = FALSE)# ggsave(plot=grid_map, filename=paste('Fig Bolido maps barcodes
# 1.0.pdf',sep=''), width = 20 , height = 35, scale=1.2, units='cm',
# useDingbats=FALSE)10.4 Map individual morphospecies in a grid array
map_array <- list() # to store the maps to do tiled graph
for (one_species in df.morpho_species_color$species) {
df_map <- morpho_species %>% filter(species == one_species)
species_title <- one_species
one_species_map <- background_map + geom_point(data = df_map, aes(x = longitude,
y = latitude), color = color_morpho, size = size_cross, shape = 19) +
ggtitle(species_title) + theme(plot.title = element_text(face = "italic"))
print(one_species_map)
map_array[[one_species]] <- one_species_map
}grid_map <- grid.arrange(grobs = map_array, ncol = 3, nrow = 4, clip = FALSE)# ggsave(plot=grid_map, filename=paste('Fig Bolido maps morpho species
# 3.42.pdf',sep=''), width = 30 , height = 30, scale=1.2, units='cm',
# useDingbats=FALSE)10.5 Map morphospecies on global maps
- World
- Arctic
- Japan
- Alaska
morpho_species_map <- background_map + geom_jitter(data = morpho_species, aes(x = longitude,
y = latitude, color = species, shape = genus), size = size_cross * 3, width = size_cross,
height = size_cross) + scale_color_manual(values = morpho_species_color) +
scale_shape_manual(values = morpho_genus_shape) + guides(color = guide_legend(override.aes = list(size = 5)))
morpho_species_mapmorpho_species_map_pole <- morpho_species_map + coord_map("ortho", orientation = c(60,
-180, 0))
morpho_species_map_polexmin = 143
xmax = 148
ymin = 37
ymax = 45
xbreak = 1
ybreak = 1
morpho_species_map_japan <- background_map + geom_jitter(data = morpho_species,
aes(x = longitude, y = latitude, color = species, shape = genus), size = size_cross *
3, width = 0.2 * size_cross, height = 0.2 * size_cross) + scale_color_manual(values = morpho_species_color) +
scale_shape_manual(values = morpho_genus_shape) + guides(color = guide_legend(override.aes = list(size = 5))) +
scale_x_continuous(breaks = (as.integer(xmin/xbreak):as.integer(xmax/xbreak)) *
xbreak) + scale_y_continuous(breaks = (as.integer(ymin/ybreak):as.integer(ymax/ybreak)) *
ybreak) + coord_map("mercator", xlim = c(xmin, xmax), ylim = c(ymin, ymax))
morpho_species_map_japanxmin = -150
xmax = -135
xbreak = 5
ymin = 55
ymax = 61
ybreak = 1
morpho_species_map_alaska <- background_north_america + geom_jitter(data = morpho_species,
aes(x = longitude, y = latitude, color = species, shape = genus), size = size_cross *
5, width = 0.2 * size_cross, height = 0.2 * size_cross) + scale_color_manual(values = morpho_species_color) +
scale_shape_manual(values = morpho_genus_shape) + guides(color = guide_legend(override.aes = list(size = 5))) +
scale_x_continuous(breaks = (as.integer(xmin/xbreak):as.integer(xmax/xbreak)) *
xbreak) + scale_y_continuous(breaks = (as.integer(ymin/ybreak):as.integer(ymax/ybreak)) *
ybreak) + coord_map("mercator", xlim = c(xmin, xmax), ylim = c(ymin, ymax))
morpho_species_map_alaska