Metabarcoding Singapore
ASV from Dada2

1 Aim
2 Initialize
3 Read the data
4 Assignment of eukaryotic ASVs based on PR2 database
5 Read reassignments for some prokaryotes
- 5.1 Synechococcus
6 Process BLAST file
7 Map
8 Environmental data
- 8.1 Per station
- 8.2 Average
  - 8.2.1 Rain over 7 days
9 Phyloseq analysis

1 Aim

Assign and analyze eukaryotes for Singapore metabarcoding data (ASV assigned with dada2 as implemented on Mothur).
Do some analyzes with the prokaryotes too…

2 Initialize

This file defines all the necessary libraries and variables

  source('Metabarcoding Singapore_init.R', echo=FALSE)

3 Read the data

3.1 File names

full_path_data <- function(file_name) {
    str_c("../qiime/2018-09-06_dada2/", file_name)
}

taxo_file <- full_path_data("taxonomy.tsv")
otu_file <- full_path_data("feature-table_unrarefied.tsv")
otu_excel_file <- "../qiime/Singapore ASV_table.xlsx"
sequence_file <- full_path_data("ref_sequences_unrarefied.fasta")

metadata_xlsx <- "../metadata/Singapore_metadata.xlsx"
metadata_csv <- "../metadata/monsoonpaper_env_data.csv"

sequence_file_euk <- full_path_data("ASV_unrarefied_euk.fasta")
sequence_file_mamiello <- full_path_data("ASV_Mamiello.fasta")

dada2_taxo_file_euk <- full_path_data("ASV_unrarefied_euk.dada2.taxo")
dada2_boot_file_euk <- full_path_data("ASV_unrarefied_euk.dada2.boot")
otu_table_final_file <- full_path_data("ASV_final.tsv")

blast_file <- full_path_data("ASV_unrarefied_euk.blast.tsv")

3.2 Read the files

The dada2 treatment has already removed the forward and reverse primers, so no need to remove them
Work with the unrarefied data

# Read the sample and metadata tables
  sample_table <- read_excel(metadata_xlsx, sheet="samples", range="A1:D89") 

  metadata_table  <- read_csv(metadata_csv, na=c("ND", "")) %>% 
    dplyr::rename(sample_code=Sample, day_number=Day_number, date=Date, 
                  location=Location, monsoon=`Monsoon period`) %>% 
    select(-Strait) %>% 
    mutate(date = lubridate::parse_date_time(date,"dmy"),
           monsoon = forcats::fct_relevel(monsoon, "NE", "IM-1", "SW", "IM-2"))
  
  station_table  <- read_excel(metadata_xlsx, sheet="stations", na=c("ND", ""))

  sample_table <- sample_table %>% 
    left_join(metadata_table) %>%
    left_join(station_table) %>% 
    mutate(sample_label = str_c(strait_label,location_label,
                                monsoon,sprintf("%03d",day_number), 
                                sep="_"))

# Read the taxonomy table
  taxo_table <- read_tsv(taxo_file)

# Clean up the taxonomy
  taxo_table <- taxo_table %>% 
    mutate(taxo_clean = str_replace_all(Taxon, "D_[0-9]+__","")) %>% 
    separate(col=taxo_clean, into=str_c("taxo", c(0:6)), sep=";") %>% 
    rename(otu_name = `Feature ID`)
  
# Remove duplicate entries for bacteria
  pattern_taxa_removed <- "marine metagenome|uncultured"
 
  taxo_table <- taxo_table %>% 
    mutate(taxo2 = case_when (
                      str_detect(taxo2,pattern_taxa_removed) ~ NA_character_,
                      TRUE ~ taxo2),
           taxo3 = case_when (
                      str_detect(taxo3,pattern_taxa_removed) ~ NA_character_,
                      TRUE ~ taxo3),
           taxo4 = case_when (
                      str_detect(taxo4,pattern_taxa_removed) ~ NA_character_,
                      TRUE ~ taxo4),
           taxo5 = case_when (
                      str_detect(taxo5,pattern_taxa_removed) ~ NA_character_,
                      TRUE ~ taxo5),
           taxo6 = case_when (
                      str_detect(taxo6,pattern_taxa_removed) ~ NA_character_,
                      TRUE ~ taxo6))
  
# If taxo_i is empty, fill with taxo_i-1
  taxo_levels <- str_c("taxo", c(0:6))
  taxo_levels_number <- length(taxo_levels)

  for (i in 1:nrow(taxo_table)) {
    for (j in 1:(taxo_levels_number) ){
      if (is.na(taxo_table[i,taxo_levels[j]])) {
              taxo_table[i,taxo_levels[j]] <-   taxo_table[i,taxo_levels[j-1]]
          }
    }    
  }

# Read the otu table
  otu_table <- read_tsv(otu_file, skip=1) %>%  # Jump the first line
    rename(otu_name = `#OTU ID`) %>% 
    mutate(otu_id = str_c("otu_", sprintf("%04d",row_number())))

# Read the sequences
  otu_sequences <- readAAStringSet(sequence_file)
  otu_sequences.df <- data.frame (otu_name=names(otu_sequences),sequence=as.character(otu_sequences))

# Remove the primers - Not necessary because the primers have been removed  
  # fwd_length = 20
  # rev_length = 15
  # otu_sequences.df <- otu_sequences.df %>% 
  #   separate (col=names, into=c("otu_id_qiime", "otu_rep_seq"), sep=" ") %>%
  #   mutate (sequence = str_sub(sequence, start=fwd_length+1, end = - rev_length - 1))

  otu_table <- taxo_table %>%  
    left_join(otu_table) %>% 
    left_join(otu_sequences.df) %>% 
    arrange(otu_id)
  
# Write a fasta file for blast with all taxonomy roups
 # otu_sequences <- otu_table %>% transmute(sequence=sequence, seq_name=otu_id)
 # fasta_write(otu_sequences, file_name="../qiime/otu_rep_98_all.fasta", compress = FALSE, taxo_include = FALSE)  
# write_tsv(otu_table, full_path_data("otu_table.tsv"), na="")

3.3 Only keep the eukaryotes in the OTU file

otu_table_euk <- otu_table %>% filter(str_detect(Taxon, "Eukaryota"))

# Write the fasta file file
otu_sequences_euk <- otu_table_euk %>% transmute(sequence = sequence, seq_name = otu_id)
fasta_write(otu_sequences_euk, file_name = sequence_file_euk, compress = FALSE, 
    taxo_include = FALSE)

[1] TRUE

4 Assignment of eukaryotic ASVs based on PR2 database

4.1 Use dada2 to reassign to PR2

dada2_assign(seq_file_name = sequence_file_euk, ref_file_name = "C:/daniel.vaulot@gmail.com/Databases/_PR2/versions/4.11.0/pr2_version_4.11.0_dada2.fasta.gz")

4.2 Read the PR2 assignement and merge with initial otu table

otu_euk_pr2 <- read_tsv(dada2_taxo_file_euk) %>% rename(otu_id = seq_name)
# otu_euk_pr2_boot <- read_tsv(dada2_boot_file_euk) %>%
# rename_all(funs(str_c(.,'_boot'))) %>% rename(seq_name = seq_name_boot)
# otu_euk_pr2 <- left_join(otu_euk_pr2, otu_euk_pr2_boot) %>% rename(otu_id=
# seq_name)
otu_table_final <- left_join(otu_table, otu_euk_pr2) %>% select(otu_id, otu_name, 
    taxo0:taxo6, Taxon, kingdom:species_boot, matches("EC|PR|RM|SBW|STJ"), sequence)

# write_tsv(otu_table_final, otu_table_final_file, na = '')

4.3 Export fasta file with taxonomy for Mamiellophyceae

otu_mamiello <- otu_table_final %>% filter(class == "Mamiellophyceae") %>% select(sequence = sequence, 
    seq_name = otu_id, supergroup:species)
fasta_write(otu_mamiello, file_name = sequence_file_mamiello, compress = FALSE, 
    taxo_include = TRUE)

[1] TRUE

5 Read reassignments for some prokaryotes

5.1 Synechococcus

otu_table_syn <- read_excel(otu_excel_file, sheet = "syn_reassigned")
otu_table_final <- otu_table_final %>% filter(!str_detect(taxo5, "Synechococcus")) %>% 
    bind_rows(otu_table_syn)

6 Process BLAST file

BLAST is performed on Roscoff ABIMS server

blast_18S_reformat(blast_file)

7 Map

7.1 Leaflet map

Visualize different layers: https://leaflet-extras.github.io/leaflet-providers/preview/

  lng_center = mean(station_table$longitude)
  lat_center = mean(station_table$latitude)

  map <- leaflet(width = 1000, height = 1000) %>%
   addTiles() %>%  # Default
#    addTiles(urlTemplate = 'https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}' ) %>%  # Satellite
#    addTiles(urlTemplate = 'https://server.arcgisonline.com/ArcGIS/rest/services/Canvas/World_Light_Gray_Base/MapServer/tile/{z}/{y}/{x}' ) %>%  # Grey background
    setView(lng=lng_center, lat=lat_center, zoom=11) %>% 
    addCircleMarkers(data = station_table, lat = ~ latitude, lng = ~ longitude, 
                     radius = 5,
               label = ~ location, 
               labelOptions = labelOptions(textsize = "10px", noHide = T),
               clusterOptions = markerClusterOptions()) 

  map

7.2 Use world map package

Resolution is very bad

worldMap <- rworldmap::getMap(resolution = "high")
world.points <- fortify(worldMap)
world.points$region <- world.points$id
world.df <- world.points[, c("long", "lat", "group", "region")]
singapore.df <- world.df %>% filter(region %in% c("Singapore"))

map <- ggplot() + geom_polygon(data = singapore.df, aes(x = long, y = lat, group = group), 
    fill = "black", color = "black")

map

7.3 Use R naturalearth package

Resolution is bad also…

singapore.sf <- rnaturalearth::ne_countries(country = "singapore", scale = "large")

tmap::tm_shape(singapore.sf) + tmap::tm_fill()

singapore.sf <- rnaturalearth::ne_countries(country = "singapore", scale = "large", 
    returnclass = "sf")
map <- ggplot() + geom_sf(data = singapore.sf, fill = "black", color = "black")

map

# Mapping of small countries
sf <- rnaturalearth::ne_download(scale = 110, type = "tiny_countries", category = "cultural", 
    returnclass = "sf")
map <- ggplot() + geom_sf(data = filter(sf, NAME == "Singapore"), fill = "black", 
    color = "black")

map

8 Environmental data

8.1 Per station

8.1.1 Temp

p <- ggplot(filter(sample_table, location_label != "BL")) + geom_line(aes(x = date, 
    y = Temperature), na.rm = TRUE) + geom_point(aes(x = date, y = Temperature, 
    color = monsoon), size = 5) + facet_grid(rows = vars(location)) + ylim(26, 
    34) + xlim(as.POSIXct(as.Date(c("2017-01-01", "2019-01-01")))) + geom_vline(xintercept = as.POSIXct(as.Date(c("2017-01-01", 
    "2018-01-01", "2019-01-01")))) + scale_color_viridis(discrete = TRUE)
p

### Salinity

p <- ggplot(filter(sample_table, location_label != "BL")) + geom_line(aes(x = date, 
    y = Salinity), na.rm = TRUE) + geom_point(aes(x = date, y = Salinity, color = monsoon), 
    size = 5) + facet_grid(rows = vars(location)) + ylim(15, 35) + xlim(as.POSIXct(as.Date(c("2017-01-01", 
    "2019-01-01")))) + geom_vline(xintercept = as.POSIXct(as.Date(c("2017-01-01", 
    "2018-01-01", "2019-01-01")))) + scale_color_viridis(discrete = TRUE)
p

8.1.2 Chlorophyll

p <- ggplot(filter(sample_table, location_label != "BL")) + geom_line(aes(x = date, 
    y = Chl), na.rm = TRUE) + geom_point(aes(x = date, y = Chl, color = monsoon), 
    size = 5) + facet_grid(rows = vars(location)) + xlim(as.POSIXct(as.Date(c("2017-01-01", 
    "2019-01-01")))) + geom_vline(xintercept = as.POSIXct(as.Date(c("2017-01-01", 
    "2018-01-01", "2019-01-01")))) + scale_color_viridis(discrete = TRUE)
p

8.1.3 Rain over 7 days

p <- ggplot(filter(sample_table, location_label != "BL")) + geom_col(aes(x = date, 
    y = Rain7, fill = monsoon)) + facet_grid(rows = vars(location)) + xlim(as.POSIXct(as.Date(c("2017-01-01", 
    "2019-01-01")))) + geom_vline(xintercept = as.POSIXct(as.Date(c("2017-01-01", 
    "2018-01-01", "2019-01-01")))) + scale_fill_viridis(discrete = TRUE)
p

8.2 Average

8.2.1 Rain over 7 days

rain <- sample_table %>% group_by(date, monsoon) %>% summarise(Rain7_mean = mean(Rain7, 
    na.rm = TRUE))

p <- ggplot(rain) + geom_point(aes(x = date, y = Rain7_mean, color = monsoon), 
    size = 5) + xlim(as.POSIXct(as.Date(c("2017-01-01", "2019-01-01")))) + geom_vline(xintercept = as.POSIXct(as.Date(c("2017-01-01", 
    "2018-01-01", "2019-01-01")))) + scale_color_viridis(discrete = TRUE)
p

9 Phyloseq analysis

9.1 Create phyloseq files for all taxa after filtering the data

Filter the euk data to remove the low bootstraps values (threshold : bootstrap > 90% at the supergroup level) and create a phyloseq file

Note the bootstrap threshold had to be higher for 98% compared to 97% (90% vs 65%). For ASV the same bootstrap has been used

9.1.1 Write the final ASV table

otu_table_euk_final <- otu_table_final %>% filter(supergroup_boot > 90)
otu_table_prok_final <- otu_table_final %>% filter(taxo0 %in% c("Bacteria", 
    "Archaea")) %>% select(-kingdom, -supergroup, -division, -class, -order, 
    -family, -genus) %>% rename(kingdom = taxo0, supergroup = taxo1, division = taxo2, 
    class = taxo3, order = taxo4, family = taxo5, genus = taxo6)

otu_table_ps <- bind_rows(otu_table_euk_final, otu_table_prok_final) %>% select_at(vars(-matches("taxo\\d"))) %>% 
    arrange(otu_id)

write_tsv(otu_table_ps, otu_table_final_file, na = "")

9.1.2 Create the phyloseq file

otu_mat <- otu_table_ps %>% select(otu = otu_id, matches("EC|PR|RM|SBW|STJ"), 
    -species, -species_boot)
tax_mat <- otu_table_ps %>% select(otu = otu_id, kingdom:species)
samples_df <- sample_table %>% rename(sample = sample_id)

row.names(otu_mat) <- otu_mat$otu
otu_mat <- otu_mat %>% select(-otu)
row.names(tax_mat) <- tax_mat$otu
tax_mat <- tax_mat %>% select(-otu)
row.names(samples_df) <- samples_df$sample
samples_df <- samples_df %>% select(-sample)
otu_mat <- as.matrix(otu_mat)
tax_mat <- as.matrix(tax_mat)

OTU = otu_table(otu_mat, taxa_are_rows = TRUE)
TAX = tax_table(tax_mat)
samples = sample_data(samples_df)

ps_all <- phyloseq(OTU, TAX, samples)
ps_all <- subset_samples(ps_all, sequence_quality == "good")

9.2 Break up into photosynthetic and non-photosynthetic

Opisthokonta (Metazoa, Fungi) are removed

cat("\nPhyloseq All \n========== \n")


Phyloseq All 
==========

ps_all

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 3000 taxa and 81 samples ]
sample_data() Sample Data:       [ 81 samples by 26 sample variables ]
tax_table()   Taxonomy Table:    [ 3000 taxa by 8 taxonomic ranks ]

ps_euk <- subset_taxa(ps_all, (kingdom %in% c("Eukaryota")))
ps_euk <- subset_taxa(ps_euk, !(supergroup %in% c("Opisthokonta")))
cat("\nPhyloseq Eukaryotes \n========== \n")


Phyloseq Eukaryotes 
==========

ps_euk

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 668 taxa and 81 samples ]
sample_data() Sample Data:       [ 81 samples by 26 sample variables ]
tax_table()   Taxonomy Table:    [ 668 taxa by 8 taxonomic ranks ]

ps_photo <- subset_taxa(ps_euk, (division %in% c("Chlorophyta", "Cryptophyta", 
    "Rhodophyta", "Haptophyta", "Ochrophyta")) | ((division == "Dinoflagellata") & 
    (class != "Syndiniales")) | (class == "Filosa-Chlorarachnea"))
cat("\nPhyloseq Photosynthetic Eukaryotes \n========== \n")


Phyloseq Photosynthetic Eukaryotes 
==========

ps_photo

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 271 taxa and 81 samples ]
sample_data() Sample Data:       [ 81 samples by 26 sample variables ]
tax_table()   Taxonomy Table:    [ 271 taxa by 8 taxonomic ranks ]

ps_hetero <- subset_taxa(ps_euk, !(division %in% c("Chlorophyta", "Cryptophyta", 
    "Rhodophyta", "Haptophyta", "Ochrophyta")) & !((division == "Dinoflagellata") & 
    !(class == "Syndiniales")) & !(class == "Filosa-Chlorarachnea"))
cat("\nPhyloseq Heterotrophic Eukaryotes \n========== \n")


Phyloseq Heterotrophic Eukaryotes 
==========

ps_hetero

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 397 taxa and 81 samples ]
sample_data() Sample Data:       [ 81 samples by 26 sample variables ]
tax_table()   Taxonomy Table:    [ 397 taxa by 8 taxonomic ranks ]

ps_prok <- subset_taxa(ps_all, (kingdom %in% c("Bacteria", "Archaea")))
cat("\nPhyloseq Prokaryotes \n========== \n")


Phyloseq Prokaryotes 
==========

ps_prok

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 2077 taxa and 81 samples ]
sample_data() Sample Data:       [ 81 samples by 26 sample variables ]
tax_table()   Taxonomy Table:    [ 2077 taxa by 8 taxonomic ranks ]

9.3 Normalize number of reads in each sample using median sequencing depth.

! If there no cells do not transform, just set column to 0 function(x, t=total_hetero) (if(sum(x) > 0){ t * (x / sum(x))} else {x})

# First define a function to normalize

ps_normalize_median <- function(ps, title) {
    ps_median = median(sample_sums(ps))
    cat(sprintf("\nThe median number of reads used for normalization of %s is  %.0f", 
        title, ps_median))
    normalize_median = function(x, t = ps_median) (if (sum(x) > 0) {
        t * (x/sum(x))
    } else {
        x
    })
    ps = transform_sample_counts(ps, normalize_median)
    cat(str_c("\nPhyloseq ", title, "\n========== \n"))
    print(ps)
}

# Apply to all the phyloseq files
ps_all = ps_normalize_median(ps_all, "all")


The median number of reads used for normalization of all is  60533
Phyloseq all
========== 
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 3000 taxa and 81 samples ]
sample_data() Sample Data:       [ 81 samples by 26 sample variables ]
tax_table()   Taxonomy Table:    [ 3000 taxa by 8 taxonomic ranks ]

ps_euk = ps_normalize_median(ps_euk, "eukaryotes (auto+hetero)")


The median number of reads used for normalization of eukaryotes (auto+hetero) is  4735
Phyloseq eukaryotes (auto+hetero)
========== 
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 668 taxa and 81 samples ]
sample_data() Sample Data:       [ 81 samples by 26 sample variables ]
tax_table()   Taxonomy Table:    [ 668 taxa by 8 taxonomic ranks ]

ps_photo = ps_normalize_median(ps_photo, "eukaryotes autotrophs")


The median number of reads used for normalization of eukaryotes autotrophs is  3038
Phyloseq eukaryotes autotrophs
========== 
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 271 taxa and 81 samples ]
sample_data() Sample Data:       [ 81 samples by 26 sample variables ]
tax_table()   Taxonomy Table:    [ 271 taxa by 8 taxonomic ranks ]

ps_hetero = ps_normalize_median(ps_hetero, "eukaryotes heterotrophs")


The median number of reads used for normalization of eukaryotes heterotrophs is  983
Phyloseq eukaryotes heterotrophs
========== 
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 397 taxa and 81 samples ]
sample_data() Sample Data:       [ 81 samples by 26 sample variables ]
tax_table()   Taxonomy Table:    [ 397 taxa by 8 taxonomic ranks ]

ps_prok = ps_normalize_median(ps_prok, "prokaryotes")


The median number of reads used for normalization of prokaryotes is  54273
Phyloseq prokaryotes
========== 
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 2077 taxa and 81 samples ]
sample_data() Sample Data:       [ 81 samples by 26 sample variables ]
tax_table()   Taxonomy Table:    [ 2077 taxa by 8 taxonomic ranks ]

9.4 Phyloseq files for abundant taxa

Remove taxa that are < 0.10 (euks) and <0.05 (proks) in any given sample
Normalize again…

ps_abundant <- function(ps, contrib_min = 0.1, title) {
    total_per_sample <- max(sample_sums(ps))
    ps <- filter_taxa(ps, function(x) sum(x > total_per_sample * contrib_min) > 
        0, TRUE)
    ps <- ps_normalize_median(ps, title)
}

cat("Remove taxa in low abundance \n\n")

Remove taxa in low abundance

ps_all_abundant = ps_abundant(ps_all, contrib_min = 0.05, "All")


The median number of reads used for normalization of All is  24700
Phyloseq All
========== 
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 49 taxa and 81 samples ]
sample_data() Sample Data:       [ 81 samples by 26 sample variables ]
tax_table()   Taxonomy Table:    [ 49 taxa by 8 taxonomic ranks ]

ps_euk_abundant = ps_abundant(ps_euk, contrib_min = 0.1, "eukaryotes (auto+hetero)")


The median number of reads used for normalization of eukaryotes (auto+hetero) is  3359
Phyloseq eukaryotes (auto+hetero)
========== 
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 60 taxa and 81 samples ]
sample_data() Sample Data:       [ 81 samples by 26 sample variables ]
tax_table()   Taxonomy Table:    [ 60 taxa by 8 taxonomic ranks ]

ps_photo_abundant = ps_abundant(ps_photo, contrib_min = 0.1, "eukaryotes autotrophs")


The median number of reads used for normalization of eukaryotes autotrophs is  2767
Phyloseq eukaryotes autotrophs
========== 
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 60 taxa and 81 samples ]
sample_data() Sample Data:       [ 81 samples by 26 sample variables ]
tax_table()   Taxonomy Table:    [ 60 taxa by 8 taxonomic ranks ]

ps_hetero_abundant = ps_abundant(ps_hetero, contrib_min = 0.1, "eukaryotes heterotrophs")


The median number of reads used for normalization of eukaryotes heterotrophs is  694
Phyloseq eukaryotes heterotrophs
========== 
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 83 taxa and 81 samples ]
sample_data() Sample Data:       [ 81 samples by 26 sample variables ]
tax_table()   Taxonomy Table:    [ 83 taxa by 8 taxonomic ranks ]

ps_prok_abundant = ps_abundant(ps_prok, contrib_min = 0.05, "prokaryotes")


The median number of reads used for normalization of prokaryotes is  24353
Phyloseq prokaryotes
========== 
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 44 taxa and 81 samples ]
sample_data() Sample Data:       [ 81 samples by 26 sample variables ]
tax_table()   Taxonomy Table:    [ 44 taxa by 8 taxonomic ranks ]

9.5 Create a list for the auto and hetero phyloseq files

ps_list <- list(ps = c(ps_prok, ps_euk, ps_photo), title = c("Prokaryotes - all OTUs", 
    "Eukaryotes - Auto and Hetero - all OTUs", "Eukaryotes - Autotrophs - all OTUs"))
ps_list_abundant <- list(ps = c(ps_prok_abundant, ps_euk_abundant, ps_photo_abundant), 
    title = c("Prokaryotes - abundant OTUs (> 5%)", "Eukaryotes - Auto + Hetero - abundant OTUs (> 10%)", 
        "Eukaryotes - Autotrophs - abundant OTUs (> 10%)"))

9.6 Create tabular files for other plots

ps_to_long <- function(ps) {
    otu_df <- data.frame(otu_table(ps)) %>% rownames_to_column(var = "otu_id")
    taxo_df <- data.frame(tax_table(ps)) %>% rownames_to_column(var = "otu_id")
    otu_df <- left_join(taxo_df, otu_df)
    otu_df <- gather(otu_df, "sample", "n_seq", contains("X"))  # All samples contain X
    metadata_df <- data.frame(sample_data(ps)) %>% rownames_to_column(var = "sample")
    otu_df <- left_join(otu_df, metadata_df)
}

long_all <- ps_to_long(ps_all)
long_euk <- ps_to_long(ps_euk)
long_photo <- ps_to_long(ps_photo)
long_euk_abundant <- ps_to_long(ps_euk_abundant)
long_photo_abundant <- ps_to_long(ps_photo_abundant)
long_prok <- ps_to_long(ps_prok)

9.7 Treemaps at division and class levels

9.7.1 Define function to draw treemaps

treemap_gg_dv <- function(df, group1, group2, title) {
    group1 <- enquo(group1)
    group2 <- enquo(group2)
    df <- df %>% group_by(!!group1, !!group2) %>% summarise(n_seq = sum(n_seq))
    g_treemap <- ggplot(df, aes(area = n_seq, fill = !!group1, label = !!group2, 
        subgroup = !!group1)) + ggtitle(title) + treemapify::geom_treemap() + 
        treemapify::geom_treemap_subgroup_border() + treemapify::geom_treemap_text(colour = "black", 
        place = "topleft", reflow = T, padding.x = grid::unit(3, "mm"), padding.y = grid::unit(3, 
            "mm")) + treemapify::geom_treemap_subgroup_text(place = "centre", 
        grow = T, alpha = 0.5, colour = "white", fontface = "italic", min.size = 0) + 
        scale_fill_viridis(discrete = TRUE) + theme(legend.position = "none")
    print(g_treemap)
    return(g_treemap)
}

9.7.2 Do the individual treemaps

array_treemap <- list()

for (one_strait in c("Singapore", "Johor")) {
    
    label <- str_c(one_strait, "-all")
    
    array_treemap[[label]] <- treemap_gg_dv(filter(long_all, strait == one_strait), 
        kingdom, supergroup, str_c("A - All - ", one_strait))
    
    label <- str_c(one_strait, "-arch")
    array_treemap[[label]] <- treemap_gg_dv(filter(long_prok, strait == one_strait & 
        kingdom == "Archaea"), division, class, str_c("B - Archaea - ", one_strait))
    
    label <- str_c(one_strait, "-bact")
    array_treemap[[label]] <- treemap_gg_dv(filter(long_prok, strait == one_strait & 
        kingdom == "Bacteria"), division, class, str_c("C - Bacteria - ", one_strait))
    
    
    label <- str_c(one_strait, "-euks")
    array_treemap[[label]] <- treemap_gg_dv(filter(long_euk, strait == one_strait), 
        division, class, str_c("D - All Eulkaryotes - ", one_strait))
    
    
    label <- str_c(one_strait, "-photeuks")
    array_treemap[[label]] <- treemap_gg_dv(filter(long_photo, strait == one_strait), 
        division, class, str_c("E - Photosynthetic Eukaryotes - ", one_strait))
    
    
    
    # treemap_dv(filter(long_euk, strait == one_strait), c('division',
    # 'class'),'n_seq', str_c('All euks - ', one_strait ))
    # treemap_dv(filter(long_photo, strait == one_strait), c('division',
    # 'class'),'n_seq',str_c('Photo euks', one_strait ))
    # treemap_dv(filter(long_prok, strait == one_strait & kingdom=='Bacteria'),
    # c('division', 'class'),'n_seq',str_c('Bacteria - ', one_strait ))
    # treemap_dv(filter(long_prok, strait == one_strait & kingdom=='Archaea'),
    # c('division', 'class'),'n_seq',str_c('Archaea - ', one_strait ))
}

9.7.3 Arrange the different treemaps in a grid and save

grid_treemap_fig <- gridExtra::grid.arrange(grobs = array_treemap, ncol = 2, 
    nrow = 5, clip = FALSE, padding = unit(0, "line"), as.table = FALSE)

ggsave("../fig/Fig_Treemaps.png", grid_treemap_fig, height = 25, width = 10, 
    dpi = 300)

9.8 Most abundant species (photosynthetic)

array_species <- list()

for (one_strait in c("Singapore", "Johor")) {
    
    long_euk_species <- filter(long_photo, strait == one_strait) %>% mutate(species_label = str_c(class, 
        species, sep = "-")) %>% group_by(division, class, species, species_label) %>% 
        summarize(n_seq = sum(n_seq)) %>% arrange(desc(n_seq)) %>% ungroup()
    
    array_species[[one_strait]] <- ggplot(top_n(long_euk_species, 20, n_seq)) + 
        geom_col(aes(x = reorder(species, n_seq), y = n_seq, fill = division)) + 
        coord_flip() + xlab("") + ylab("Number of reads") + scale_fill_manual(values = division_colors) + 
        ggtitle(one_strait)
    
    print(array_species[[one_strait]])
    
}

grid_fig <- cowplot::plot_grid(array_species[["Singapore"]], array_species[["Johor"]], 
    labels = c("A", "B"), align = "h", ncol = 2, label_size = 20)

ggsave("../fig/Fig_Abundant_Species.png", grid_fig, height = 10, width = 20, 
    dpi = 300)

9.9 Most abundant prokaryotes taxo6 (~ genus)

Taxo6 (Family) corresponds to the genus for bacteria… A bit confusing, will need to be fixed

array_species <- list()

for (one_strait in c("Singapore", "Johor")) {
    
    long_prok_genus <- filter(long_prok, strait == one_strait) %>% mutate(genus_label = str_c(division, 
        family, sep = "-")) %>% group_by(supergroup, division, class, family, 
        genus_label) %>% summarize(n_seq = sum(n_seq)) %>% arrange(desc(n_seq)) %>% 
        ungroup()
    
    array_species[[one_strait]] <- ggplot(top_n(long_prok_genus, 20, n_seq)) + 
        geom_col(aes(x = reorder(genus_label, n_seq), y = n_seq, fill = supergroup)) + 
        coord_flip() + xlab("") + ylab("Number of reads") + scale_fill_manual(values = supergroup_colors) + 
        ggtitle(one_strait)
    
    print(array_species[[one_strait]])
    
}

grid_fig <- cowplot::plot_grid(array_species[["Singapore"]], array_species[["Johor"]], 
    labels = c("A", "B"), align = "h", ncol = 2, label_size = 20)

ggsave("../fig/Fig_Abundant_Genus_Prok.png", grid_fig, height = 10, width = 20, 
    dpi = 300)

9.10 Bar plot of divisions per station

Note: some stations are completely missing heterotrophs (Only Opistokonta)

for (i in 1:3) {
    p <- plot_bar(ps_list$ps[[i]], x = "sample_label", fill = "division") + 
        geom_bar(aes(color = division, fill = division), stat = "identity", 
            position = "stack") + ggtitle(str_c("Division level - ", ps_list$title[[i]])) + 
        theme(axis.text.y = element_text(size = 10)) + theme(axis.text.x = element_text(angle = 0, 
        hjust = 0.5)) + coord_flip() + scale_fill_viridis(discrete = TRUE) + 
        scale_color_viridis(discrete = TRUE)
    print(p)
}

9.11 Bar plot of class per station

Only consider the abundant taxa

for (i in 1:3) {
    p <- plot_bar(ps_list_abundant$ps[[i]], x = "sample_label", fill = "class") + 
        geom_bar(aes(color = class, fill = class), stat = "identity", position = "stack") + 
        ggtitle(str_c("Class level - ", ps_list_abundant$title[[i]])) + theme(axis.text.y = element_text(size = 10)) + 
        theme(axis.text.x = element_text(angle = 0, hjust = 0.5)) + coord_flip() + 
        scale_fill_viridis(discrete = TRUE) + scale_color_viridis(discrete = TRUE)
    print(p)
}

9.12 Compare by Straight, Site, Moonsoon (abundant OTUs only)

for (factor in c("strait", "location", "monsoon")) {
    for (i in 1:3) {
        ps_aggregate <- merge_samples(ps_list_abundant$ps[[i]], factor)
        ps_aggregate <- transform_sample_counts(ps_aggregate, function(x) 100 * 
            (x/sum(x)))
        p <- plot_bar(ps_aggregate, fill = "division") + geom_col(aes(color = division, 
            fill = division)) + ggtitle(str_c(ps_list_abundant$title[[i]], " - ", 
            factor)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, 
            hjust = 1)) + ylab("%") + scale_fill_viridis(discrete = TRUE) + 
            scale_color_viridis(discrete = TRUE)
        print(p)
    }
}

9.13 Compare by Moonsoon in Singapore straight (abundant OTUs)

for (i in 1:3) {
    ps_aggregate <- subset_samples(ps_list_abundant$ps[[i]], strait == "Singapore")
    ps_aggregate <- merge_samples(ps_aggregate, "monsoon")
    ps_aggregate <- transform_sample_counts(ps_aggregate, function(x) 100 * 
        (x/sum(x)))
    p <- plot_bar(ps_aggregate, fill = "division") + geom_col(aes(color = division, 
        fill = division)) + ggtitle(str_c(ps_list_abundant$title[[i]], " - ", 
        factor)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, 
        hjust = 1)) + ylab("%") + scale_fill_viridis(discrete = TRUE) + scale_color_viridis(discrete = TRUE)
    print(p)
}

9.14 Time series (abundant OTUs) aggregated for Singapore strait only

for (factor in c("date")) {
    for (i in 1:3) {
        ps_aggregate <- subset_samples(ps_list_abundant$ps[[i]], strait == "Singapore")
        ps_aggregate <- merge_samples(ps_aggregate, "date")
        ps_aggregate <- transform_sample_counts(ps_aggregate, function(x) 100 * 
            (x/sum(x)))
        p <- plot_bar(ps_aggregate, fill = "division") + geom_col(aes(color = division, 
            fill = division)) + ggtitle(str_c(ps_list_abundant$title[[i]], " - ", 
            factor)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, 
            hjust = 1)) + ylab("%") + scale_fill_viridis(discrete = TRUE) + 
            scale_color_viridis(discrete = TRUE)
        print(p)
    }
}

9.15 Time series (abundant OTUs) - Division level

for (i in 1:3) {
    ps_plot <- ps_list_abundant$ps[[i]]
    p <- plot_bar(ps_plot, x = "date", fill = "division") + facet_grid(rows = vars(location)) + 
        geom_col(aes(color = division, fill = division)) + ggtitle(str_c(ps_list_abundant$title[[i]], 
        " - Date")) + theme(axis.text.x = element_text(angle = 45, vjust = 0.5, 
        hjust = 1)) + ylab("%") + xlim(as.POSIXct(as.Date(c("2017-01-01", "2019-01-01")))) + 
        geom_vline(xintercept = as.POSIXct(as.Date(c("2017-01-01", "2018-01-01", 
            "2019-01-01")))) + scale_fill_viridis(discrete = TRUE) + scale_color_viridis(discrete = TRUE)
    print(p)
}

9.16 Time series (abundant OTUs) - Genus level for Chlorophyta

for (i in 3:3) {
    ps_plot <- subset_taxa(ps_list_abundant$ps[[i]], division == "Chlorophyta")
    p <- plot_bar(ps_plot, x = "date", fill = "genus") + facet_grid(rows = vars(location)) + 
        geom_col(aes(color = genus, fill = genus)) + ggtitle(str_c(ps_list_abundant$title[[i]], 
        " - Date")) + theme(axis.text.x = element_text(angle = 45, vjust = 0.5, 
        hjust = 1)) + ylab("%") + xlim(as.POSIXct(as.Date(c("2017-01-01", "2019-01-01")))) + 
        geom_vline(xintercept = as.POSIXct(as.Date(c("2017-01-01", "2018-01-01", 
            "2019-01-01")))) + scale_fill_viridis(discrete = TRUE) + scale_color_viridis(discrete = TRUE)
    print(p)
}

9.17 Main species for each division (Eukaryotes - Autrotrophs)

p <- list()
for (one_division in c("Chlorophyta", "Dinoflagellata", "Ochrophyta", "Cryptophyta", 
    "Haptophyta")) {
    ps_subset <- subset_samples(ps_photo_abundant, strait != "Raffles Mari")
    ps_subset <- subset_taxa(ps_subset, division %in% one_division)
    p[[one_division]] <- plot_bar(ps_subset, x = "species") + facet_grid(rows = vars(strait), 
        cols = vars(monsoon)) + geom_col() + theme(axis.text.x = element_text(angle = 90, 
        vjust = 0.5, hjust = 1)) + ggtitle(str_c(one_division, " - Abundant OTUs")) + 
        coord_flip()
    print(p[[one_division]])
}

# grid_array <- list(p[['Chlorophyta']],p[['Ochrophyta']]) grid_layout <-
# rbind(c(NA,1,1,1,1,1), c( 2,2,2,2,2,2)) grid_fig <-
# gridExtra::grid.arrange(grobs=grid_array, layout_matrix=grid_layout,
# clip=FALSE, padding = unit(0, 'line'), as.table = FALSE)
grid_fig <- cowplot::plot_grid(p[["Chlorophyta"]], p[["Ochrophyta"]], labels = c("A", 
    "B"), align = "v", nrow = 2)
ggsave("../fig/Fig_Species_Moonsoon.png", grid_fig, height = 10, width = 8, 
    dpi = 300)

9.18 Heatmaps

9.18.1 Abundant OTUs

Data are agglomarated at the genus level. Use function tax_glom

for (i in c(1)) {
    ps_heat <- tax_glom(ps_list_abundant$ps[[i]], taxrank = "family")
    p <- plot_heatmap(ps_heat, method = "NMDS", distance = "bray", taxa.label = "family", 
        taxa.order = "division", sample.label = "sample_label", sample.order = "sample_label", 
        low = "beige", high = "red", na.value = "beige", title = ps_list_abundant$title[[i]])
    print(p)
}

for (i in 2:3) {
    
    ps_heat <- tax_glom(ps_list_abundant$ps[[i]], taxrank = "genus")
    p <- plot_heatmap(ps_heat, method = "NMDS", distance = "bray", taxa.label = "genus", 
        taxa.order = "division", sample.label = "sample_label", sample.order = "sample_label", 
        low = "beige", high = "red", na.value = "beige", title = ps_list_abundant$title[[i]])
    print(p)
}

9.18.2 Chlorophyta at species level

All ASVs considered (not only abundant)

ps_heat <- subset_taxa(ps_photo, division == "Chlorophyta")

ps_heat <- tax_glom(ps_heat, taxrank = "species")

p <- plot_heatmap(ps_heat, method = "NMDS", distance = "bray", taxa.label = "species", 
    taxa.order = "species", sample.label = "sample_label", sample.order = "sample_label", 
    low = "beige", high = "red", na.value = "beige", trans = scales::log10_trans(), 
    title = "Mamiellophyceae in Singapore")
print(p)

9.18.3 Mamiello (Only Bathy, Ostreo and Micromonas) at genus level

All ASVs considered (not only abundant)

ps_heat <- subset_taxa(ps_photo, genus %in% c("Ostreococcus", "Bathycoccus", 
    "Micromonas"))

ps_heat <- tax_glom(ps_heat, taxrank = "genus")

p <- plot_heatmap(ps_heat, method = "NMDS", distance = "bray", taxa.label = "genus", 
    taxa.order = "genus", sample.label = "sample_label", sample.order = "sample_label", 
    low = "beige", high = "red", na.value = "beige", trans = scales::log_trans(10), 
    title = "Mamiellophyceae in Singapore")
print(p)

9.18.4 Mamiello (Only Bathy, Ostreo and Micromonas) at species level

All ASVs considered (not only abundant)

ps_heat <- subset_taxa(ps_photo, genus %in% c("Ostreococcus", "Bathycoccus", 
    "Micromonas"))

ps_heat <- tax_glom(ps_heat, taxrank = "species")

p <- plot_heatmap(ps_heat, method = "NMDS", distance = "bray", taxa.label = "species", 
    taxa.order = "species", sample.label = "sample_label", sample.order = "sample_label", 
    low = "beige", high = "red", na.value = "beige", trans = scales::log_trans(10), 
    title = "Mamiellophyceae in Singapore")
print(p)

9.19 NMDS

Sample removed because they were pulling the NMDS * PR2X16XS21 it has a single eukaryote (diatom bloom ) * RM13XS36 cause problem for bacteria * PR11XS25 cause problem for hetero euks * SBW11XS26 cause problem for hetero euks * SBW13XS37 cause problem for hetero euks * RM13XS36 cause problem for hetero euks

9.19.1 Define function

See comments inside functions (saved plots are different from displayed plots)

ps_do_nmds <- function(ps_list) {
    
    plot_array <- list()
    
    for (i in 1:3) {
        ps_nmds <- ps_list$ps[[i]]
        
        # Remove samples with no reads
        ps_nmds <- prune_samples(sample_sums(ps_nmds) > 0, ps_nmds)
        
        # Remove samples from Raffles
        ps_nmds <- subset_samples(ps_nmds, !(str_detect(strait, "Raffles Mari|Johor")))
        
        # Remove samples that caused problems (1= prok, 2=euk, 3=euk auto)
        if (i == 1) 
            {
                ps_nmds <- prune_samples(!(sample_names(ps_nmds) %in% c("RM13XS36")), 
                  ps_nmds)
            }  # Prokaryotes
        if (i %in% c(2, 3)) 
            {
                ps_nmds <- prune_samples(!(sample_names(ps_nmds) %in% c("PR2X16SXS21")), 
                  ps_nmds)
            }  # Eukaryotes
        if (i == 4) 
            {
                ps_nmds <- prune_samples(!(sample_names(ps_nmds) %in% c("PR11XS25", 
                  "SBW11XS26", "SBW13XS37")), ps_nmds)
            }  # Heterotrophs not used
        
        singa.ord <- ordinate(ps_nmds, "NMDS", "bray")
        
        # Fit environmental parameters
        env_var <- sample_variables(ps_nmds)
        env_matrix <- get_variable(ps_nmds, c("Chl", "Temperature", "Salinity", 
            "Phosphate", "Silicate", "DIN", "BAC"))
        env_fit <- vegan::envfit(singa.ord, env = env_matrix, perm = 999, na.rm = TRUE)
        env_arrows <- data.frame(env_fit$vectors$arrows * sqrt(env_fit$vectors$r)) %>% 
            rownames_to_column(var = "parameter")
        
        
        nmds_samples <- data.frame(singa.ord[["points"]], get_variable(ps_nmds, 
            c("monsoon", "strait", "location", "location_label"))) %>% rownames_to_column(var = "sample")
        
        # Factor to move the labels
        nudge_x <- max(nmds_samples$MDS1) * 0.08
        nudge_y <- max(nmds_samples$MDS2) * 0.08
        xy_max = max(c(nmds_samples$MDS1, nmds_samples$MDS2)) * 1.5
        xy_min = min(c(nmds_samples$MDS1, nmds_samples$MDS2)) * 1.5
        factor <- 3  # for vectors for euks
        if (i == 1) 
            {
                factor <- 1.5
            }  # for vectors for proks
        print(factor)
        
        p <- plot_ordination(ps_nmds, singa.ord, type = "samples", color = "monsoon", 
            shape = "strait", title = ps_list$title[[i]]) + geom_point(aes(shape = strait, 
            color = monsoon), size = 3.5) + scale_color_viridis(discrete = TRUE) + 
            scale_shape_manual(values = c(15, 16)) + geom_text(aes(label = location_label, 
            color = monsoon), nudge_x = nudge_x, nudge_y = nudge_y, check_overlap = TRUE, 
            size = 2) + theme_bw() + geom_segment(data = env_arrows, aes(x = 0, 
            xend = NMDS1 * factor, y = 0, yend = NMDS2 * factor), inherit.aes = FALSE, 
            arrow = arrow(length = unit(0.5, "cm")), colour = "black") + geom_text(data = env_arrows, 
            aes(x = NMDS1 * factor, y = NMDS2 * factor, label = parameter), 
            inherit.aes = FALSE, hjust = -0.2, vjust = -0.2, size = 3)
        print(singa.ord)
        plot_array[[i]] <- p
        print(p)
        
        # The following lines can be used if you want to avoid using the pjhyloseq
        # functions to plot the data.  Notes : - must use inherit.aes = FALSE to add
        # some extra layers - the saved plots have a different scale for the added
        # layer than the displaued plot can figure out ggplot()+ coord_fixed() +
        # xlim(xy_min, xy_max) + ylim(xy_min, xy_max) +
        # geom_point(data=nmds_samples, aes(x=MDS1, y=MDS2, shape=strait,
        # color=monsoon), size=5) + geom_text(data=nmds_samples, aes(x=MDS1, y=MDS2,
        # label=location_label, color=monsoon), nudge_x=nudge_x, nudge_y=nudge_y,
        # check_overlap = FALSE, size=2) + ggtitle(ps_list$title[[i]]) +
        
        p <- plot_ordination(ps_nmds, singa.ord, type = "taxa", color = "division", 
            title = ps_list$title[[i]]) + scale_color_viridis(discrete = TRUE) + 
            geom_point(size = 3) + theme_bw() + geom_segment(data = env_arrows, 
            aes(x = 0, xend = NMDS1 * factor, y = 0, yend = NMDS2 * factor), 
            inherit.aes = FALSE, arrow = arrow(length = unit(0.5, "cm")), colour = "black") + 
            geom_text(data = env_arrows, aes(x = NMDS1 * factor, y = NMDS2 * 
                factor, label = parameter), inherit.aes = FALSE, hjust = -0.2, 
                vjust = -0.2, size = 3)
        
        print(p)
        plot_array[[i + 3]] <- p
    }
    return(plot_array)
}

9.19.2 All OTUs

plot_array_all <- ps_do_nmds(ps_list)

Square root transformation
Wisconsin double standardization
Run 0 stress 0.12 
Run 1 stress 0.12 
... New best solution
... Procrustes: rmse 0.032  max resid 0.14 
Run 2 stress 0.12 
... Procrustes: rmse 0.00021  max resid 0.00067 
... Similar to previous best
Run 3 stress 0.12 
Run 4 stress 0.12 
... Procrustes: rmse 0.0028  max resid 0.012 
Run 5 stress 0.12 
... Procrustes: rmse 4.9e-05  max resid 0.00016 
... Similar to previous best
Run 6 stress 0.12 
... Procrustes: rmse 0.0029  max resid 0.012 
Run 7 stress 0.12 
... New best solution
... Procrustes: rmse 2.5e-05  max resid 6.7e-05 
... Similar to previous best
Run 8 stress 0.15 
Run 9 stress 0.12 
... Procrustes: rmse 0.0031  max resid 0.013 
Run 10 stress 0.12 
Run 11 stress 0.12 
Run 12 stress 0.12 
... Procrustes: rmse 7.8e-05  max resid 0.00026 
... Similar to previous best
Run 13 stress 0.12 
... Procrustes: rmse 0.0031  max resid 0.013 
Run 14 stress 0.39 
Run 15 stress 0.12 
... Procrustes: rmse 2.2e-05  max resid 6.2e-05 
... Similar to previous best
Run 16 stress 0.12 
... Procrustes: rmse 4.7e-05  max resid 0.00014 
... Similar to previous best
Run 17 stress 0.17 
Run 18 stress 0.12 
... Procrustes: rmse 3.6e-05  max resid 0.00014 
... Similar to previous best
Run 19 stress 0.12 
... Procrustes: rmse 0.00018  max resid 0.00055 
... Similar to previous best
Run 20 stress 0.16 
*** Solution reached
[1] 1.5

Call:
metaMDS(comm = veganifyOTU(physeq), distance = distance) 

global Multidimensional Scaling using monoMDS

Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
Distance: bray 

Dimensions: 2 
Stress:     0.12 
Stress type 1, weak ties
Two convergent solutions found after 20 tries
Scaling: centring, PC rotation, halfchange scaling 
Species: expanded scores based on 'wisconsin(sqrt(veganifyOTU(physeq)))'

Square root transformation
Wisconsin double standardization
Run 0 stress 0.22 
Run 1 stress 0.24 
Run 2 stress 0.23 
Run 3 stress 0.23 
Run 4 stress 0.23 
Run 5 stress 0.24 
Run 6 stress 0.22 
Run 7 stress 0.26 
Run 8 stress 0.24 
Run 9 stress 0.22 
... New best solution
... Procrustes: rmse 0.074  max resid 0.24 
Run 10 stress 0.39 
Run 11 stress 0.23 
Run 12 stress 0.25 
Run 13 stress 0.24 
Run 14 stress 0.24 
Run 15 stress 0.23 
Run 16 stress 0.24 
Run 17 stress 0.25 
Run 18 stress 0.22 
... Procrustes: rmse 0.084  max resid 0.3 
Run 19 stress 0.22 
Run 20 stress 0.28 
*** No convergence -- monoMDS stopping criteria:
    20: stress ratio > sratmax
[1] 3

Call:
metaMDS(comm = veganifyOTU(physeq), distance = distance) 

global Multidimensional Scaling using monoMDS

Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
Distance: bray 

Dimensions: 2 
Stress:     0.22 
Stress type 1, weak ties
No convergent solutions - best solution after 20 tries
Scaling: centring, PC rotation, halfchange scaling 
Species: expanded scores based on 'wisconsin(sqrt(veganifyOTU(physeq)))'

Square root transformation
Wisconsin double standardization
Run 0 stress 0.2 
Run 1 stress 0.18 
... New best solution
... Procrustes: rmse 0.071  max resid 0.31 
Run 2 stress 0.22 
Run 3 stress 0.2 
Run 4 stress 0.19 
Run 5 stress 0.19 
Run 6 stress 0.18 
... Procrustes: rmse 4.1e-05  max resid 0.00012 
... Similar to previous best
Run 7 stress 0.19 
Run 8 stress 0.26 
Run 9 stress 0.18 
... New best solution
... Procrustes: rmse 5.5e-05  max resid 0.00026 
... Similar to previous best
Run 10 stress 0.18 
Run 11 stress 0.21 
Run 12 stress 0.18 
... New best solution
... Procrustes: rmse 7.2e-05  max resid 0.00035 
... Similar to previous best
Run 13 stress 0.18 
... New best solution
... Procrustes: rmse 5e-05  max resid 0.00021 
... Similar to previous best
Run 14 stress 0.2 
Run 15 stress 0.19 
Run 16 stress 0.23 
Run 17 stress 0.21 
Run 18 stress 0.18 
... Procrustes: rmse 0.00017  max resid 0.00081 
... Similar to previous best
Run 19 stress 0.19 
Run 20 stress 0.19 
*** Solution reached
[1] 3

Call:
metaMDS(comm = veganifyOTU(physeq), distance = distance) 

global Multidimensional Scaling using monoMDS

Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
Distance: bray 

Dimensions: 2 
Stress:     0.18 
Stress type 1, weak ties
Two convergent solutions found after 20 tries
Scaling: centring, PC rotation, halfchange scaling 
Species: expanded scores based on 'wisconsin(sqrt(veganifyOTU(physeq)))'

9.19.3 Abundant OTUs

plot_array_abundant <- ps_do_nmds(ps_list_abundant)

Square root transformation
Wisconsin double standardization
Run 0 stress 0.17 
Run 1 stress 0.16 
... New best solution
... Procrustes: rmse 0.046  max resid 0.2 
Run 2 stress 0.17 
Run 3 stress 0.16 
... Procrustes: rmse 0.006  max resid 0.023 
Run 4 stress 0.16 
Run 5 stress 0.2 
Run 6 stress 0.18 
Run 7 stress 0.17 
Run 8 stress 0.17 
Run 9 stress 0.17 
Run 10 stress 0.17 
Run 11 stress 0.17 
Run 12 stress 0.19 
Run 13 stress 0.17 
Run 14 stress 0.17 
Run 15 stress 0.22 
Run 16 stress 0.17 
Run 17 stress 0.18 
Run 18 stress 0.17 
Run 19 stress 0.23 
Run 20 stress 0.18 
*** No convergence -- monoMDS stopping criteria:
    20: stress ratio > sratmax
[1] 1.5

Call:
metaMDS(comm = veganifyOTU(physeq), distance = distance) 

global Multidimensional Scaling using monoMDS

Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
Distance: bray 

Dimensions: 2 
Stress:     0.16 
Stress type 1, weak ties
No convergent solutions - best solution after 20 tries
Scaling: centring, PC rotation, halfchange scaling 
Species: expanded scores based on 'wisconsin(sqrt(veganifyOTU(physeq)))'

Square root transformation
Wisconsin double standardization
Run 0 stress 0.21 
Run 1 stress 0.27 
Run 2 stress 0.21 
... New best solution
... Procrustes: rmse 9.4e-05  max resid 0.00032 
... Similar to previous best
Run 3 stress 0.24 
Run 4 stress 0.25 
Run 5 stress 0.25 
Run 6 stress 0.21 
... Procrustes: rmse 0.019  max resid 0.087 
Run 7 stress 0.21 
... Procrustes: rmse 0.00021  max resid 7e-04 
... Similar to previous best
Run 8 stress 0.21 
... Procrustes: rmse 0.018  max resid 0.086 
Run 9 stress 0.27 
Run 10 stress 0.24 
Run 11 stress 0.24 
Run 12 stress 0.26 
Run 13 stress 0.21 
... Procrustes: rmse 7.8e-05  max resid 0.00028 
... Similar to previous best
Run 14 stress 0.26 
Run 15 stress 0.23 
Run 16 stress 0.25 
Run 17 stress 0.22 
Run 18 stress 0.24 
Run 19 stress 0.24 
Run 20 stress 0.21 
... New best solution
... Procrustes: rmse 3.6e-05  max resid 7.7e-05 
... Similar to previous best
*** Solution reached
[1] 3

Call:
metaMDS(comm = veganifyOTU(physeq), distance = distance) 

global Multidimensional Scaling using monoMDS

Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
Distance: bray 

Dimensions: 2 
Stress:     0.21 
Stress type 1, weak ties
Two convergent solutions found after 20 tries
Scaling: centring, PC rotation, halfchange scaling 
Species: expanded scores based on 'wisconsin(sqrt(veganifyOTU(physeq)))'

Square root transformation
Wisconsin double standardization
Run 0 stress 0.21 
Run 1 stress 0.21 
... New best solution
... Procrustes: rmse 0.08  max resid 0.23 
Run 2 stress 0.2 
... New best solution
... Procrustes: rmse 0.098  max resid 0.27 
Run 3 stress 0.21 
Run 4 stress 0.2 
Run 5 stress 0.23 
Run 6 stress 0.25 
Run 7 stress 0.21 
Run 8 stress 0.25 
Run 9 stress 0.23 
Run 10 stress 0.24 
Run 11 stress 0.2 
Run 12 stress 0.26 
Run 13 stress 0.22 
Run 14 stress 0.21 
Run 15 stress 0.24 
Run 16 stress 0.2 
Run 17 stress 0.2 
Run 18 stress 0.25 
Run 19 stress 0.2 
... Procrustes: rmse 0.00017  max resid 0.00058 
... Similar to previous best
Run 20 stress 0.22 
*** Solution reached
[1] 3

Call:
metaMDS(comm = veganifyOTU(physeq), distance = distance) 

global Multidimensional Scaling using monoMDS

Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
Distance: bray 

Dimensions: 2 
Stress:     0.2 
Stress type 1, weak ties
Two convergent solutions found after 20 tries
Scaling: centring, PC rotation, halfchange scaling 
Species: expanded scores based on 'wisconsin(sqrt(veganifyOTU(physeq)))'

### MSDS graph

grid_fig <- cowplot::plot_grid(plot_array_abundant[[1]], plot_array_abundant[[4]], 
    plot_array_abundant[[3]], plot_array_abundant[[6]], ncol = 2, labels = c("A", 
        "B", "C", "D"), align = "hv", label_size = 20)
grid_fig

ggsave("../fig/Fig_NMDS-Singapore.png", grid_fig, height = 12, width = 18, dpi = 300)

9.20 Network analysis

for (i in 1:3) {
    
    ps_nmds <- ps_list_abundant$ps[[i]]
    
    # Remove samples with no reads
    ps_nmds <- prune_samples(sample_sums(ps_nmds) > 0, ps_nmds)
    
    # Remove samples that caused problems (1= prok, 2=euk, 3=euk auto)
    if (i == 1) 
        {
            ps_nmds <- prune_samples(!(sample_names(ps_nmds) %in% c("RM13XS36")), 
                ps_nmds)
        }  # Prokaryotes
    if (i %in% c(2, 3)) 
        {
            ps_nmds <- prune_samples(!(sample_names(ps_nmds) %in% c("PR2X16SXS21")), 
                ps_nmds)
        }  # Eukaryotes
    if (i == 4) 
        {
            ps_nmds <- prune_samples(!(sample_names(ps_nmds) %in% c("PR11XS25", 
                "SBW11XS26", "SBW13XS37")), ps_nmds)
        }  # Heterotrophs not used
    
    if (i > 1) {
        p <- plot_net(ps_nmds, distance = "(A+B-2*J)/(A+B)", type = "taxa", 
            maxdist = 0.4, color = "class", point_label = "genus") + ggtitle(ps_list_abundant$title[[i]])
    } else {
        p <- plot_net(ps_nmds, distance = "(A+B-2*J)/(A+B)", type = "taxa", 
            maxdist = 0.4, color = "class", point_label = "family") + ggtitle(ps_list_abundant$title[[i]])
    }
    
    print(p)
}

Metabarcoding Singapore ASV from Dada2

Metabarcoding Singapore ASV from Dada2

1 Aim

2 Initialize

3 Read the data

3.1 File names

3.2 Read the files

3.3 Only keep the eukaryotes in the OTU file

4 Assignment of eukaryotic ASVs based on PR2 database

4.1 Use dada2 to reassign to PR2

4.2 Read the PR2 assignement and merge with initial otu table

4.3 Export fasta file with taxonomy for Mamiellophyceae

5 Read reassignments for some prokaryotes

5.1 Synechococcus

6 Process BLAST file

7 Map

7.1 Leaflet map

7.2 Use world map package

7.3 Use R naturalearth package

8 Environmental data

8.1 Per station

8.1.1 Temp

8.1.2 Chlorophyll

8.1.3 Rain over 7 days

8.2 Average

8.2.1 Rain over 7 days

9 Phyloseq analysis

9.1 Create phyloseq files for all taxa after filtering the data

9.1.1 Write the final ASV table

9.1.2 Create the phyloseq file

9.2 Break up into photosynthetic and non-photosynthetic

9.3 Normalize number of reads in each sample using median sequencing depth.

9.4 Phyloseq files for abundant taxa

9.5 Create a list for the auto and hetero phyloseq files

9.6 Create tabular files for other plots

9.7 Treemaps at division and class levels

9.7.1 Define function to draw treemaps

9.7.2 Do the individual treemaps

9.7.3 Arrange the different treemaps in a grid and save

9.8 Most abundant species (photosynthetic)

9.9 Most abundant prokaryotes taxo6 (~ genus)

9.10 Bar plot of divisions per station

9.11 Bar plot of class per station

9.12 Compare by Straight, Site, Moonsoon (abundant OTUs only)

9.13 Compare by Moonsoon in Singapore straight (abundant OTUs)

9.14 Time series (abundant OTUs) aggregated for Singapore strait only

9.15 Time series (abundant OTUs) - Division level

9.16 Time series (abundant OTUs) - Genus level for Chlorophyta

9.17 Main species for each division (Eukaryotes - Autrotrophs)

9.18 Heatmaps

9.18.1 Abundant OTUs

9.18.2 Chlorophyta at species level

9.18.3 Mamiello (Only Bathy, Ostreo and Micromonas) at genus level

9.18.4 Mamiello (Only Bathy, Ostreo and Micromonas) at species level

9.19 NMDS

9.19.1 Define function

9.19.2 All OTUs

9.19.3 Abundant OTUs

9.20 Network analysis

Metabarcoding Singapore
ASV from Dada2

Metabarcoding Singapore
ASV from Dada2