Metabarcoding Singapore
Euks at 97%

Initialize. This file defines all the necessary libraries and variables

  source('Metabarcoding Singapore_init.R', echo=FALSE)

1 Aim

  • Assign and analyze eukaryotes for Singapore metabarcoding data (97%)

2 Analysis

2.1 Assignment of 97% OTUs based on PR2 database

2.1.1 Read the data

# Read the sample table
sample_table <- read_excel("../qiime/Singapore otu_table_w_tax_0.97.xlsx", sheet = "samples")

# Read the otu table
otu_table <- read_excel("../qiime/Singapore otu_table_w_tax_0.97.xlsx", sheet = "otu_table")

# Clean up the taxonomy
otu_table <- otu_table %>% mutate(taxo_clean = str_replace_all(taxonomy, "D_[0-9]+__", 
    "")) %>% separate(col = taxo_clean, into = str_c("taxo", c(1:7)), sep = "; ")

# Read the sequences
otu_sequences <- readAAStringSet("../qiime/otu_rep_97.fna")
otu_sequences.df <- data.frame(names = names(otu_sequences), sequence = as.character(otu_sequences))

# Split the otu sequences name into the otu id given by qiime and the name
# of the representative sequence Remove the primers
fwd_length = 20
rev_length = 15
otu_sequences.df <- otu_sequences.df %>% separate(col = names, into = c("otu_id_qiime", 
    "otu_rep_seq"), sep = " ") %>% mutate(sequence = str_sub(sequence, start = fwd_length + 
    1, end = -rev_length - 1))

2.1.2 Only keep the eukaryotes in the OTU file

otu_table_euk <- otu_table %>% filter(str_detect(taxonomy, "Eukaryota|Unassigned"))
otu_table_euk <- left_join(otu_table_euk, otu_sequences.df)

# Write the fasta file file
otu_sequences_euk <- otu_table_euk %>% transmute(sequence = sequence, seq_name = otu_id)
fasta_write(otu_sequences_euk, file_name = "../qiime/otu_rep_97_euk.fasta", 
    compress = FALSE, taxo_include = FALSE)
[1] TRUE

2.1.3 Use dada2 to reassign to PR2

dada2_assign(seq_file_name = "../qiime/otu_rep_97_euk.fasta")

2.1.4 Read the PR2 assignement and merge with initial otu table

otu_euk_pr2 <- read_tsv("../qiime/otu_rep_97_euk.dada2.taxo")
otu_euk_pr2_boot <- read_tsv("../qiime/otu_rep_97_euk.dada2.boot") %>% rename_all(funs(str_c(., 
    "_boot"))) %>% rename(seq_name = seq_name_boot)
otu_euk_pr2 <- left_join(otu_euk_pr2, otu_euk_pr2_boot) %>% rename(otu_id = seq_name)
otu_table_euk_final <- left_join(otu_table_euk, otu_euk_pr2) %>% select(otu_id, 
    otu_id_qiime, taxo1:taxo7, kingdom:species_boot, nseq:EC11.S24, otu_rep_seq, 
    sequence)

write_tsv(otu_table_euk_final, "../qiime/otu_table_97_euk_final.tsv")

2.2 Phyloseq analysis

2.2.1 Create phyloseq files after filtering the data

Filter the euk data to remove the low bootstraps values (threshold : bootstrap > 65% at the supergroup level) and create a phyloseq file

otu_table_euk_final <- otu_table_euk_final %>% filter(supergroup_boot > 65)
str_c("Number of euk otus : ", nrow(otu_table_euk_final))
[1] "Number of euk otus : 561"
otu_mat <- otu_table_euk_final %>% select(otu = otu_id, BLOOM10.S22:EC11.S24)
tax_mat <- otu_table_euk_final %>% select(otu = otu_id, kingdom:species)
samples_df <- sample_table %>% rename(sample = sample_id)

row.names(otu_mat) <- otu_mat$otu
otu_mat <- otu_mat %>% select(-otu)
row.names(tax_mat) <- tax_mat$otu
tax_mat <- tax_mat %>% select(-otu)
row.names(samples_df) <- samples_df$sample
samples_df <- samples_df %>% select(-sample)
otu_mat <- as.matrix(otu_mat)
tax_mat <- as.matrix(tax_mat)

OTU = otu_table(otu_mat, taxa_are_rows = TRUE)
TAX = tax_table(tax_mat)
samples = sample_data(samples_df)

singa.ps <- phyloseq(OTU, TAX, samples)
singa.ps
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 561 taxa and 64 samples ]
sample_data() Sample Data:       [ 64 samples by 5 sample variables ]
tax_table()   Taxonomy Table:    [ 561 taxa by 8 taxonomic ranks ]

Break up into photosynthetic and non-photosynthetic (Metazoa are removed)

singa_photo <- subset_taxa(singa.ps, (division %in% c("Chlorophyta", "Dinoflagellata", 
    "Cryptophyta", "Rhodophyta", "Haptophyta", "Ochrophyta", "Cercozoa")) & 
    !(class %in% c("Syndiniales", "Sarcomonadea")))
singa_hetero <- subset_taxa(singa.ps, !(division %in% c("Chlorophyta", "Dinoflagellata", 
    "Cryptophyta", "Rhodophyta", "Haptophyta", "Ochrophyta", "Cercozoa", "Metazoa")) | 
    (class %in% c("Syndiniales", "Sarcomonadea")))

Normalize number of reads in each sample using median sequencing depth.

total_photo = median(sample_sums(singa_photo))
sprintf("The number of reads used for normalization of autotrops is  %.0f", 
    total_photo)
[1] "The number of reads used for normalization of autotrops is  658"
standf = function(x, t = total_photo) (t * (x/sum(x)))
singa_photo = transform_sample_counts(singa_photo, standf)

total_hetero = median(sample_sums(singa_hetero))
sprintf("The number of reads used for normalization of heterotrophs is  %.0f", 
    total_hetero)
[1] "The number of reads used for normalization of heterotrophs is  208"
standf = function(x, t = total_hetero) (t * (x/sum(x)))
singa_hetero = transform_sample_counts(singa_hetero, standf)

Remove taxa which are in low abundace (< 0.10 in any given sample)

contrib_min <- 0.1
singa_photo <- filter_taxa(singa_photo, function(x) sum(x > total_photo * contrib_min) > 
    0, TRUE)
singa_hetero <- filter_taxa(singa_hetero, function(x) sum(x > total_hetero * 
    contrib_min) > 0, TRUE)

Create a list for the auto and hetero phyloseq files

ps_list <- list(ps = c(singa_photo, singa_hetero), title = c("Autotrophs", "Heterotrophs"))

2.2.2 Bar plot of classes per station for auto and hetero

for (i in 1:2) {
    p <- plot_bar(ps_list$ps[[i]], fill = "class") + geom_bar(aes(color = class, 
        fill = class), stat = "identity", position = "stack") + ggtitle(ps_list$title[[i]])
    print(p)
}

2.2.3 Compare by Straight, Site, Moonsoon

for (factor in c("strait", "site", "moonsoon")) {
    for (i in 1:2) {
        ps_aggregate <- merge_samples(ps_list$ps[[i]], factor)
        p <- plot_bar(ps_aggregate, fill = "division") + geom_bar(aes(color = division, 
            fill = division), stat = "identity", position = "stack") + ggtitle(str_c(ps_list$title[[i]], 
            " - ", factor))
        print(p)
    }
}

2.2.4 Main genera for different division of Autotrophs

for (one_division in c("Chlorophyta", "Dinoflagellata", "Ochrophyta")) {
    ps_subset <- subset_taxa(singa_photo, division %in% one_division)
    p <- plot_bar(ps_subset, x = "genus", fill = "genus", facet_grid = strait ~ 
        moonsoon) + geom_bar(aes(color = genus, fill = genus), stat = "identity", 
        position = "stack") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, 
        hjust = 1)) + ggtitle(one_division)
    print(p)
}

2.2.5 Main species of Mamiellophyceae and Diatoms

for (one_class in c("Mamiellophyceae", "Bacillariophyta")) {
    ps_subset <- subset_taxa(singa_photo, class %in% one_class)
    p <- plot_bar(ps_subset, x = "species", fill = "species", facet_grid = strait ~ 
        moonsoon) + geom_bar(aes(color = species, fill = species), stat = "identity", 
        position = "stack") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, 
        hjust = 1)) + ggtitle(one_class)
    print(p)
}

2.2.6 Heatmaps

for (i in 1:2) {
    
    p <- plot_heatmap(ps_list$ps[[i]], method = "NMDS", distance = "bray", taxa.label = "species", 
        taxa.order = "division", low = "beige", high = "red", na.value = "beige", 
        title = ps_list$title[[i]])
    print(p)
}

2.2.7 NMDS

singa.ord <- ordinate(singa.ps, "NMDS", "bray")
Square root transformation
Wisconsin double standardization
Run 0 stress 0.14 
Run 1 stress 0.15 
Run 2 stress 0.15 
Run 3 stress 0.15 
Run 4 stress 0.14 
... New best solution
... Procrustes: rmse 0.063  max resid 0.36 
Run 5 stress 0.14 
Run 6 stress 0.15 
Run 7 stress 0.14 
Run 8 stress 0.15 
Run 9 stress 0.14 
Run 10 stress 0.14 
Run 11 stress 0.15 
Run 12 stress 0.15 
Run 13 stress 0.15 
Run 14 stress 0.15 
Run 15 stress 0.14 
Run 16 stress 0.14 
Run 17 stress 0.16 
Run 18 stress 0.14 
Run 19 stress 0.15 
Run 20 stress 0.15 
*** No convergence -- monoMDS stopping criteria:
    20: stress ratio > sratmax
print(singa.ord)

Call:
metaMDS(comm = veganifyOTU(physeq), distance = distance) 

global Multidimensional Scaling using monoMDS

Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
Distance: bray 

Dimensions: 2 
Stress:     0.14 
Stress type 1, weak ties
No convergent solutions - best solution after 20 tries
Scaling: centring, PC rotation, halfchange scaling 
Species: expanded scores based on 'wisconsin(sqrt(veganifyOTU(physeq)))' 
p <- plot_ordination(singa.ps, singa.ord, type = "samples", color = "moonsoon", 
    shape = "strait", title = "All euks OTUs") + geom_point(size = 3)
print(p)

for (i in 1:2) {
    
    singa.ord <- ordinate(ps_list$ps[[i]], "NMDS", "bray")
    print(singa.ord)
    
    p <- plot_ordination(ps_list$ps[[i]], singa.ord, type = "samples", color = "moonsoon", 
        shape = "strait", title = str_c(ps_list$title[[i]], " - Only abundant OTUs (> 10% in at least one sample)")) + 
        geom_point(size = 3)
    print(p)
}
Square root transformation
Wisconsin double standardization
Run 0 stress 0.15 
Run 1 stress 0.15 
Run 2 stress 0.17 
Run 3 stress 0.16 
Run 4 stress 0.16 
Run 5 stress 0.16 
Run 6 stress 0.18 
Run 7 stress 0.17 
Run 8 stress 0.17 
Run 9 stress 0.16 
Run 10 stress 0.18 
Run 11 stress 0.17 
Run 12 stress 0.15 
Run 13 stress 0.15 
Run 14 stress 0.15 
Run 15 stress 0.17 
Run 16 stress 0.15 
Run 17 stress 0.16 
Run 18 stress 0.16 
Run 19 stress 0.16 
Run 20 stress 0.18 
*** No convergence -- monoMDS stopping criteria:
    20: stress ratio > sratmax

Call:
metaMDS(comm = veganifyOTU(physeq), distance = distance) 

global Multidimensional Scaling using monoMDS

Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
Distance: bray 

Dimensions: 2 
Stress:     0.15 
Stress type 1, weak ties
No convergent solutions - best solution after 20 tries
Scaling: centring, PC rotation, halfchange scaling 
Species: expanded scores based on 'wisconsin(sqrt(veganifyOTU(physeq)))' 

Square root transformation
Wisconsin double standardization
Run 0 stress 0.19 
Run 1 stress 0.23 
Run 2 stress 0.2 
Run 3 stress 0.21 
Run 4 stress 0.19 
... New best solution
... Procrustes: rmse 0.048  max resid 0.28 
Run 5 stress 0.21 
Run 6 stress 0.19 
Run 7 stress 0.2 
Run 8 stress 0.22 
Run 9 stress 0.2 
Run 10 stress 0.22 
Run 11 stress 0.21 
Run 12 stress 0.21 
Run 13 stress 0.2 
Run 14 stress 0.22 
Run 15 stress 0.21 
Run 16 stress 0.21 
Run 17 stress 0.23 
Run 18 stress 0.21 
Run 19 stress 0.2 
Run 20 stress 0.21 
*** No convergence -- monoMDS stopping criteria:
    20: stress ratio > sratmax

Call:
metaMDS(comm = veganifyOTU(physeq), distance = distance) 

global Multidimensional Scaling using monoMDS

Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
Distance: bray 

Dimensions: 2 
Stress:     0.19 
Stress type 1, weak ties
No convergent solutions - best solution after 20 tries
Scaling: centring, PC rotation, halfchange scaling 
Species: expanded scores based on 'wisconsin(sqrt(veganifyOTU(physeq)))' 

2.2.8 Network analysis

for (i in 1:2) {
    
    p <- plot_net(ps_list$ps[[i]], distance = "(A+B-2*J)/(A+B)", type = "taxa", 
        maxdist = 0.4, color = "class", point_label = "genus") + ggtitle(ps_list$title[[i]])
    
    print(p)
}

Daniel Vaulot, Adriana Lopes dos Santos

06 07 2018