RCC Database - Export custom lists

1 Aim

Export specific custom RCC lists

2 Initialize.

This file defines all the necessary libraries and variables

  source('RCC_init.R')

3 Export part of the database

3.1 Moore Foundation genetic project

Micromonas, Ostreococcus, Bathycoccus, Aureococcus, Pelagomonas

genus_list <- c("Micromonas", "Ostreococcus", "Bathycoccus", "Aureococcus", 
    "Pelagomonas")

cultures_genus <- cultures %>% filter(genus %in% genus_list) %>% filter(transformed == 
    0) %>% arrange(division, class, genus, species, clade, rcc_id) %>% filter(not_pure == 
    0) %>% select(division, class, genus, species, clade, rcc_id, strain_name, 
    strain_name_synonyms, clonal, cryopreserved, active_transfer_stopped, sampling_ocean, 
    sampling_regional_sea, sampling_station, sampling_depth, sampling_date, 
    Latitude, Longitude)
# write_tsv(cultures_genus, 'C:/Users/vaulot/Google Drive/Projects/2015
# Moore Foundation Banyuls/strains_moore.txt', na='')

3.2 Cultures isolated from Roscoff

list_sampling_stations <- cultures %>% group_by(sampling_station, sampling_site, 
    sampling_regional_sea) %>% summarise(n_strains = n())
cultures_Roscoff <- cultures %>% filter(str_detect(sampling_station, "Roscoff|SOMLIT|Santec") | 
    str_detect(sampling_site, "Penz|Morlaix")) %>% arrange(domain, division, 
    class, genus, species, clade, rcc_id) %>% select(domain, division, class, 
    genus, species, clade, rcc_id, strain_name, strain_name_synonyms, sampling_site, 
    sampling_station, sampling_depth, sampling_date, Latitude, Longitude)
# dv_save(cultures_Roscoff, 'C:/Users/vaulot/Desktop/strains_Roscoff.txt')

# Summarize by species
cultures_Roscoff_summary <- cultures_Roscoff %>% group_by(domain, division, 
    class, genus, species, clade) %>% summarise(n_strains = n())
# write_tsv((cultures_Roscoff_summary,
# 'C:/Users/vaulot/Desktop/strains_Roscoff_summary.txt', na='')

3.3 Pelagophyceae

3.3.1 Export tables

cultures_Pelago <- cultures %>% filter(class == "Pelagophyceae") %>% arrange(class, 
    order, family, genus, species, rcc_id) %>% select(class, order, family, 
    genus, species, rcc_id, strain_name, strain_name_synonyms, sampling_ocean, 
    sampling_regional_sea, sampling_depth, sampling_date, Latitude, Longitude)

sequences_18S <- sequences %>% filter(str_detect(gene_name, "18S")) %>% select(rcc_id, 
    genbank_accession, sequence) %>% rename(accession_18S = genbank_accession, 
    sequence_18S = sequence) %>% mutate(sequence_18S_length = str_length(sequence_18S))

sequences_18S_longer <- sequences_18S %>% arrange(rcc_id, desc(sequence_18S_length)) %>% 
    distinct(rcc_id, .keep_all = TRUE)

sequences_ITS <- sequences %>% filter(str_detect(gene_name, "ITS")) %>% select(rcc_id, 
    genbank_accession, sequence) %>% rename(accession_ITS = genbank_accession, 
    sequence_ITS = sequence)

dna_summary <- dna %>% group_by(rcc_id) %>% summarise(n_dna_samples = n())

cultures_Pelago <- cultures_Pelago %>% left_join(sequences_18S_longer) %>% left_join(sequences_ITS) %>% 
    left_join(dna_summary)

dna_Pelago <- cultures_Pelago %>% select(rcc_id, species, strain_name) %>% left_join(dna) %>% 
    filter(!is.na(dna_date))

write_tsv(cultures_Pelago, "C:/Users/vaulot/Desktop/RCC_Pelago.txt", na = "")
write_tsv(dna_Pelago, "C:/Users/vaulot/Desktop/RCC_Pelago_DNA.txt", na = "")

3.3.2 Export fasta

cultures_Pelago_18S <- cultures_Pelago %>% filter(!is.na(accession_18S)) %>% 
    mutate(sequence_name = str_c(accession_18S, str_c("RCC", rcc_id), strain_name, 
        species, sep = "|"))

seq_out <- DNAStringSet(cultures_Pelago_18S$sequence_18S)  # Store the sequence in a  DNAString
names(seq_out) <- cultures_Pelago_18S$sequence_name

writeXStringSet(seq_out, "C:/Users/vaulot/Desktop/RCC_Pelago_18S.fasta", compress = FALSE)

3.3.3 Graphs

Number of strains per species

df <- cultures_Pelago %>% group_by(order, species) %>% summarise(n_strains = n())

ggplot(df, aes(x = reorder(species, n_strains), y = n_strains, fill = order)) + 
    geom_bar(stat = "identity") + xlab("species") + ylab("Number of strains") + 
    geom_text(aes(label = n_strains), vjust = 0.2, hjust = -0.2) + coord_flip()

Map of new strains

map <- map_world()
map <- map + geom_point(data = cultures_Pelago, aes(x = Longitude, y = Latitude, 
    fill = order), size = 2, shape = 21) + scale_fill_manual(values = c("blue", 
    "red", "yellow"))
map

Daniel Vaulot

25 07 2018