4 Plotting Abundance

Author

Dalmolin Systems Biology Group

4.1 Importing Packages

library(here)
library(readr)
library(magrittr)
library(ggplot2)
library(hrbrthemes)
library(tinter)
library(dplyr)
library(tidyr)
library(AnnotationHub)

4.2 Defining functions

These functions are intended to organize the annotation of the vertices of our nodelist and calculate the cumulative number of genes per clade and per biological process.

calculate_cumulative_genes <- function(nodelist) {
  
    # Get all possible categories of clade_name
  all_clades <- node_annotation %>%
    arrange(desc(root)) %>%
    dplyr:: select(clade_name) %>%
    unique()
  
  # Define the columns of interest
  process_columns <- c("queryItem", "root", "clade_name", 
                       "Olfactory transduction", 
                       "Taste transduction",           
                       "Phototransduction")
  
  # Calculate the cumulative sum grouping by clade_name
  cumulative_genes <- nodelist %>%
    arrange(desc(root)) %>%
    dplyr::select(all_of(process_columns)) %>%
    group_by(clade_name, root) %>%
    summarise(count_genes = n(), .groups = "drop") %>%
    arrange(desc(root)) %>%
    mutate(cumulative_sum = cumsum(count_genes)) %>%
    right_join(all_clades, by = "clade_name") %>%
    fill(cumulative_sum, .direction = "down")
  
  return(cumulative_genes)
}

calculate_cumulative_bp <- function(nodelist) {
    
  # Get all possible categories of clade_name
  all_clades <- node_annotation %>%
    arrange(desc(root)) %>%
    dplyr:: select(clade_name) %>%
    unique()
  
   # Define the columns of interest
  process_columns <- c("queryItem", "root", "clade_name", 
                       "Olfactory transduction", 
                       "Taste transduction",           
                       "Phototransduction")
  
  # Calculate the cumulative sum for each biological process
  cumulative_bp <- nodelist %>%
    dplyr::select(all_of(process_columns)) %>%
    distinct(root, queryItem, .keep_all = TRUE) %>%
    mutate(across(all_of(process_columns[-c(1:3)]), ~ as.numeric(.))) %>%
    group_by(root, clade_name) %>%
    summarise(across(all_of(process_columns[-c(1:3)]), 
                     ~ sum(. , na.rm = TRUE)),
              .groups = "drop") %>%
    arrange(desc(root)) %>%
    mutate(across(all_of(process_columns[-c(1:3)]), ~ cumsum(.))) %>%
    right_join(all_clades, by = "clade_name") %>%
    fill(everything(), .direction = "down")
  
  return(cumulative_bp)
}

4.3 Defining aesthetic parameters for ggplot

Assigning default colors for the metabolic pathways in the analysis and defining the other aesthetic standards for plotting.

# Plotting colors and labels
annotation_colors <- c(
  "Olfactory transduction"   = "#8dd3c7",
  "Taste transduction"      = "#72874EFF",
  "Phototransduction"       = "#fb8072"
)

annotation_labels <- c(
  "Olfactory transduction"   = "Olfactory transduction",
  "Taste transduction"      = "Taste transduction",
  "Phototransduction"       = "Phototransduction"
)

# This vertical line indicates the first metazoan (Amphimedon queenslandica / Ctenophora)
choanoflagellata_line <- geom_vline(
  xintercept = "Sphaeroforma arctica",
  color      = "#FF0000",
  linetype   = "11",
  alpha      = 1,
  linewidth  = 0.25
)

# Plotting
theme_main <- theme(
  panel.spacing      = unit(2.5, "pt"),
  strip.background   = element_blank(),
  panel.grid.major.x = element_blank(),
  panel.grid.major.y = element_line(linewidth = 0.25, linetype = "dotted", color = "#E0E0E0"),
  strip.text.x       = element_text(size = 9, angle = 90, hjust = 0, vjust = 0.5, color = "#757575"),
  strip.text.y       = element_text(size = 10, angle = 0, hjust = 0, vjust = 0.5, color = "#757575"),
  axis.title         = element_text(size = 15, color = "#424242"),
  axis.ticks.x       = element_blank(),
  axis.text.x        = element_blank(),
  axis.text.y        = element_text(size = 5.5),
  legend.position    = "none"
)

theme_supplementary <- theme(
  panel.grid.major.x = element_line(color = "#E0E0E0", linewidth = 0.25, linetype = "dotted"),
  panel.grid.major.y = element_blank(),
  strip.text.y       = element_text(size = 7, angle = 0, hjust = 0, vjust = 0.5, color = "#757575"),
  strip.text.x       = element_text(size = 7, angle = 90, hjust = 0, vjust = 0.5, color = "#757575"),
  axis.title         = element_text(size = 12, color = "#424242"),
  axis.ticks         = element_line(colour = "grey20"),
  axis.text.y        = element_text(size = 6, angle = 0, hjust = 1, vjust = 0.5, color = "#757575"),
  axis.text.x        = element_text(size = 6)
)

theme_average <- theme(
  panel.spacing      = unit(1, "pt"),
  axis.title         = element_text(color = "#424242"),
  axis.text          = element_text(color = "#757575"),
  axis.text.x        = element_text(size = 7, angle = -45, vjust = 0, hjust = 0),
  axis.text.y        = element_text(size = 5),
  strip.background   = element_blank(),
  strip.text         = element_text(color = "#757575"),
  strip.text.y       = element_text(angle = 0, hjust = 0, vjust = 0.5)
)

theme_big <- theme(
  panel.spacing      = unit(0.5, "pt"),
  panel.grid.major.x = element_line(linewidth = 0.1, linetype = "dashed"),
  panel.grid.major.y = element_blank(),
  strip.background   = element_blank(),
  strip.text.x       = element_text(size = 8, angle = 90, hjust = 0.5, vjust = 0),
  strip.text.y       = element_text(size = 8, angle = 0, hjust = 0, vjust = 0.5),
  axis.text.x        = element_text(size = 6, angle = 90, vjust = 0, hjust = 0),
  axis.text.y        = element_text(size = 4.5),
  axis.ticks         = element_line(size = 0.1)
)

tick_function <- function(x) {
  seq(x[2], 0, length.out = 3) %>% head(-1) %>% tail(-1) %>% { ceiling(./5)*5 }
}

4.4 Load necessary tables

To proceed with the analysis, it is necessary to load a table containing the taxid information for each species. This information will be used later to map the taxid of the species with the taxid of each COG.

4.4.1 Table Description

File Name: string_eukaryotes.rda
Content: Taxonomic information of eukaryotic species.

The table loads the following main fields: - TaxID (taxonomic identifier): A unique identifier associated with each species. - Species name

# Query Phylotree and OG data
ah <- AnnotationHub()
meta <- query(ah, "geneplast")
load(meta[["AH83116"]])

# TODO: save tables from previous qmds
nodelist <- vroom::vroom(file = here("data/nodelist.csv"), delim = ",")
gene_cogs <-  vroom::vroom(file = here("data/gene_cogs.csv"), delim = ",")
sensorial_genes <- read.csv(here("data/sensorial_genes.csv"))
map_ids <- vroom::vroom(here("data/map_ids.csv"))
groot_df <- vroom::vroom(here("data/groot_df.csv"))

ogr <- readRDS(here("data/ogr.RData"))

load(here("data/string_eukaryotes.rda"))
head(string_eukaryotes)

4.5 Visualization: Rooting Pattern

This visualization allows analyzing the cumulative pattern of gene emergence over evolutionary time. In addition, it also shows the growth of biological processes separated by each metabolic pathway.

4.5.1 Graph Characteristics

Bar Chart (Cumulative Pattern):
- Represents the cumulative sum of genes associated with each clade.
- Each bar displays the total accumulated genes in a given clade, allowing the identification of the diversification pattern.
Combined Chart (Bars and Lines):
- The bars indicate the cumulative sum of genes per clade.
- The lines represent the growth of different biological processes (Process), separated by metabolic pathways, allowing the comparison between the cumulative emergence of genes and the development of biological functions.

4.5.2 Interpretation

Cumulative Pattern: The bar chart shows how genes have emerged over time, cumulatively. This view helps to identify moments of greater genetic diversification.
Comparison by Biological Processes: The combined chart allows observing which biological processes stand out at different moments of evolution and how they are related to the emergence of new genes.

# Mapping roots and proteins info
node_annotation <- nodelist %>%
  inner_join(gene_cogs, by = c("node" = "protein_id", "cog_id")) %>%
  inner_join(sensorial_genes, by = c("queryItem" = "gene_symbol")) %>%
  distinct(queryItem, cog_id, pathway_name, root, clade_name)

cumulative_genes <- calculate_cumulative_genes(nodelist) 
cumulative_bp <- calculate_cumulative_bp(nodelist)
  
cumulative_data <- left_join(cumulative_genes, cumulative_bp)
 

long_data <- cumulative_data %>%
  pivot_longer(cols = 5:7, 
               names_to = "Process", 
               values_to = "Value")

#a <-
ggplot() +
  # Bar chart for cumulative_sum
  geom_bar(data = cumulative_data, 
           aes(x = factor(clade_name, levels = clade_name), y = cumulative_sum), 
           stat = "identity", fill = "darkgray", colour = NA) +
  geom_text(data = cumulative_data, 
            aes(x = factor(clade_name, levels = clade_name), y = cumulative_sum, label = cumulative_sum), 
            vjust = -0.5, size = 3, color = "darkgray") +
  scale_color_manual(values = annotation_colors) +
  
  labs(x = "Clade Name", y = "Cumulative Sum", 
       title = "Cumulative Sum and Biological Processes",
       fill = "Cumulative Sum",
       color = "Biological Processes") +
  
  theme_main +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

#b <- 
ggplot() +
  # Bar chart for cumulative_sum
  geom_bar(data = cumulative_data, 
           aes(x = factor(-root), y = cumulative_sum), 
           stat = "identity", fill = "darkgray", colour = NA) +
  geom_text(data = cumulative_data, 
            aes(x = factor(-root), y = cumulative_sum, label = cumulative_sum), 
            vjust = -0.5, size = 3, color = "darkgray") +
  
  # Line chart for biological processes
  geom_line(data = long_data, 
            aes(x = factor(-root), y = Value, color = Process, group = Process), 
            size = 1) +
  
  # Use the defined color palette
  scale_color_manual(values = annotation_colors) +
  
  labs(x = "Clade Name", y = "Cumulative Sum", 
       title = "Cumulative Sum and Biological Processes",
       fill = "Cumulative Sum",
       color = "Biological Processes") +
  
  theme_main +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#ggsave(file = "", plot=a, width=15, height=8)
#ggsave(file = "", plot=b, width=15, height=8)

4.6 Calculation of COG Abundance by Function in Species

This analysis is based on the premise that similar COGs share the same biological function. Thus, it is possible to calculate the average abundance of functions associated with COGs in each species, using the number of COGs present in the species identified by the TaxID.

4.6.1 Calculation Steps

Species Mapping Each species was mapped to its respective clade (hierarchical level) based on the TaxID information.
- The ogr@spbranches table was used to associate species (ssp_id) with their branches (lca).
- The data was organized to include the order of the TaxIDs.
COG Annotation
- The relationship between proteins and genes was enriched with functional information, such as COG identifiers and metabolic pathway names (pathway_name).
- Duplicates were removed to ensure that only unique information was considered.
Integration of Species Information
- A mapping was created between species (TaxID) and their respective clades to add the relevant taxonomic information.
- The species were ordered based on their clades, to facilitate hierarchical analysis.
Calculation of Average Abundance by Function
- For each species and metabolic function (pathway_name), the average abundance of COGs was calculated.
- Additional information, such as species names (ncbi_name) and clades (clade_name), was integrated into the result.
Abundance Adjustment (Capping)
- To avoid extreme values, the average abundance was adjusted:
  - Values above a limit (mean + 3 standard deviations) were truncated to 100.
  - The adjusted values were calculated separately by metabolic function.

lca_spp <- ogr@spbranches %>%
  rename("taxid" = ssp_id, "species" = ssp_name, "lca" = "branch") %>%
  mutate(taxid_order = row_number()) %>%
  dplyr::select(lca, taxid, taxid_order)

clade_taxids <- lca_spp
clade_names <- vroom::vroom("https://raw.githubusercontent.com/dalmolingroup/neurotransmissionevolution/ctenophora_before_porifera/analysis/geneplast_clade_names.tsv")

cog_annotation <- map_ids %>%
  left_join(groot_df, by = c("stringId" = "protein_id")) %>%
  left_join(sensorial_genes, by = c("queryItem" = "gene_symbol")) %>%
  distinct(queryItem, cog_id, pathway_name) %>%
  dplyr::select(cog_id, pathway_name) %>%
  unique() %>%
  na.omit()

cog_abundance_by_taxid <- cogdata %>%
  filter(cog_id %in% nodelist[["cog_id"]]) %>%
  count(ssp_id, cog_id, name = "abundance") %>%
  left_join(cog_annotation, by = "cog_id")

# Mapping species to clade info
ordered_species <- string_eukaryotes %>%
  dplyr::select(taxid, ncbi_name) %>%
  left_join(clade_taxids, by = "taxid") %>%
  left_join(clade_names, by = c("lca" = "root")) %>%
  na.omit() %>% unique() %>%
  arrange(desc(lca)) %>%
  dplyr::select(-taxid_order)
  
avg_abundance_by_function <- cog_abundance_by_taxid %>%
  group_by(ssp_id, pathway_name) %>%
  summarise(avg_abundance = mean(abundance)) %>%
  ungroup() %>%
  # Adding species and clade info
  left_join(ordered_species %>% mutate(taxid = as.double(taxid)), by = c("ssp_id" = "taxid")) %>%
  unique() %>%
  arrange(desc(lca)) %>%
  mutate(ncbi_name = factor(ncbi_name, levels = unique(ncbi_name)),
         clade_name = factor(clade_name, levels = unique(clade_name))) %>%
  na.omit()

capped_abundance_by_function <- avg_abundance_by_function %>%
  # mutate(capped_abundance = ifelse(abundance >= 100, 100, abundance)) %>%
  group_by(pathway_name) %>%
  mutate(
    # max_abundance = max(abundance[lca <= 29])
    max_abundance = avg_abundance[lca <= 29] %>% { mean(.) + 3*sd(.) }
    ,abundance     = ifelse(avg_abundance >= max_abundance, pmin(max_abundance, 100), pmin(avg_abundance, 100)))

# List of signatures
signatures <- unique(node_annotation$pathway_name)

roots_seq <- node_annotation %>%
  arrange(desc(root)) %>%
  dplyr:: select(root, clade_name) %>%
  unique()

roots_seq$clade_name <- factor(roots_seq$clade_name, levels = roots_seq$clade_name)

4.6.2 Visualization

In the following graphs, it is possible to observe the average abundance of COGs in each clade. Each bar represents a species within the respective clade, while the colors indicate the different metabolic functions associated with the orthologous groups.

4.6.2.1 Graph of Average COG Abundance by Clade

The X axis represents the species.
The Y axis indicates the average abundance of proteins in orthologous groups.
The bars are colored according to the metabolic function (pathway_name).

4.6.2.2 Graph with Adjusted Abundance (Capped)

To avoid distortions caused by extreme values, the abundance was adjusted to a maximum limit (mean + 3 standard deviations).
This graph provides a more balanced visualization, especially for metabolic functions with very high values.

# Plotting by species
ggplot(avg_abundance_by_function) +
  # Geoms  ----------------
choanoflagellata_line +
  geom_bar(
    aes(x = ncbi_name, y = avg_abundance, fill = pathway_name, color = after_scale(darken(fill, 0.1)))
    ,stat = "identity"
  ) +
  # Labels  ---------------
 xlab("Species") +
  ylab("Average protein abundance in orthologous groups") +
  #ylab("Average protein abundance in orthologous groups") +
  # Scales ----------------
scale_y_continuous(breaks = tick_function, minor_breaks = NULL) +
  scale_fill_manual(values = annotation_colors %>% darken(0.1)) +
  # Styling ---------------
facet_grid(
  pathway_name ~ clade_name
  ,scales   = "free"
  ,space    = "free"
  ,labeller = labeller(annotation = annotation_labels)
) +
  theme_classic() + 
  theme_main

# Plotting by species capped
ggplot(capped_abundance_by_function) +
  # Geoms  ----------------
choanoflagellata_line +
  geom_bar(
    aes(x = ncbi_name, y = abundance, fill = pathway_name, color = after_scale(darken(fill, 0.1)))
    ,stat = "identity"
  ) +
  # Labels  ---------------
  xlab("Species") +
  ylab("Average protein abundance in orthologous groups") +
  #ylab("Average protein abundance in orthologous groups") +
  # Scales ----------------
  scale_y_continuous(breaks = tick_function, minor_breaks = NULL) +
  scale_fill_manual(values = annotation_colors %>% darken(0.1)) +
  # Styling ---------------
facet_grid(
  pathway_name ~ clade_name
  ,scales   = "free"
  ,space    = "free"
  ,labeller = labeller(annotation = annotation_labels)
) +
  theme_classic() + 
  theme_main

4.6.2.3 Simplified Visualization: Abundance by Clade

For a less detailed analysis, it is possible to visualize the average abundance of COGs only at the clade level, ignoring the differences between individual species. This graph provides a more direct overview, ideal for identifying global trends.

The X axis represents the clades.
The Y axis shows the average abundance of proteins in orthologous groups per clade.
The bars are grouped by clade and colored according to the metabolic functions (pathway_name).

# Ploting by clade
ggplot(avg_abundance_by_function) +
  geom_bar(
    aes(x = clade_name, y = avg_abundance, fill = pathway_name, color = after_scale(darken(fill, 0.1)))
    ,stat = "summary"
    ,fun  = "mean"
  ) +
  scale_y_continuous(breaks = tick_function, minor_breaks = NULL) +
  scale_fill_manual(values = annotation_colors, guide = "none") +
  facet_grid(
    pathway_name ~ .
    ,scales   = "free"
    ,space    = "free_y"
    ,labeller = labeller(annotation = sub("\n", "", annotation_labels))
  ) +
  xlab("Clades") +
  ylab("Average abundance by clade") +
  theme_classic() + 
  theme_average