2 Gene Rooting

Author

Dalmolin Systems Biology Group

2.1 Importing Packages

library(GeneBridge)
library(geneplast.data)
library(readr)
library(dplyr)
library(purrr)
library(biomaRt)
library(magrittr)
library(KEGGREST)
library(ape)
library(tidyverse)
library(data.table)
library(stringi)
library(AnnotationHub)
library(sourcetools)
library(here)

2.2 Defining functions for later use

The first one searches for the respective protein ID for each gene in the input list. The second returns the interactions between these proteins, and the last one filters the interactions by a combined confidence score greater than 0.4.

# get IDs from STRING DB
get_string_ids <- function(genes_hgnc, species_id = "9606") {

    req <- RCurl::postForm(
    "https://string-db.org/api/tsv/get_string_ids",
    identifiers = paste(genes_hgnc, collapse = "%0D"),
    echo_query = "1",
    species = species_id,
    .opts = list(ssl.verifypeer = FALSE)
  )
  
  map_ids <- read.table(text = req, sep = " ", header = TRUE, quote = "") %>%
    dplyr::select(-queryIndex) %>%
    unique()
  
  map_ids$stringId <- substring(map_ids$stringId, 6, 1000)
  
  return(map_ids)
}

 # Get STRING interactions
 get_network_interaction <- function(map_ids, protein_id, species_id = "9606") {
 
   identifiers <- map_ids %>% pull(protein_id) %>% na.omit %>% paste0(collapse="%0d")
 
   req2 <- RCurl::postForm(
     "https://string-db.org/api/tsv/network",
     identifiers = identifiers,
     required_core = "0",
     species = species_id,
     .opts = list(ssl.verifypeer = FALSE)
   )
   
   int_network <- read.table(text = req2, sep = "   ", header = TRUE)
   
   int_network <- unique(int_network)
   
   return(int_network)
 }

 ## Recomputing scores
combine_scores <- function(dat, evidences = "all", confLevel = 0.4) {
  if(evidences[1] == "all"){
    edat<-dat[,-c(1,2,ncol(dat))]
  } else {
    if(!all(evidences%in%colnames(dat))){
      stop("NOTE: one or more 'evidences' not listed in 'dat' colnames!")
    }
    edat<-dat[,evidences]
  }
  if (any(edat > 1)) {
    edat <- edat/1000
  }
  edat<-1-edat
  sc<- apply(X = edat, MARGIN = 1, FUN = function(x) 1-prod(x))
  dat <- cbind(dat[,c(1,2)],combined_score = sc)
  idx <- dat$combined_score >= confLevel
  dat <-dat[idx,]
  return(dat)
}

2.3 Loading gene list and orthology data

We load the orthology data through AnnotationHub, this R package provides a central place where genomic files (VCF, bed, wig) and other resources from standard locations (e.g., UCSC, Ensembl) can be accessed. This way, we have access to the input files for the GeneBridge algorithm.

# Load the Gene Set Table
sensorial_genes <- read.csv(here("data/sensorial_genes.csv"))

# Query Phylotree and OG data
ah <- AnnotationHub()
meta <- query(ah, "geneplast")
load(meta[["AH83116"]])

head(sensorial_genes)
head(cogdata)

2.4 1. Pre-processing

2.4.1 1.1. Mapping

For the next analyses, we need to cross-reference information between our genes of interest (Gene IDs from the sensorial_genes table) and Protein IDs (from the cogdata table). The STRINGdb API is used to map Gene IDs to Protein IDs, allowing the filtering of genes of interest in the cogdata table. The final goal is to obtain a filtered set of sensory genes with their respective pathways and COG IDs.

map_ids <- get_string_ids(sensorial_genes$gene_symbol)

# Subsetting cogs of interest - Sensorial Genes
gene_cogs <- cogdata %>%
  filter(ssp_id %in% map_ids$ncbiTaxonId) %>%
  filter(protein_id %in% map_ids[["stringId"]]) %>%
  group_by(protein_id) %>%
  summarise(n = n(), cog_id = paste(cog_id, collapse = " / "))

head(map_ids)

#map_ids |> 
#  vroom::vroom_write(file = here("data/map_ids.csv"), delim = ",")

2.4.2 1.2. Resolving duplicate COGs

Due to evolutionary events such as gene duplication, some genes may be associated with more than one Cluster of Orthologous Groups (COG). To ensure the functionality of the algorithm, it is necessary to resolve these cases, prioritizing COGs according to the following criteria:

Priority by COG type:

KOGs have higher priority.
COGs have higher priority than NOGs.

Cases with COGs starting with the same letter:

Are resolved manually, based on the annotated function of the COG and the scientific question of the study.

The code below implements this resolution and integrates the corrections into the main table.

gene_cogs %>% filter(n > 1)

# Resolving main proteins
gene_cogs_resolved <- tribble(
  ~protein_id, ~cog_id,
"ENSP00000332500", "NOG274749", #NOG274749 / NOG274749
"ENSP00000409316", "NOG282909", #NOG282909 / NOG282909 / NOG282909
"ENSP00000480090", "KOG3599"    #KOG3599 / KOG3272
)

# Removing unresolved cases and adding manual assignments
gene_cogs %<>%
  filter(n == 1) %>%
 dplyr:: select(-n) %>%
  bind_rows(gene_cogs_resolved)

#gene_cogs |> 
#  vroom::vroom_write(file = here("data/gene_cogs.csv"), delim = ",")

2.5 3. Processing

The objective of this step is to perform the rooting of the genes of interest using the GeneBridge package. For this, we use the newBridge, runBridge, and runPermutation functions, which produce statistical results associated with the selected COGs in a phylogenetic tree.

2.5.1 3.1. Necessary Inputs

ogdata:
- Dataset containing three main columns:
  - Protein ID: Protein identifiers.
  - COG ID: Clusters of interest.
  - Specie ID: Species identifiers.
- In the example, the cogdata object is being used.
phyloTree:
- Phylogenetic tree containing 476 eukaryotes, representing the evolutionary structure among the analyzed species.
ogids:
- List of COGs of interest. This set is derived from the gene_cogs table and includes the COGs associated with the proteins after the previous processing.
refsp:
- Reference species for rooting. In the example, we use 9606 (human).

The getBridge function extracts the results generated by GeneBridge in table format. The res table contains the statistical results of the rooting.

## Run GeneBridge
cogs_of_interest <- gene_cogs %>% pull(cog_id) %>% unique

ogr <- newBridge(ogdata=cogdata, phyloTree=phyloTree, ogids = cogs_of_interest, refsp="9606")

ogr <- runBridge(ogr, penalty = 2, threshold = 0.5, verbose = TRUE)

ogr <- runPermutation(ogr, nPermutations=1000, verbose=FALSE)

res <- getBridge(ogr, what="results")

saveRDS(ogr, file = here("data/ogr.RData"))

2.6 4. Post-processing

After performing the rooting with GeneBridge, it is necessary to adjust the data to improve the visualization and interpretation of the results. In this step, we add the names of the clades to the identified roots, using an external table that relates the root identifiers to the clade names.

# naming the rooted clades
CLADE_NAMES <- "https://raw.githubusercontent.com/dalmolingroup/neurotransmissionevolution/ctenophora_before_porifera/analysis/geneplast_clade_names.tsv"

lca_names <- vroom::vroom(CLADE_NAMES)

groot_df <- res %>%
  tibble::rownames_to_column("cog_id") %>%
  dplyr::select(cog_id, root = Root) %>%
  left_join(lca_names) %>%
  inner_join(gene_cogs)

head(groot_df)

#groot_df |> 
#  vroom::vroom_write(file = here("data/groot_df.csv"), delim = ",")

2.6.1 4.1. Protein-Protein Interaction Network

The construction of a protein-protein interaction (PPI) network is an essential step to identify the functional relationships between proteins. In this process, we use the STRINGdb API, a database that catalogs interactions between proteins based on various sources, including experimental assays, co-expression, and evidence extracted from scientific publications.

The STRINGdb API offers methods to: - Obtain protein interactions for a list of proteins. - Select specific sources of evidence. - Calculate and combine scores based on the selected evidence.

More information about the API can be found in the STRING API documentation.

# Get proteins interaction
string_edgelist <- get_network_interaction(groot_df)

# Recomputing scores
string_edgelist <- combine_scores(string_edgelist,
                                  evidences = c("ascore", "escore", "dscore"),
                                  confLevel = 0.7)

colnames(string_edgelist) <- c("stringId_A", "stringId_B", "combined_score")

# Remove the species id
string_edgelist$stringId_A <- substring(string_edgelist$stringId_A, 6, 1000)
string_edgelist$stringId_B <- substring(string_edgelist$stringId_B, 6, 1000)

# How many edgelist proteins are absent in gene_ids? (should return 0)
setdiff(
  string_edgelist %$% c(stringId_A, stringId_B),
  map_ids %>% pull(stringId)
)

head(string_edgelist)

For the construction of the graph, in addition to the interactions between the proteins, it is necessary that each node is annotated with additional information that will be used in the analysis, such as: - Protein name. - Clade where it is rooted. - Metabolic pathway in which it participates.

## Create anotation table
nodelist <- data.frame(node = unique(c(string_edgelist$stringId_A, string_edgelist$stringId_B)))

merged_paths <- merge(nodelist, groot_df, by.x = "node", by.y = "protein_id")

pivotada <- sensorial_genes %>%
  dplyr::select(gene_symbol, pathway_name) %>%
  dplyr::mutate(n = 1) %>%
  tidyr::pivot_wider(
    id_cols = gene_symbol,
    names_from = pathway_name,
    values_from = n,
    values_fn = list(n = length),
    values_fill = list(n = 0),
  )

source_statements <-
  colnames(pivotada)[2:length(pivotada)]

nodelist <-
  nodelist %>%
  left_join(merged_paths, by = c("node" = "node")) %>%
  left_join(map_ids, by = c("node" = "stringId")) %>%
  left_join(pivotada, by = c("queryItem" = "gene_symbol"))

head(nodelist)

In addition to the graph structure, we can calculate metrics such as the number of connections (degree) of each node.

# Network Metrics
connected_nodes <- rle(sort(c(string_edgelist[,1], string_edgelist[,2])))
connected_nodes <- data.frame(count=connected_nodes$lengths, node=connected_nodes$values)
connected_nodes <- left_join(nodelist, connected_nodes, by = c("node" = "node"))
connected_nodes <- dplyr::select(connected_nodes, queryItem, root, clade_name, count)

head(connected_nodes)

#nodelist |> 
#  vroom::vroom_write(file = here("data/nodelist.csv"), delim = ",")
#string_edgelist |> 
#  vroom::vroom_write(file = here("data/string_edgelist.csv"), delim = ",")
#merged_paths |> 
#  vroom::vroom_write(file = here("data/merged_paths.csv"), delim = ",")