CodeReview.Rmd

---
editor_options: 
  markdown: 
    wrap: 72
---

## Example for Code Review

```{r}
if (!requireNamespace("tidyverse", quietly = TRUE))
    install.packages("tidyverse")

if (!requireNamespace("devtools", quietly = TRUE))
    install.packages("devtools")

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

# Install STRINGdb if not already installed
if (!requireNamespace("STRINGdb", quietly = TRUE)) {
  BiocManager::install("STRINGdb")
}

# Install biomaRt if not already installed
if (!requireNamespace("biomaRt", quietly = TRUE)) {
  BiocManager::install("biomaRt")
}

devtools::install_github("martadelfino/fiveP-R")

library(tidyverse)
library(fiveP)
```

```{r}

# Reading the file 
gene_classes <- readr::read_delim('CodeReview_data.txt', delim = '\t',  
                                      show_col_types = FALSE)

```

```{r}

# Getting the HGNC IDs
AR <- gene_classes %>%
  dplyr::filter(ndd_ar_classes == 'positive') %>%
  dplyr::select(hgnc_id)# %>%
#  dplyr::filter(hgnc_id %in% c("HGNC:1020", "HGNC:10451", "HGNC:15972", 
 #                              "HGNC:21416", "HGNC:10527"))
```

```{r}

AR_results <- get_fiveP(AR)


```

```{r}
print(AR_results)
```

### Checking 5 random genes

```{r}
input_genes <- AR 

hgnc_gene_list <- fetch_hgnc_gene_list()
paralogues <- fetch_paralogues(hgnc_gene_list)
pathways <- fetch_pathways(hgnc_gene_list, input_genes)
ppi <- fetch_ppi(hgnc_gene_list)
uniprot <- fetch_uniprot(hgnc_gene_list, input_genes)
protein_complex <- fetch_protein_complex(hgnc_gene_list, uniprot)
protein_families <- fetch_protein_families(hgnc_gene_list, uniprot)

# Data processing functions --------------------------------------------------
paralogues_ratio <- calculate_paralogues_ratio(paralogues, input_genes)
pathways_ratio <- calculate_pathways_ratio(pathways$input_genes_Uniprot2Reactome,
                                           pathways$Uniprot2Reactome_final_hgnc_no_na,
                                           input_genes)
ppi_ratio <- calculate_ppi_ratio(ppi, input_genes)
protein_complex_ratio <- calculate_protein_complex_ratio(protein_complex, input_genes)
protein_families_ratio <- calculate_protein_families_ratio(protein_families, input_genes)

```

#### 1 gene from the AR input list: HGNC:1020, 4 genes not from the input list: HGNC:10451, HGNC:15972, HGNC:21416, HGNC:10527

|  |  |  |  |  |
|---------------|---------------|---------------|---------------|---------------|
|  | Function Results | Manual Results | Function Results | Manual Results |
| **Gene** | **Protein complex** | **Protein complex** | **Protein family** | **Protein family** |
| HGNC:1020 | NA | 6 predicted complexes, so not included. | 0 | PTHR23070 - only this one gene in the family. |
| HGNC:10451 | 1 | 9 complexes: 4 curated, 5 predicted. CPX-369 complex is an input gene complex. | NA | PTHR11573 - family not in input genes |
| HGNC:15972 | 0.2857143 | 7 complexes: only 2 curated. CPX-6212 is in two input genes. CPX-6213 is in one input gene. | 1 | PTHR23323 - 2 genes in this family, and HGNC:14583 is an input gene. |
| HGNC:21416 | NA | 9 complexes - all predicted. | 0.5 | PTHR10340 - 3 genes in the family. 1 is an input gene |
| HGNC:10527 | NA | 4 complexes - all predicted. | NA | PTHR23233 - 4 genes in the family. None are input genes. |

|  |  |  |  |  |  |
|------------|------------|------------|------------|------------|------------|
| Function Results | Manual Results | Function Results | Manual Results | Function Results | Manual Results |
| Pathway | Pathway | Paralogue | Paralogue | PPI | PPI |
| 0.13414634 | R-HSA-1268020 - pathway of 10 input genes | NA | NA | 0.25000000 |  |
| 0.07692308 | R-HSA-499943 - pathway of 2 input genes | NA | NA | 0.0483871 |  |
| 0.2 | R-HSA-9705683 - this is the pathway that the function gets from reactome. 11 input genes also are in this pathway. Manually searching Reactome, the first one it shows me is it's pathway R-HSA-9754560, which is inside the previous pathway (but not a subpathway). | NA | NA | 0.07407407 |  |
| NA | NA - no annotations | 0 | Has two paralogs. One is an input gene HGNC:11120 but below the cutoff of 30%, so it's not counted. | NA |  |
| NA | NA - no annotations | 0 | Has 14 paralogs, but only 2 above 30% threshold: HGNC:15924, HGNC:10524 and neither are input genes. | NA |  |

```{r}

checks_all_Ps <- gene_check(hgnc_gene_list, uniprot, paralogues, 
                                ppi, input_genes)


```

Other code used to check

```{r}

unique_categories <- unique(paralogues$hsapiens_paralog_orthology_type)
print(unique_categories)

category_counts <- table(paralogues$hsapiens_paralog_orthology_type)
print(category_counts)
```

```{r}

check_protein_families_ratio <- function(panther, input_genes) {

  # Input genes ----------------------------------------------------------------

  input_genes <- input_genes %>%
    dplyr::select(hgnc_id)


  # Panther protein families of input genes ------------------------------------

  panther_counts <- panther %>%
    mutate(input_gene_yes_or_no = ifelse(hgnc_id %in% input_genes$hgnc_id, 1, 0))


  # Calculations ---------------------------------------------------------------

  # Counting number of input proteins/genes in each family
  panther_counts <- panther_counts %>%
    group_by(family_id) %>%
    mutate(num_genes_in_family = n(),
           num_input_gene_per_family = sum(input_gene_yes_or_no))

  # Counting the number of unique genes in each pathway that gene is related to
  panther_counts_per_gene <- panther_counts %>%
    group_by(hgnc_id) %>%
    dplyr::mutate(
      num_families = n_distinct(family_id),
      num_unique_genes_in_families = sum(length(unique(panther_counts$hgnc_id[panther_counts$family_id %in% family_id]))),
      num_input_genes_in_families = sum(unique(panther_counts$hgnc_id[panther_counts$family_id %in% family_id]) %in% input_genes$hgnc_id) - (hgnc_id %in% input_genes$hgnc_id)
    ) %>%
    dplyr::mutate(ratio_input_genes_in_families = num_input_genes_in_families / num_unique_genes_in_families) %>%
    dplyr::select(hgnc_id, uniprot_ids, family_id, num_families,
                  num_unique_genes_in_families, num_input_genes_in_families,
                  ratio_input_genes_in_families) %>% arrange(hgnc_id)

  panther_counts_per_gene_final <- panther_counts_per_gene %>%
    dplyr::select(hgnc_id, uniprot_ids, num_families,
                  num_unique_genes_in_families, num_input_genes_in_families,
                  ratio_input_genes_in_families) %>% unique() %>%
    dplyr::mutate(ratio_input_genes_in_families = ifelse(is.na(ratio_input_genes_in_families), 0, ratio_input_genes_in_families))


  cat('\n(12/12) finished running protein_families_ratio.R\n')
  return(panther_counts_per_gene_final)
}


test_family_ratio <- check_protein_families_ratio(protein_families, AR)
```

```{r}

test_protein_families <- function(protein_coding_genes,
                                   uniprot_input_gene_symbol_results_cleaned) {

  # Creating a df of protein families data from uniprot results ----------------

  input_genes_protein_families_expanded <- uniprot_input_gene_symbol_results_cleaned %>%
    tidyr::separate_rows(PANTHER, sep = ";") %>% distinct() %>%
    filter(PANTHER != "") %>%
    dplyr::select(PANTHER) %>% distinct() %>%
    dplyr::rename(family_id = PANTHER)
  
  input_genes_protein_families_expanded2 <- input_genes_protein_families_expanded

  # removing extra information after the ':'
  input_genes_protein_families_expanded2$family_id <- trimws(sub(":.*", "", input_genes_protein_families_expanded2$family_id)) 
  input_genes_protein_families_expanded2 <- input_genes_protein_families_expanded2 %>%
    distinct()


  # Querying Uniprot -----------------------------------------------------------

  batch_size = 1

  # Obtain families
  family <- dplyr::select(input_genes_protein_families_expanded2, family_id)
  vector_family <- family %>% dplyr::pull(family_id)   # turning object into vector

  # Ensure input is a character vector
  if (!is.character(vector_family)) {
    stop("Input must be a character vector of gene names.")
  }

  # Split genes into batches
  batches <- split(vector_family, ceiling(seq_along(vector_family) / batch_size))

  # Initialize an empty list to store results
  results <- list()

  for (i in seq_along(batches)) {
    # Join genes into a query string with OR logic for the current batch
    family_query <- paste(paste0("xref:", batches[[i]]), collapse = " OR ")

    # URL-encode the query string to handle special characters
    encoded_query <- URLencode(family_query)

    # Construct the curl command with the specified genes and desired fields
    curl_command <- paste0(
      "curl -s -H \"Accept: text/plain; format=tsv\" \"https://rest.uniprot.org/uniprotkb/search?query=reviewed:true+AND+(",
      encoded_query,
      ")+AND+organism_id:9606&fields=accession,xref_hgnc,gene_primary,xref_panther,version\""
    )

    # Execute the curl command and capture the output
    output <- system(curl_command, intern = TRUE)

    # Combine the output into a single string
    tsv_content <- paste(output, collapse = "\n")

    # Check if output contains valid content
    if (nchar(tsv_content) == 0) {
      warning(paste("No data returned for batch", i))
      next
    }

    # Convert the TSV content into a data frame by reading from a string
    batch_data <- read_tsv(I(tsv_content), col_types = cols(.default = "c"), show_col_types = FALSE)

    # Append the batch data to the results list
    results[[i]] <- batch_data
  }

  # Combine all batch results into a single data frame
  uniprot_input_gene_family_results <- do.call(rbind, results)

  # Cleaning Protein Families result file from Uniprot ---------------------------

  # Selecting and renaming required columns
  proteinfamily_genes <- uniprot_input_gene_family_results %>%
    dplyr::select(Entry, HGNC, 'Gene Names (primary)', PANTHER) %>%
    dplyr::rename(uniprot_ids = Entry) %>%
    dplyr::rename(hgnc_id = HGNC) %>%
    dplyr::rename(family_id = PANTHER) %>%
    dplyr::rename(symbol = 'Gene Names (primary)')

  # Removing trailing ;
  proteinfamily_genes$hgnc_id <- gsub(";$", "", proteinfamily_genes$hgnc_id)
  proteinfamily_genes$family_id <- gsub(";$", "", proteinfamily_genes$family_id)

  # Separating families into new rows
  proteinfamily_genes_expanded <- proteinfamily_genes %>%
    tidyr::separate_rows(family_id, sep = ";") %>%
    filter(family_id != "")

  # removing extra bits
  proteinfamily_genes_expanded$family_id <- trimws(sub(":.*", "", proteinfamily_genes_expanded$family_id))
  proteinfamily_genes_expanded <- proteinfamily_genes_expanded %>% distinct() %>%
    dplyr::select(family_id, uniprot_ids, hgnc_id, symbol) %>% # fixing order of columns
    arrange(family_id) # rearranging rows


  cat('\n(7/12) finished running protein_families.R\n')
  return(proteinfamily_genes_expanded)

}


test_pf <- test_protein_families(hgnc_gene_list, uniprot)

```

```{r}

encoded_query <- paste(paste0("xref:", "PTHR11777"), collapse = " OR ")


curl_command <- paste0(
    "curl -H \"Accept: text/plain; format=tsv\" \"https://rest.uniprot.org/uniprotkb/search?query=reviewed:true+AND+(",
    encoded_query,
    ")+AND+organism_id:9606&fields=accession,xref_hgnc,gene_primary,xref_panther,version\""
  )

  # Execute the curl command and capture the output
output <- system(curl_command, intern = TRUE)

  # Combine the output into a single string
tsv_content <- paste(output, collapse = "\n")

print(tsv_content)

```