perinatal_modeling.qmd

---
title: "Modeling Perinatal Data for Gene1 and Gene2"
format: html
editor: visual
---

**Gene names have been changed because the results have not been published.**
## This script starts with trying to fit a linear model to the gene expression for a variety of cell types and then does spline regression.

```{r setup, include= FALSE}
knitr::opts_chunk$set(echo = F)
library(here)
library(ggplot2)
library(dplyr)
library(tidyr)
library(splines)
library(DT)
library(purrr)
library(cowplot)
library(splitstackshape)
library(broom)
```

```{r}
##---------
# Functions
##---------
lm_summaries <- function(df, group_col, x, y){
  ## This function runs linear models for all groups and produces a data.frame with a summary output for each model.
  lm_summary <- df %>%
  nest(data = -{{group_col}}) %>% 
  mutate(
    fit = map(data, ~lm(as.formula(paste(y, "~", x)), data = .x)),
    glance_out = map(fit, glance)
  ) %>%
  select(X, glance_out) %>%
  unnest(cols = glance_out)

  lm_summary <- lm_summary %>% 
    mutate(p.adj= p.value*nrow(.)) %>%  ## Bonferroni correction
    mutate(across(colnames(.)[-1], ~format(.x, digits= 2))) %>%
    select(contains(group_col), contains("r.squared"), contains("p."),
           "AIC", "BIC")
  
  return(lm_summary)
}

model_facet_plot <- function(df,x, y, lm_group, title){
 p <- ggplot(df, aes_string(x= x, y= y)) + 
    geom_boxplot(aes_string(group= x), outlier.alpha = 0) +
    geom_point(position= position_jitter(width= 0.2, height= 0)) + 
    geom_smooth(aes_string(group= lm_group), method= "lm") +
    facet_wrap(~X, scales = "free") +
    ggtitle(title) + 
    theme_cowplot()
 plot(p)
}

bspline_testing <- function(train_df, test_df, x, y, group_col, dfs, degree){
  ## Spline regression wrapper that produces plots and measures the model performance on the test data 
  groups <- unique(train_df[,group_col, drop= TRUE])
  
  for(group in groups){
    print(paste0("Starting ", group))
    group_train = train_df %>% filter(get(group_col) == group)
    group_test  = test_df  %>% filter(get(group_col) == group)
      
    max_r_sq= 0
    
    for(i in dfs){
      formula_use <- formula(paste0(y," ~ splines::bs(",x, ", df= ", i,", degree =", degree, ")")) 
      model <- lm(formula_use, data= group_train)
      
      model_sum <- glance(summary(model))
      
      if(model_sum$adj.r.squared > max_r_sq){
        best_df= i
        max_r_sq= model_sum$adj.r.squared
      }
    }
    
    print(paste0("Df that has the highest adjusted R^2 for ", group, " is ", best_df))
    
    formula_use <- formula(paste0(y," ~ splines::bs(",x, ", df= ", best_df,", degree =", degree, ")")) 
    model <- lm(formula_use, data= group_train)
    model_sum <- glance(summary(model))
    print(model_sum)
    
    print(paste0("Model performance on test data for ", group))
    preds <- predict(model, newdata= group_test)
    real  <- group_test[,y, drop= TRUE]
    residuals= real - preds
      
    SSE <- sum((residuals)^2)
    print(paste0("SSE: ", round(SSE, digits= 2)))
      
    RMSE <- sqrt(mean((residuals)^2))
    print(paste0("RMSE: ", round(RMSE, digits= 2)))
      
    MAE <- mean(abs(residuals)) 
    print(paste0("MAE: ", round(MAE, digits= 2)))
    
    p <- ggplot(group_test, aes_string(x, y)) + 
      geom_boxplot(aes_string(group= x)) +
      geom_point() +
      geom_smooth(method= "lm", 
                  formula= y ~ splines::bs(x, df= i, degree= degree), 
                  se = FALSE)
    plot(p)
      
    hist(residuals)
    plot(x= residuals, y= preds)
    plot(x= real, y= preds)
  }
}
```

## Formatting Data

```{r}
## Gene1
gene1 <- here("modeling_perinatal/data/Gene1_for_John.csv") %>% 
  read.csv() %>% 
  pivot_longer(!X, names_to= "day") %>% 
  mutate(day= as.numeric(gsub("X|\\.\\d+","", day)), ## regex fun
         ln_value= log(value + 1e-6)) %>% 
  as.data.frame()

gene1_pheno_day <- gene1 %>% 
  group_by(X) %>% 
  count(day) %>% 
  ungroup() 
ggplot(gene1_pheno_day, aes(x= day, y= n)) +
    geom_col() + 
    facet_wrap(~X) + 
    theme_bw() +
    ggtitle("Gene1 Phenotype & Day Distribution")


## Gene2
gene2 <- here("modeling_perinatal/data/Gene2_RFI_for_John.csv") %>% 
  read.csv() %>% 
  pivot_longer(!X, names_to= "day") %>% 
  mutate(day= as.numeric(gsub("X|\\.\\d+", "", day)),
         ln_value= log(value + 1e-6)) %>% 
  as.data.frame()

gene2_pheno_day <- gene2 %>% 
  group_by(X) %>% 
  count(day)  %>% 
  ungroup() 
ggplot(gene2_pheno_day, aes(x= day, y= n)) +
    geom_col() + 
    facet_wrap(~X) + 
    theme_bw() +
    ggtitle("Gene2 Phenotype & Day Distribution")
```

## Upsampling

```{r}
## Gene1
Gene1_upsample <- gene1 %>% 
  left_join(gene1_pheno_day, by= c("X","day")) %>% 
  group_by(X) %>% 
  mutate(max_counts= max(n)) %>% 
  ungroup() 
gene1_max_n= unique(gene1_upsample$max_counts)

gene1_upsample <- gene1_upsample %>% 
  group_by(X, day) %>% 
  slice_sample(n= gene1_max_n, replace = TRUE) %>% 
  ungroup()

## Gene2
gene2_upsample <- left_join(gene2, gene2_pheno_day, by= c("X","day")) %>% 
  group_by(X) %>% 
  mutate(max_counts= max(n)) %>% 
  ungroup() 
gene2_max_n= unique(gene2_upsample$max_counts)

gene2_upsample <- gene2_upsample %>% 
  group_by(X, day) %>% 
  slice_sample(n= gene2_max_n, replace = TRUE) %>% 
  ungroup()
```

## Gene1 Modeling

### Simple Linear Regression

```{r, message= FALSE}
model_facet_plot(df= gene1_upsample, x= "day", y= "value", 
                 lm_group= "X", title= "gene1 - Raw")

datatable(lm_summaries(gene1_upsample, group_col= "X", x= "day", y= "value"))
```

Linear modeling achieves significance, but the R^2^-values are poor.

```{r, message= FALSE}
model_facet_plot(gene1_upsample,x= "day", y= "ln_value",
                 lm_group= "X",title= "gene1 - Log")

datatable(lm_summaries(gene1_upsample, group_col= "X", x= "day", y= "ln_value"))
```

## Gene1 Train Test Split

```{r}
set.seed(123)
train_test_gene1 <-stratified(gene1_upsample, 
                               group= c("X", "day"), 
                               size= 0.8, 
                               bothSets= TRUE)
train_gene1 <- as.data.frame(train_test_gene1[[1]])
test_gene1  <- as.data.frame(train_test_gene1[[2]])
```

## Spline Regression on raw gene1
```{r, message= FALSE}
bspline_testing(train_df= train_gene1,
                test_df= test_gene1,
                x= "day",
                y= "value",
                group_col= "X",
                dfs= 3:6, 
                degree= 1
                )
```

## Spline Regression on log gene1
```{r, message= FALSE}
bspline_testing(train_df= train_gene1,
                test_df= test_gene1,
                x= "day",
                y= "ln_value",
                group_col= "X",
                dfs= 3:6, 
                degree= 1
                )
```

## Gene2 Modeling

### Simple Linear Regression

```{r, eval= TRUE}
model_facet_plot(df= gene2, x= "day", y= "value", 
                 lm_group= "X", title= "Gene2 - Raw")

datatable(lm_summaries(df= gene2, group_col= "X", 
                       x= "day", y= "ln_value"))
```

```{r}
model_facet_plot(df= gene2, x= "day", y= "ln_value", 
                 lm_group= "X", title= "Gene2 - LN")
datatable(lm_summaries(df= gene2, group_col= "X", 
                       x= "day", y= "ln_value"))
```

## Gene2 Train Test Split
```{r}
set.seed(123)
train_test_gene2 <-stratified(gene2_upsample, 
                              group= c("X", "day"), 
                              size= 0.8, 
                              bothSets= TRUE)
train_gene2 <- as.data.frame(train_test_gene2[[1]])
test_gene2  <- as.data.frame(train_test_gene2[[2]])
```

## Spline Regression on raw Gene2
```{r, message= FALSE}
bspline_testing(train_df= train_gene2,
                test_df= test_gene2,
                x= "day",
                y= "value",
                group_col= "X",
                dfs= 3:6, 
                degree= 1
                )
```

## Spline Regression on log gene2
```{r, message= FALSE}
bspline_testing(train_df= train_gene2,
                test_df= test_gene2,
                x= "day",
                y= "ln_value",
                group_col= "X",
                dfs= 2:6, 
                degree= 1
                )
```