5. GFAP correlation

This script is meant to identify proteins whose expression levels correlate with GFAP, within the plaque near condition.

Libraries

Code
suppressPackageStartupMessages({
library(data.table)
library(qs2)
library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
library(ggrepel)
})

Directories

Code
base_dir <- "/nemo/lab/destrooperb/home/shared/zanettc/millie_proteomics"
run_num <- "run5"

results_dir <- file.path(base_dir, "results", run_num)
objects_dir <- file.path(base_dir, "data", "processed", run_num)

dir.create(results_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(objects_dir, recursive = TRUE, showWarnings = FALSE)

Load objects and extract summarised protein levels

Code
#Map protein IDs to readable gene IDs
processed_data <- qs_read(file.path(objects_dir, "processed_msstats_data.qs2"))
protein_dict <- fread(file.path(base_dir, "data/processed/run1/clean_spec_raw.csv")) %>%
  dplyr::select(Protein = PG.ProteinGroups, Gene = PG.Genes) %>%
  distinct()

# Extract Summarized Abundances
# Using ProteinLevelData (summarized by MSstats) for cleaner correlations

#transform to wide format (rows per run, columns per gene) to calculate column-wise correlations

wide_df <- processed_data$ProteinLevelData %>%
  left_join(protein_dict, by = "Protein") %>%
  mutate(Gene = ifelse(is.na(Gene) | Gene == "", Protein, Gene)) %>%
  # Handle duplicates (e.g., multiple Protein IDs mapping to one Gene)
  group_by(originalRUN, Gene) %>%
  summarize(Abundance = mean(LogIntensities, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(names_from = Gene, values_from = Abundance) %>%
  column_to_rownames("originalRUN")

Correlate gene (protein) levels to Gfap

Code
final_clean_meta <- read.csv(file.path(base_dir,"data/processed/run1/msstats_annotation.csv"))
plaque_near_runs <- final_clean_meta$Run[final_clean_meta$Condition == "plaque_near"]

# Subset the data to just plaque near samples
pn_data <- wide_df[rownames(wide_df) %in% plaque_near_runs, ]

#calculate R between every protein's abundance and GFAP. 
pn_gfap_cors <- data.frame(Gene = colnames(pn_data)) %>%
  mutate(
    Pearson_R = sapply(Gene, function(x) {
      if(sum(!is.na(pn_data$Gfap) & !is.na(pn_data[[x]])) > 3) {
        cor(pn_data$Gfap, pn_data[[x]], use = "pairwise.complete.obs")
      } else { NA }
    })
  ) %>%
  filter(!is.na(Pearson_R)) %>%
  arrange(desc(Pearson_R))

pn_gfap_cors
Code
write.csv(pn_gfap_cors, file.path(objects_dir, "gfap_correlation.csv"))

View Cd44 correlation with Gfap

Code
pn_gfap_cors %>% filter(Gene == "Cd44")

Proteins correlated to Cd44

Code
pn_cd44_cors <- data.frame(Gene = colnames(pn_data)) %>%
  mutate(
    Pearson_R = sapply(Gene, function(x) {
      if(sum(!is.na(pn_data$Cd44) & !is.na(pn_data[[x]])) > 3) {
        cor(pn_data$Cd44, pn_data[[x]], use = "pairwise.complete.obs")
      } else { NA }
    })
  ) %>%
  filter(!is.na(Pearson_R)) %>%
  arrange(desc(Pearson_R))

pn_cd44_cors
Code
write.csv(pn_cd44_cors, file.path(objects_dir, "cd44_correlation.csv"))