3. Phospho — DE volcanoes (adjusted)

Overview

Volcano plots and per-contrast site-level CSVs from the adjusted PTM contrasts in notebook 02. Adjusted log2FC = “phospho minus protein,” i.e. regulation not explained by total protein abundance.

Sites are labelled in Gene_Sxxx format (e.g. Mapt_S202) when the FASTA mapping in notebook 02 succeeded; otherwise the raw protein/peptide identifier is used.

Libraries

Directories

Code
base_dir <- "/nemo/lab/destrooperb/home/shared/zanettc/giulia_proteomics/phosphoproteomics"
run_num  <- "run3"

results_dir <- file.path(base_dir, "results", run_num)
objects_dir <- file.path(base_dir, "data", "processed", run_num)
dir.create(results_dir, recursive = TRUE, showWarnings = FALSE)

Load adjusted PTM contrast table

Code
res_df <- fread(file.path(results_dir, "PTM_adjusted_GroupComparison.csv"))
cat("Rows:", nrow(res_df), "  Contrasts:",
    paste(unique(res_df$Label), collapse = ", "), "\n")
Rows: 44415   Contrasts: hAppNLGF_vs_hApp, hApp_FIRE_vs_hApp, hAppNLGF_FIRE_vs_hAppNLGF, hAppNLGF_FIRE_vs_hApp_FIRE, Interaction_APP_x_Microglia 
Code
# MSstatsPTM puts site identifier in `Protein` (e.g. UniProt_S202_T205); split out
# the UniProt accession to map to a gene symbol. The exact format depends on the
# FASTA-mapping success in notebook 02 — adjust the regex if your sites look
# different.
res_df[, Accession := str_extract(Protein, "^[^_;]+")]
res_df[, Sites_in_id := str_extract(Protein, "(?<=_)[STY][0-9]+(?:_[STY][0-9]+)*")]

# Map UniProt accession → gene symbol
res_df[, Gene := mapIds(
  org.Mm.eg.db,
  keys      = Accession,
  column    = "SYMBOL",
  keytype   = "UNIPROT",
  multiVals = "first"
)]
res_df[, Gene := ifelse(is.na(Gene) | Gene == "", Accession, Gene)]
res_df[, SiteLabel := ifelse(is.na(Sites_in_id), Gene, paste0(Gene, "_", Sites_in_id))]

Volcano helper (mirrors bulk’s generate_swanky_volcano)

Code
volcano_cols <- c("Upregulated"     = "firebrick3",
                  "Downregulated"   = "navy",
                  "Not Significant" = "grey80")

generate_phospho_volcano <- function(df, comparison_label,
                                     logfc_cutoff = 0.5,
                                     graphs_dir   = results_dir) {
  v <- df %>%
    filter(Label == comparison_label,
           !is.na(adj.pvalue), is.finite(log2FC)) %>%
    mutate(
      Status = case_when(
        adj.pvalue < 0.05 & log2FC >  logfc_cutoff ~ "Upregulated",
        adj.pvalue < 0.05 & log2FC < -logfc_cutoff ~ "Downregulated",
        TRUE ~ "Not Significant"
      )
    ) %>%
    group_by(Status) %>%
    mutate(GroupRank = rank(adj.pvalue, ties.method = "first")) %>%
    ungroup() %>%
    mutate(PlotLabel = ifelse(Status != "Not Significant" & GroupRank <= 20,
                               SiteLabel, NA_character_))

  p <- ggplot(v, aes(x = log2FC, y = -log10(adj.pvalue),
                     fill = Status, color = Status)) +
    geom_point(aes(size = Status, alpha = Status), shape = 21, stroke = 0.2) +
    scale_fill_manual(values  = volcano_cols) +
    scale_color_manual(values = c("Upregulated"     = "black",
                                   "Downregulated"   = "black",
                                   "Not Significant" = "grey90")) +
    scale_size_manual(values  = c("Upregulated" = 3, "Downregulated" = 3,
                                   "Not Significant" = 1.5)) +
    scale_alpha_manual(values = c("Upregulated" = 1, "Downregulated" = 1,
                                   "Not Significant" = 0.3)) +
    geom_hline(yintercept = -log10(0.05), linetype = "dashed",
               color = "grey30", linewidth = 0.8) +
    geom_vline(xintercept = c(-logfc_cutoff, logfc_cutoff), linetype = "dashed",
               color = "grey30", linewidth = 0.8) +
    geom_label_repel(aes(label = PlotLabel),
                     fill = "white", color = "black",
                     fontface = "bold", size = 3.2,
                     box.padding = 0.5, point.padding = 0.5,
                     min.segment.length = 0, segment.color = "grey50",
                     max.overlaps = 50) +
    labs(title    = paste0("Adjusted phospho DE — ", gsub("_", " ", comparison_label)),
         subtitle = paste0("Top 20 sites labelled | FDR < 0.05 | |log2FC| > ", logfc_cutoff,
                           " (phospho minus protein)"),
         x = bquote(bold("Adjusted log"[2] ~ "FC")),
         y = bquote(bold("-Log"[10] ~ "FDR"))) +
    theme_minimal(base_size = 14) +
    theme(plot.title    = element_text(face = "bold", size = 18, hjust = 0),
          plot.subtitle = element_text(size = 12, color = "grey40",
                                       margin = margin(b = 10)),
          axis.title    = element_text(face = "bold", size = 14),
          axis.text     = element_text(face = "bold", color = "black"),
          axis.line     = element_line(color = "black", linewidth = 1),
          axis.ticks    = element_line(color = "black", linewidth = 1),
          legend.position = "top",
          legend.title    = element_blank(),
          legend.text     = element_text(size = 12, face = "bold"),
          panel.grid.minor = element_blank(),
          panel.grid.major = element_line(color = "grey95"))

  safe <- gsub(" ", "_", comparison_label)
  ggsave(file.path(graphs_dir, paste0("PhosphoVolcano_", safe, ".pdf")),
         p, width = 10, height = 7, dpi = 300, device = cairo_pdf)

  fwrite(v %>% dplyr::select(SiteLabel, Gene, Accession, Sites_in_id,
                      log2FC, pvalue, adj.pvalue, Status) %>%
           arrange(adj.pvalue),
         file.path(graphs_dir, paste0("PhosphoVolcano_Data_", safe, ".csv")))
  p
}

Iterate through contrasts

Code
for (cl in unique(res_df$Label)) {
  print(generate_phospho_volcano(res_df, cl, logfc_cutoff = 0.5,
                                 graphs_dir = results_dir))
}

Quick sanity check — bulk vs phospho concordance

For each contrast, scatter the bulk protein-level log2FC against the unadjusted phospho log2FC at sites whose protein is present in both. Diagonal = abundance-driven. Off-diagonal = phosphorylation-specific regulation.

Points are coloured by adjusted phospho FDR (from the phospho − protein contrast in notebook 02), not by unadjusted FDR. MSstatsPTM gives oneConditionMissing / completeMissing sites a sentinel log2FC = ±Inf and p = 0, which dominate the unadjusted FDR call but carry no real biological signal — they’re filtered out here.

Code
ptm_unadj <- fread(file.path(results_dir, "PTM_unadjusted_GroupComparison.csv"))
prot_res  <- fread(file.path(results_dir, "PROTEIN_GroupComparison.csv"))

ptm_unadj[, Accession := str_extract(Protein, "^[^_;]+")]
prot_res[, Accession := str_extract(Protein, "^[^;]+")]

cmp <- merge(
  ptm_unadj[, .(Label, SiteProtein = Protein, Accession,
                Phospho_log2FC = log2FC, Phospho_issue = issue)],
  prot_res[, .(Label, Accession, Protein_log2FC = log2FC)],
  by = c("Label", "Accession")
)

# Bring in adjusted (phospho − protein) FDR from notebook 02 — `res_df` is already
# loaded above. SiteProtein matches the adjusted table's `Protein` column.
cmp <- merge(
  cmp,
  res_df[, .(Label, SiteProtein = Protein, Adj_padj = adj.pvalue)],
  by = c("Label", "SiteProtein"),
  all.x = TRUE
)

cmp_plot <- cmp[is.finite(Phospho_log2FC) & is.finite(Protein_log2FC)]
cmp_plot[, Significant := !is.na(Adj_padj) & Adj_padj < 0.05]

cat("Sentinel rows dropped:", nrow(cmp) - nrow(cmp_plot),
    " | retained:", nrow(cmp_plot),
    " | significant (adj FDR < 0.05):", sum(cmp_plot$Significant), "\n")
Sentinel rows dropped: 7431  | retained: 35824  | significant (adj FDR < 0.05): 109 
Code
p_scat <- ggplot(cmp_plot,
                 aes(Protein_log2FC, Phospho_log2FC, color = Significant)) +
  geom_point(alpha = 0.5, size = 1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey30") +
  geom_hline(yintercept = 0, linetype = "dotted", color = "grey60") +
  geom_vline(xintercept = 0, linetype = "dotted", color = "grey60") +
  scale_color_manual(values = c("TRUE" = "firebrick3", "FALSE" = "grey60"),
                     labels = c("TRUE" = "Significant", "FALSE" = "n.s."),
                     name   = "Adjusted phospho FDR < 0.05") +
  facet_wrap(~ Label, ncol = 3, scales = "free") +
  labs(title    = "Phospho vs Protein log2FC per contrast",
       subtitle = "Coloured by adjusted (phospho − protein) FDR; MSstats sentinel sites removed",
       x = "Protein log2FC (bulk)",
       y = "Phospho log2FC (unadjusted)") +
  theme_bw(base_size = 11) +
  theme(legend.position = "top", strip.text = element_text(face = "bold"))

print(p_scat)

Code
ggsave(file.path(results_dir, "Phospho_vs_Protein_log2FC_scatter.pdf"),
       p_scat, width = 12, height = 8, device = cairo_pdf)