3. DE plots

Libraries

Directories

Code
base_dir <- "/nemo/lab/destrooperb/home/shared/zanettc/millie_proteomics"
run_num <- "run6"

results_dir <- file.path(base_dir, "results", run_num)
objects_dir <- file.path(base_dir, "data", "processed", run_num)
raw_dir <- file.path(base_dir, "data", "raw")

dir.create(results_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(objects_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(raw_dir, recursive = TRUE, showWarnings = FALSE)

Read in MSstats output

Code
res_df <- fread(file.path(results_dir, "Full_GroupComparison_Results.csv"))

Mapping protein IDs to gene names

Code
clean_spec_raw <- fread(file.path(base_dir, "data/processed/run1/clean_spec_raw.csv"))

protein_dictionary <- clean_spec_raw %>%
  dplyr::select(Protein = PG.ProteinGroups, 
         Gene = PG.Genes, 
         Description = PG.ProteinDescriptions) %>%
  distinct()

Volcano function

Boundary cut-offs of 0.05 FDR and +/- 0.58 Log2FC (1.5x).

Code
# Define colours
volcano_cols <- c("Upregulated" = "firebrick3", 
                  "Downregulated" = "navy", 
                  "Not Significant" = "grey80")

generate_swanky_volcano <- function(msstats_results, dict, comparison_label, logfc_cutoff = 0.5, graphs_dir = "results") {
  
  # --- Data Processing ---
  volcano_data <- msstats_results %>%
    # Isolate the specific comparison and drop missing values
    filter(Label == comparison_label, !is.na(adj.pvalue), is.finite(log2FC)) %>%
    # Map the Gene names from the dictionary
    left_join(dict, by = "Protein") %>%
    mutate(
      # Fallback to Protein ID if Gene name is missing
      Gene = ifelse(is.na(Gene) | Gene == "", Protein, Gene),
      # Define significance categories
      Status = case_when(
        adj.pvalue < 0.05 & log2FC > logfc_cutoff ~ "Upregulated",
        adj.pvalue < 0.05 & log2FC < -logfc_cutoff ~ "Downregulated",
        TRUE ~ "Not Significant"
      )
    ) %>%
    # Group by Status to rank Upregulated and Downregulated separately
    group_by(Status) %>%
    # Rank by FDR (smallest adj.pvalue = 1)
    mutate(GroupRank = rank(adj.pvalue, ties.method = "first")) %>%
    ungroup() %>%
    mutate(
      # Label top 20 for each significant category
      PlotLabel = case_when(
        Status != "Not Significant" & GroupRank <= 20 ~ Gene, 
        TRUE ~ NA_character_
      )
    )
  
  # --- Plotting ---
  p <- ggplot(volcano_data, aes(x = log2FC, y = -log10(adj.pvalue), fill = Status, color = Status)) +
    geom_point(aes(size = Status, alpha = Status), shape = 21, stroke = 0.2) +
    scale_fill_manual(values = volcano_cols) +
    scale_color_manual(values = c("Upregulated" = "black", "Downregulated" = "black", "Not Significant" = "grey90")) +
    scale_size_manual(values = c("Upregulated" = 3, "Downregulated" = 3, "Not Significant" = 1.5)) +
    scale_alpha_manual(values = c("Upregulated" = 1, "Downregulated" = 1, "Not Significant" = 0.3)) +
    geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "grey30", linewidth = 0.8) +
    geom_vline(xintercept = c(-logfc_cutoff, logfc_cutoff), linetype = "dashed", color = "grey30", linewidth = 0.8) +
    geom_label_repel(
      aes(label = PlotLabel),
      fill = "white", color = "black", fontface = "bold", size = 3.5,
      box.padding = 0.5, point.padding = 0.5,
      min.segment.length = 0, segment.color = "grey50", max.overlaps = 50
    ) +
    labs(
      title = paste0("Differential Abundance: ", gsub("_", " ", comparison_label)),
      subtitle = paste0("Top 20 Genes Labeled | FDR < 0.05 | |log2FC| > ", logfc_cutoff),
      x = bquote(bold("Log"[2] ~ "Fold Change")),
      y = bquote(bold("-Log"[10] ~ "FDR"))
    ) +
    theme_minimal(base_size = 14) +
    theme(
      plot.title = element_text(face = "bold", size = 18, hjust = 0),
      plot.subtitle = element_text(size = 12, color = "grey40", margin = margin(b = 10)),
      axis.title = element_text(face = "bold", size = 14),
      axis.text = element_text(face = "bold", color = "black"),
      axis.line = element_line(color = "black", linewidth = 1),
      axis.ticks = element_line(color = "black", linewidth = 1),
      legend.position = "top",
      legend.title = element_blank(),
      legend.text = element_text(size = 12, face = "bold"),
      panel.grid.minor = element_blank(),
      panel.grid.major = element_line(color = "grey95")
    )
  
  # --- Output Generation ---
  # Ensure directory exists
  dir.create(graphs_dir, showWarnings = FALSE, recursive = TRUE)
  safe_name <- gsub(" ", "_", comparison_label)
  
  # Save Plot
  ggsave(
    filename = file.path(graphs_dir, paste0("Volcano_", safe_name, "_Swanky.pdf")), 
    plot = p, 
    width = 10, height = 7, dpi = 300,
    device = cairo_pdf 
  )
  
  # Save Data Table
  # Select only the most useful columns and sort by significance for easy reading
  export_data <- volcano_data %>%
    dplyr::select(Protein, Gene, Description, log2FC, pvalue, adj.pvalue, Status) %>%
    arrange(adj.pvalue)
  
  fwrite(export_data, file = file.path(graphs_dir, paste0("Volcano_Data_", safe_name, ".csv")))
  
  return(p)
}

Iterate through comparisons to generate plots and useful DE tables

Code
all_comparisons <- unique(res_df$Label)

# 2. Loop through each comparison, generate the plot/data, and store the plots in a list
plot_list <- list()

for (comp in all_comparisons) {
  p <- generate_swanky_volcano(
    msstats_results = res_df, 
    dict = protein_dictionary, 
    comparison_label = comp, 
    logfc_cutoff = 0.5, 
    graphs_dir = results_dir
  )
  
  # Store the plot in our list
  plot_list[[comp]] <- p
  
  # Print the plot so it appears in HTML output
  print(p)
}
Warning: Removed 5533 rows containing missing values or values outside the scale range
(`geom_label_repel()`).
Removed 5533 rows containing missing values or values outside the scale range
(`geom_label_repel()`).
Warning: Removed 5527 rows containing missing values or values outside the scale range
(`geom_label_repel()`).

Warning: Removed 5527 rows containing missing values or values outside the scale range
(`geom_label_repel()`).
Warning: Removed 5543 rows containing missing values or values outside the scale range
(`geom_label_repel()`).

Warning: Removed 5543 rows containing missing values or values outside the scale range
(`geom_label_repel()`).

Interrogation of floor genes from Near vs Far plaque volcano plots

Code
res_df
Code
floor_check <- res_df %>%
  filter(Label == "PlaqueNear_vs_Control", adj.pvalue > 0.99) %>%
  dplyr::select(Protein, log2FC, pvalue, adj.pvalue, SE, DF) %>%
  arrange(desc(abs(log2FC)))

# Look at the top results
head(floor_check, 10)
Code
#filtered for top 4 logFC proteins with >0.99 FDR
target_proteins <- res_df %>%
  filter(Label == "PlaqueNear_vs_PlaqueFar", adj.pvalue > 0.99) %>%
  arrange(desc(abs(log2FC))) %>%
  slice_head(n = 4) %>%
  pull(Protein)

#read in msstats data
processed_data <- qs_read(file.path(objects_dir, "processed_msstats_data.qs2"))

# extract their summarized abundances from the processed MSstats object
abundance_data <- processed_data$ProteinLevelData %>%
  filter(Protein %in% target_proteins) %>%
  # Filter to just the two conditions of interest for clarity
  filter(GROUP %in% c("plaque_near", "plaque_far")) %>%
  group_by(Protein, GROUP, SUBJECT) %>%
  summarise(LogIntensities = mean(LogIntensities, na.rm = TRUE), .groups = "drop")
#plot abundances
#each dot represents the summarised abundance for a biological replicate 
p_abundance <- ggplot(abundance_data, aes(x = GROUP, y = LogIntensities, fill = GROUP)) +
  geom_boxplot(alpha = 0.5, outlier.shape = NA) +
  geom_jitter(width = 0.2, size = 2, alpha = 0.8) +
  facet_wrap(~ Protein, scales = "free_y") +
  theme_minimal(base_size = 14) +
  scale_fill_manual(values = c("plaque_near" = "#d95f02", "plaque_far" = "#7570b3")) +
  labs(
    title = "Abundance Profile of High-FC 'Floor' Proteins",
    subtitle = "FDR = 1.0 despite high Log2FC",
    x = "Condition",
    y = "Log2 Intensity"
  ) +
  theme(legend.position = "none")

# Print the plot
print(p_abundance)