3. DE plots

Libraries

Directories

Code

base_dir <- "/nemo/lab/destrooperb/home/shared/zanettc/giulia_proteomics/bulk_proteomics"
run_num  <- "run4"

results_dir <- file.path(base_dir, "results", run_num)
objects_dir <- file.path(base_dir, "data", "processed", run_num)
input_dir   <- file.path(base_dir, "input")

dir.create(results_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(objects_dir, recursive = TRUE, showWarnings = FALSE)

Read in MSstats output

Code

res_df <- fread(file.path(results_dir, "Full_GroupComparison_Results.csv"))

Mapping protein IDs to gene names

Code

clean_spec_raw <- fread(file.path(base_dir, "data/processed/run1/clean_spec_raw.csv"))

protein_dictionary <- clean_spec_raw %>%
  dplyr::select(Protein = PG.ProteinGroups, 
                Accession = PG.ProteinAccessions) %>%
  distinct() %>%
  # Spectronaut can list multiple accessions separated by semicolons.
  # We extract the first one to use as our search key.
  mutate(Primary_Accession = str_extract(Accession, "^[^;]+"))

cat("Mapping Genes and Descriptions...\n")

Mapping Genes and Descriptions...

Code

# 2. Map UniProt Accessions to Gene Symbols
protein_dictionary$Gene <- mapIds(
  x = org.Mm.eg.db,
  keys = protein_dictionary$Primary_Accession,
  column = "SYMBOL",
  keytype = "UNIPROT",
  multiVals = "first"
)

# 3. Map UniProt Accessions to Protein Names/Descriptions
protein_dictionary$Description <- mapIds(
  x = org.Mm.eg.db,
  keys = protein_dictionary$Primary_Accession,
  column = "GENENAME", 
  keytype = "UNIPROT",
  multiVals = "first"
)

# 4. Clean up the dictionary (fill NAs and drop the temp column)
protein_dictionary <- protein_dictionary %>%
  mutate(
    Gene = ifelse(is.na(Gene), Primary_Accession, Gene),
    Description = ifelse(is.na(Description), "Uncharacterized protein", Description)
  ) %>%
  dplyr::select(-Primary_Accession)

fwrite(protein_dictionary, file.path(objects_dir,"protein_dictionary.csv" ))

Volcano function

Boundary cut-offs of 0.05 FDR and +/- 0.58 Log2FC (1.5x).

Code

# Define colours
volcano_cols <- c("Upregulated" = "firebrick3", 
                  "Downregulated" = "navy", 
                  "Not Significant" = "grey80")

generate_swanky_volcano <- function(msstats_results, dict, comparison_label, logfc_cutoff = 0.5, graphs_dir = "results") {
  
  # --- Data Processing ---
  volcano_data <- msstats_results %>%
    # Isolate the specific comparison and drop missing values
    filter(Label == comparison_label, !is.na(adj.pvalue), is.finite(log2FC)) %>%
    # Map the Gene names from the dictionary
    left_join(dict, by = "Protein") %>%
    mutate(
      # Fallback to Protein ID if Gene name is missing
      Gene = ifelse(is.na(Gene) | Gene == "", Protein, Gene),
      # Define significance categories
      Status = case_when(
        adj.pvalue < 0.05 & log2FC > logfc_cutoff ~ "Upregulated",
        adj.pvalue < 0.05 & log2FC < -logfc_cutoff ~ "Downregulated",
        TRUE ~ "Not Significant"
      )
    ) %>%
    # Group by Status to rank Upregulated and Downregulated separately
    group_by(Status) %>%
    # Rank by FDR (smallest adj.pvalue = 1)
    mutate(GroupRank = rank(adj.pvalue, ties.method = "first")) %>%
    ungroup() %>%
    mutate(
      # Label top 20 for each significant category
      PlotLabel = case_when(
        Status != "Not Significant" & GroupRank <= 20 ~ Gene, 
        TRUE ~ NA_character_
      )
    )
  
  # --- Plotting ---
  p <- ggplot(volcano_data, aes(x = log2FC, y = -log10(adj.pvalue), fill = Status, color = Status)) +
    geom_point(aes(size = Status, alpha = Status), shape = 21, stroke = 0.2) +
    scale_fill_manual(values = volcano_cols) +
    scale_color_manual(values = c("Upregulated" = "black", "Downregulated" = "black", "Not Significant" = "grey90")) +
    scale_size_manual(values = c("Upregulated" = 3, "Downregulated" = 3, "Not Significant" = 1.5)) +
    scale_alpha_manual(values = c("Upregulated" = 1, "Downregulated" = 1, "Not Significant" = 0.3)) +
    geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "grey30", linewidth = 0.8) +
    geom_vline(xintercept = c(-logfc_cutoff, logfc_cutoff), linetype = "dashed", color = "grey30", linewidth = 0.8) +
    geom_label_repel(
      aes(label = PlotLabel),
      fill = "white", color = "black", fontface = "bold", size = 3.5,
      box.padding = 0.5, point.padding = 0.5,
      min.segment.length = 0, segment.color = "grey50", max.overlaps = 50
    ) +
    labs(
      title = paste0("Differential Abundance: ", gsub("_", " ", comparison_label)),
      subtitle = paste0("Top 20 Genes Labeled | FDR < 0.05 | |log2FC| > ", logfc_cutoff),
      x = bquote(bold("Log"[2] ~ "Fold Change")),
      y = bquote(bold("-Log"[10] ~ "FDR"))
    ) +
    theme_minimal(base_size = 14) +
    theme(
      plot.title = element_text(face = "bold", size = 18, hjust = 0),
      plot.subtitle = element_text(size = 12, color = "grey40", margin = margin(b = 10)),
      axis.title = element_text(face = "bold", size = 14),
      axis.text = element_text(face = "bold", color = "black"),
      axis.line = element_line(color = "black", linewidth = 1),
      axis.ticks = element_line(color = "black", linewidth = 1),
      legend.position = "top",
      legend.title = element_blank(),
      legend.text = element_text(size = 12, face = "bold"),
      panel.grid.minor = element_blank(),
      panel.grid.major = element_line(color = "grey95")
    )
  
  # --- Output Generation ---
  # Ensure directory exists
  dir.create(graphs_dir, showWarnings = FALSE, recursive = TRUE)
  safe_name <- gsub(" ", "_", comparison_label)
  
  # Save Plot
  ggsave(
    filename = file.path(graphs_dir, paste0("Volcano_", safe_name, "_Swanky.pdf")), 
    plot = p, 
    width = 10, height = 7, dpi = 300,
    device = cairo_pdf 
  )
  
  # Save Data Table
  # Select only the most useful columns and sort by significance for easy reading
  export_data <- volcano_data %>%
    dplyr::select(Protein, Gene, Description, log2FC, pvalue, adj.pvalue, Status) %>%
    arrange(adj.pvalue)
  
  fwrite(export_data, file = file.path(graphs_dir, paste0("Volcano_Data_", safe_name, ".csv")))
  
  return(p)
}

Iterate through comparisons to generate plots and useful DE tables

Code

all_comparisons <- unique(res_df$Label)

# 2. Loop through each comparison, generate the plot/data, and store the plots in a list
plot_list <- list()

for (comp in all_comparisons) {
  p <- generate_swanky_volcano(
    msstats_results = res_df, 
    dict = protein_dictionary, 
    comparison_label = comp, 
    logfc_cutoff = 0.5, 
    graphs_dir = results_dir
  )
  
  # Store the plot in our list
  plot_list[[comp]] <- p
  
  # Print the plot so it appears in HTML output
  print(p)
}

--- title: "3. DE plots" --- ## Libraries ```{r setup, include=FALSE, message=FALSE} library(data.table) library(qs2) library(dplyr) library(ggplot2) library(ggrepel) library(MSstats) library(stringr) library(AnnotationDbi) library(org.Mm.eg.db) ``` ## Directories ```{r} base_dir <- "/nemo/lab/destrooperb/home/shared/zanettc/giulia_proteomics/bulk_proteomics" run_num <- "run4" results_dir <- file.path(base_dir, "results", run_num) objects_dir <- file.path(base_dir, "data", "processed", run_num) input_dir <- file.path(base_dir, "input") dir.create(results_dir, recursive = TRUE, showWarnings = FALSE) dir.create(objects_dir, recursive = TRUE, showWarnings = FALSE) ``` ## Read in MSstats output ```{r} res_df <- fread(file.path(results_dir, "Full_GroupComparison_Results.csv")) ``` ## Mapping protein IDs to gene names ```{r} clean_spec_raw <- fread(file.path(base_dir, "data/processed/run1/clean_spec_raw.csv")) protein_dictionary <- clean_spec_raw %>% dplyr::select(Protein = PG.ProteinGroups, Accession = PG.ProteinAccessions) %>% distinct() %>% # Spectronaut can list multiple accessions separated by semicolons. # We extract the first one to use as our search key. mutate(Primary_Accession = str_extract(Accession, "^[^;]+")) cat("Mapping Genes and Descriptions...\n") # 2. Map UniProt Accessions to Gene Symbols protein_dictionary$Gene <- mapIds( x = org.Mm.eg.db, keys = protein_dictionary$Primary_Accession, column = "SYMBOL", keytype = "UNIPROT", multiVals = "first" ) # 3. Map UniProt Accessions to Protein Names/Descriptions protein_dictionary$Description <- mapIds( x = org.Mm.eg.db, keys = protein_dictionary$Primary_Accession, column = "GENENAME", keytype = "UNIPROT", multiVals = "first" ) # 4. Clean up the dictionary (fill NAs and drop the temp column) protein_dictionary <- protein_dictionary %>% mutate( Gene = ifelse(is.na(Gene), Primary_Accession, Gene), Description = ifelse(is.na(Description), "Uncharacterized protein", Description) ) %>% dplyr::select(-Primary_Accession) fwrite(protein_dictionary, file.path(objects_dir,"protein_dictionary.csv" )) ``` ## Volcano function Boundary cut-offs of 0.05 FDR and +/- 0.58 Log2FC (1.5x). ```{r} # Define colours volcano_cols <- c("Upregulated" = "firebrick3", "Downregulated" = "navy", "Not Significant" = "grey80") generate_swanky_volcano <- function(msstats_results, dict, comparison_label, logfc_cutoff = 0.5, graphs_dir = "results") { # --- Data Processing --- volcano_data <- msstats_results %>% # Isolate the specific comparison and drop missing values filter(Label == comparison_label, !is.na(adj.pvalue), is.finite(log2FC)) %>% # Map the Gene names from the dictionary left_join(dict, by = "Protein") %>% mutate( # Fallback to Protein ID if Gene name is missing Gene = ifelse(is.na(Gene) | Gene == "", Protein, Gene), # Define significance categories Status = case_when( adj.pvalue < 0.05 & log2FC > logfc_cutoff ~ "Upregulated", adj.pvalue < 0.05 & log2FC < -logfc_cutoff ~ "Downregulated", TRUE ~ "Not Significant" ) ) %>% # Group by Status to rank Upregulated and Downregulated separately group_by(Status) %>% # Rank by FDR (smallest adj.pvalue = 1) mutate(GroupRank = rank(adj.pvalue, ties.method = "first")) %>% ungroup() %>% mutate( # Label top 20 for each significant category PlotLabel = case_when( Status != "Not Significant" & GroupRank <= 20 ~ Gene, TRUE ~ NA_character_ ) ) # --- Plotting --- p <- ggplot(volcano_data, aes(x = log2FC, y = -log10(adj.pvalue), fill = Status, color = Status)) + geom_point(aes(size = Status, alpha = Status), shape = 21, stroke = 0.2) + scale_fill_manual(values = volcano_cols) + scale_color_manual(values = c("Upregulated" = "black", "Downregulated" = "black", "Not Significant" = "grey90")) + scale_size_manual(values = c("Upregulated" = 3, "Downregulated" = 3, "Not Significant" = 1.5)) + scale_alpha_manual(values = c("Upregulated" = 1, "Downregulated" = 1, "Not Significant" = 0.3)) + geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "grey30", linewidth = 0.8) + geom_vline(xintercept = c(-logfc_cutoff, logfc_cutoff), linetype = "dashed", color = "grey30", linewidth = 0.8) + geom_label_repel( aes(label = PlotLabel), fill = "white", color = "black", fontface = "bold", size = 3.5, box.padding = 0.5, point.padding = 0.5, min.segment.length = 0, segment.color = "grey50", max.overlaps = 50 ) + labs( title = paste0("Differential Abundance: ", gsub("_", " ", comparison_label)), subtitle = paste0("Top 20 Genes Labeled | FDR < 0.05 | |log2FC| > ", logfc_cutoff), x = bquote(bold("Log"[2] ~ "Fold Change")), y = bquote(bold("-Log"[10] ~ "FDR")) ) + theme_minimal(base_size = 14) + theme( plot.title = element_text(face = "bold", size = 18, hjust = 0), plot.subtitle = element_text(size = 12, color = "grey40", margin = margin(b = 10)), axis.title = element_text(face = "bold", size = 14), axis.text = element_text(face = "bold", color = "black"), axis.line = element_line(color = "black", linewidth = 1), axis.ticks = element_line(color = "black", linewidth = 1), legend.position = "top", legend.title = element_blank(), legend.text = element_text(size = 12, face = "bold"), panel.grid.minor = element_blank(), panel.grid.major = element_line(color = "grey95") ) # --- Output Generation --- # Ensure directory exists dir.create(graphs_dir, showWarnings = FALSE, recursive = TRUE) safe_name <- gsub(" ", "_", comparison_label) # Save Plot ggsave( filename = file.path(graphs_dir, paste0("Volcano_", safe_name, "_Swanky.pdf")), plot = p, width = 10, height = 7, dpi = 300, device = cairo_pdf ) # Save Data Table # Select only the most useful columns and sort by significance for easy reading export_data <- volcano_data %>% dplyr::select(Protein, Gene, Description, log2FC, pvalue, adj.pvalue, Status) %>% arrange(adj.pvalue) fwrite(export_data, file = file.path(graphs_dir, paste0("Volcano_Data_", safe_name, ".csv"))) return(p) } ``` ## Iterate through comparisons to generate plots and useful DE tables ```{r} all_comparisons <- unique(res_df$Label) # 2. Loop through each comparison, generate the plot/data, and store the plots in a list plot_list <- list() for (comp in all_comparisons) { p <- generate_swanky_volcano( msstats_results = res_df, dict = protein_dictionary, comparison_label = comp, logfc_cutoff = 0.5, graphs_dir = results_dir ) # Store the plot in our list plot_list[[comp]] <- p # Print the plot so it appears in HTML output print(p) } ```