Code
suppressPackageStartupMessages({
library(data.table)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggrepel)
library(stringr)
library(STRINGdb)
library(igraph)
library(qs2)
})This script identifies protein-protein interaction networks for upregulated proteins in each MSstats comparison using STRINGdb. It identifies hub proteins by degree centrality and exports summary tables for downstream interpretation.
Workflow overview
suppressPackageStartupMessages({
library(data.table)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggrepel)
library(stringr)
library(STRINGdb)
library(igraph)
library(qs2)
})base_dir <- "/nemo/lab/destrooperb/home/shared/zanettc/millie_proteomics"
run_num <- "run5"
results_dir <- file.path(base_dir, "results", run_num)
objects_dir <- file.path(base_dir, "data", "processed", run_num)
ppi_dir <- file.path(results_dir, "PPI_networks")
dir.create(ppi_dir, recursive = TRUE, showWarnings = FALSE)
# Thresholds
FDR_CUTOFF <- 0.05
LOGFC_CUTOFF <- 0.5
STRING_SCORE <- 700res_df <- fread(file.path(results_dir, "Full_GroupComparison_Results.csv"))
clean_spec_raw <- fread(file.path(base_dir, "data/processed/run1/clean_spec_raw.csv"))
protein_dictionary <- clean_spec_raw %>%
dplyr::select(
Protein = PG.ProteinGroups,
Gene = PG.Genes,
Description = PG.ProteinDescriptions
) %>%
distinct()
# Map gene names and define significance categories
msstats_ready <- res_df %>%
filter(!is.na(adj.pvalue), is.finite(log2FC)) %>%
left_join(protein_dictionary, by = "Protein") %>%
mutate(
Gene = ifelse(is.na(Gene) | Gene == "", Protein, Gene),
Status = case_when(
adj.pvalue < FDR_CUTOFF & log2FC > LOGFC_CUTOFF ~ "Upregulated",
adj.pvalue < FDR_CUTOFF & log2FC < -LOGFC_CUTOFF ~ "Downregulated",
TRUE ~ "Not Significant"
)
)string_db <- STRINGdb$new(
version = "12.0",
species = 10090,
score_threshold = STRING_SCORE,
network_type = "physical"
)comparisons <- c("PlaqueNear_vs_Control", "PlaqueFar_vs_Control", "PlaqueNear_vs_PlaqueFar")
network_results <- list()
for (comp in comparisons) {
cat("\n============================\n")
cat(comp, "\n")
cat("============================\n")
# --- Filter upregulated proteins ---
sig_up <- msstats_ready %>%
filter(Label == comp, Status == "Upregulated") %>%
dplyr::select(Gene, log2FC, adj.pvalue) %>%
distinct(Gene, .keep_all = TRUE) %>%
as.data.frame()
cat("Upregulated proteins:", nrow(sig_up), "\n")
if (nrow(sig_up) < 5) {
cat("Too few proteins, skipping.\n")
next
}
# --- Map to STRING ---
mapped <- string_db$map(sig_up, "Gene", removeUnmappedRows = TRUE)
cat("Mapped to STRING:", nrow(mapped), "\n")
if (nrow(mapped) < 5) {
cat("Too few mapped, skipping.\n")
next
}
# --- Get interactions ---
interactions <- string_db$get_interactions(mapped$STRING_id)
cat("Interactions:", nrow(interactions), "\n")
# --- Build igraph object ---
if (nrow(interactions) > 0) {
g <- graph_from_data_frame(
interactions %>% dplyr::select(from, to, combined_score),
directed = FALSE,
vertices = mapped %>% dplyr::select(STRING_id, Gene, log2FC)
)
g <- simplify(g)
# Degree centrality
V(g)$degree <- degree(g)
cat("Nodes:", vcount(g), "| Edges:", ecount(g), "\n")
} else {
g <- NULL
cat("No interactions found.\n")
}
network_results[[comp]] <- list(
proteins = mapped,
interactions = interactions,
graph = g
)
}
============================
PlaqueNear_vs_Control
============================
Upregulated proteins: 453
Warning: we couldn't map to STRING 0% of your identifiersMapped to STRING: 450
Interactions: 312
Nodes: 450 | Edges: 156
============================
PlaqueFar_vs_Control
============================
Upregulated proteins: 301
Warning: we couldn't map to STRING 0% of your identifiersMapped to STRING: 298
Interactions: 118
Nodes: 298 | Edges: 59
============================
PlaqueNear_vs_PlaqueFar
============================
Upregulated proteins: 73
Mapped to STRING: 73
Interactions: 44
Nodes: 73 | Edges: 22
for (comp in names(network_results)) {
g <- network_results[[comp]]$graph
if (is.null(g) || ecount(g) == 0) next
# Remove isolated nodes (degree 0) for cleaner plot
g_plot <- delete_vertices(g, V(g)[degree(g) == 0])
if (vcount(g_plot) < 3) next
# Style
V(g_plot)$label <- V(g_plot)$Gene
V(g_plot)$size <- scales::rescale(degree(g_plot), to = c(5, 20))
V(g_plot)$label.cex <- scales::rescale(degree(g_plot), to = c(0.6, 1.3))
V(g_plot)$label.font <- 2
V(g_plot)$color <- "steelblue"
V(g_plot)$frame.color <- "grey30"
E(g_plot)$color <- "grey75"
E(g_plot)$width <- 0.8
# Highlight top 10 hubs
top_hubs <- names(sort(degree(g_plot), decreasing = TRUE))[1:min(10, vcount(g_plot))]
V(g_plot)$color[V(g_plot)$name %in% top_hubs] <- "firebrick3"
pdf(file.path(ppi_dir, paste0("network_full_", comp, ".pdf")), width = 14, height = 14)
set.seed(42)
plot(g_plot,
layout = layout_with_fr(g_plot),
vertex.label.color = "black",
main = paste0(comp, "\nUpregulated proteins | STRING physical >= ", STRING_SCORE,
"\nNodes: ", vcount(g_plot), " | Edges: ", ecount(g_plot),
" | Red = top 10 hubs"))
dev.off()
# Also render in notebook
set.seed(42)
plot(g_plot,
layout = layout_with_fr(g_plot),
vertex.label.color = "black",
main = paste0(comp, "\nUpregulated proteins | STRING physical >= ", STRING_SCORE,
"\nNodes: ", vcount(g_plot), " | Edges: ", ecount(g_plot),
" | Red = top 10 hubs"))
}


Focused view of the most connected proteins only.
TOP_N <- 30
for (comp in names(network_results)) {
g <- network_results[[comp]]$graph
if (is.null(g) || ecount(g) == 0) next
# Select top N by degree
top_nodes <- names(sort(degree(g), decreasing = TRUE))[1:min(TOP_N, vcount(g))]
g_top <- induced_subgraph(g, top_nodes)
g_top <- delete_vertices(g_top, V(g_top)[degree(g_top) == 0])
if (vcount(g_top) < 3) next
V(g_top)$label <- V(g_top)$Gene
V(g_top)$size <- scales::rescale(degree(g_top), to = c(8, 22))
V(g_top)$label.cex <- 1.0
V(g_top)$label.font <- 2
V(g_top)$frame.color <- "grey30"
E(g_top)$color <- "grey70"
E(g_top)$width <- 1.2
# Colour by logFC
fc <- V(g_top)$log2FC
fc_pal <- colorRampPalette(c("gold", "orange", "firebrick3"))(100)
fc_idx <- findInterval(fc, seq(min(fc), max(fc), length.out = 100))
fc_idx[fc_idx == 0] <- 1
V(g_top)$color <- fc_pal[fc_idx]
pdf(file.path(ppi_dir, paste0("network_top", TOP_N, "_", comp, ".pdf")),
width = 12, height = 12)
set.seed(42)
plot(g_top,
layout = layout_with_fr(g_top),
vertex.label.color = "black",
main = paste0(comp, " — Top ", vcount(g_top), " by degree",
"\nColoured by log2FC | Edges: ", ecount(g_top)))
legend_vals <- seq(min(fc, na.rm = TRUE), max(fc, na.rm = TRUE), length.out = 5)
legend_colors <- colorRampPalette(c("gold", "orange", "firebrick3"))(5)
# 2. Add the legend
legend("bottomright",
legend = round(legend_vals, 2),
fill = legend_colors,
title = "log2FC",
bty = "n", # No border box
cex = 0.8) # Text size
dev.off()
set.seed(42)
plot(g_top,
layout = layout_with_fr(g_top),
vertex.label.color = "black",
main = paste0(comp, " — Top ", vcount(g_top), " by degree",
"\nColoured by log2FC | Edges: ", ecount(g_top)))
legend_vals <- seq(min(fc, na.rm = TRUE), max(fc, na.rm = TRUE), length.out = 5)
legend_colors <- colorRampPalette(c("gold", "orange", "firebrick3"))(5)
# 2. Add the legend
legend("bottomright",
legend = round(legend_vals, 2),
fill = legend_colors,
title = "log2FC",
bty = "n", # No border box
cex = 0.8) # Text size
}Warning in title(...): for 'PlaqueNear_vs_Control — Top 29 by degree' in
'mbcsToSbcs': - substituted for — (U+2014)
Warning in title(...): for 'PlaqueFar_vs_Control — Top 26 by degree' in
'mbcsToSbcs': - substituted for — (U+2014)

Warning in title(...): for 'PlaqueNear_vs_PlaqueFar — Top 18 by degree' in
'mbcsToSbcs': - substituted for — (U+2014)


hub_list <- list()
for (comp in names(network_results)) {
g <- network_results[[comp]]$graph
if (is.null(g)) next
hub_df <- data.frame(
Gene = V(g)$Gene,
log2FC = V(g)$log2FC,
Degree = degree(g),
Comparison = comp
) %>%
filter(Degree > 0) %>%
arrange(desc(Degree))
hub_list[[comp]] <- hub_df
cat("\n--- Top 20 hubs:", comp, "---\n")
print(head(hub_df, 20))
fwrite(hub_df, file.path(ppi_dir, paste0("hub_proteins_", comp, ".csv")))
}
--- Top 20 hubs: PlaqueNear_vs_Control ---
Gene log2FC Degree Comparison
10090.ENSMUSP00000030698 STX12 0.9066883 11 PlaqueNear_vs_Control
10090.ENSMUSP00000122090 SNAPIN 0.6389746 8 PlaqueNear_vs_Control
10090.ENSMUSP00000112138 SNAP23 0.5609813 7 PlaqueNear_vs_Control
10090.ENSMUSP00000026405 BLOC1S1 0.7382437 7 PlaqueNear_vs_Control
10090.ENSMUSP00000151638 STX7 0.9157072 7 PlaqueNear_vs_Control
10090.ENSMUSP00000057462 VTI1B 0.9011774 7 PlaqueNear_vs_Control
10090.ENSMUSP00000021285 STX8 0.8990744 7 PlaqueNear_vs_Control
10090.ENSMUSP00000005406 APP 0.6018411 7 PlaqueNear_vs_Control
10090.ENSMUSP00000030797 VAMP3 0.5199320 7 PlaqueNear_vs_Control
10090.ENSMUSP00000052262 VAMP7 1.1770267 7 PlaqueNear_vs_Control
10090.ENSMUSP00000088591 RRAGA 0.8747798 7 PlaqueNear_vs_Control
10090.ENSMUSP00000130811 LAMTOR3 0.7049390 6 PlaqueNear_vs_Control
10090.ENSMUSP00000093644 VTI1A 0.7121213 6 PlaqueNear_vs_Control
10090.ENSMUSP00000133302 APOE 1.3629408 6 PlaqueNear_vs_Control
10090.ENSMUSP00000072729 VPS41 0.7493179 6 PlaqueNear_vs_Control
10090.ENSMUSP00000056693 LAMTOR4 1.1679741 6 PlaqueNear_vs_Control
10090.ENSMUSP00000099559 VPS39 0.9382069 6 PlaqueNear_vs_Control
10090.ENSMUSP00000030399 RRAGC 1.0087124 6 PlaqueNear_vs_Control
10090.ENSMUSP00000033131 LAMTOR1 1.2968520 6 PlaqueNear_vs_Control
10090.ENSMUSP00000129012 LAMTOR5 1.0398121 6 PlaqueNear_vs_Control
--- Top 20 hubs: PlaqueFar_vs_Control ---
Gene log2FC Degree Comparison
10090.ENSMUSP00000054634 Cdc42 0.5434135 6 PlaqueFar_vs_Control
10090.ENSMUSP00000130123 Gnb1 0.5000713 6 PlaqueFar_vs_Control
10090.ENSMUSP00000104932 Shank3 0.5371577 6 PlaqueFar_vs_Control
10090.ENSMUSP00000019290 Cacng2 0.5113417 4 PlaqueFar_vs_Control
10090.ENSMUSP00000026572 Hras 0.5107116 4 PlaqueFar_vs_Control
10090.ENSMUSP00000121127 Gnb4 0.6310180 3 PlaqueFar_vs_Control
10090.ENSMUSP00000093978 Gng3 0.5763679 3 PlaqueFar_vs_Control
10090.ENSMUSP00000071486 Actg1 0.5576763 3 PlaqueFar_vs_Control
10090.ENSMUSP00000007130 Ctnnb1 0.5726895 3 PlaqueFar_vs_Control
10090.ENSMUSP00000122896 Dlgap1 0.5007003 3 PlaqueFar_vs_Control
10090.ENSMUSP00000103571 Shank1 0.5285800 2 PlaqueFar_vs_Control
10090.ENSMUSP00000141686 Syngap1 0.5782903 2 PlaqueFar_vs_Control
10090.ENSMUSP00000002029 Emd 0.5941915 2 PlaqueFar_vs_Control
10090.ENSMUSP00000005606 Prkaca 0.6049349 2 PlaqueFar_vs_Control
10090.ENSMUSP00000062392 Prkca 0.5467517 2 PlaqueFar_vs_Control
10090.ENSMUSP00000107830 Prkcd 0.5585430 2 PlaqueFar_vs_Control
10090.ENSMUSP00000070019 Prkcb 0.5312018 2 PlaqueFar_vs_Control
10090.ENSMUSP00000026436 Baiap2 0.5063107 2 PlaqueFar_vs_Control
10090.ENSMUSP00000042785 Ak5 0.6536680 2 PlaqueFar_vs_Control
10090.ENSMUSP00000112221 Nck1 0.5200268 2 PlaqueFar_vs_Control
--- Top 20 hubs: PlaqueNear_vs_PlaqueFar ---
Gene log2FC Degree Comparison
10090.ENSMUSP00000088591 Rraga 0.5653424 5 PlaqueNear_vs_PlaqueFar
10090.ENSMUSP00000056693 Lamtor4 0.8112023 5 PlaqueNear_vs_PlaqueFar
10090.ENSMUSP00000030399 Rragc 0.7862667 5 PlaqueNear_vs_PlaqueFar
10090.ENSMUSP00000033131 Lamtor1 1.0448811 5 PlaqueNear_vs_PlaqueFar
10090.ENSMUSP00000129012 Lamtor5 0.7495078 5 PlaqueNear_vs_PlaqueFar
10090.ENSMUSP00000029698 Lamtor2 0.9612483 4 PlaqueNear_vs_PlaqueFar
10090.ENSMUSP00000057462 Vti1b 0.5585695 2 PlaqueNear_vs_PlaqueFar
10090.ENSMUSP00000021285 Stx8 0.5823858 2 PlaqueNear_vs_PlaqueFar
10090.ENSMUSP00000052262 Vamp7 0.8350880 2 PlaqueNear_vs_PlaqueFar
10090.ENSMUSP00000152169 Ctsl 0.7165960 1 PlaqueNear_vs_PlaqueFar
10090.ENSMUSP00000006235 Ctsb 0.6849511 1 PlaqueNear_vs_PlaqueFar
10090.ENSMUSP00000033824 Lamp1 0.7344733 1 PlaqueNear_vs_PlaqueFar
10090.ENSMUSP00000074448 Lamp2 1.0661647 1 PlaqueNear_vs_PlaqueFar
10090.ENSMUSP00000022169 Hexb 0.6912477 1 PlaqueNear_vs_PlaqueFar
10090.ENSMUSP00000026262 Hexa 0.6914866 1 PlaqueNear_vs_PlaqueFar
10090.ENSMUSP00000113862 Depdc5 0.7191249 1 PlaqueNear_vs_PlaqueFar
10090.ENSMUSP00000072729 Vps41 0.5873762 1 PlaqueNear_vs_PlaqueFar
10090.ENSMUSP00000099559 Vps39 0.5994513 1 PlaqueNear_vs_PlaqueFar
# Combined table across comparisons
if (length(hub_list) > 0) {
hub_combined <- bind_rows(hub_list)
fwrite(hub_combined, file.path(ppi_dir, "hub_proteins_all_comparisons.csv"))
# Proteins that are hubs in both comparisons
if (length(hub_list) == 2) {
shared_hubs <- inner_join(
hub_list[[1]] %>% dplyr::select(Gene, Degree_Near = Degree, log2FC_Near = log2FC),
hub_list[[2]] %>% dplyr::select(Gene, Degree_Far = Degree, log2FC_Far = log2FC),
by = "Gene"
) %>%
mutate(Total_Degree = Degree_Near + Degree_Far) %>%
arrange(desc(Total_Degree))
cat("\n--- Shared hubs (present in both comparisons) ---\n")
print(head(shared_hubs, 20))
fwrite(shared_hubs, file.path(ppi_dir, "hub_proteins_shared.csv"))
}
}summary_table <- data.frame(
Comparison = character(),
Upregulated_proteins = integer(),
Mapped_to_STRING = integer(),
Nodes_with_edges = integer(),
Edges = integer(),
stringsAsFactors = FALSE
)
for (comp in names(network_results)) {
nr <- network_results[[comp]]
g <- nr$graph
n_with_edges <- if (!is.null(g)) sum(degree(g) > 0) else 0
n_edges <- if (!is.null(g)) ecount(g) else 0
summary_table <- rbind(summary_table, data.frame(
Comparison = comp,
Upregulated_proteins = nrow(msstats_ready %>% filter(Label == comp, Status == "Upregulated") %>% distinct(Gene)),
Mapped_to_STRING = nrow(nr$proteins),
Nodes_with_edges = n_with_edges,
Edges = n_edges
))
}
cat("\n--- Network summary ---\n")
--- Network summary ---
print(summary_table) Comparison Upregulated_proteins Mapped_to_STRING
1 PlaqueNear_vs_Control 453 450
2 PlaqueFar_vs_Control 301 298
3 PlaqueNear_vs_PlaqueFar 73 73
Nodes_with_edges Edges
1 124 156
2 71 59
3 18 22
fwrite(summary_table, file.path(ppi_dir, "network_summary.csv"))sessionInfo()R version 4.5.1 (2025-06-13)
Platform: x86_64-pc-linux-gnu
Running under: Rocky Linux 8.7 (Green Obsidian)
Matrix products: default
BLAS/LAPACK: FlexiBLAS OPENBLAS; LAPACK version 3.12.0
locale:
[1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
[5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
[7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/London
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] qs2_0.1.6 igraph_2.2.2 STRINGdb_2.22.0
[4] stringr_1.6.0 ggrepel_0.9.6 ggplot2_4.0.2
[7] tidyr_1.3.2 dplyr_1.2.0 data.table_1.18.2.1
loaded via a namespace (and not attached):
[1] generics_0.1.4 gplots_3.3.0 bitops_1.0-9
[4] KernSmooth_2.23-26 gtools_3.9.5 RSQLite_2.4.5
[7] stringi_1.8.7 digest_0.6.39 magrittr_2.0.4
[10] caTools_1.18.3 evaluate_1.0.5 grid_4.5.1
[13] RColorBrewer_1.1-3 blob_1.2.4 fastmap_1.2.0
[16] plyr_1.8.9 jsonlite_2.0.0 DBI_1.2.3
[19] httr_1.4.8 purrr_1.2.1 scales_1.4.0
[22] cli_3.6.5 rlang_1.1.7 bit64_4.6.0-1
[25] plotrix_3.8-14 gsubfn_0.7 cachem_1.1.0
[28] withr_3.0.2 yaml_2.3.12 otel_0.2.0
[31] proto_1.0.0 tools_4.5.1 memoise_2.0.1
[34] hash_2.2.6.4 vctrs_0.7.1 R6_2.6.1
[37] png_0.1-8 lifecycle_1.0.5 stringfish_0.17.0
[40] bit_4.6.0 htmlwidgets_1.6.4 pkgconfig_2.0.3
[43] RcppParallel_5.1.11-1 pillar_1.11.1 gtable_0.3.6
[46] glue_1.8.0 Rcpp_1.1.1 xfun_0.56
[49] tibble_3.3.1 tidyselect_1.2.1 knitr_1.51
[52] farver_2.1.2 sqldf_0.4-12 htmltools_0.5.9
[55] rmarkdown_2.30 compiler_4.5.1 S7_0.2.1
[58] chron_2.3-62