7. Putative cell-cell interactions

==================================================================

Cell-cell interactions for proteomics data

Author: Carlo Zanetti

This script aims to identify and statistically validate potential intercellular signalling pathways occurring between astrocytes in different plaques states compared to WT.

Workflow overview

  1. OmnipathR package retrieves a database of human ligand-receptor interactions.
  2. Import DE results (derived from MSstats)
  3. Filter for cases were both the ligand and receptor are statistically significant (p < 0.05) and upregulated (either logFC > 0, or stricter 0.5, within the same experimental comparison).
  4. To ensure interactions aren’t random occurrences, 1000 permutations are performed, shuffling the protein names to create a null distribution of random interactions. We can then compare the observed interaction counts to this distribution and derive an empirical p-value.
  5. Interactions are ranked and filtered by both the number of supporting research papers and combined magnitude of upregulation (logFC_Ligand + logFC_Receptor).

==================================================================

Code
suppressPackageStartupMessages({
library(OmnipathR)
library(dplyr)
library(tidyr)
library(data.table)
library(qs2)
library(ggplot2)
library(igraph)
library(ggraph)
library(patchwork)
})

Set directories

Code
base_dir <- "/nemo/lab/destrooperb/home/shared/zanettc/millie_proteomics"
run_num <- "run5"

results_dir <- file.path(base_dir, "results", run_num)
objects_dir <- file.path(base_dir, "data", "processed", run_num)
raw_dir <- file.path(base_dir, "data", "raw")

dir.create(results_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(objects_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(raw_dir, recursive = TRUE, showWarnings = FALSE)

Retrieve intercellular communication database

Code
lr_network <- intercell_network(
  transmitter_param = list(categories = c('ligand')),
  receiver_param = list(categories = c('receptor')),
  resources = NULL,
  references_by_resource = TRUE)

lr_with_evidence <- lr_network %>%
    dplyr::select(
        Ligand = source_genesymbol, 
        Receptor = target_genesymbol, 
        sources, 
        references
    ) %>%
    distinct() %>%
    # Count how many databases confirm this interaction
    mutate(curation_count = nchar(as.character(sources)) - nchar(gsub(";", "", as.character(sources))) + 1) %>%
    # Count how many individual PMIDs (papers) support it
    mutate(paper_count = nchar(as.character(references)) - nchar(gsub(";", "", as.character(references))) + 1) %>%
    # Replace NAs with 0
    mutate(across(c(curation_count, paper_count), ~replace_na(.x, 0)))

Read in MSstats results

Code
res_df <- fread(file.path(results_dir, "Full_GroupComparison_Results.csv"))

clean_spec_raw <- fread(file.path(base_dir, "data/processed/run1/clean_spec_raw.csv"))

protein_dictionary <- clean_spec_raw %>%
  dplyr::select(Protein = PG.ProteinGroups, 
         Gene = PG.Genes, 
         Description = PG.ProteinDescriptions) %>%
  distinct()

msstats_results <- res_df %>%
  # handle NAs
  filter(Label %in% c("PlaqueFar_vs_Control", "PlaqueNear_vs_Control")) %>%
  filter(!is.na(adj.pvalue), is.finite(log2FC)) %>%
  
  # Join with dictionary
  left_join(protein_dictionary, by = "Protein") %>%
  
  # Clean up Gene names and define significance
  mutate(
    # Fallback to Protein ID if Gene name is missing or empty
    #Change to upper case to match with human gene symbols
    Gene = toupper(ifelse(is.na(Gene) | Gene == "", Protein, Gene)),
  )

Create ligand and receptor dataframes

Code
#Ligands data
ligands_data <- msstats_results %>%
  dplyr::select(Label, Ligand = Gene, logFC_Ligand = log2FC, pval_Ligand = adj.pvalue)

#Receptor data
receptors_data <- msstats_results %>%
  dplyr::select(Label, Receptor = Gene, logFC_Receptor = log2FC, pval_Receptor = adj.pvalue)

Multi-comparison merge

Join network with ligands, then join with receptors matching both the receptor name and the label. Ensures evaluating ligand X and receptor Y within the same MSstats comparison.

Code
all_cross_talk <- lr_with_evidence %>%
  #keeps only rows that exist in both
  inner_join(ligands_data, by = "Ligand", relationship = "many-to-many") %>%
  #match both receptor name and the label (condition of interest) - keeps only rows where the recieving recedptor was also detected, in that specific condition
  inner_join(receptors_data, by = c("Receptor", "Label")) %>%
  # Filter to keep only rows where BOTH proteins were detected and statistically significant
  filter(pval_Ligand < 0.05 & pval_Receptor < 0.05) %>%
  arrange(desc(curation_count))
Warning in inner_join(., receptors_data, by = c("Receptor", "Label")): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 2011 of `x` matches multiple rows in `y`.
ℹ Row 1931 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
Code
#4839 length when inner joining to ligand
#2135 length when inner joining to receptor and label too.
#147 length when filtering to sig for both. 

Extract co-upregulated pairs per comparison (logFC >0)

logFC filters set to 0 to be permissive.

Code
co_upregulated_all <- all_cross_talk %>%
  filter(logFC_Ligand > 0 & logFC_Receptor > 0) %>%
  group_by(Label) %>%
  arrange(desc(logFC_Ligand + logFC_Receptor)) %>%
  ungroup() %>%
  dplyr::select(Label, Ligand, Receptor, logFC_Ligand, logFC_Receptor, paper_count, references)

co_upregulated_all

Comparison of number of interactions for each comparison

Code
interaction_summary <- co_upregulated_all %>%
  count(Label, name = "Observed_Interactions")
interaction_summary

Permutation test (logFC > 0)

Code
set.seed(42)
n_perms <- 1000
results_list <- vector("list", n_perms)

print(paste("Starting", n_perms, "permutations..."))
[1] "Starting 1000 permutations..."
Code
for (i in 1:n_perms) {
  
  if (i %% 100 == 0) {
    message("Completed ", i, " of ", n_perms, " permutations...")
  }
  
  # Shuffle Gene names to break biological links but keep data structure
  shuffled_msstats <- msstats_results %>%
    group_by(Label) %>%
    mutate(Gene = sample(Gene)) %>% 
    ungroup()
  
  # Rebuild fake ligand/receptor tables
  shuffled_ligands <- shuffled_msstats %>%
    dplyr::select(Label, Ligand = Gene, logFC_Ligand = log2FC, pval_Ligand = adj.pvalue)
  
  shuffled_receptors <- shuffled_msstats %>%
    dplyr::select(Label, Receptor = Gene, logFC_Receptor = log2FC, pval_Receptor = adj.pvalue)
  
  # Re-run the network merge and filtering with fake data
  shuffled_cross_talk <- lr_with_evidence %>%
    inner_join(shuffled_ligands, by = "Ligand", relationship = "many-to-many") %>%
    inner_join(shuffled_receptors, by = c("Receptor", "Label"), relationship = "many-to-many")%>%
    filter(pval_Ligand < 0.05, pval_Receptor < 0.05, 
           logFC_Ligand > 0, logFC_Receptor > 0)
  
  # Count hits
  shuffled_summary <- shuffled_cross_talk %>%
    count(Label, name = "Simulated_Interactions") %>%
    mutate(Permutation = i)
  
  results_list[[i]] <- shuffled_summary
}
Completed 100 of 1000 permutations...
Completed 200 of 1000 permutations...
Completed 300 of 1000 permutations...
Completed 400 of 1000 permutations...
Completed 500 of 1000 permutations...
Completed 600 of 1000 permutations...
Completed 700 of 1000 permutations...
Completed 800 of 1000 permutations...
Completed 900 of 1000 permutations...
Completed 1000 of 1000 permutations...
Code
# Combine simulated results
null_distribution_df <- bind_rows(results_list)

# Ensure cases where 0 interactions were found are explicitly tracked
all_labels <- unique(msstats_results$Label)
grid <- expand_grid(Label = all_labels, Permutation = 1:n_perms)
null_distribution_df <- grid %>%
  left_join(null_distribution_df, by = c("Label", "Permutation")) %>%
  mutate(Simulated_Interactions = replace_na(Simulated_Interactions, 0))

# Calculate Empirical P-values
empirical_results <- interaction_summary %>%
  left_join(null_distribution_df, by = "Label", relationship = "many-to-many") %>%
  group_by(Label, Observed_Interactions) %>%
  summarise(
    Expected_Mean = mean(Simulated_Interactions),
    Permutations_Greater_or_Equal = sum(Simulated_Interactions >= Observed_Interactions),
    Empirical_P_Value = sum(Simulated_Interactions >= Observed_Interactions) / n_perms,
    .groups = "drop"
  )
print(empirical_results)
# A tibble: 2 × 5
  Label               Observed_Interactions Expected_Mean Permutations_Greater…¹
  <chr>                               <int>         <dbl>                  <int>
1 PlaqueFar_vs_Contr…                    69          88.3                    794
2 PlaqueNear_vs_Cont…                    77          43.5                     17
# ℹ abbreviated name: ¹​Permutations_Greater_or_Equal
# ℹ 1 more variable: Empirical_P_Value <dbl>

Visualize the Permutation Results

Code
perm_plot <- ggplot(null_distribution_df, aes(x = Simulated_Interactions)) +
  geom_histogram(binwidth = 1, fill = "lightgray", color = "black", alpha = 0.7) +
  geom_vline(data = interaction_summary, aes(xintercept = Observed_Interactions), 
             color = "red", linetype = "dashed", size = 1.2) +
  facet_wrap(~Label, scales = "free_x") +
  theme_minimal() +
  labs(
    title = "Permutation Test: Observed vs. Random Co-upregulated L-R Pairs",
    subtitle = "Red line = Actual observed pairs. Histogram = What random chance produced.",
    x = "Number of Significant Co-upregulated Pairs",
    y = "Frequency (out of 1000 permutations)"
  )
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Code
print(perm_plot)

Extract co-upregulated pairs per comparison (logFC >0.5)

Sorted by decreasing paper count

Code
co_upregulated_all <- all_cross_talk %>%
  filter(logFC_Ligand > 0.5 & logFC_Receptor > 0.5) %>%
  group_by(Label) %>%
  arrange(desc(paper_count)) %>%
  ungroup() %>%
  dplyr::select(Label, Ligand, Receptor, logFC_Ligand, logFC_Receptor, paper_count, references)

co_upregulated_all

Sorted by descending LogFC_Ligand + logFC_Receptor:

Code
co_upregulated_all <- all_cross_talk %>%
  filter(logFC_Ligand > 0.5 & logFC_Receptor > 0.5) %>%
  group_by(Label) %>%
  arrange(desc(logFC_Ligand + logFC_Receptor)) %>%
  ungroup() %>%
  dplyr::select(Label, Ligand, Receptor, logFC_Ligand, logFC_Receptor, paper_count, references)

co_upregulated_all

Comparison of number of interactions for each comparison

Code
interaction_summary <- co_upregulated_all %>%
  count(Label, name = "Observed_Interactions")
interaction_summary

Permutation test (logFC > 0.5)

Code
set.seed(42)
n_perms <- 1000
results_list <- vector("list", n_perms)

print(paste("Starting", n_perms, "permutations..."))
[1] "Starting 1000 permutations..."
Code
for (i in 1:n_perms) {
  
  if (i %% 100 == 0) {
    message("Completed ", i, " of ", n_perms, " permutations...")
  }
  
  # Shuffle Gene names to break biological links but keep data structure
  shuffled_msstats <- msstats_results %>%
    group_by(Label) %>%
    mutate(Gene = sample(Gene)) %>% 
    ungroup()
  
  # Rebuild fake ligand/receptor tables
  shuffled_ligands <- shuffled_msstats %>%
    dplyr::select(Label, Ligand = Gene, logFC_Ligand = log2FC, pval_Ligand = adj.pvalue)
  
  shuffled_receptors <- shuffled_msstats %>%
    dplyr::select(Label, Receptor = Gene, logFC_Receptor = log2FC, pval_Receptor = adj.pvalue)
  
  # Re-run the network merge and filtering with fake data
  shuffled_cross_talk <- lr_with_evidence %>%
    inner_join(shuffled_ligands, by = "Ligand", relationship = "many-to-many") %>%
    inner_join(shuffled_receptors, by = c("Receptor", "Label"), relationship = "many-to-many")%>%
    filter(pval_Ligand < 0.05, pval_Receptor < 0.05, 
           logFC_Ligand > 0.5, logFC_Receptor > 0.5)
  
  # Count hits
  shuffled_summary <- shuffled_cross_talk %>%
    count(Label, name = "Simulated_Interactions") %>%
    mutate(Permutation = i)
  
  results_list[[i]] <- shuffled_summary
}
Completed 100 of 1000 permutations...
Completed 200 of 1000 permutations...
Completed 300 of 1000 permutations...
Completed 400 of 1000 permutations...
Completed 500 of 1000 permutations...
Completed 600 of 1000 permutations...
Completed 700 of 1000 permutations...
Completed 800 of 1000 permutations...
Completed 900 of 1000 permutations...
Completed 1000 of 1000 permutations...
Code
# Combine simulated results
null_distribution_df <- bind_rows(results_list)

# Ensure cases where 0 interactions were found are explicitly tracked
all_labels <- unique(msstats_results$Label)
grid <- expand_grid(Label = all_labels, Permutation = 1:n_perms)
null_distribution_df <- grid %>%
  left_join(null_distribution_df, by = c("Label", "Permutation")) %>%
  mutate(Simulated_Interactions = replace_na(Simulated_Interactions, 0))

# Calculate Empirical P-values
empirical_results <- interaction_summary %>%
  left_join(null_distribution_df, by = "Label", relationship = "many-to-many") %>%
  group_by(Label, Observed_Interactions) %>%
  summarise(
    Expected_Mean = mean(Simulated_Interactions),
    Permutations_Greater_or_Equal = sum(Simulated_Interactions >= Observed_Interactions),
    Empirical_P_Value = sum(Simulated_Interactions >= Observed_Interactions) / n_perms,
    .groups = "drop"
  )
print(empirical_results)
# A tibble: 2 × 5
  Label               Observed_Interactions Expected_Mean Permutations_Greater…¹
  <chr>                               <int>         <dbl>                  <int>
1 PlaqueFar_vs_Contr…                     2          1.94                    482
2 PlaqueNear_vs_Cont…                    13          4.85                     45
# ℹ abbreviated name: ¹​Permutations_Greater_or_Equal
# ℹ 1 more variable: Empirical_P_Value <dbl>

Visualize the Permutation Results

Code
perm_plot <- ggplot(null_distribution_df, aes(x = Simulated_Interactions)) +
  geom_histogram(binwidth = 1, fill = "lightgray", color = "black", alpha = 0.7) +
  geom_vline(data = interaction_summary, aes(xintercept = Observed_Interactions), 
             color = "red", linetype = "dashed", size = 1.2) +
  facet_wrap(~Label, scales = "free_x") +
  theme_minimal() +
  labs(
    title = "Permutation Test: Observed vs. Random Co-upregulated L-R Pairs",
    subtitle = "Red line = Actual observed pairs. Histogram = What random chance produced.",
    x = "Number of Significant Co-upregulated Pairs",
    y = "Frequency (out of 1000 permutations)"
  )

print(perm_plot)

Code
# ==============================================================================
# Ligand-Receptor Interaction Heatmap (logFC > 0.5) - NEAR VS CONTROL ONLY
# Uses all_cross_talk object
# ==============================================================================


# ------------------------------------------------------------------------------
# 1. Prepare data — logFC > 0.5 co-upregulated pairs (PlaqueNear only)
# ------------------------------------------------------------------------------

lr_heatmap_data <- all_cross_talk %>%
  # Isolate only the PlaqueNear_vs_Control condition
  filter(Label == "PlaqueNear_vs_Control") %>%
  filter(logFC_Ligand > 0.5 & logFC_Receptor > 0.5) %>%
  mutate(
    combined_logFC = logFC_Ligand + logFC_Receptor,
    comparison = "Near vs Ctrl", 
    # Create a pair label for ordering
    pair = paste0(Ligand, " \u2192 ", Receptor)
  ) %>%
  dplyr::select(pair, Ligand, Receptor, comparison, 
                combined_logFC, logFC_Ligand, logFC_Receptor, paper_count)

# ------------------------------------------------------------------------------
# 2. Dot plot heatmap — pair x comparison
# ------------------------------------------------------------------------------

# Order pairs by combined logFC (descending)
pair_order <- lr_heatmap_data %>%
  group_by(pair) %>%
  summarise(max_fc = max(combined_logFC), .groups = "drop") %>%
  arrange(desc(max_fc)) %>%
  pull(pair)

lr_heatmap_data$pair <- factor(lr_heatmap_data$pair, levels = rev(pair_order))

p_dot <- ggplot(lr_heatmap_data, 
                aes(x = comparison, y = pair)) +
  geom_point(
    aes(size = paper_count + 1, fill = combined_logFC),
    shape = 21, colour = "black", stroke = 0.4
  ) +
  # Add logFC text inside/beside each dot
  geom_text(
    aes(label = sprintf("%.2f", combined_logFC)),
    size = 2.8, colour = "grey30", nudge_x = 0.3
  ) +
  scale_fill_gradient2(
    low = "lightyellow", mid = "#F0997B", high = "firebrick3",
    midpoint = median(lr_heatmap_data$combined_logFC),
    name = "Combined\nlog2FC"
  ) +
  scale_size_continuous(
    range = c(3, 12),
    name = "Paper\nevidence",
    breaks = c(1, 3, 7, 13),
    labels = c("0", "2", "6", "12")
  ) +
  labs(
    title = "Co-upregulated Ligand-Receptor Pairs",
    subtitle = "Near vs Control | Both log2FC > 0.5, adj.p < 0.05 | Dot size = supporting publications",
    x = NULL, y = NULL
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 10, colour = "grey40", margin = margin(b = 15)),
    axis.text.y = element_text(size = 10, face = "bold"),
    axis.text.x = element_text(size = 12, face = "bold"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    legend.position = "right",
    plot.margin = margin(20, 20, 20, 10)
  )

print(p_dot)

Code
ggsave(
  filename = file.path(results_dir, "LR_dotplot_Near_logFC05.pdf"),
  plot = p_dot,
  width = 8, height = 8, dpi = 300 # Slightly narrowed width for single column
)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'NRXN3 → ADGRL1' in 'mbcsToSbcs': -> substituted for → (U+2192)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'NRXN3 → LRRTM1' in 'mbcsToSbcs': -> substituted for → (U+2192)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'NRXN3 → CLSTN1' in 'mbcsToSbcs': -> substituted for → (U+2192)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'CNTN4 → APP' in 'mbcsToSbcs': -> substituted for → (U+2192)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'NRXN3 → CLSTN3' in 'mbcsToSbcs': -> substituted for → (U+2192)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'CNTN4 → APLP1' in 'mbcsToSbcs': -> substituted for → (U+2192)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'VGF → STAT3' in 'mbcsToSbcs': -> substituted for → (U+2192)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'PTN → PTPRS' in 'mbcsToSbcs': -> substituted for → (U+2192)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'PAM → FXYD1' in 'mbcsToSbcs': -> substituted for → (U+2192)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'C1QTNF4 → STAT3' in 'mbcsToSbcs': -> substituted for → (U+2192)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'APOE → LRP1' in 'mbcsToSbcs': -> substituted for → (U+2192)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'APOE → APP' in 'mbcsToSbcs': -> substituted for → (U+2192)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'APLP2 → CD14' in 'mbcsToSbcs': -> substituted for → (U+2192)
Code
# ------------------------------------------------------------------------------
# 3. Tile heatmap — ligand x receptor matrix
# ------------------------------------------------------------------------------

# Create a complete grid for the axes
all_ligands <- sort(unique(lr_heatmap_data$Ligand))
all_receptors <- sort(unique(lr_heatmap_data$Receptor))

tile_data <- lr_heatmap_data %>%
  dplyr::select(Ligand, Receptor, combined_logFC, paper_count) %>%
  # Dropped "comparison" from complete() since there is only one
  complete(Ligand = all_ligands, Receptor = all_receptors, 
           fill = list(combined_logFC = NA, paper_count = NA))

p_tile <- ggplot(tile_data, aes(x = Receptor, y = Ligand)) +
  geom_tile(
    aes(fill = combined_logFC),
    colour = "white", linewidth = 0.8
  ) +
  # Paper count as text overlay
  geom_text(
    aes(label = ifelse(!is.na(paper_count) & paper_count > 0, paper_count, "")),
    size = 3.5, fontface = "bold", colour = "white"
  ) +
  scale_fill_gradient(
    low = "#F5C4B3", high = "firebrick3",
    na.value = "grey95",
    name = "Combined\nlog2FC"
  ) +
  # Removed facet_wrap since we only have one condition to plot
  labs(
    title = "Ligand-Receptor Interaction Heatmap (Near vs Control)",
    subtitle = "log2FC > 0.5, adj.p < 0.05 | Numbers = supporting publications",
    x = "Receptor", y = "Ligand"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 10, colour = "grey40", margin = margin(b = 15)),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 9, face = "bold"),
    axis.text.y = element_text(size = 10, face = "bold"),
    axis.title = element_text(face = "bold"),
    panel.grid = element_blank(),
    legend.position = "right",
    plot.margin = margin(20, 20, 20, 10)
  )

print(p_tile)

Code
ggsave(
  filename = file.path(results_dir, "LR_heatmap_tile_Near_logFC05.pdf"),
  plot = p_tile,
  width = 9, height = 7, dpi = 300 # Scaled down width for a single heatmap
)