9. Zemke 2026 Tripartite Astrocyte Signature

This notebook evaluates whether the LCM proteomics data contains an enriched tripartite astrocyte signature, using the 18-gene module from Zemke et al. (2026). The same genes were used in that study with Seurat’s AddModuleScore to identify putative tripartite astrocyte clusters. Here we ask whether these proteins are detected, whether their aggregate abundance is higher than expected by chance, and how the sample cell-type composition varies across conditions.


Libraries

Code
suppressPackageStartupMessages({
  library(data.table)
  library(qs2)
  library(dplyr)
  library(tidyr)
  library(tibble)
  library(ggplot2)
  library(ggrepel)
  library(stringr)
  library(glue)
})

Directories & gene set

Code
base_dir    <- "/nemo/lab/destrooperb/home/shared/zanettc/millie_proteomics"
run_num     <- "run5"
results_dir <- file.path(base_dir, "results", run_num)
objects_dir <- file.path(base_dir, "data", "processed", run_num)

dir.create(file.path(results_dir, "Tripartite_Astrocyte"), recursive = TRUE, showWarnings = FALSE)
out_dir <- file.path(results_dir, "Tripartite_Astrocyte")

# Zemke et al. 2026 — 18 tripartite synaptic genes as reported (human HGNC symbols)
zemke_genes_human <- c(
  "NRXN1", "NLGN3", "ITGB1", "ITGAV", "CDH2",
  "CADM3", "CADM1", "NCAM1", "TGFB2", "ATP1B2",
  "EPHA4", "GJB6",  "BCAN",  "CSPG5", "SPARCL1",
  "SDC2",  "SDC4",  "GPC1"
)

# Convert to mouse MGI symbols (title case; all 18 have verified 1:1 mouse orthologs
# with identical names differing only in capitalisation — confirmed via Ensembl BioMart)
zemke_genes_mouse <- paste0(
  toupper(substr(zemke_genes_human, 1, 1)),
  tolower(substr(zemke_genes_human, 2, nchar(zemke_genes_human)))
)

# Uppercase versions used for case-insensitive matching against the proteomics dictionary
zemke_genes <- toupper(zemke_genes_mouse)

cat("Mouse gene symbols:\n", paste(zemke_genes_mouse, collapse = ", "), "\n")
Mouse gene symbols:
 Nrxn1, Nlgn3, Itgb1, Itgav, Cdh2, Cadm3, Cadm1, Ncam1, Tgfb2, Atp1b2, Epha4, Gjb6, Bcan, Cspg5, Sparcl1, Sdc2, Sdc4, Gpc1 
Code
# Condition display order and colours (consistent with the rest of the project)
cond_levels <- c("control", "plaque_far", "plaque_near")
cond_labels <- c("Control", "Plaque Far", "Plaque Near")
cond_cols   <- c("control" = "#1b9e77", "plaque_far" = "#7570b3", "plaque_near" = "#d95f02")

Load data

Code
# Protein dictionary
protein_dict <- fread(file.path(base_dir, "data/processed/run1/clean_spec_raw.csv")) %>%
  dplyr::select(Protein = PG.ProteinGroups,
                Gene    = PG.Genes,
                Description = PG.ProteinDescriptions) %>%
  distinct()

# MSstats summarised protein-level abundances (ProteinLevelData)
processed_data <- qs_read(file.path(objects_dir, "processed_msstats_data.qs2"))

1. Detection check

How many of the 18 Zemke genes are quantified in the proteome?

Code
# All detected genes (uppercase)
detected_genes <- protein_dict %>%
  filter(!is.na(Gene) & Gene != "") %>%
  mutate(Gene_upper = toupper(Gene)) %>%
  pull(Gene_upper) %>%
  unique()

detected_zemke <- zemke_genes[zemke_genes %in% detected_genes]
missing_zemke  <- zemke_genes[!zemke_genes %in% detected_genes]

cat(glue("Detected : {length(detected_zemke)} / {length(zemke_genes)}\n"))
Detected : 16 / 18
Code
cat("Detected genes :", paste(detected_zemke, collapse = ", "), "\n")
Detected genes : NRXN1, NLGN3, ITGB1, ITGAV, CDH2, CADM3, CADM1, NCAM1, ATP1B2, EPHA4, GJB6, BCAN, CSPG5, SPARCL1, SDC4, GPC1 
Code
cat("Missing genes  :", paste(missing_zemke,  collapse = ", "), "\n")
Missing genes  : TGFB2, SDC2 
Code
det_df <- data.frame(
  Gene    = zemke_genes,
  Status  = ifelse(zemke_genes %in% detected_genes, "Detected", "Not detected")
)

ggplot(det_df, aes(x = reorder(Gene, Status == "Detected"), fill = Status)) +
  geom_bar(width = 0.7) +
  scale_fill_manual(values = c("Detected" = "#2c7bb6", "Not detected" = "grey70")) +
  labs(title = "Zemke 2026 Tripartite Astrocyte Genes — Detection in Proteome",
       x = NULL, y = "Count", fill = NULL) +
  theme_bw(base_size = 13) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold"),
        plot.title  = element_text(hjust = 0.5, face = "bold"),
        legend.position = "top")

Code
ggsave(file.path(out_dir, "Detection_Summary.pdf"), width = 8, height = 4)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'Zemke 2026 Tripartite Astrocyte Genes — Detection in Proteome' in
'mbcsToSbcs': - substituted for — (U+2014)

2. Individual protein abundance per condition

Boxplots for each detected tripartite gene, coloured by Condition (as in notebook 01b).

Code
# Build long-format abundance data for detected tripartite genes
tripartite_long <- processed_data$ProteinLevelData %>%
  rename(Condition = GROUP) %>%
  left_join(protein_dict, by = "Protein") %>%
  filter(!is.na(Gene) & Gene != "") %>%
  mutate(Gene_upper = toupper(Gene)) %>%
  filter(Gene_upper %in% detected_zemke) %>%
  mutate(Gene_upper = factor(Gene_upper, levels = sort(detected_zemke))) %>%
  mutate(Condition  = factor(Condition, levels = cond_levels, labels = cond_labels))

p_boxes <- ggplot(tripartite_long,
                  aes(x = Condition, y = LogIntensities, fill = Condition)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.6) +
  geom_jitter(width = 0.2, size = 1.2, alpha = 0.6) +
  facet_wrap(~ Gene_upper, scales = "free_y") +
  scale_fill_manual(values = c(
    "Control"     = cond_cols["control"],
    "Plaque Far"  = cond_cols["plaque_far"],
    "Plaque Near" = cond_cols["plaque_near"]
  )) +
  labs(title = "Tripartite Astrocyte Proteins — Abundance per Condition",
       x = NULL, y = "Log2 Intensity", fill = "Condition") +
  theme_bw(base_size = 12) +
  theme(axis.text.x  = element_text(angle = 45, hjust = 1),
        strip.text   = element_text(face = "bold"),
        plot.title   = element_text(hjust = 0.5, face = "bold"),
        legend.position = "top")

print(p_boxes)
Warning: No shared levels found between `names(values)` of the manual scale and the
data's fill values.
No shared levels found between `names(values)` of the manual scale and the
data's fill values.
No shared levels found between `names(values)` of the manual scale and the
data's fill values.

Code
dyn_h <- 3 * ceiling(length(detected_zemke) / 4)
ggsave(file.path(out_dir, "Individual_Protein_Boxplots.pdf"),
       plot = p_boxes, width = 11, height = dyn_h, limitsize = FALSE)
Warning: No shared levels found between `names(values)` of the manual scale and the
data's fill values.
No shared levels found between `names(values)` of the manual scale and the
data's fill values.
No shared levels found between `names(values)` of the manual scale and the
data's fill values.
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'Tripartite Astrocyte Proteins — Abundance per Condition' in 'mbcsToSbcs':
- substituted for — (U+2014)

3. Aggregate module score per sample

Average log-intensity of detected tripartite genes per run, compared across conditions. Runs with fewer than 3 detected genes are excluded.

Code
module_scores <- processed_data$ProteinLevelData %>%
  rename(Condition = GROUP) %>%
  left_join(protein_dict, by = "Protein") %>%
  filter(!is.na(Gene) & Gene != "") %>%
  mutate(Gene_upper = toupper(Gene)) %>%
  filter(Gene_upper %in% detected_zemke) %>%
  group_by(originalRUN, Condition) %>%
  summarise(
    module_score  = mean(LogIntensities, na.rm = TRUE),
    n_genes_used  = sum(!is.na(LogIntensities)),
    .groups = "drop"
  ) %>%
  filter(n_genes_used >= 3) %>%
  mutate(Condition = factor(Condition, levels = cond_levels, labels = cond_labels))

module_scores
Code
p_module <- ggplot(module_scores, aes(x = Condition, y = module_score, fill = Condition)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.6, width = 0.5) +
  geom_jitter(aes(color = Condition), width = 0.15, size = 2, alpha = 0.8) +
  scale_fill_manual(values  = c("Control"     = cond_cols["control"],
                                "Plaque Far"  = cond_cols["plaque_far"],
                                "Plaque Near" = cond_cols["plaque_near"])) +
  scale_color_manual(values = c("Control"     = cond_cols["control"],
                                "Plaque Far"  = cond_cols["plaque_far"],
                                "Plaque Near" = cond_cols["plaque_near"])) +
  labs(title    = "Zemke Tripartite Astrocyte Module Score",
       subtitle = glue("Mean log2 intensity of {length(detected_zemke)} detected genes"),
       x = NULL, y = "Module Score (mean log2 intensity)") +
  theme_bw(base_size = 14) +
  theme(legend.position = "none",
        plot.title    = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5, color = "grey40"),
        axis.text.x   = element_text(face = "bold"))

print(p_module)
Warning: No shared levels found between `names(values)` of the manual scale and the
data's fill values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's colour values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's fill values.
No shared levels found between `names(values)` of the manual scale and the
data's fill values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's colour values.

Code
ggsave(file.path(out_dir, "Module_Score_Boxplot.pdf"), plot = p_module, width = 5, height = 5)
Warning: No shared levels found between `names(values)` of the manual scale and the
data's fill values.
No shared levels found between `names(values)` of the manual scale and the
data's colour values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's fill values.
No shared levels found between `names(values)` of the manual scale and the
data's fill values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's colour values.

4. Permutation test: are tripartite genes more abundant than expected by chance?

Tests whether the tripartite gene set has higher absolute protein levels than would be expected if you picked the same number of proteins at random from the full proteome. For each condition the observed mean log-intensity of the detected tripartite genes is compared to a null distribution from 10,000 random draws of equal size. A complementary two-sample Wilcoxon rank-sum test (tripartite vs. all other proteins) is also reported.

Code
set.seed(42)
n_perm <- 10000

# Condition-level mean abundance per protein (average across runs within condition)
protein_means_per_cond <- processed_data$ProteinLevelData %>%
  rename(Condition = GROUP) %>%
  left_join(protein_dict, by = "Protein") %>%
  filter(!is.na(Gene) & Gene != "") %>%
  mutate(Gene_upper = toupper(Gene)) %>%
  group_by(Condition, Gene_upper) %>%
  summarise(mean_abundance = mean(LogIntensities, na.rm = TRUE), .groups = "drop") %>%
  filter(!is.nan(mean_abundance))

perm_results_abund <- purrr::map_dfr(cond_levels, function(cond) {

  cond_data <- protein_means_per_cond %>% filter(Condition == cond)
  all_abund  <- cond_data$mean_abundance
  gene_names <- cond_data$Gene_upper

  detected_in_cond <- intersect(detected_zemke, gene_names)
  n_detected        <- length(detected_in_cond)

  if (n_detected < 3) {
    message(glue("Skipping {cond}: fewer than 3 tripartite genes detected"))
    return(NULL)
  }

  observed_mean <- mean(all_abund[gene_names %in% detected_in_cond], na.rm = TRUE)

  null_dist <- replicate(n_perm, mean(sample(all_abund, n_detected, replace = FALSE)))
  emp_pval  <- mean(null_dist >= observed_mean)

  # Two-sample Wilcoxon: tripartite vs. rest of proteome
  tripartite_abund <- all_abund[gene_names %in% detected_in_cond]
  rest_abund       <- all_abund[!(gene_names %in% detected_in_cond)]
  wx <- wilcox.test(tripartite_abund, rest_abund, alternative = "greater")

  tibble(
    Condition     = cond,
    n_genes       = n_detected,
    observed_mean = observed_mean,
    null_mean     = mean(null_dist),
    null_sd       = sd(null_dist),
    perm_pval     = emp_pval,
    wilcox_pval   = wx$p.value,
    null_dist     = list(null_dist)
  )
})

perm_results_abund %>%
  dplyr::select(-null_dist) %>%
  mutate(across(where(is.numeric), ~ round(.x, 4)))
Code
plot_data_abund <- perm_results_abund %>%
  tidyr::unnest(null_dist) %>%
  mutate(Condition = factor(Condition, levels = cond_levels, labels = cond_labels))

obs_data_abund <- perm_results_abund %>%
  mutate(Condition = factor(Condition, levels = cond_levels, labels = cond_labels))

p_perm_abund <- ggplot(plot_data_abund, aes(x = null_dist, fill = Condition)) +
  geom_histogram(bins = 60, alpha = 0.7, color = NA) +
  geom_vline(data = obs_data_abund,
             aes(xintercept = observed_mean, color = Condition),
             linewidth = 1.2, linetype = "dashed") +
  geom_text(data = obs_data_abund,
            aes(x = observed_mean, y = Inf,
                label = glue("p = {signif(perm_pval, 2)}")),
            vjust = 2, hjust = -0.1, size = 4, fontface = "bold") +
  facet_wrap(~ Condition, scales = "free") +
  scale_fill_manual(values  = c("Control"     = cond_cols["control"],
                                "Plaque Far"  = cond_cols["plaque_far"],
                                "Plaque Near" = cond_cols["plaque_near"])) +
  scale_color_manual(values = c("Control"     = cond_cols["control"],
                                "Plaque Far"  = cond_cols["plaque_far"],
                                "Plaque Near" = cond_cols["plaque_near"])) +
  labs(title    = "5a. Permutation Test — Absolute Abundance vs. Null",
       subtitle = glue("Dashed line = observed mean; null = {n_perm} random draws of equal size"),
       x = "Mean log2 intensity (random gene sets)", y = "Count") +
  theme_bw(base_size = 13) +
  theme(legend.position = "none",
        plot.title    = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5, color = "grey40"),
        strip.text    = element_text(face = "bold"))

print(p_perm_abund)
Warning: No shared levels found between `names(values)` of the manual scale and the
data's fill values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's colour values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's fill values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's colour values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's fill values.

Code
ggsave(file.path(out_dir, "5a_Permutation_Absolute_Abundance.pdf"),
       plot = p_perm_abund, width = 10, height = 4)
Warning: No shared levels found between `names(values)` of the manual scale and the
data's fill values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's colour values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's fill values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's colour values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's fill values.
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for '5a. Permutation Test — Absolute Abundance vs. Null' in 'mbcsToSbcs': -
substituted for — (U+2014)
Code
perm_results_abund %>%
  dplyr::select(-null_dist) %>%
  mutate(
    Condition       = factor(Condition, levels = cond_levels, labels = cond_labels),
    perm_pval_adj   = p.adjust(perm_pval,  method = "BH"),
    wilcox_pval_adj = p.adjust(wilcox_pval, method = "BH")
  ) %>%
  mutate(across(where(is.numeric), ~ round(.x, 4)))

6. Cell-type identity validation: astrocyte vs. non-astrocyte marker enrichment

To validate that the LCM samples are genuinely astrocyte-enriched, the same permutation test from section 5a is run across five cell-type marker sets simultaneously. The prediction is that astrocyte marker sets (canonical astrocyte + Zemke tripartite) should have significantly higher-than-chance abundance in the proteome, while neuronal, microglial, and oligodendrocyte marker sets should not. A significant result for non-astrocyte sets would indicate contamination from those cell types.

Marker sources: - Canonical astrocyte: Sharma et al. 2015 (Nat Neurosci 18:1819) — SILAC-MS cell-type resolved mouse brain proteome; top proteins enriched >2-fold in astrocytes over neurons, oligodendrocytes and microglia. This is the primary proteomic reference for this context. Boisvert et al. 2018 (Cell Reports 22:1964) — Ribo-Tag MS from mouse astrocytes in vivo (translated proteome) provides additional validation for the core set. - Neurons, microglia, oligodendrocytes: Sharma et al. 2015 (same study); consensus from Zeisel et al. 2018 and standard immunomarker literature

Code
cell_type_markers <- list(
  "Zemke_Tripartite_Astrocyte" = toupper(zemke_genes_mouse),

  # Sharma et al. 2015 (Nat Neurosci 18:1819) SILAC-MS; top astrocyte-enriched proteins
  # confirmed in Boisvert et al. 2018 (Cell Reports 22:1964) Ribo-Tag MS
  "Canonical_Astrocyte" = toupper(c(
    "Aldh1l1", "Gfap",   "Aqp4",   "Glul",   "Gja1",
    "Atp1a2",  "Mfge8",  "Ndrg2",  "Clu",    "Slc1a2",
    "Slc1a3",  "Kcnj10", "Slc4a4", "Aldoc",  "Ddah1",
    "Ezr",     "Vim",    "S100b",  "Apoe"
  )),

  "Neuron" = toupper(c(
    "Tubb3", "Snap25", "Stmn2", "Eno2", "Syn1",
    "Rbfox3", "Map2", "Nefh", "Nefl", "Nefm",
    "Syt1", "Sv2b", "Gpm6a", "Dlg4", "Camk2a", "Atp1b1"
  )),

  "Microglia" = toupper(c(
    "Aif1", "Csf1r", "Cx3cr1", "Tmem119", "P2ry12",
    "Trem2", "Tyrobp", "C1qa", "C1qb", "C1qc",
    "Hexb", "Cd68", "Itgam", "Siglech", "Fcrls"
  )),

  "Oligodendrocyte" = toupper(c(
    "Mbp", "Plp1", "Mog", "Cnp", "Mag",
    "Mobp", "Sox10", "Mal", "Cldn11", "Gjb1",
    "Ermn", "Opalin", "Fa2h", "Ndrg1", "Aspa"
  ))
)

# Colour palette for cell types
ct_cols <- c(
  "Zemke_Tripartite_Astrocyte" = "#d95f02",
  "Canonical_Astrocyte"        = "#1b9e77",
  "Neuron"                     = "#377eb8",
  "Microglia"                  = "#984ea3",
  "Oligodendrocyte"            = "#ff7f00"
)

Detection rates per cell type

Code
detection_summary <- purrr::map_dfr(names(cell_type_markers), function(ct) {
  genes    <- cell_type_markers[[ct]]
  detected <- sum(genes %in% detected_genes)
  tibble(
    Cell_type    = ct,
    n_in_set     = length(genes),
    n_detected   = detected,
    pct_detected = round(100 * detected / length(genes), 1)
  )
})

detection_summary

Permutation test across all cell types

Performed on the full proteome (all conditions pooled) to give the most powered estimate of whether each cell-type marker set is broadly enriched in these LCM samples.

Code
# Proteome-wide mean abundance per protein (pooled across all conditions)
protein_means_all <- processed_data$ProteinLevelData %>%
  left_join(protein_dict, by = "Protein") %>%
  filter(!is.na(Gene) & Gene != "") %>%
  mutate(Gene_upper = toupper(Gene)) %>%
  group_by(Gene_upper) %>%
  summarise(mean_abundance = mean(LogIntensities, na.rm = TRUE), .groups = "drop") %>%
  filter(!is.nan(mean_abundance))

all_abund_pooled <- protein_means_all$mean_abundance
all_genes_pooled <- protein_means_all$Gene_upper

set.seed(42)

ct_perm_results <- purrr::map_dfr(names(cell_type_markers), function(ct) {
  genes_in_set    <- cell_type_markers[[ct]]
  detected_in_set <- intersect(genes_in_set, all_genes_pooled)
  n_det           <- length(detected_in_set)

  if (n_det < 3) {
    message(glue("Skipping {ct}: fewer than 3 genes detected"))
    return(NULL)
  }

  observed_mean <- mean(all_abund_pooled[all_genes_pooled %in% detected_in_set])

  null_dist <- replicate(n_perm,
                         mean(sample(all_abund_pooled, n_det, replace = FALSE)))
  emp_pval  <- mean(null_dist >= observed_mean)

  # Effect size: how many SDs above the null mean (z-score equivalent)
  effect_size <- (observed_mean - mean(null_dist)) / sd(null_dist)

  # Two-sample Wilcoxon: this cell type vs. all other proteins
  target_abund <- all_abund_pooled[all_genes_pooled %in% detected_in_set]
  rest_abund   <- all_abund_pooled[!(all_genes_pooled %in% detected_in_set)]
  wx <- wilcox.test(target_abund, rest_abund, alternative = "greater")

  tibble(
    Cell_type     = ct,
    n_detected    = n_det,
    observed_mean = observed_mean,
    null_mean     = mean(null_dist),
    null_sd       = sd(null_dist),
    effect_size_z = effect_size,
    perm_pval     = emp_pval,
    wilcox_pval   = wx$p.value,
    null_dist     = list(null_dist)
  )
})

ct_perm_results %>%
  dplyr::select(-null_dist) %>%
  mutate(
    perm_pval_adj   = p.adjust(perm_pval,  method = "BH"),
    wilcox_pval_adj = p.adjust(wilcox_pval, method = "BH")
  ) %>%
  mutate(across(where(is.numeric), ~ round(.x, 4)))

Summary plot: enrichment z-score by cell type

The effect size (z-score) represents how many standard deviations above the permutation null the observed mean sits. Positive = enriched above chance; negative = depleted.

Code
ct_plot_df <- ct_perm_results %>%
  mutate(
    sig_label = case_when(
      perm_pval < 0.001 ~ "***",
      perm_pval < 0.01  ~ "**",
      perm_pval < 0.05  ~ "*",
      TRUE              ~ "ns"
    ),
    Cell_type = factor(Cell_type, levels = names(cell_type_markers)),
    is_astrocyte = Cell_type %in% c("Zemke_Tripartite_Astrocyte", "Canonical_Astrocyte")
  )

p_ct_summary <- ggplot(ct_plot_df,
                       aes(x = effect_size_z, y = reorder(Cell_type, effect_size_z),
                           fill = Cell_type)) +
  geom_col(width = 0.65, alpha = 0.85) +
  geom_text(aes(label = glue("{sig_label}  (n={n_detected})"),
                hjust = ifelse(effect_size_z >= 0, -0.1, 1.1)),
            size = 4.2, fontface = "bold") +
  geom_vline(xintercept = 0, linewidth = 0.6, color = "grey30") +
  scale_fill_manual(values = ct_cols, guide = "none") +
  scale_y_discrete(labels = function(x) str_replace_all(x, "_", " ")) +
  labs(
    title    = "LCM Sample Cell-Type Identity Validation",
    subtitle = "Enrichment z-score vs. proteome-wide null (higher = more enriched than chance)",
    x        = "Effect size (z-score above permutation null)",
    y        = NULL
  ) +
  theme_bw(base_size = 13) +
  theme(
    plot.title    = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, color = "grey40", size = 10),
    axis.text.y   = element_text(face = "bold")
  ) +
  xlim(c(min(ct_plot_df$effect_size_z) * 1.4,
         max(ct_plot_df$effect_size_z) * 1.6))

print(p_ct_summary)

Code
ggsave(file.path(out_dir, "6_CellType_Identity_Validation.pdf"),
       plot = p_ct_summary, width = 7, height = 5)

Null distribution plots per cell type

Code
null_plot_df <- ct_perm_results %>%
  tidyr::unnest(null_dist) %>%
  mutate(Cell_type = factor(Cell_type, levels = names(cell_type_markers)))

obs_plot_df <- ct_perm_results %>%
  mutate(Cell_type = factor(Cell_type, levels = names(cell_type_markers)))

p_null_all <- ggplot(null_plot_df, aes(x = null_dist, fill = Cell_type)) +
  geom_histogram(bins = 60, alpha = 0.75, color = NA) +
  geom_vline(data = obs_plot_df,
             aes(xintercept = observed_mean, color = Cell_type),
             linewidth = 1.2, linetype = "dashed") +
  geom_text(data = obs_plot_df,
            aes(x = observed_mean, y = Inf,
                label = glue("z={round(effect_size_z,1)}\np={signif(perm_pval,2)}")),
            vjust = 1.5, hjust = -0.1, size = 3.5, fontface = "bold") +
  facet_wrap(~ Cell_type, scales = "free", ncol = 3,
             labeller = labeller(Cell_type = function(x) str_replace_all(x, "_", " "))) +
  scale_fill_manual(values  = ct_cols, guide = "none") +
  scale_color_manual(values = ct_cols, guide = "none") +
  labs(title    = "Permutation Null Distributions — Cell Type Marker Sets",
       subtitle = glue("Dashed line = observed mean; null = {n_perm} random draws"),
       x = "Mean log2 intensity (random gene sets)", y = "Count") +
  theme_bw(base_size = 12) +
  theme(plot.title    = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5, color = "grey40"),
        strip.text    = element_text(face = "bold"))

print(p_null_all)

Code
ggsave(file.path(out_dir, "6_CellType_NullDistributions.pdf"),
       plot = p_null_all, width = 12, height = 8)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'Permutation Null Distributions — Cell Type Marker Sets' in 'mbcsToSbcs': -
substituted for — (U+2014)

Permutation test split by condition

The pooled test above asks whether each cell-type marker set is enriched overall. Here the same permutation is run separately within each condition, allowing comparison of enrichment profiles across conditions. Microglia enrichment, for example, may be stronger in the plaque-near condition.

Code
set.seed(42)

ct_perm_by_cond <- purrr::map_dfr(cond_levels, function(cond) {

  cond_means <- processed_data$ProteinLevelData %>%
    rename(Condition = GROUP) %>%
    filter(Condition == cond) %>%
    left_join(protein_dict, by = "Protein") %>%
    filter(!is.na(Gene) & Gene != "") %>%
    mutate(Gene_upper = toupper(Gene)) %>%
    group_by(Gene_upper) %>%
    summarise(mean_abundance = mean(LogIntensities, na.rm = TRUE), .groups = "drop") %>%
    filter(!is.nan(mean_abundance))

  abund_vec <- cond_means$mean_abundance
  gene_vec  <- cond_means$Gene_upper

  purrr::map_dfr(names(cell_type_markers), function(ct) {
    genes_in_set    <- cell_type_markers[[ct]]
    detected_in_set <- intersect(genes_in_set, gene_vec)
    n_det           <- length(detected_in_set)

    if (n_det < 3) return(NULL)

    observed_mean <- mean(abund_vec[gene_vec %in% detected_in_set])
    null_dist     <- replicate(n_perm, mean(sample(abund_vec, n_det, replace = FALSE)))
    emp_pval      <- mean(null_dist >= observed_mean)
    effect_size   <- (observed_mean - mean(null_dist)) / sd(null_dist)

    tibble(
      Condition     = cond,
      Cell_type     = ct,
      n_detected    = n_det,
      observed_mean = observed_mean,
      null_mean     = mean(null_dist),
      effect_size_z = effect_size,
      perm_pval     = emp_pval
    )
  })
})

ct_perm_by_cond %>%
  mutate(
    Condition = factor(Condition, levels = cond_levels, labels = cond_labels),
    across(where(is.numeric), ~ round(.x, 4))
  )
Code
ct_by_cond_plot_df <- ct_perm_by_cond %>%
  mutate(
    sig_label = case_when(
      perm_pval < 0.001 ~ "***",
      perm_pval < 0.01  ~ "**",
      perm_pval < 0.05  ~ "*",
      TRUE              ~ "ns"
    ),
    Condition = factor(Condition, levels = cond_levels, labels = cond_labels),
    Cell_type = factor(Cell_type, levels = names(cell_type_markers))
  )

p_ct_by_cond <- ggplot(ct_by_cond_plot_df,
                       aes(x = effect_size_z, y = reorder(Cell_type, effect_size_z),
                           fill = Cell_type)) +
  geom_col(width = 0.65, alpha = 0.85) +
  geom_text(aes(label = glue("{sig_label}  (n={n_detected})"),
                hjust = ifelse(effect_size_z >= 0, -0.1, 1.1)),
            size = 3.8, fontface = "bold") +
  geom_vline(xintercept = 0, linewidth = 0.6, color = "grey30") +
  scale_fill_manual(values = ct_cols, guide = "none") +
  scale_y_discrete(labels = function(x) str_replace_all(x, "_", " ")) +
  facet_wrap(~ Condition, ncol = 3) +
  labs(
    title    = "Cell-Type Identity Validation — By Condition",
    subtitle = "Enrichment z-score vs. condition-specific null (higher = more enriched than chance)",
    x        = "Effect size (z-score above permutation null)",
    y        = NULL
  ) +
  theme_bw(base_size = 13) +
  theme(
    plot.title    = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, color = "grey40", size = 10),
    axis.text.y   = element_text(face = "bold"),
    strip.text    = element_text(face = "bold")
  )

print(p_ct_by_cond)

Code
ggsave(file.path(out_dir, "6_CellType_Identity_ByCondition.pdf"),
       plot = p_ct_by_cond, width = 14, height = 5)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'Cell-Type Identity Validation — By Condition' in 'mbcsToSbcs': -
substituted for — (U+2014)

7. Cellular composition across conditions

Section 6 showed enrichment of neuronal and oligodendrocyte markers alongside astrocyte markers. The following checks determine whether this co-capture varies with condition and whether astrocyte and non-astrocyte signals are correlated across samples.

7.1 Per-sample module scores — all cell types

Code
all_ct_scores <- purrr::map_dfr(names(cell_type_markers), function(ct) {
  processed_data$ProteinLevelData %>%
    rename(Condition = GROUP) %>%
    left_join(protein_dict, by = "Protein") %>%
    filter(!is.na(Gene), Gene != "") %>%
    mutate(Gene_upper = toupper(Gene)) %>%
    filter(Gene_upper %in% cell_type_markers[[ct]]) %>%
    group_by(originalRUN, Condition) %>%
    summarise(
      module_score = mean(LogIntensities, na.rm = TRUE),
      n_detected   = sum(!is.na(LogIntensities)),
      .groups = "drop"
    ) %>%
    mutate(Cell_type = ct)
}) %>%
  mutate(
    Condition = factor(Condition, levels = cond_levels, labels = cond_labels),
    Cell_type = factor(Cell_type, levels = names(cell_type_markers))
  )

7.2 Step 1 — condition-specificity of non-astrocyte signal

If neuronal or oligodendrocyte scores differ across conditions, co-capture is not uniform and will confound comparisons.

Code
p_ct_cond <- ggplot(all_ct_scores,
                    aes(x = Condition, y = module_score, fill = Condition)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.6, width = 0.5) +
  geom_jitter(aes(color = Condition, size = n_detected), width = 0.15, alpha = 0.7) +
  scale_fill_manual(values  = c("Control"     = cond_cols["control"],
                                "Plaque Far"  = cond_cols["plaque_far"],
                                "Plaque Near" = cond_cols["plaque_near"])) +
  scale_color_manual(values = c("Control"     = cond_cols["control"],
                                "Plaque Far"  = cond_cols["plaque_far"],
                                "Plaque Near" = cond_cols["plaque_near"])) +
  scale_size_continuous(range = c(1, 4), name = "n markers\ndetected") +
  facet_wrap(~ Cell_type, scales = "free_y", ncol = 3,
             labeller = labeller(Cell_type = function(x) str_replace_all(x, "_", " "))) +
  labs(title    = "Cell-Type Module Scores per Condition",
       subtitle = "Point size = number of marker genes detected in that run",
       x = NULL, y = "Mean log2 intensity") +
  theme_bw(base_size = 12) +
  theme(legend.position = "top",
        plot.title    = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5, color = "grey40"),
        strip.text    = element_text(face = "bold"),
        axis.text.x   = element_text(angle = 30, hjust = 1))

print(p_ct_cond)
Warning: No shared levels found between `names(values)` of the manual scale and the
data's fill values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's colour values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's fill values.
No shared levels found between `names(values)` of the manual scale and the
data's fill values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's colour values.

Code
ggsave(file.path(out_dir, "7_AllCellType_Scores_ByCondition.pdf"),
       plot = p_ct_cond, width = 14, height = 8)
Warning: No shared levels found between `names(values)` of the manual scale and the
data's fill values.
No shared levels found between `names(values)` of the manual scale and the
data's colour values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's fill values.
No shared levels found between `names(values)` of the manual scale and the
data's fill values.
Warning: No shared levels found between `names(values)` of the manual scale and the
data's colour values.

Pairwise Wilcoxon (BH-corrected) for each cell type across conditions:

Code
wilcox_ct_cond <- purrr::map_dfr(levels(all_ct_scores$Cell_type), function(ct) {
  sub <- filter(all_ct_scores, Cell_type == ct)
  pw  <- pairwise.wilcox.test(sub$module_score, sub$Condition,
                               p.adjust.method = "BH")
  as.data.frame(pw$p.value) %>%
    rownames_to_column("Condition_A") %>%
    pivot_longer(-Condition_A, names_to = "Condition_B", values_to = "padj") %>%
    filter(!is.na(padj)) %>%
    mutate(
      Cell_type = ct,
      sig = case_when(padj < 0.001 ~ "***", padj < 0.01 ~ "**",
                      padj < 0.05  ~ "*",   TRUE         ~ "ns")
    )
})

wilcox_ct_cond %>%
  dplyr::select(Cell_type, Condition_A, Condition_B, padj, sig) %>%
  arrange(Cell_type, padj)
Code
write.csv(wilcox_ct_cond,
          file.path(out_dir, "7_CellType_Condition_Wilcoxon.csv"),
          row.names = FALSE)

7.3 Covariate structure — cell-type score correlations

High correlation between astrocyte and neuronal scores would indicate co-capture that tracks sample quality rather than biology.

Code
ct_z_wide <- all_ct_scores %>%
  group_by(Cell_type) %>%
  mutate(z = as.numeric(scale(module_score))) %>%
  ungroup() %>%
  dplyr::select(originalRUN, Condition, Cell_type, z) %>%
  pivot_wider(names_from = Cell_type, values_from = z)
Code
cor_mat <- ct_z_wide %>%
  dplyr::select(-originalRUN, -Condition) %>%
  cor(method = "spearman", use = "pairwise.complete.obs")

cor_long <- as.data.frame(cor_mat) %>%
  rownames_to_column("A") %>%
  pivot_longer(-A, names_to = "B", values_to = "rho") %>%
  mutate(
    A = str_replace_all(A, "_", "\n"),
    B = str_replace_all(B, "_", "\n")
  )

p_cor <- ggplot(cor_long, aes(x = B, y = A, fill = rho)) +
  geom_tile(color = "white", linewidth = 0.5) +
  geom_text(aes(label = round(rho, 2)), size = 3.5) +
  scale_fill_gradient2(low = "#2c7bb6", mid = "white", high = "#d7191c",
                       midpoint = 0, limits = c(-1, 1), name = "Spearman ρ") +
  labs(title = "Spearman correlation — cell-type module z-scores", x = NULL, y = NULL) +
  theme_bw(base_size = 11) +
  theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8),
        axis.text.y = element_text(size = 8),
        plot.title  = element_text(hjust = 0.5, face = "bold"),
        panel.grid  = element_blank())

print(p_cor)

Code
ggsave(file.path(out_dir, "7_CellType_Correlation.pdf"), plot = p_cor, width = 7, height = 6)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
conversion failure on 'Spearman ρ' in 'mbcsToSbcs': for ρ (U+03C1)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'Spearman correlation — cell-type module z-scores' in 'mbcsToSbcs': -
substituted for — (U+2014)

Session info

Code
sessionInfo()
R version 4.5.1 (2025-06-13)
Platform: x86_64-pc-linux-gnu
Running under: Rocky Linux 8.7 (Green Obsidian)

Matrix products: default
BLAS/LAPACK: FlexiBLAS OPENBLAS;  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
 [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
 [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/London
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] glue_1.8.0          stringr_1.6.0       ggrepel_0.9.6      
[4] ggplot2_4.0.2       tibble_3.3.1        tidyr_1.3.2        
[7] dplyr_1.2.0         qs2_0.1.6           data.table_1.18.2.1

loaded via a namespace (and not attached):
 [1] gtable_0.3.6          jsonlite_2.0.0        compiler_4.5.1       
 [4] tidyselect_1.2.1      Rcpp_1.1.1            textshaping_1.0.3    
 [7] systemfonts_1.3.1     scales_1.4.0          yaml_2.3.12          
[10] fastmap_1.2.0         R6_2.6.1              labeling_0.4.3       
[13] generics_0.1.4        knitr_1.51            htmlwidgets_1.6.4    
[16] stringfish_0.17.0     pillar_1.11.1         RColorBrewer_1.1-3   
[19] rlang_1.1.7           stringi_1.8.7         xfun_0.56            
[22] S7_0.2.1              RcppParallel_5.1.11-1 otel_0.2.0           
[25] cli_3.6.5             withr_3.0.2           magrittr_2.0.4       
[28] digest_0.6.39         grid_4.5.1            lifecycle_1.0.5      
[31] vctrs_0.7.1           evaluate_1.0.5        farver_2.1.2         
[34] ragg_1.5.0            rmarkdown_2.30        purrr_1.2.1          
[37] tools_4.5.1           pkgconfig_2.0.3       htmltools_0.5.9