1. Phosphoproteome — Initial QC

Overview

QC for the phosphoproteome MSstats report from Spectronaut. Mirrors the bulk-proteome QC (notebook 01_initial_qc_spectronaut.qmd in bulk_proteomics/) but operates on phosphopeptide-level data and tracks phosphosite-level metrics.

Important note on this Spectronaut export: the file does not include EG.PTMLocalizationProbabilities (or EG.PTMAssayProbability). All phospho-site information lives in EG.ModifiedSequence (e.g. _AAS[Phospho (STY)]TPGK_). MSstatsPTM downstream needs a mouse FASTA to map sequence positions to protein residues — see notebook 02. For QC here we count phosphosites by parsing EG.ModifiedSequence directly.

Libraries

Directories

Code

base_dir <- "/nemo/lab/destrooperb/home/shared/zanettc/giulia_proteomics/phosphoproteomics"
run_num  <- "run3"

results_dir <- file.path(base_dir, "results", run_num)
objects_dir <- file.path(base_dir, "data", "processed", run_num)
input_dir   <- file.path(base_dir, "input")

dir.create(results_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(objects_dir, recursive = TRUE, showWarnings = FALSE)

Study design

Same 16-sample 2×2 design as the bulk proteome.

Code

meta <- data.table(
  FileName      = paste0("GA_", 1:16),
  Condition     = rep(c("hApp", "hAppNLGF", "hAppNLGF_FIRE", "hApp_FIRE"), each = 4),
  BioReplicate  = rep(1:4, times = 4),
  Has_Microglia = rep(c(TRUE, TRUE, FALSE, FALSE), each = 4)
)
meta

Load the phospho report

Read only the columns needed for QC and phosphosite parsing.

Code

phospho_cols <- c(
  "R.Condition", "R.FileName", "R.Replicate",
  "PG.ProteinGroups", "PG.ProteinAccessions", "PG.Qvalue", "PG.Quantity",
  "PEP.StrippedSequence",
  "EG.ModifiedSequence", "EG.PrecursorId", "EG.Qvalue",
  "FG.Charge", "FG.Id",
  "F.FrgIon", "F.Charge", "F.FrgLossType", "F.FrgNum", "F.FrgType",
  "F.NormalizedPeakArea", "F.PeakArea", "F.ExcludedFromQuantification"
)

cat("Reading phospho MSstats report...\n")

Reading phospho MSstats report...

Code

phospho_raw <- fread(
  file.path(input_dir, "PCF000612_phospho_GA_MSStats_report.csv"),
  select = phospho_cols
)
cat("Rows:", nrow(phospho_raw), "\n")

Rows: 19232048

Code

cat("Unique samples:", uniqueN(phospho_raw$R.FileName), "\n")

Unique samples: 16

Code

cat("Samples:", paste(sort(unique(phospho_raw$R.FileName)), collapse = ", "), "\n")

Samples: GA_P_1, GA_P_10, GA_P_11, GA_P_12, GA_P_13, GA_P_14, GA_P_15, GA_P_16, GA_P_2, GA_P_3, GA_P_4, GA_P_5, GA_P_6, GA_P_7, GA_P_8, GA_P_9

Restrict to confidently-identified phospho-modified precursors

Filter to phospho-containing precursors at EG.Qvalue < 0.01.

Code

is_phospho <- str_detect(phospho_raw$EG.ModifiedSequence, fixed("Phospho"))
cat("Phospho-modified precursor rows: ",
    sum(is_phospho), "/", nrow(phospho_raw), "\n")

Phospho-modified precursor rows:  6488816 / 19232048

Code

phospho_raw <- phospho_raw[is_phospho & EG.Qvalue < 0.01]
cat("After phospho + Qvalue<0.01 filter:", nrow(phospho_raw), "\n")

After phospho + Qvalue<0.01 filter: 5341781

Phosphopeptide and phosphosite metrics per sample

A “phosphosite” here is a (protein, peptide, S/T/Y position-within-peptide) tuple — the peptide-relative position. Absolute protein-residue numbering requires the FASTA mapping, done in notebook 02.

Code

# Helper: extract phospho positions within the modified sequence (Vectorized)
extract_phospho_positions_vec <- function(modseq) {
  
  # 1. Remove leading/trailing underscores
  m <- str_replace_all(modseq, "_", "")
  
  # 2. Strip non-phospho mods entirely. 
  # Use ifelse() so it handles vectors of length > 1 correctly
  m_clean <- str_replace_all(m, "\\[[^\\]]*\\]", function(x) {
    ifelse(str_detect(x, "Phospho"), x, "")
  })
  
  # 3. Split the sequence on the exact Phospho tags
  parts <- str_split(m_clean, "\\[Phospho \\(STY\\)\\]")
  
  # 4. Count the residues and calculate positions
  vapply(parts, function(p) {
    # If the length is 1, it means no split occurred (no phospho present or NA)
    if (length(p) == 1) return(NA_character_)  
    
    # Each part before the last contains only residue chars
    residue_counts <- nchar(p[-length(p)])
    positions <- cumsum(residue_counts)
    
    # Return as a semicolon-separated string (e.g., "3;5")
    paste(positions, collapse = ";")
  }, FUN.VALUE = character(1)) 
}

cat("Parsing phosphosite positions (vectorized)...\n")

Parsing phosphosite positions (vectorized)...

Code

# Vectorized call — completely replaces mapply + unlist
phospho_raw[, PhosphoPos := extract_phospho_positions_vec(EG.ModifiedSequence)]

# Per-sample summary -> need to use correct filenames
phospho_raw[, R.FileName := str_replace(R.FileName, "^GA_P_", "GA_")]

# Then re-run sample_summary as before
sample_summary <- phospho_raw[
  , .(
    n_features        = uniqueN(EG.PrecursorId),
    n_modseq          = uniqueN(EG.ModifiedSequence),
    n_phosphopeptides = uniqueN(paste(PG.ProteinGroups, EG.ModifiedSequence)),
    n_phosphoproteins = uniqueN(PG.ProteinGroups)
  ),
  by = R.FileName
]
sample_summary <- sample_summary[meta, on = c(R.FileName = "FileName")]
sample_summary <- sample_summary[meta, on = c(R.FileName = "FileName")]

print(sample_summary[order(R.FileName)])

    R.FileName n_features n_modseq n_phosphopeptides n_phosphoproteins
        <char>      <int>    <int>             <int>             <int>
 1:       GA_1      58334    49866             49866              4671
 2:      GA_10      54339    46790             46790              4603
 3:      GA_11      55963    48067             48067              4632
 4:      GA_12      54227    46519             46519              4488
 5:      GA_13      60929    52093             52093              4836
 6:      GA_14      51997    44985             44985              4404
 7:      GA_15      45105    39477             39477              4318
 8:      GA_16      62414    53375             53375              4880
 9:       GA_2      59233    50688             50688              4803
10:       GA_3      52757    45384             45384              4480
11:       GA_4      57471    49262             49262              4636
12:       GA_5      60927    51898             51898              4856
13:       GA_6      55498    47638             47638              4634
14:       GA_7      58719    50132             50132              4642
15:       GA_8      62634    53339             53339              4923
16:       GA_9      57535    49320             49320              4568
        Condition BioReplicate Has_Microglia   i.Condition i.BioReplicate
           <char>        <int>        <lgcl>        <char>          <int>
 1:          hApp            1          TRUE          hApp              1
 2: hAppNLGF_FIRE            2         FALSE hAppNLGF_FIRE              2
 3: hAppNLGF_FIRE            3         FALSE hAppNLGF_FIRE              3
 4: hAppNLGF_FIRE            4         FALSE hAppNLGF_FIRE              4
 5:     hApp_FIRE            1         FALSE     hApp_FIRE              1
 6:     hApp_FIRE            2         FALSE     hApp_FIRE              2
 7:     hApp_FIRE            3         FALSE     hApp_FIRE              3
 8:     hApp_FIRE            4         FALSE     hApp_FIRE              4
 9:          hApp            2          TRUE          hApp              2
10:          hApp            3          TRUE          hApp              3
11:          hApp            4          TRUE          hApp              4
12:      hAppNLGF            1          TRUE      hAppNLGF              1
13:      hAppNLGF            2          TRUE      hAppNLGF              2
14:      hAppNLGF            3          TRUE      hAppNLGF              3
15:      hAppNLGF            4          TRUE      hAppNLGF              4
16: hAppNLGF_FIRE            1         FALSE hAppNLGF_FIRE              1
    i.Has_Microglia
             <lgcl>
 1:            TRUE
 2:           FALSE
 3:           FALSE
 4:           FALSE
 5:           FALSE
 6:           FALSE
 7:           FALSE
 8:           FALSE
 9:            TRUE
10:            TRUE
11:            TRUE
12:            TRUE
13:            TRUE
14:            TRUE
15:            TRUE
16:           FALSE

Code

fwrite(sample_summary, file.path(results_dir, "phospho_sample_summary.csv"))

unique(phospho_raw$R.FileName)

 [1] "GA_1"  "GA_2"  "GA_3"  "GA_4"  "GA_5"  "GA_6"  "GA_7"  "GA_8"  "GA_9" 
[10] "GA_10" "GA_11" "GA_12" "GA_13" "GA_14" "GA_15" "GA_16"

Phosphopeptide count per sample (bar plot)

Code

sample_summary[, R.FileName := factor(R.FileName, levels = paste0("GA_", 1:16))]

p_counts <- ggplot(sample_summary,
                   aes(x = R.FileName, y = n_phosphopeptides, fill = Condition)) +
  geom_col() +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Phosphopeptides identified per sample",
       x = NULL, y = "# phosphopeptides (proteinGroup × modSeq)") +
  theme_minimal(base_size = 13) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p_counts)

Code

ggsave(file.path(results_dir, "Phosphopeptides_per_sample.pdf"),
       p_counts, width = 10, height = 5)

MAD-based outlier flag on phosphopeptide count

Code

mad_thresh <- 3
med <- median(sample_summary$n_phosphopeptides)
md  <- mad(sample_summary$n_phosphopeptides)

outliers <- sample_summary[abs(n_phosphopeptides - med) > mad_thresh * md]
cat("Median:", med, " MAD:", md, "\n")

Median: 49291  MAD: 3786.56

Code

cat("Outlier samples (>", mad_thresh, "× MAD):\n")

Outlier samples (> 3 × MAD):

Code

print(outliers[, .(R.FileName, Condition, n_phosphopeptides)])

Empty data.table (0 rows and 3 cols): R.FileName,Condition,n_phosphopeptides

PCA on phosphopeptide intensities

Aggregate to one log2 intensity per (modSeq × sample), wide format, log2-transform, replace NAs with column-wise medians for PCA only (this is QC, not stats).

Code

# One value per (proteinGroup, modSeq, sample) — sum fragment areas
phospho_intens <- phospho_raw[
  , .(Intensity = sum(F.PeakArea, na.rm = TRUE)),
  by = .(PG.ProteinGroups, EG.ModifiedSequence, R.FileName)
]

# Drop near-zero intensities
phospho_intens <- phospho_intens[Intensity > 0]
phospho_intens[, log2I := log2(Intensity + 1)]

wide <- dcast(phospho_intens,
              PG.ProteinGroups + EG.ModifiedSequence ~ R.FileName,
              value.var = "log2I")
mat <- as.matrix(wide[, -(1:2)])
rownames(mat) <- paste(wide$PG.ProteinGroups, wide$EG.ModifiedSequence)

# Keep features observed in >= 12 of 16 samples for PCA stability
keep <- rowSums(!is.na(mat)) >= 12
mat <- mat[keep, , drop = FALSE]

# Median-impute NAs by column for PCA only
for (j in seq_len(ncol(mat))) {
  miss <- is.na(mat[, j])
  if (any(miss)) mat[miss, j] <- median(mat[, j], na.rm = TRUE)
}

pca <- prcomp(t(mat), scale. = TRUE)
pca_df <- as.data.frame(pca$x[, 1:3])
pca_df$Sample    <- rownames(pca_df)
pca_df$Condition <- meta$Condition[match(pca_df$Sample, meta$FileName)]

ve <- summary(pca)$importance[2, ] * 100

p_pca <- ggplot(pca_df, aes(PC1, PC2, color = Condition, label = Sample)) +
  geom_point(size = 4) +
  ggrepel::geom_text_repel(size = 3, max.overlaps = 50) +
  scale_color_brewer(palette = "Set2") +
  labs(title = "PCA — phosphopeptide log2 intensity",
       subtitle = paste0("n features = ", nrow(mat)),
       x = sprintf("PC1 (%.1f%%)", ve[1]),
       y = sprintf("PC2 (%.1f%%)", ve[2])) +
  theme_minimal(base_size = 12)

print(p_pca)

Code

ggsave(file.path(results_dir, "Phospho_PCA.pdf"), p_pca, width = 8, height = 6)

Sample-correlation heatmap

Code

cor_mat <- cor(mat, use = "pairwise.complete.obs", method = "spearman")

ann_col <- data.frame(Condition = meta$Condition,
                      row.names = meta$FileName)

ph <- pheatmap(cor_mat,
               annotation_col = ann_col,
               main = "Phospho — Spearman correlation between samples",
               fontsize = 9,
               filename = file.path(results_dir, "Phospho_correlation_heatmap.pdf"),
               width = 8, height = 7)

Save cleaned phospho data for downstream notebooks

Code

qs_save(phospho_raw, file.path(objects_dir, "phospho_raw_filtered.qs2"))
fwrite(sample_summary, file.path(objects_dir, "phospho_sample_summary.csv"))

cat("Saved:\n",
    "  ", file.path(objects_dir, "phospho_raw_filtered.qs2"), "\n",
    "  ", file.path(objects_dir, "phospho_sample_summary.csv"), "\n")

Saved:
    /nemo/lab/destrooperb/home/shared/zanettc/giulia_proteomics/phosphoproteomics/data/processed/run3/phospho_raw_filtered.qs2 
    /nemo/lab/destrooperb/home/shared/zanettc/giulia_proteomics/phosphoproteomics/data/processed/run3/phospho_sample_summary.csv