4. GSEA

==================================================================

GSEA for proteomics data

This script performs GSEA to identify biological pathways enriched in the differential expression lists generated by msstats. It uses the fgsea package.

Workflow overview

1. Data Loading: Imports DE tables (CSV) from the msstats step.

2. Gene Set Prep: Loads pathways from multiple sources (GO / KEGG / cell type signatures (C8)/ DAA signature)

3. Ranking: Ranks proteins by Log2FC for every comparison.

4. Analysis Loop: Runs GSEA for every combination of [Comparison x Gene Set Database].

5. Visualization:

  • Generates summary bar plots of top 10 and bottom 10 NES.
  • Generates specific enrichment plots (mountain plots) for key pathways of choice.

==================================================================

Load libraries

Code
suppressPackageStartupMessages({
library(data.table)
library(qs2)
library(ggrepel)
library(ggplot2)
library(stringr)
library(glue)
library(dplyr)
library(msigdbr)
library(fgsea)
library(viridis)
library(readxl)
})

Set directories

Code
base_dir <- "/nemo/lab/destrooperb/home/shared/zanettc/millie_proteomics"

run_num <- "run5"

results_dir <- file.path(base_dir, "results", run_num)
objects_dir <- file.path(base_dir, "data", "processed", run_num)
raw_dir <- file.path(base_dir, "data", "raw")

dir.create(results_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(objects_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(raw_dir, recursive = TRUE, showWarnings = FALSE)

Read in DE data

Code
#seurat <- qs_read(glue("output/{run_num}/objects/prelabelled_integrated_rpca.qs2"))
res_df <- fread(file.path(results_dir, "Full_GroupComparison_Results.csv"))
clean_spec_raw <- fread(file.path(base_dir, "data/processed/run1/clean_spec_raw.csv"))

protein_dictionary <- clean_spec_raw %>%
  dplyr::select(Protein = PG.ProteinGroups, 
         Gene = PG.Genes, 
         Description = PG.ProteinDescriptions) %>%
  distinct()

res_df

Define helper functions

Code
# ==============================================================================
# 1. HELPER FUNCTIONS
# ==============================================================================

# A. Create Rank Vector from MSstats Data
prep_msstats_ranks <- function(msstats_res, dict, comp_label) {
  df <- msstats_res %>%
    filter(Label == comp_label) %>%
    # Keep only finite log2FC values (removes Inf and -Inf)
    filter(is.finite(log2FC)) %>%
    left_join(dict, by = "Protein") %>%
    filter(!is.na(Gene) & Gene != "") %>%
    mutate(Gene = toupper(Gene)) %>%
    mutate(adj.pvalue = ifelse(adj.pvalue == 0, 1e-300, adj.pvalue)) %>%
    group_by(Gene) %>%
    slice_max(order_by = abs(log2FC), n = 1, with_ties = FALSE) %>%
    ungroup()
  
  #rank by logFC
  ranks <- df$log2FC
  names(ranks) <- df$Gene
  
  # Remove any unexpected NAs in names or values
  ranks <- ranks[!is.na(ranks) & !is.na(names(ranks))]
  
  return(sort(ranks, decreasing = TRUE))
}

# B. Core GSEA & Plotting Function
run_gsea_analysis <- function(ranks, 
                              comparison_name, 
                              geneset_name,    
                              pathways, 
                              base_objects_dir, 
                              base_graphs_dir,  
                              color_A = "firebrick3",
                              color_B = "navy") {
  
  message(glue("Running GSEA | Set: {geneset_name} | Comp: {comparison_name}"))
  
  # --- DYNAMIC LABEL GENERATION ---
  # Split the comparison name to get the two groups (e.g., "PlaqueNear" and "PlaqueFar")
  comp_parts <- str_split(comparison_name, "_vs_")[[1]]
  if(length(comp_parts) == 2) {
    group_A_label <- glue("{comp_parts[1]} Enriched") # Positive NES
    group_B_label <- glue("{comp_parts[2]} Enriched") # Negative NES
  } else {
    group_A_label <- "Group A Enriched"
    group_B_label <- "Group B Enriched"
  }
  
  # Clean title for the plot
  clean_comp_title <- gsub("_", " ", comparison_name)
  plot_title <- glue("{clean_comp_title} ({geneset_name})")

  # --- Directories ---
  current_objects_dir <- file.path(base_objects_dir, "GSEA_Analysis_logfc_x_pval", comparison_name)
  current_graphs_dir  <- file.path(base_graphs_dir, "GSEA_Analysis_logfc_x_pval", comparison_name)
  dir.create(current_objects_dir, recursive = TRUE, showWarnings = FALSE)
  dir.create(current_graphs_dir, recursive = TRUE, showWarnings = FALSE)
  
  # --- Run FGSEA ---
  pathways_upper <- lapply(pathways, toupper)
  set.seed(42)
  fgsea_res <- fgsea(
    pathways = pathways_upper,
    stats    = ranks,
    minSize  = 15,
    maxSize  = 3000,
    nproc    = 4, 
    nPermSimple = 10000 #changed this 
  )
  
  if (nrow(fgsea_res) == 0) {
    warning(glue("No significant pathways for {geneset_name} in {comparison_name}"))
    return(NULL)
  }
  
  # --- Tidy & Save ---
  fgsea_res_tidy <- fgsea_res %>%
    as_tibble() %>%
    mutate(leadingEdge = sapply(leadingEdge, paste, collapse = ",")) %>%
    mutate(pathway = gsub("_geneset", "", pathway)) %>% 
    arrange(padj)
  
  write.csv(fgsea_res_tidy, file.path(current_objects_dir, glue("Table_{geneset_name}.csv")), row.names = FALSE)
  
  # --- Barplot ---
  top_gsea <- fgsea_res_tidy %>%
    arrange(desc(NES)) %>%
    slice_head(n = 10) %>%
    bind_rows(fgsea_res_tidy %>% arrange(NES) %>% slice_head(n = 10)) %>%
    distinct(pathway, .keep_all = TRUE) %>%
    mutate(
      sig_label = case_when(
        padj < 0.001 ~ "***",
        padj < 0.01  ~ "**",
        padj < 0.05  ~ "*",
        TRUE         ~ ""
      ),
      # Apply dynamic labels based on NES direction
      fill_group = ifelse(NES > 0, group_A_label, group_B_label),
      pathway = factor(pathway, levels = unique(pathway[order(NES)]))
    )
  
  if (nrow(top_gsea) == 0) return(fgsea_res_tidy)
  
  axis_zoom <- 3.5 
  
  p_bar <- ggplot(top_gsea, aes(x = NES, y = pathway, fill = fill_group)) +
    geom_col(width = 0.7) +
    geom_text(aes(label = sig_label, hjust = ifelse(NES > 0, 1.2, -0.2)), 
              color = "white", size = 5, vjust = 0.75, fontface = "bold") +
    scale_fill_manual(values = setNames(c(color_A, color_B), c(group_A_label, group_B_label))) +
    scale_x_continuous(breaks = seq(-axis_zoom, axis_zoom, 0.5),
                       labels = function(x) ifelse(x %% 1 == 0, x, "")) + 
    labs(x = "Normalized Enrichment Score (NES)", y = NULL, title = plot_title) +
    theme_bw(base_size = 14) +
    theme(
      panel.grid.major.x = element_line(color = "grey92", linewidth = 0.5),
      panel.grid.major.y = element_blank(),
      panel.grid.minor   = element_blank(),
      panel.border = element_rect(colour = "black", fill = NA, linewidth = 1.5),
      legend.position = "none",
      plot.title = element_text(hjust = 0.5, face = "bold", size = 16, margin = margin(b = 40)),
      plot.margin = margin(t = 40, r = 20, b = 20, l = 20, unit = "pt"),
      #ensure y axis has enough room, and tightens lines together
      axis.text.y = element_text(size = 9, lineheight = 0.8)
    ) +
    scale_y_discrete(labels = function(x) str_wrap(gsub("_", " ", x), width = 30)) +
    # Dynamic text annotations at the top of the plot
    annotate("text", x = -axis_zoom/2, y = Inf, 
             label = glue("^ {group_B_label}"), color = color_B, size = 5, fontface = "bold", vjust = -1.0) +
    annotate("text", x = axis_zoom/2, y = Inf, 
             label = glue("^ {group_A_label}"), color = color_A, size = 5, fontface = "bold", vjust = -1.0) +
    coord_cartesian(xlim = c(-axis_zoom, axis_zoom), clip = "off")
  
  dyn_h <- 2 + (nrow(top_gsea) * 0.3)
  dyn_w <- 5 + (max(nchar(as.character(top_gsea$pathway))) * 0.12)
  
  print(p_bar)
  
  ggsave(filename = file.path(current_graphs_dir, glue("Barplot_{geneset_name}.pdf")), 
         plot = p_bar, width = dyn_w, height = dyn_h, limitsize = FALSE)
  
  # --- Dotplot (Top 10 Highest NES) ---
  top_nes_gsea <- fgsea_res_tidy %>%
    arrange(desc(NES)) %>%   
    slice_head(n = 10) %>%   
    mutate(pathway = factor(pathway, levels = unique(pathway[order(NES)])))

  if (nrow(top_nes_gsea) > 0) {
    # Define our wrap width
    wrap_width <- 45
    
    p_sig <- ggplot(top_nes_gsea, aes(x = NES, y = pathway, color = NES, size = -log10(padj))) +
      geom_vline(xintercept = 0, linetype = "dashed", color = "grey60") +
      geom_point(alpha = 0.8) +
      scale_color_gradient2(low = color_B, mid = "grey90", high = color_A, midpoint = 0, name = "NES") +
      scale_size_continuous(range = c(3, 8), name = "-log10(padj)") +
      labs(x = "Normalized Enrichment Score (NES)", y = NULL, 
           title = glue("Top 10 Highest NES:\n{clean_comp_title} ({geneset_name})")) +
      theme_bw(base_size = 14) +
      theme(
        panel.grid.major.x = element_line(color = "grey92", linewidth = 0.5),
        panel.grid.major.y = element_line(color = "grey92", linewidth = 0.5, linetype = "dotted"),
        panel.grid.minor   = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA, linewidth = 1.5),
        plot.title = element_text(hjust = 0.5, face = "bold", size = 14, margin = margin(b = 20)),
        plot.margin = margin(t = 20, r = 10, b = 10, l = 10, unit = "pt"),
        axis.text.y = element_text(lineheight = 0.8) 
      ) +
      scale_y_discrete(labels = function(x) str_wrap(gsub("_", " ", x), width = wrap_width))

    # --- DYNAMIC SIZING ---
    # Cap the character count at wrap_width so the plot doesn't stretch endlessly horizontally
    max_chars <- min(max(nchar(as.character(top_nes_gsea$pathway))), wrap_width)
    
    # Calculate dimensions: enough base room for the plot/legends + scaled room for text/points
    dyn_h_sig <- 3 + (nrow(top_nes_gsea) * 0.4)
    dyn_w_sig <- 4.5 + (max_chars * 0.1) 
    
    ggsave(filename = file.path(current_graphs_dir, glue("Dotplot_Top10_NES_{geneset_name}.pdf")), 
           plot = p_sig, width = dyn_w_sig, height = dyn_h_sig, limitsize = FALSE)
  }
  
  
  return(fgsea_res_tidy)
}

Load data

Code
# ==============================================================================
# 2. CONFIG & DATA LOADING
# ==============================================================================

GLOBAL_CONFIG <- list(
  output_dir = file.path(results_dir, "GSEA_Results_Master"), 
  group_A    = "Plaque Enriched",
  group_B    = "Control Enriched",
  col_A      = "firebrick3",
  col_B      = "navy"
)

# Load Gene Sets

#go and kegg sets
go_pathways <- gmtPathways("../../GO_human_all_genesets.gmt")
kegg_df <- msigdbr(species = "Mus musculus", collection = "C2", subcollection = "CP:KEGG_MEDICUS")
Using human MSigDB with ortholog mapping to mouse. Use `db_species = "MM"` for mouse-native gene sets.
This message is displayed once per session.
Code
kegg_list <- split(x = kegg_df$gene_symbol, f = kegg_df$gs_name)
go_kegg_list <- c(go_pathways, kegg_list)

#cell type signature sets
c8_df <- msigdbr(species = "Mus musculus", category = "C8")
Warning: The `category` argument of `msigdbr()` is deprecated as of msigdbr 10.0.0.
ℹ Please use the `collection` argument instead.
Code
c8_list <- split(x = c8_df$gene_symbol, f = c8_df$gs_name)

#DAA signature from from https://www.nature.com/articles/s41593-020-0624-8
daa_df <- read_excel(path = file.path(raw_dir, "DAA_Habib_2020.xlsx"), col_types = "text")

#Extract the genes as a character vector 
daa_vector <- daa_df %>%
  pull(Gene) %>%
  toupper() %>%
  unique()
daa_vector
  [1] "CST3"          "APOE"          "CLU"           "GFAP"         
  [5] "CPE"           "MT1"           "CD81"          "CD9"          
  [9] "MT2"           "ID3"           "CKB"           "FXYD1"        
 [13] "VIM"           "PRDX6"         "CTSB"          "CSMD1"        
 [17] "DBI"           "FTH1"          "HSD17B4"       "ALDOC"        
 [21] "IGFBP5"        "MLC1"          "C4B"           "NTRK2"        
 [25] "GSN"           "KCNIP4"        "GPM6B"         "CNN3"         
 [29] "FTL1"          "ATP1B2"        "ID4"           "PSAP"         
 [33] "PLCE1"         "AQP4"          "TAGLN3"        "SORBS1"       
 [37] "TMEM47"        "ITM2C"         "SCD2"          "ITM2B"        
 [41] "GPC5"          "PRDX1"         "CD63"          "LAPTM4A"      
 [45] "LAMP1"         "GSTM1"         "TRPM3"         "TSPAN3"       
 [49] "GPR37L1"       "S100A6"        "CTSD"          "SPARC"        
 [53] "TMBIM6"        "SCG3"          "B2M"           "NCAM2"        
 [57] "GM14964"       "GGTA1"         "ENOX1"         "GLIS3"        
 [61] "S100A16"       "TUBA1A"        "CTSL"          "S100B"        
 [65] "6330403K07RIK" "CHST2"         "CHL1"          "MATN4"        
 [69] "ANGPT1"        "ADD3"          "SERPINA3N"     "TMEM176B"     
 [73] "ARHGEF4"       "MGST1"         "TUBB2B"        "IFITM3"       
 [77] "GRM5"          "ID2"           "DHRS1"         "SDC4"         
 [81] "OSMR"          "CADPS"         "LSAMP"         "DST"          
 [85] "GADD45G"       "THBS4"         "DDR1"          "SEMA6D"       
 [89] "RCN2"          "CPNE8"         "UBE2E2"        "SELK"         
 [93] "SBNO2"         "FBXO2"         "TSC22D4"       "SARAF"        
 [97] "TMEM176A"      "PRKCA"         "LAMP2"         "ASPG"         
[101] "MAN1A"         "SLC38A1"       "CAV2"          "S100A1"       
[105] "ABCA1"         "CHIL1"         "TST"           "PRUNE2"       
[109] "HIST1H2BC"     "SLC3A2"        "SYT11"         "ERBB2IP"      
[113] "GM10116"       "KIF1A"         "HRSP12"        "SGCD"         
[117] "STAT3"         "LGALS1"        "SLC16A1"       "GPX4"         
[121] "PDPN"          "CD151"         "USP53"         "GRIK2"        
[125] "PARD3B"        "FXYD7"         "FOS"           "ID1"          
[129] "CACNB2"        "UBC"           "PDIA4"         "NKAIN2"       
[133] "NRG2"          "LXN"           "HOPX"          "SREBF1"       
[137] "SLC35F1"       "PDLIM4"        "CEBPB"         "CYR61"        
[141] "SOX9"          "SERPINF1"      "NFKBIA"        "NFE2L2"       
[145] "SLC14A1"       "ARHGAP6"       "HSPA5"         "SULF2"        
[149] "NAALADL2"      "TSPAN4"        "NAV2"          "CTNNBIP1"     
[153] "BCAP31"        "SLC39A14"      "SHISA6"        "APLP2"        
[157] "SERPING1"      "SPCS2"         "EZR"           "SCARB2"       
[161] "RAB18"         "GNAI2"         "JUNB"          "CLIC1"        
[165] "PFKP"          "HSPA2"         "AEBP1"         "IDH2"         
[169] "PPIB"          "ATRAID"        "DGKI"          "ATP6V0E"      
[173] "JUND"          "HNRNPK"        "SULF1"         "1700030F04RIK"
[177] "ITGB8"         "MFAP3L"        "S100A10"       "NPC2"         
[181] "SLC7A10"       "REEP5"         "VWA1"          "SHISA5"       
[185] "GM4876"        "LGMN"          "LGALS3BP"      "GM3448"       
[189] "TMCO1"         "GM3764"        "RAP1B"         "TMEM59"       
[193] "2810459M11RIK" "DEGS1"         "EFCAB14"       "1810058I24RIK"
[197] "GNA13"         "SAMD4"         "S1PR3"         "MMP16"        
[201] "HSD17B12"      "SORBS2"        "TUBB2A"        "ATP6AP2"      
[205] "KCNJ3"         "RHOB"          "NRG1"          "EPDR1"        
[209] "PDZD2"         "TM9SF3"        "SLC20A1"       "CLDND1"       
[213] "HIVEP2"        "PLXDC2"        "SOX2"          "LUZP2"        
[217] "SYNPO2"        "TPRGL"         "H2-K1"         "PKP4"         
[221] "FABP7"         "USP24"         "PDGFD"         "ATP1A1"       
[225] "ENPP5"         "TIMP2"         "HMGCLL1"       "CACNA2D3"     
[229] "DAPK1"         "SEPT15"        "RASSF8"        "TGOLN1"       
[233] "VIMP"          "ZFP706"        "GM2A"          "H2-D1"        
[237] "PCYT2"         "UQCRC2"        "UPF2"          "PPP2CA"       
[241] "TMEM100"       "NNAT"          "LYSMD2"        "C1QA"         
[245] "ST6GALNAC5"    "SMPDL3A"       "GRIA2"         "NDUFV3"       
[249] "NRXN1"         "AGT"           "MGAT4C"        "CDH10"        
[253] "RPL36-PS3"     "KIRREL3"      
Code
# Create master list
gene_sets_master <- list(
  "GO_KEGG" = go_kegg_list,
  "cell_type_sigs_c8" = c8_list,
  "Habib_2020_DAA" = list("Habib_2020_DAA" = daa_vector)
)

#Create Rank Vectors directly from MSstats output
comparisons_master <- list(
  "PlaqueNear_vs_Control" = prep_msstats_ranks(res_df, protein_dictionary, "PlaqueNear_vs_Control"),
  "PlaqueFar_vs_Control"  = prep_msstats_ranks(res_df, protein_dictionary, "PlaqueFar_vs_Control"),
  "PlaqueNear_vs_PlaqueFar" = prep_msstats_ranks(res_df, protein_dictionary, "PlaqueNear_vs_PlaqueFar")
)

Execute the GSEA function

Code
# ==============================================================================
# 3. EXECUTION LOOP
# ==============================================================================

purrr::walk2(names(gene_sets_master), gene_sets_master, function(gset_name, gset_pathways) {
  purrr::walk2(comparisons_master, names(comparisons_master), function(ranks, comp_name) {
    
    run_gsea_analysis(
      ranks            = ranks,
      comparison_name  = comp_name,
      geneset_name     = gset_name,     
      pathways         = gset_pathways, 
      base_objects_dir = objects_dir,   
      base_graphs_dir  = results_dir,    
      color_A          = GLOBAL_CONFIG$col_A,
      color_B          = GLOBAL_CONFIG$col_B
    )
    
    gc() # Memory cleanup
  })
})

Specific term plotting

Code
# ==============================================================================
# SPECIFIC TERM PLOTTING
# ==============================================================================

term_to_plot <- "Habib_2020_DAA"
# 1. Flatten master gene list
all_pathways_flat <- do.call(c, unname(gene_sets_master))

# Check if the term exists before looping - in case typo
if (!term_to_plot %in% names(all_pathways_flat)) {
  stop(glue("Could not find pathway '{term_to_plot}' in gene_sets_master."))
}

pathway_genes <- all_pathways_flat[[term_to_plot]]

# Create a specific output folder
specific_graphs_dir <- file.path(results_dir, "Specific_Pathway_Plots")
dir.create(specific_graphs_dir, showWarnings = FALSE, recursive = TRUE)

# Loop and Plot
purrr::walk2(comparisons_master, names(comparisons_master), function(ranks, comp_name) {
  
  message(glue("Plotting {term_to_plot} for {comp_name}..."))
  
  # Generate the GSEA enrichment plot
  gsea_plot <- plotEnrichment(pathway_genes, ranks) +
    labs(
      title = term_to_plot,
      subtitle = glue("Comparison: {comp_name}"),
      x = "Rank",
      y = "Enrichment Score"
    ) +
    theme_bw(base_size = 14) +
    theme(
      plot.title = element_text(face = "bold", hjust = 0.5),
      plot.subtitle = element_text(hjust = 0.5, color = "grey40"),
      panel.grid = element_blank()
    )
  print(gsea_plot)
  
  # Save
  ggsave(
    filename = file.path(specific_graphs_dir, glue("{term_to_plot}_{comp_name}.pdf")),
    plot = gsea_plot,
    width = 8, 
    height = 5
  )
})
Plotting Habib_2020_DAA for PlaqueNear_vs_Control...

Plotting Habib_2020_DAA for PlaqueFar_vs_Control...

Plotting Habib_2020_DAA for PlaqueNear_vs_PlaqueFar...

Hypergeometric test: Plaque Near vs Plaque Far top 10 vs Habib 2020 top 10

Code
# Get top 10 upregulated proteins from PlaqueNear vs PlaqueFar
top10_near_vs_far <- comparisons_master[["PlaqueNear_vs_PlaqueFar"]] %>%
  sort(decreasing = TRUE) %>%
  head(10) %>%
  names()

# Get top 10 from Habib 2020 DAA signature
top10_habib <- head(daa_vector, 10)

# Define the background population
# Use all genes present in the rank list as the background
background_genes <- names(comparisons_master[["PlaqueNear_vs_PlaqueFar"]])
N <- length(background_genes)

# Calculate overlap
overlap <- intersect(top10_near_vs_far, top10_habib)
k <- length(overlap)  # observed overlap
m <- sum(top10_habib %in% background_genes)  # Habib top10 genes present in background
n <- N - m            # genes NOT in Habib top10
q <- length(top10_near_vs_far)  # size of your top10

# Hypergeometric test
# phyper(k-1) gives P(X >= k) i.e. probability of overlap >= observed by chance
p_val <- phyper(k - 1, m, n, q, lower.tail = FALSE)

cat("=== Hypergeometric Test: PlaqueNear vs PlaqueFar vs Habib 2020 DAA ===\n")
=== Hypergeometric Test: PlaqueNear vs PlaqueFar vs Habib 2020 DAA ===
Code
cat("Background (total detected proteins):", N, "\n")
Background (total detected proteins): 5532 
Code
cat("Habib top 10 genes found in background:", m, "\n")
Habib top 10 genes found in background: 6 
Code
cat("Your top 10 Near vs Far proteins:", q, "\n")
Your top 10 Near vs Far proteins: 10 
Code
cat("Observed overlap:", k, "\n")
Observed overlap: 1 
Code
cat("Overlapping genes:", paste(overlap, collapse = ", "), "\n")
Overlapping genes: GFAP 
Code
cat("P-value:", p_val, "\n\n")
P-value: 0.01080195 
Code
# Visualise the overlap with a simple table
overlap_df <- data.frame(
  Gene = background_genes,
  In_Habib_Top10 = background_genes %in% top10_habib,
  In_NearFar_Top10 = background_genes %in% top10_near_vs_far
) %>%
  filter(In_Habib_Top10 | In_NearFar_Top10) %>%
  mutate(
    Overlap = In_Habib_Top10 & In_NearFar_Top10
  ) %>%
  arrange(desc(Overlap))

print(overlap_df)
       Gene In_Habib_Top10 In_NearFar_Top10 Overlap
1      GFAP           TRUE             TRUE    TRUE
2     NCOR1          FALSE             TRUE   FALSE
3     CLCN6          FALSE             TRUE   FALSE
4     DIDO1          FALSE             TRUE   FALSE
5      DPP7          FALSE             TRUE   FALSE
6   ARFGEF3          FALSE             TRUE   FALSE
7  SERPINC1          FALSE             TRUE   FALSE
8     ARL8B          FALSE             TRUE   FALSE
9     ABCA3          FALSE             TRUE   FALSE
10    PTPN6          FALSE             TRUE   FALSE
11     APOE           TRUE            FALSE   FALSE
12      CLU           TRUE            FALSE   FALSE
13      CPE           TRUE            FALSE   FALSE
14     CST3           TRUE            FALSE   FALSE
15     CD81           TRUE            FALSE   FALSE