diff --git a/dev-notebooks/STRINGdb-Analysis.Rmd b/dev-notebooks/STRINGdb-Analysis.Rmd index eb331a4..92ace44 100644 --- a/dev-notebooks/STRINGdb-Analysis.Rmd +++ b/dev-notebooks/STRINGdb-Analysis.Rmd @@ -3,8 +3,8 @@ title: 'STRINGdb: A Database of Known and Predicted Protein-protein Interactions author: "Co-adapted by Haowen He and Rachael White from the 2015 Vignette by Andrea Franceschini" subtitle: IDEA Alzheimer's Data Analysis Bootcamp - Summer 2022 output: - html_document: default pdf_document: default + html_document: default always_allow_html: true --- @@ -180,12 +180,12 @@ string_db$plot_network(hits, payload_id=payload_id) ## Clustering ```{r, fig.width=10, fig.height=10, fig.fullwidth=TRUE, fig.fullheight=TRUE} -clustersList <- string_db$get_clusters(example1_mapped$STRING_id[1:600]) +#clustersList <- string_db$get_clusters(example1_mapped$STRING_id[1:600]) # plot first 4 clusters -par(mfrow=c(2,2)) -for(i in seq(1:4)){ - string_db$plot_network(clustersList[[i]]) - } +#par(mfrow=c(2,2)) +#for(i in seq(1:4)){ +# string_db$plot_network(clustersList[[i]]) +# } ``` \newpage @@ -237,14 +237,10 @@ dim(seurat_mapped) hits_seurat <- seurat_mapped$STRING_id[1:200] ``` -\newpage - ```{r, fig.width=10, fig.height=10, fig.fullwidth=TRUE, fig.fullheight=TRUE} -string_db$plot_network(hits_seurat) +#string_db$plot_network(hits_seurat) ``` -\newpage - ```{r} # PAYLOAD MECHANISM # filter by p-value and add a color column @@ -272,22 +268,175 @@ payload_id_seurat <- string_db$post_payload( seurat_mapped_pval05$STRING_id, colors = seurat_mapped_pval05$color ) ``` -\newpage - ```{r, fig.width=10, fig.height=10, fig.fullwidth=TRUE, fig.fullheight=TRUE} -string_db$plot_network(hits_seurat) +#string_db$plot_network(hits_seurat) ``` ```{r, , fig.width=10, fig.height=10, fig.fullwidth=TRUE, fig.fullheight=TRUE} # display a STRING network png with the "halo" string_db$plot_network(hits_seurat, payload_id = payload_id_seurat) +title(sub = "STRINGdb: protein-protein interaction of Astrocytes at 2mo", + cex.sub = 1, font.sub = 3, col.sub = "darkgreen" + ) -clustersList_seurat <- string_db$get_clusters(seurat_mapped$STRING_id[1:600]) +# clustersList_seurat <- string_db$get_clusters(seurat_mapped$STRING_id[1:600]) # plot first 4 clusters -par(mfrow=c(2,2)) -for(i in seq(1:4)){ - string_db$plot_network(clustersList_seurat[[i]]) - } +#par(mfrow=c(2,2)) +#for(i in seq(1:4)){ +# string_db$plot_network(clustersList_seurat[[i]]) +#} ``` +\newpage + +```{r, , fig.width=10, fig.height=10, fig.fullwidth=TRUE, fig.fullheight=TRUE} +# specify your cell type +my_celltype <- "Ast" +# specify time point of interest +my_timept_2 <- "4mo" + +# Load Seurat data +# Change the filepath in quotes "" below to match where you +# have your celltype data saved +# The expected filepath is "~/AlzheimersDS/student-notebooks/FinalMergedData-my_celltype.rds" +# (where `my-celltype` is your celltype code) +my_celltype_data <- readRDS("/data/Alzheimers_DS/FinalMergedData-Ast.rds") + +# Subset my celltype-specific Seurat object by time +my_seurat_2 <- subset(x = my_celltype_data, subset = Age == my_timept_2) + +# set active identities of my subset seurat to "Mt" attribute (denoting variant) +# for DE testing +Idents(my_seurat_2) <- "Mt" +summary(as.factor(my_seurat_2$Mt)) + +# Run D.E. test to identify genes differentially expressed between +# between diseased and normal cells, for my celltype and at this timepoint +marker_genes.df_2 <- FindMarkers(my_seurat_2, + ident.1 = "V337M", ident.2 = "V337V") + +# order results by p-value +marker_genes.df_2 <- marker_genes.df_2 %>% arrange(p_val) +head(marker_genes.df_2) + +# format a new dataframe using the contents of +# marker_genes.df, to pass to STRINGdb +stringdb.df_2 <- marker_genes.df_2 %>% + tibble::rownames_to_column("geneIDs") %>% + dplyr::select(geneIDs, avg_log2FC, p_val) +colnames(stringdb.df_2) <- c("geneIDs","logFC","pvalue") + +# mapping +seurat_mapped_2 <- string_db$map(stringdb.df_2, "geneIDs", removeUnmappedRows = TRUE ) +dim(seurat_mapped_2) + +#View(seurat_mapped) +hits_seurat_2 <- seurat_mapped_2$STRING_id[1:200] + +# PAYLOAD MECHANISM +# filter by p-value and add a color column +# (i.e. green down-regulated gened and red for up-regulated genes) +head(subset(seurat_mapped_2, pvalue < 0.05), 20) + +dim(seurat_mapped_2[seurat_mapped_2$pvalue < 0.05,]) + +seurat_mapped_pval05_2 <- string_db$add_diff_exp_color(subset(seurat_mapped_2, pvalue<0.05), + logFcColStr="logFC" ) +head(seurat_mapped_pval05_2) + +table(seurat_mapped_pval05_2$color) + +dim(seurat_mapped_pval05_2) + +seurat_mapped_pval05_2[seurat_mapped_pval05_2$color == '#D7FFD7FF', ] + +seurat_mapped_pval05_2[seurat_mapped_pval05_2$color == '#FFEDEDFF', ] + +seurat_mapped_pval05_2[seurat_mapped_pval05_2$color == '#FFF6F6FF', ] +# post payload information to the STRING server +payload_id_seurat_2 <- string_db$post_payload( seurat_mapped_pval05_2$STRING_id, + colors = seurat_mapped_pval05_2$color ) + +string_db$plot_network(hits_seurat_2, payload_id = payload_id_seurat_2) +title(sub = "STRINGdb: protein-protein interaction of Astrocytes at 4mo", + cex.sub = 1, font.sub = 3, col.sub = "darkgreen" + ) +``` + +\newpage + +```{r, , fig.width=10, fig.height=8, fig.fullwidth=TRUE, fig.fullheight=TRUE} +# specify your cell type +my_celltype <- "Ast" +# specify time point of interest +my_timept_3 <- "6mo" + +# Load Seurat data +# Change the filepath in quotes "" below to match where you +# have your celltype data saved +# The expected filepath is "~/AlzheimersDS/student-notebooks/FinalMergedData-my_celltype.rds" +# (where `my-celltype` is your celltype code) +my_celltype_data <- readRDS("/data/Alzheimers_DS/FinalMergedData-Ast.rds") + +# Subset my celltype-specific Seurat object by time +my_seurat_3 <- subset(x = my_celltype_data, subset = Age == my_timept_3) + +# set active identities of my subset seurat to "Mt" attribute (denoting variant) +# for DE testing +Idents(my_seurat_3) <- "Mt" +summary(as.factor(my_seurat_3$Mt)) + +# Run D.E. test to identify genes differentially expressed between +# between diseased and normal cells, for my celltype and at this timepoint +marker_genes.df_3 <- FindMarkers(my_seurat_3, + ident.1 = "V337M", ident.2 = "V337V") + +# order results by p-value +marker_genes.df_3 <- marker_genes.df_3 %>% arrange(p_val) +head(marker_genes.df_3) + +# format a new dataframe using the contents of +# marker_genes.df, to pass to STRINGdb +stringdb.df_3 <- marker_genes.df_3 %>% + tibble::rownames_to_column("geneIDs") %>% + dplyr::select(geneIDs, avg_log2FC, p_val) +colnames(stringdb.df_3) <- c("geneIDs","logFC","pvalue") + +# mapping +seurat_mapped_3 <- string_db$map(stringdb.df_3, "geneIDs", removeUnmappedRows = TRUE ) +dim(seurat_mapped_3) + +#View(seurat_mapped) +hits_seurat_3 <- seurat_mapped_3$STRING_id[1:200] + +# PAYLOAD MECHANISM +# filter by p-value and add a color column +# (i.e. green down-regulated gened and red for up-regulated genes) +head(subset(seurat_mapped_3, pvalue < 0.05), 20) + +dim(seurat_mapped_3[seurat_mapped_3$pvalue < 0.05,]) + +seurat_mapped_pval05_3 <- string_db$add_diff_exp_color(subset(seurat_mapped_3, pvalue<0.05), + logFcColStr="logFC" ) +head(seurat_mapped_pval05_3) + +table(seurat_mapped_pval05_3$color) + +dim(seurat_mapped_pval05_3) + +seurat_mapped_pval05_3[seurat_mapped_pval05_3$color == '#D7FFD7FF', ] + +seurat_mapped_pval05_3[seurat_mapped_pval05_3$color == '#FFEDEDFF', ] + +seurat_mapped_pval05_3[seurat_mapped_pval05_3$color == '#FFF6F6FF', ] + +# post payload information to the STRING server +payload_id_seurat_3 <- string_db$post_payload( seurat_mapped_pval05_3$STRING_id, + colors = seurat_mapped_pval05_3$color ) + +string_db$plot_network(hits_seurat_3, payload_id = payload_id_seurat_3) +title(sub = "STRINGdb: protein-protein interaction of Astrocytes at 6mo", + cex.sub = 1, font.sub = 3, col.sub = "darkgreen" + ) +``` diff --git a/dev-notebooks/STRINGdb-Analysis.html b/dev-notebooks/STRINGdb-Analysis.html index e0ca2c6..c63f24d 100644 --- a/dev-notebooks/STRINGdb-Analysis.html +++ b/dev-notebooks/STRINGdb-Analysis.html @@ -551,17 +551,12 @@

Mapping

Clustering

-
clustersList <- string_db$get_clusters(example1_mapped$STRING_id[1:600])
+
#clustersList <- string_db$get_clusters(example1_mapped$STRING_id[1:600])
 # plot first 4 clusters
-par(mfrow=c(2,2))
-for(i in seq(1:4)){
-   string_db$plot_network(clustersList[[i]])
-   }
-
## [1] "Parameter add_link not available in version 11.0 (please use 11.0b or later)"
-
## [1] "Parameter add_link not available in version 11.0 (please use 11.0b or later)"
-
## [1] "Parameter add_link not available in version 11.0 (please use 11.0b or later)"
-
## [1] "Parameter add_link not available in version 11.0 (please use 11.0b or later)"
-

+#par(mfrow=c(2,2)) +#for(i in seq(1:4)){ +# string_db$plot_network(clustersList[[i]]) +# }
@@ -617,11 +612,7 @@

Part II: STRINGdb Analysis with Seurat

## [1] 317   4
#View(seurat_mapped)
 hits_seurat <- seurat_mapped$STRING_id[1:200]
-
-
string_db$plot_network(hits_seurat)
-
## [1] "Parameter add_link not available in version 11.0 (please use 11.0b or later)"
-

-
+
#string_db$plot_network(hits_seurat)
# PAYLOAD MECHANISM
 # filter by p-value and add a color column
 # (i.e. green down-regulated gened and red for up-regulated genes)
@@ -697,25 +688,286 @@ 

Part II: STRINGdb Analysis with Seurat

# post payload information to the STRING server
 payload_id_seurat <- string_db$post_payload( seurat_mapped_pval05$STRING_id,
                                       colors = seurat_mapped_pval05$color )
-
-
string_db$plot_network(hits_seurat)
-
## [1] "Parameter add_link not available in version 11.0 (please use 11.0b or later)"
-

+
#string_db$plot_network(hits_seurat)
# display a STRING network png with the "halo"
 string_db$plot_network(hits_seurat, payload_id = payload_id_seurat)
## [1] "Parameter add_link not available in version 11.0 (please use 11.0b or later)"
-

-
clustersList_seurat <- string_db$get_clusters(seurat_mapped$STRING_id[1:600])
+
title(sub = "STRINGdb: protein-protein interaction of Astrocytes at 2mo", 
+      cex.sub = 1,   font.sub = 3, col.sub = "darkgreen"
+      )
+

+
# clustersList_seurat <- string_db$get_clusters(seurat_mapped$STRING_id[1:600])
 # plot first 4 clusters
-par(mfrow=c(2,2))
-for(i in seq(1:4)){
-   string_db$plot_network(clustersList_seurat[[i]])
-   }
-
## [1] "Parameter add_link not available in version 11.0 (please use 11.0b or later)"
-
## [1] "Parameter add_link not available in version 11.0 (please use 11.0b or later)"
+#par(mfrow=c(2,2)) +#for(i in seq(1:4)){ +# string_db$plot_network(clustersList_seurat[[i]]) +#}
+
+
# specify your cell type 
+my_celltype <- "Ast"
+# specify time point of interest
+my_timept_2 <- "4mo"
+
+# Load Seurat data
+# Change the filepath in quotes "" below to match where you 
+# have your celltype data saved
+# The expected filepath is "~/AlzheimersDS/student-notebooks/FinalMergedData-my_celltype.rds"
+# (where `my-celltype` is your celltype code)
+my_celltype_data <- readRDS("/data/Alzheimers_DS/FinalMergedData-Ast.rds")
+
+# Subset my celltype-specific Seurat object by time
+my_seurat_2 <- subset(x = my_celltype_data, subset = Age == my_timept_2)
+
+# set active identities of my subset seurat to "Mt" attribute (denoting variant)
+# for DE testing
+Idents(my_seurat_2) <- "Mt"
+summary(as.factor(my_seurat_2$Mt))
+
## V337M V337V 
+##    83    93
+
# Run D.E. test to identify genes differentially expressed between
+# between diseased and normal cells, for my celltype and at this timepoint
+marker_genes.df_2 <- FindMarkers(my_seurat_2, 
+                               ident.1 = "V337M", ident.2 = "V337V") 
+
+# order results by p-value
+marker_genes.df_2 <- marker_genes.df_2 %>% arrange(p_val) 
+head(marker_genes.df_2)
+
##                 p_val avg_log2FC pct.1 pct.2  p_val_adj
+## RPL9     2.436679e-06 -0.4309564 1.000 1.000 0.09770109
+## CHCHD2   4.873454e-05  0.9258074 0.639 0.452 1.00000000
+## RPS5     6.022260e-05 -0.3538973 1.000 1.000 1.00000000
+## RPL24    6.876765e-05 -0.2768466 1.000 1.000 1.00000000
+## RPS13    8.903810e-05 -0.3510350 1.000 1.000 1.00000000
+## HSP90AB1 1.384171e-04 -0.3997375 1.000 1.000 1.00000000
+
# format a new dataframe using the contents of
+# marker_genes.df, to pass to STRINGdb
+stringdb.df_2 <- marker_genes.df_2 %>% 
+  tibble::rownames_to_column("geneIDs") %>% 
+  dplyr::select(geneIDs, avg_log2FC, p_val)
+colnames(stringdb.df_2) <- c("geneIDs","logFC","pvalue")
+
+# mapping
+seurat_mapped_2 <- string_db$map(stringdb.df_2, "geneIDs", removeUnmappedRows = TRUE )
+
## Warning:  we couldn't map to STRING 7% of your identifiers
+
dim(seurat_mapped_2)
+
## [1] 389   4
+
#View(seurat_mapped)
+hits_seurat_2 <- seurat_mapped_2$STRING_id[1:200]
+
+# PAYLOAD MECHANISM
+# filter by p-value and add a color column
+# (i.e. green down-regulated gened and red for up-regulated genes)
+head(subset(seurat_mapped_2, pvalue < 0.05), 20)
+
##     geneIDs      logFC       pvalue            STRING_id
+## 1      RPL9 -0.4309564 2.436679e-06 9606.ENSP00000400467
+## 2    CHCHD2  0.9258074 4.873454e-05 9606.ENSP00000378812
+## 3      RPS5 -0.3538973 6.022260e-05 9606.ENSP00000472985
+## 4     RPL24 -0.2768466 6.876765e-05 9606.ENSP00000377640
+## 5     RPS13 -0.3510350 8.903810e-05 9606.ENSP00000435777
+## 6  HSP90AB1 -0.3997375 1.384171e-04 9606.ENSP00000360609
+## 7    NUDCD1 -0.3071863 2.337988e-04 9606.ENSP00000239690
+## 8      CTSF  0.2543172 3.056046e-04 9606.ENSP00000310832
+## 9     CNTN1 -0.6736092 3.473495e-04 9606.ENSP00000447006
+## 10   FAM84B  0.4271538 5.004481e-04 9606.ENSP00000302578
+## 11     DTNA  0.4544159 5.387087e-04 9606.ENSP00000470152
+## 12   RPL18A -0.2869659 5.563186e-04 9606.ENSP00000222247
+## 13  ZC3H11A -0.3695626 6.414567e-04 9606.ENSP00000438527
+## 14  ARHGEF3 -0.2590921 6.858667e-04 9606.ENSP00000341071
+## 15   PNPLA4  0.2838586 7.119000e-04 9606.ENSP00000370430
+## 16    SFRP2 -0.3884216 8.468573e-04 9606.ENSP00000274063
+## 17    PDCD6  0.3380133 9.165345e-04 9606.ENSP00000264933
+## 18    EIF3E -0.4086390 9.885559e-04 9606.ENSP00000220849
+## 19    TBPL1 -0.3417715 1.035193e-03 9606.ENSP00000237264
+## 20     KIF9  0.3973186 1.208109e-03 9606.ENSP00000333942
+
dim(seurat_mapped_2[seurat_mapped_2$pvalue < 0.05,])
+
## [1] 225   4
+
seurat_mapped_pval05_2 <- string_db$add_diff_exp_color(subset(seurat_mapped_2, pvalue<0.05),
+                                                        logFcColStr="logFC" )
+head(seurat_mapped_pval05_2)
+
##    geneIDs     logFC       pvalue            STRING_id     color
+## 2   CHCHD2 0.9258074 4.873454e-05 9606.ENSP00000378812 #FF5555FF
+## 8     CTSF 0.2543172 3.056046e-04 9606.ENSP00000310832 #FFFEFEFF
+## 10  FAM84B 0.4271538 5.004481e-04 9606.ENSP00000302578 #FFDDDDFF
+## 11    DTNA 0.4544159 5.387087e-04 9606.ENSP00000470152 #FFD7D7FF
+## 15  PNPLA4 0.2838586 7.119000e-04 9606.ENSP00000370430 #FFF9F9FF
+## 17   PDCD6 0.3380133 9.165345e-04 9606.ENSP00000264933 #FFEFEFFF
+
table(seurat_mapped_pval05_2$color)
+
## 
+## #00FF00FF #A5FFA5FF #C1FFC1FF #C8FFC8FF #DAFFDAFF #E0FFE0FF #E1FFE1FF #E2FFE2FF 
+##         1         1         1         2         1         1         1         1 
+## #E4FFE4FF #E8FFE8FF #EBFFEBFF #ECFFECFF #EDFFEDFF #EFFFEFFF #F0FFF0FF #F1FFF1FF 
+##         1         1         2         3         2         1         1         2 
+## #F2FFF2FF #F3FFF3FF #F4FFF4FF #F5FFF5FF #F7FFF7FF #F8FFF8FF #F9FFF9FF #FAFFFAFF 
+##         3         2         1         2         3         8         7         7 
+## #FBFFFBFF #FCFFFCFF #FDFFFDFF #FEFFFEFF #FF0000FF #FF2121FF #FF5151FF #FF5555FF 
+##         5         7         6        16         1         1         1         1 
+## #FF9E9EFF #FFAEAEFF #FFBEBEFF #FFC7C7FF #FFC8C8FF #FFC9C9FF #FFCACAFF #FFCECEFF 
+##         1         1         1         2         1         2         1         1 
+## #FFD6D6FF #FFD7D7FF #FFD8D8FF #FFDCDCFF #FFDDDDFF #FFDFDFFF #FFE0E0FF #FFE1E1FF 
+##         1         2         1         2         3         3         1         2 
+## #FFE2E2FF #FFE3E3FF #FFE4E4FF #FFE5E5FF #FFE6E6FF #FFE8E8FF #FFE9E9FF #FFEBEBFF 
+##         1         2         2         1         2         2         1         3 
+## #FFECECFF #FFEDEDFF #FFEEEEFF #FFEFEFFF #FFF0F0FF #FFF1F1FF #FFF3F3FF #FFF4F4FF 
+##         3         3         3         2         3         1         5         4 
+## #FFF5F5FF #FFF6F6FF #FFF7F7FF #FFF8F8FF #FFF9F9FF #FFFAFAFF #FFFBFBFF #FFFCFCFF 
+##         2         7         5         5         4         6         8         5 
+## #FFFDFDFF #FFFEFEFF #FFFFFFFF 
+##         6        10        11
+
dim(seurat_mapped_pval05_2)
+
## [1] 225   5
+
seurat_mapped_pval05_2[seurat_mapped_pval05_2$color == '#D7FFD7FF', ]
+
## [1] geneIDs   logFC     pvalue    STRING_id color    
+## <0 rows> (or 0-length row.names)
+
seurat_mapped_pval05_2[seurat_mapped_pval05_2$color == '#FFEDEDFF', ]
+
##     geneIDs     logFC      pvalue            STRING_id     color
+## 46    ALCAM 0.3467108 0.003644027 9606.ENSP00000305988 #FFEDEDFF
+## 87    REXO2 0.3456395 0.009814719 9606.ENSP00000265881 #FFEDEDFF
+## 153  SQSTM1 0.3491756 0.024846155 9606.ENSP00000374455 #FFEDEDFF
+
seurat_mapped_pval05_2[seurat_mapped_pval05_2$color == '#FFF6F6FF', ]
+
##     geneIDs     logFC      pvalue            STRING_id     color
+## 55     NRG1 0.3016229 0.005279101 9606.ENSP00000384620 #FFF6F6FF
+## 56   OGFRL1 0.3016229 0.005280041 9606.ENSP00000359464 #FFF6F6FF
+## 81     CRB2 0.2981367 0.009174040 9606.ENSP00000362734 #FFF6F6FF
+## 86   NDFIP2 0.3016229 0.009663438 9606.ENSP00000480798 #FFF6F6FF
+## 152   HEPN1 0.3008081 0.024386421 9606.ENSP00000386143 #FFF6F6FF
+## 168   PDIA3 0.2988126 0.030245988 9606.ENSP00000300289 #FFF6F6FF
+## 222    CDH2 0.2987689 0.048474986 9606.ENSP00000269141 #FFF6F6FF
+
# post payload information to the STRING server
+payload_id_seurat_2 <- string_db$post_payload( seurat_mapped_pval05_2$STRING_id,
+                                      colors = seurat_mapped_pval05_2$color )
+
+string_db$plot_network(hits_seurat_2, payload_id = payload_id_seurat_2)
## [1] "Parameter add_link not available in version 11.0 (please use 11.0b or later)"
+
title(sub = "STRINGdb: protein-protein interaction of Astrocytes at 4mo", 
+      cex.sub = 1,   font.sub = 3, col.sub = "darkgreen"
+      )
+

+
+
# specify your cell type 
+my_celltype <- "Ast"
+# specify time point of interest
+my_timept_3 <- "6mo"
+
+# Load Seurat data
+# Change the filepath in quotes "" below to match where you 
+# have your celltype data saved
+# The expected filepath is "~/AlzheimersDS/student-notebooks/FinalMergedData-my_celltype.rds"
+# (where `my-celltype` is your celltype code)
+my_celltype_data <- readRDS("/data/Alzheimers_DS/FinalMergedData-Ast.rds")
+
+# Subset my celltype-specific Seurat object by time
+my_seurat_3 <- subset(x = my_celltype_data, subset = Age == my_timept_3)
+
+# set active identities of my subset seurat to "Mt" attribute (denoting variant)
+# for DE testing
+Idents(my_seurat_3) <- "Mt"
+summary(as.factor(my_seurat_3$Mt))
+
## V337M V337V 
+##   120   206
+
# Run D.E. test to identify genes differentially expressed between
+# between diseased and normal cells, for my celltype and at this timepoint
+marker_genes.df_3 <- FindMarkers(my_seurat_3, 
+                               ident.1 = "V337M", ident.2 = "V337V") 
+
+# order results by p-value
+marker_genes.df_3 <- marker_genes.df_3 %>% arrange(p_val) 
+head(marker_genes.df_3)
+
##                p_val avg_log2FC pct.1 pct.2  p_val_adj
+## FAM183A 1.057416e-06  0.3126794 0.208 0.039 0.04239813
+## RSPH1   6.416327e-06  0.5408231 0.292 0.107 0.25726906
+## SDR39U1 8.923202e-06  0.2776002 0.367 0.146 0.35778471
+## CHCHD2  7.523072e-05  0.5406251 0.742 0.534 1.00000000
+## GSTO1   1.188225e-04  0.2758435 0.483 0.277 1.00000000
+## KIF5B   2.266152e-04 -0.2973771 0.917 0.961 1.00000000
+
# format a new dataframe using the contents of
+# marker_genes.df, to pass to STRINGdb
+stringdb.df_3 <- marker_genes.df_3 %>% 
+  tibble::rownames_to_column("geneIDs") %>% 
+  dplyr::select(geneIDs, avg_log2FC, p_val)
+colnames(stringdb.df_3) <- c("geneIDs","logFC","pvalue")
+
+# mapping
+seurat_mapped_3 <- string_db$map(stringdb.df_3, "geneIDs", removeUnmappedRows = TRUE )
+
## Warning:  we couldn't map to STRING 4% of your identifiers
+
dim(seurat_mapped_3)
+
## [1] 175   4
+
#View(seurat_mapped)
+hits_seurat_3 <- seurat_mapped_3$STRING_id[1:200]
+
+# PAYLOAD MECHANISM
+# filter by p-value and add a color column
+# (i.e. green down-regulated gened and red for up-regulated genes)
+head(subset(seurat_mapped_3, pvalue < 0.05), 20)
+
##    geneIDs      logFC       pvalue            STRING_id
+## 1  FAM183A  0.3126794 1.057416e-06 9606.ENSP00000334415
+## 2    RSPH1  0.5408231 6.416327e-06 9606.ENSP00000291536
+## 3  SDR39U1  0.2776002 8.923202e-06 9606.ENSP00000382327
+## 4   CHCHD2  0.5406251 7.523072e-05 9606.ENSP00000378812
+## 5    GSTO1  0.2758435 1.188225e-04 9606.ENSP00000358727
+## 6    KIF5B -0.2973771 2.266152e-04 9606.ENSP00000307078
+## 7    FOXJ1  0.2568422 2.401047e-04 9606.ENSP00000323880
+## 8    DHX36 -0.2883902 3.209629e-04 9606.ENSP00000417078
+## 9   EIF2S3 -0.3061879 3.441580e-04 9606.ENSP00000253039
+## 10 ANKRD32  0.2505007 3.948505e-04 9606.ENSP00000265140
+## 11  CCSER2  0.2552474 4.439559e-04 9606.ENSP00000361160
+## 12 FAM178A -0.2525705 4.699245e-04 9606.ENSP00000359292
+## 13 HCFC1R1  0.2607998 5.394797e-04 9606.ENSP00000248089
+## 14 ZSCAN18  0.3173666 5.453215e-04 9606.ENSP00000470123
+## 15 ZMYND10  0.2587778 5.474224e-04 9606.ENSP00000231749
+## 16 ARHGEF6 -0.3335041 5.491869e-04 9606.ENSP00000250617
+## 17  TUBA1A  0.3299507 5.518963e-04 9606.ENSP00000301071
+## 18   ZFHX4  0.3446729 6.159439e-04 9606.ENSP00000430497
+## 19  B3GAT2  0.2930511 6.289473e-04 9606.ENSP00000230053
+## 20     FOS  0.5067945 7.649047e-04 9606.ENSP00000306245
+
dim(seurat_mapped_3[seurat_mapped_3$pvalue < 0.05,])
+
## [1] 107   4
+
seurat_mapped_pval05_3 <- string_db$add_diff_exp_color(subset(seurat_mapped_3, pvalue<0.05),
+                                                        logFcColStr="logFC" )
+head(seurat_mapped_pval05_3)
+
##   geneIDs     logFC       pvalue            STRING_id     color
+## 1 FAM183A 0.3126794 1.057416e-06 9606.ENSP00000334415 #FFF7F7FF
+## 2   RSPH1 0.5408231 6.416327e-06 9606.ENSP00000291536 #FFD4D4FF
+## 3 SDR39U1 0.2776002 8.923202e-06 9606.ENSP00000382327 #FFFCFCFF
+## 4  CHCHD2 0.5406251 7.523072e-05 9606.ENSP00000378812 #FFD4D4FF
+## 5   GSTO1 0.2758435 1.188225e-04 9606.ENSP00000358727 #FFFCFCFF
+## 7   FOXJ1 0.2568422 2.401047e-04 9606.ENSP00000323880 #FFFEFEFF
+
table(seurat_mapped_pval05_3$color)
+
## 
+## #00FF00FF #26FF26FF #48FF48FF #95FF95FF #B7FFB7FF #BDFFBDFF #C8FFC8FF #DEFFDEFF 
+##         1         1         1         1         1         1         1         1 
+## #DFFFDFFF #E2FFE2FF #EAFFEAFF #ECFFECFF #EDFFEDFF #F1FFF1FF #F3FFF3FF #F6FFF6FF 
+##         1         1         3         1         1         3         2         2 
+## #F8FFF8FF #F9FFF9FF #FAFFFAFF #FBFFFBFF #FDFFFDFF #FEFFFEFF #FF0000FF #FFAAAAFF 
+##         2         2         1         1         1         1         1         1 
+## #FFC7C7FF #FFD3D3FF #FFD4D4FF #FFD5D5FF #FFD6D6FF #FFD8D8FF #FFDADAFF #FFDDDDFF 
+##         1         1         2         1         1         1         1         2 
+## #FFDFDFFF #FFE2E2FF #FFE5E5FF #FFE7E7FF #FFEBEBFF #FFECECFF #FFEEEEFF #FFF0F0FF 
+##         1         1         3         1         1         1         1         1 
+## #FFF1F1FF #FFF2F2FF #FFF3F3FF #FFF4F4FF #FFF5F5FF #FFF6F6FF #FFF7F7FF #FFF8F8FF 
+##         1         2         2         3         3         1         3         2 
+## #FFF9F9FF #FFFAFAFF #FFFBFBFF #FFFCFCFF #FFFDFDFF #FFFEFEFF #FFFFFFFF 
+##         6         2         4         6         6         7         7
+
dim(seurat_mapped_pval05_3)
+
## [1] 107   5
+
seurat_mapped_pval05_3[seurat_mapped_pval05_3$color == '#D7FFD7FF', ]
+
## [1] geneIDs   logFC     pvalue    STRING_id color    
+## <0 rows> (or 0-length row.names)
+
seurat_mapped_pval05_3[seurat_mapped_pval05_3$color == '#FFEDEDFF', ]
+
## [1] geneIDs   logFC     pvalue    STRING_id color    
+## <0 rows> (or 0-length row.names)
+
seurat_mapped_pval05_3[seurat_mapped_pval05_3$color == '#FFF6F6FF', ]
+
##    geneIDs     logFC       pvalue            STRING_id     color
+## 14 ZSCAN18 0.3173666 0.0005453215 9606.ENSP00000470123 #FFF6F6FF
+
# post payload information to the STRING server
+payload_id_seurat_3 <- string_db$post_payload( seurat_mapped_pval05_3$STRING_id,
+                                      colors = seurat_mapped_pval05_3$color )
+
+string_db$plot_network(hits_seurat_3, payload_id = payload_id_seurat_3)
## [1] "Parameter add_link not available in version 11.0 (please use 11.0b or later)"
-

+
title(sub = "STRINGdb: protein-protein interaction of Astrocytes at 6mo", 
+      cex.sub = 1,   font.sub = 3, col.sub = "darkgreen"
+      )
+

diff --git a/dev-notebooks/STRINGdb-Analysis.pdf b/dev-notebooks/STRINGdb-Analysis.pdf index 44c87cc..961b8b7 100644 Binary files a/dev-notebooks/STRINGdb-Analysis.pdf and b/dev-notebooks/STRINGdb-Analysis.pdf differ diff --git a/dev-notebooks/pathfinderR.Rmd b/dev-notebooks/pathfinderR.Rmd new file mode 100644 index 0000000..5f91150 --- /dev/null +++ b/dev-notebooks/pathfinderR.Rmd @@ -0,0 +1,780 @@ +--- +title: 'pathfindR: A Tool for Enrichment Analysis Utilizing Active Subnetworks' +author: "Adapted by Haowen He" +subtitle: IDEA Alzheimer's Data Analysis Bootcamp - Summer 2022 +output: + html_document: default + pdf_document: default +always_allow_html: true +--- + + +```{r setup1, include=FALSE} +# Required R package installation (run this cell before knitting) +# These will install packages if they are not already installed +if (!require("ggnewscale")) { + install.packages("ggnewscale") + library(ggnewscale) +} +if (!require("gplots")) { + install.packages("gplots") + library(gplots) +} +if (!require("ggrepel")) { + install.packages("ggrepel") + library(ggrepel) +} +if (!require("ggplot2")) { + install.packages("ggplot2") + library(ggplot2) +} +if (!require("knitr")) { + install.packages("knitr") + library(knitr) +} +if (!require("dplyr")) { + install.packages("dplyr") + library(dplyr) +} +if (!require("tidyverse")) { + install.packages("tidyverse") + library(tidyverse) +} +if (!require("plotly")) { + install.packages("plotly") + library(plotly) +} +if (!require("Seurat")) { + install.packages("Seurat") + library(Seurat) +} +if (!require("ggpubr")) { + install.packages("ggpubr") + library(ggpubr) + theme_set(theme_pubr()) +} +if (!require("webshot")) { + install.packages("webshot") + library(webshot) + webshot::install_phantomjs() +} +if (!require('gprofiler2')) { + install.packages("gprofiler2") + library(gprofiler2) + } + if (!require('rrvgo')) { + if (!require("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + BiocManager::install("rrvgo") + library(rrvgo) + } + if (!require('org.Hs.eg.db')) { + if (!require("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + BiocManager::install("org.Hs.eg.db") + library(org.Hs.eg.db) + } +if (!require(ReactomeGSA)) { + if (!require("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + BiocManager::install("ReactomeGSA") + library(ReactomeGSA) +} +if (!require(ReactomeGSA.data)) { + if (!require(devtools)) { + install.packages("devtools")} + install_github("reactome/ReactomeGSA.data") + library(ReactomeGSA.data) +} +if (!require(STRINGdb)) { + if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + BiocManager::install("STRINGdb") + library(STRINGdb) +} +if (!require("pathfindR")) { + install.packages("pathfindR") + library(pathfindR) +} +if (!require(ReactomePA)) { + if (!require("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + BiocManager::install("ReactomePA") + library(ReactomePA) +} +if (!require(DOSE)) { + if (!require("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + BiocManager::install("DOSE") + library(DOSE) +} +if (!require(enrichplot)) { + if (!require("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + BiocManager::install("enrichplot") + library(enrichplot) +} +if (!require(clusterProfiler)) { + if (!require("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + BiocManager::install("clusterProfiler") + library(clusterProfiler) +} +if (!require("GOplot")) { + install.packages("GOplot") + library(GOplot) +} +if (!require("splitstackshape")) { + install.packages("splitstackshape") + library(splitstackshape) +} +if (!require("getmstatistic")) { + install.packages("getmstatistic") + library(getmstatistic) +} +if (!require("ggnetwork")) { + install.packages("ggnetwork") + library(ggnetwork) +} +if (!require(ComplexHeatmap)) { + if (!require("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + BiocManager::install("ComplexHeatmap") + library(ComplexHeatmap) +} +``` + +# Overview + +`pathfindR` is a tool for enrichment analysis via active subnetworks. The package also offers functionalities to cluster the enriched terms and identify representative terms in each cluster, score the enriched terms per sample, and visualize analysis results. As of the latest version, the package also allows comparison of two pathfindR results. + +The functionalities of pathfindR are described in detail in Ulgen E, Ozisik O, Sezerman OU. 2019. pathfindR: An R Package for Comprehensive Identification of Enriched Pathways in Omics Data Through Active Subnetworks. Front. Genet. + +```{r figurename, echo=FALSE} +knitr::include_graphics("image1.png") +``` + + +## Load `Seurat` data + +```{r} +# specify your cell type +my_celltype <- "Ast" +# specify time point of interest +my_timept <- "2mo" + +# load Seurat data +# change the filepath in quotes "" below to match where you +# have your celltype data saved +# the expected filepath is "~/AlzheimersDS/student-notebooks/FinalMergedData-my_celltype.rds" +# (where `my-celltype` is your celltype code) +my_celltype_data <- readRDS("/data/Alzheimers_DS/FinalMergedData-Ast.rds") + +# subset my celltype-specific Seurat object by time +my_seurat <- subset(x = my_celltype_data, subset = Age == my_timept) + +# set active identities of my subset seurat to "Mt" attribute (denoting variant) +# for DE testing +Idents(my_seurat) <- "Mt" +summary(as.factor(my_seurat$Mt)) + +# run D.E. test to identify genes differentially expressed between +# between diseased and normal cells, for my celltype and at this timepoint +marker_genes.df <- FindMarkers(my_seurat, + ident.1 = "V337M", ident.2 = "V337V") + +# order results by p-value +ordered_marker_genes.df <- marker_genes.df[order(marker_genes.df$p_val),] + +# convert row names to column of gene IDs +# remove the original row names +ordered_marker_genes.df$geneIDs <- rownames(ordered_marker_genes.df) +rownames(ordered_marker_genes.df) <- NULL +head(ordered_marker_genes.df) + +# format a new data frame using the contents of +# ordered_marker_genes.df, to pass to pathfinder +pathfinder.df <- ordered_marker_genes.df %>% + dplyr::select(geneIDs, avg_log2FC, p_val) +colnames(pathfinder.df) <- c("geneIDs","logFC","pvalue") + +knitr::kable(head(pathfinder.df)) +``` + + +## Enrichment Analysis + +After input testing, any gene symbol that is not in the chosen protein-protein interaction network (PIN) is converted to an alias symbol if there is an alias that is found in the PIN. After mapping the input genes with the associated p-values onto the PIN, active subnetwork search is performed. The resulting active subnetworks are then filtered based on their scores and the number of significant genes they contain. + +> An active subnetwork can be defined as a group of interconnected genes in a protein-protein interaction network (PIN) that predominantly consists of significantly altered genes. In other words, active subnetworks define distinct disease-associated sets of interacting genes, whether discovered through the original analysis or discovered because of being in interaction with a significant gene. + +These filtered lists of active subnetworks are then used for enrichment analyses, i.e., using the genes in each of the active subnetworks, the significantly enriched terms (pathways/gene sets) are identified. Enriched terms with adjusted p-values larger than the given threshold are discarded, and the lowest adjusted p-value (among all active subnetworks) for each term is kept. This process of `active subnetwork search + enrichment analyses` is repeated for a selected number of iterations, performed in parallel. Over all iterations, the lowest and the highest adjusted p-values, and the number of occurrences among all iterations are reported for each significantly enriched term. + +```{r, echo = T, results = 'hide', message=FALSE, warning=FALSE} +# change the gene sets used for analysis (default = "KEGG") +# change the PIN for active subnetwork search (default = Biogrid) +# the available PINs are “Biogrid”, “STRING”, “GeneMania”, “IntAct”, “KEGG” and “mmu_STRING”. +# the available gene sets are “KEGG”, “Reactome”, “BioCarta”, “GO-All”, “GO-BP”, “GO-CC”, “GO-MF”, and “mmu_KEGG”. +output_df <- run_pathfindR(pathfinder.df, gene_sets = "Reactome", pin_name_path = "STRING", + p_val_threshold = 0.05, plot_enrichment_chart = FALSE) +``` + + +## Clustering of the Enriched Terms + +```{r fig1, echo=FALSE} +knitr::include_graphics("image2.png") +``` + +This workflow first calculates the pairwise kappa statistics between the enriched terms. The function then performs hierarchical clustering (by default), automatically determines the optimal number of clusters by maximizing the average silhouette width and returns a data frame with cluster assignments. + +```{r} +# display the heatmap of hierarchical clustering +# display the dendrogram and automatically-determined clusters +# change agglomeration method (default = "average") for hierarchical clustering +clustered_df <- cluster_enriched_terms(output_df, plot_hmap = FALSE, plot_dend = FALSE, plot_clusters_graph = FALSE, clu_method = "centroid") + +## The representative terms +knitr::kable(clustered_df[clustered_df$Status == "Representative", ]) + +``` + +## Term-Gene Heatmap (2mo) + +The package `ComplexHeatmap` can be used to visualize the heatmap of genes that are involved in the enriched terms. This heatmap allows visual identification of the input genes involved in the enriched terms, as well as the common or distinct genes between different terms. The heatmap has one row annotation and two column annotations as well as one mark annotation. The row annotation marks up-regulated and down-regulated genes. The two column annotations (one at top, and the other at bottom) display the enrichment term score and the -log10(P-value). The mark annotation shows genes that appear in three or more enrichment terms. + +```{r, , fig.width = 8, fig.height = 6, message=FALSE, warning=FALSE} +# format a new data frame from representative pathways only +heatmap.df <- clustered_df[clustered_df$Status == "Representative", ] +heatmap.df <- heatmap.df %>% + dplyr::select(-Status, -lowest_p, -support, -occurrence, -Cluster) +colnames(heatmap.df) <- c("ID", "Term", "Enrichment_Score", "P_value", "Up_regulated", "Down_regulated") + +# filter by P-value +heatmap.df <- heatmap.df %>% + filter(P_value <= 0.01) + +# normalize the P-value +heatmap.df$P_value <- -log10(heatmap.df$P_value) + +# pathways with up-regulated genes +up.df <- heatmap.df %>% + dplyr::select(-Down_regulated) +up_tidy.df <- cSplit(up.df, 'Up_regulated', ', ', 'long') +up_tidy.df$logFC <- "Up_regulated" +colnames(up_tidy.df) <- c("ID", "Term", "Enrichment_Score", "P_value", "Gene", "logFC") + +# pathways with down-regulated genes +down.df <- heatmap.df %>% + dplyr::select(-Up_regulated) +down_tidy.df <- cSplit(down.df, 'Down_regulated', ', ', 'long') +down_tidy.df$logFC <- "Down_regulated" +colnames(down_tidy.df) <- c("ID", "Term", "Enrichment_Score", "P_value", "Gene", "logFC") + +# combine up_tidy.df and down_tidy.df +heatmap_final.df <- rbind(up_tidy.df, down_tidy.df) + +# add row annotation (log2FC) +dfGeneAnno <- data.frame(heatmap_final.df$Gene, heatmap_final.df$logFC) +dfGeneAnno <- dfGeneAnno %>% distinct(heatmap_final.df.Gene, .keep_all = TRUE) +dfGeneAnno <- dfGeneAnno %>% + dplyr::select(-heatmap_final.df.Gene) +colnames(dfGeneAnno) <- c('Log2FC') +colours <- list('Log2FC' = c('Up_regulated' = 'royalblue', 'Down_regulated' = 'yellow')) +haGenes <- rowAnnotation( + df = dfGeneAnno, + col = colours, + width = unit(1,'cm'), + show_annotation_name = FALSE) + +# add column annotation (enrichment term score) +tidy.df <- heatmap_final.df %>% distinct(Term, .keep_all = TRUE) +tidy.df <- tidy.df %>% + dplyr::select(Term, Enrichment_Score) +transpose.df <- t(tidy.df) +colnames(transpose.df) <- tidy.df$Term +transpose.df <- as.data.frame(transpose.df) +transpose.df <- transpose.df[-1,] +df.terms <- data.frame(tidy.df$Enrichment_Score) +colnames(df.terms) <- "Enrichment\nterm score" +haTerms <- HeatmapAnnotation( + df = df.terms, + Term = anno_text( + colnames(transpose.df), + rot = 45, + just = 'right', + gp = gpar(fontsize = 6)), + annotation_height = unit.c(unit(1, 'cm'), unit(8, 'cm')), + annotation_name_side = 'left') + +# prepare final heatmap data +df.wide <- heatmap_final.df %>% + dplyr::select(-ID, -Enrichment_Score, -logFC) +df.wider <- pivot_wider(df.wide, names_from = Term, values_from = P_value, values_fill = 0) +final.df <- df.wider[,-1] +final.df[final.df != 0] <- 1 +final.df <- as.matrix(final.df) +row.names(final.df) <- df.wider$Gene + +# add column annotation (P-value) +pv.df <- heatmap_final.df %>% distinct(Term, .keep_all = TRUE) +pv.df <- pv.df %>% + dplyr::select(Term, P_value) +colfunc <- colorRampPalette(c("red","yellow","green")) +TermsPvalue <- data.frame(pv.df$P_value) +colnames(TermsPvalue) <- "-log10(pvalue)" +colAnn <- HeatmapAnnotation(df = TermsPvalue, + which = 'col', + annotation_width = unit(1, 'cm'), + gap = unit(1, 'mm'), + annotation_name_side = 'right') + +# add mark annotation (frequent genes) +anno.df <- as.data.frame(final.df) +anno.df$counts <- rowSums(anno.df[,c(1:44)]) +anno.df <- anno.df %>% + filter(counts >= 3) +mark_gene <- rownames(anno.df) +gene_pos <- which(rownames(final.df) %in% mark_gene) +row_anno <- rowAnnotation(mark_gene = anno_mark(at = gene_pos, + labels = mark_gene)) + +# draw ComplexHeatmap +hmapGSEA <- Heatmap(final.df, + name = 'Enrichment result heatmap', + row_split = dfGeneAnno[,1], + + col = c('0' = 'white', '1' = '#D16103'), + + rect_gp = gpar(col = 'grey85'), + + cluster_rows = TRUE, + show_row_dend = TRUE, + #row_title = 'Top Genes', + #row_title_side = 'left', + #row_title_gp = gpar(fontsize = 11, fontface = 'bold'), + #row_title_rot = 90, + show_row_names = FALSE, + right_annotation = row_anno, + row_names_gp = gpar(fontsize = 11, fontface = 'bold'), + row_names_side = 'left', + row_dend_width = unit(35, 'mm'), + row_dend_gp = gpar(), + + cluster_columns = TRUE, + show_column_dend = TRUE, + column_title = 'Enriched terms', + column_title_side = 'top', + column_title_gp = gpar(fontsize = 12, fontface = 'bold'), + column_title_rot = 0, + show_column_names = FALSE, + + show_heatmap_legend = FALSE, + + clustering_distance_columns = 'euclidean', + clustering_method_columns = 'ward.D2', + clustering_distance_rows = 'euclidean', + clustering_method_rows = 'ward.D2', + + top_annotation=colAnn, + + left_annotation = haGenes, + + bottom_annotation = haTerms) +draw(hmapGSEA, + heatmap_legend_side = 'right', + annotation_legend_side = 'right') +``` +\newpage +```{r, , fig.width = 15, fig.height = 12, message=FALSE, warning=FALSE} +# A larger plot size for html output +draw(hmapGSEA, heatmap_legend_side = 'right', annotation_legend_side = 'right') +``` + +\newpage +## Term-Gene Heatmap (4mo) +```{r, , fig.width = 8, fig.height = 6, message=FALSE, warning=FALSE} +# specify time point of interest +my_timept_2 <- "4mo" + +# subset my celltype-specific Seurat object by time +my_seurat_2 <- subset(x = my_celltype_data, subset = Age == my_timept_2) + +# set active identities of my subset seurat to "Mt" attribute (denoting variant) +# for DE testing +Idents(my_seurat_2) <- "Mt" +summary(as.factor(my_seurat_2$Mt)) + +# run D.E. test to identify genes differentially expressed between +# between diseased and normal cells, for my celltype and at this timepoint +marker_genes_2.df <- FindMarkers(my_seurat_2, + ident.1 = "V337M", ident.2 = "V337V") + +# order results by p-value +ordered_marker_genes_2.df <- marker_genes_2.df[order(marker_genes_2.df$p_val),] + +# convert row names to column of gene IDs +# remove the original row names +ordered_marker_genes_2.df$geneIDs <- rownames(ordered_marker_genes_2.df) +rownames(ordered_marker_genes_2.df) <- NULL +head(ordered_marker_genes_2.df) + +# format a new data frame using the contents of +# ordered_marker_genes.df, to pass to pathfinder +pathfinder_2.df <- ordered_marker_genes_2.df %>% + dplyr::select(geneIDs, avg_log2FC, p_val) +colnames(pathfinder_2.df) <- c("geneIDs","logFC","pvalue") + +knitr::kable(head(pathfinder_2.df)) + +output_2_df <- run_pathfindR(pathfinder_2.df, gene_sets = "Reactome", pin_name_path = "STRING", + p_val_threshold = 0.05, plot_enrichment_chart = FALSE) + +clustered_2_df <- cluster_enriched_terms(output_2_df, plot_hmap = FALSE, plot_dend = FALSE, plot_clusters_graph = FALSE, clu_method = "centroid") + +## The representative terms +knitr::kable(clustered_2_df[clustered_2_df$Status == "Representative", ]) + +# format a new data frame from representative pathways only +heatmap_2.df <- clustered_2_df[clustered_2_df$Status == "Representative", ] +heatmap_2.df <- heatmap_2.df %>% + dplyr::select(-Status, -lowest_p, -support, -occurrence, -Cluster) +colnames(heatmap_2.df) <- c("ID", "Term", "Enrichment_Score", "P_value", "Up_regulated", "Down_regulated") + +# filter by P-value +heatmap_2.df <- heatmap_2.df %>% + filter(P_value <= 0.01) + +# normalize the P-value +heatmap_2.df$P_value <- -log10(heatmap_2.df$P_value) + +# pathways with up-regulated genes +up_2.df <- heatmap_2.df %>% + dplyr::select(-Down_regulated) +up_2_tidy.df <- cSplit(up_2.df, 'Up_regulated', ', ', 'long') +up_2_tidy.df$logFC <- "Up_regulated" +colnames(up_2_tidy.df) <- c("ID", "Term", "Enrichment_Score", "P_value", "Gene", "logFC") + +# pathways with down-regulated genes +down_2.df <- heatmap_2.df %>% + dplyr::select(-Up_regulated) +down_2_tidy.df <- cSplit(down_2.df, 'Down_regulated', ', ', 'long') +down_2_tidy.df$logFC <- "Down_regulated" +colnames(down_2_tidy.df) <- c("ID", "Term", "Enrichment_Score", "P_value", "Gene", "logFC") + +# combine up_tidy.df and down_tidy.df +heatmap_2_final.df <- rbind(up_2_tidy.df, down_2_tidy.df) + +# add row annotation (log2FC) +dfGeneAnno_2 <- data.frame(heatmap_2_final.df$Gene, heatmap_2_final.df$logFC) +dfGeneAnno_2 <- dfGeneAnno_2 %>% distinct(heatmap_2_final.df.Gene, .keep_all = TRUE) +dfGeneAnno_2 <- dfGeneAnno_2 %>% + dplyr::select(-heatmap_2_final.df.Gene) +colnames(dfGeneAnno_2) <- c('Log2FC') +colours <- list('Log2FC' = c('Up_regulated' = 'royalblue', 'Down_regulated' = 'yellow')) +haGenes_2 <- rowAnnotation( + df = dfGeneAnno_2, + col = colours, + width = unit(1,'cm'), + show_annotation_name = FALSE) + #annotation_name_side = 'top', + #annotation_name_rot = 45) + +# add column annotation (enrichment term score) +tidy_2.df <- heatmap_2_final.df %>% distinct(Term, .keep_all = TRUE) +tidy_2.df <- tidy_2.df %>% + dplyr::select(Term, Enrichment_Score) +transpose_2.df <- t(tidy_2.df) +colnames(transpose_2.df) <- tidy_2.df$Term +transpose_2.df <- as.data.frame(transpose_2.df) +transpose_2.df <- transpose_2.df[-1,] +df_2.terms <- data.frame(tidy_2.df$Enrichment_Score) +colnames(df_2.terms) <- "Enrichment\nterm score" +haTerms_2 <- HeatmapAnnotation( + df = df_2.terms, + Term = anno_text( + colnames(transpose_2.df), + rot = 45, + just = 'right', + gp = gpar(fontsize = 6)), + annotation_height = unit.c(unit(1, 'cm'), unit(8, 'cm')), + annotation_name_side = 'left') + +# prepare final heatmap data +df_2.wide <- heatmap_2_final.df %>% + dplyr::select(-ID, -Enrichment_Score, -logFC) +df_2.wider <- pivot_wider(df_2.wide, names_from = Term, values_from = P_value, values_fill = 0) +final_2.df <- df_2.wider[,-1] +final_2.df[final_2.df != 0] <- 1 +final_2.df <- as.matrix(final_2.df) +row.names(final_2.df) <- df_2.wider$Gene + +# add column annotation (P-value) +pv_2.df <- heatmap_2_final.df %>% distinct(Term, .keep_all = TRUE) +pv_2.df <- pv_2.df %>% + dplyr::select(Term, P_value) +colfunc <- colorRampPalette(c("red","yellow","green")) +TermsPvalue_2 <- data.frame(pv_2.df$P_value) +colnames(TermsPvalue_2) <- "-log10(pvalue)" +colAnn_2 <- HeatmapAnnotation(df = TermsPvalue_2, + which = 'col', + annotation_width = unit(1, 'cm'), + gap = unit(1, 'mm'), + annotation_name_side = 'left') + +# add mark annotation (frequent genes) +anno_2.df <- as.data.frame(final_2.df) +anno_2.df$counts <- rowSums(anno_2.df) +anno_2.df <- anno_2.df %>% + filter(counts >= 2) +mark_gene_2 <- rownames(anno_2.df) +gene_pos_2 <- which(rownames(final_2.df) %in% mark_gene_2) +row_anno_2 <- rowAnnotation(mark_gene = anno_mark(at = gene_pos_2, + labels = mark_gene_2)) + +# draw ComplexHeatmap +hmapGSEA_2 <- Heatmap(final_2.df, + name = 'Enrichment result heatmap', + row_split = dfGeneAnno_2[,1], + + col = c('0' = 'white', '1' = '#C4961A'), + + rect_gp = gpar(col = 'grey85'), + + cluster_rows = TRUE, + show_row_dend = TRUE, + #row_title = 'Top Genes', + #row_title_side = 'left', + #row_title_gp = gpar(fontsize = 11, fontface = 'bold'), + #row_title_rot = 90, + show_row_names = FALSE, + right_annotation = row_anno_2, + row_names_gp = gpar(fontsize = 11, fontface = 'bold'), + row_names_side = 'left', + row_dend_width = unit(35, 'mm'), + row_dend_gp = gpar(), + + cluster_columns = TRUE, + show_column_dend = TRUE, + column_title = 'Enriched terms', + column_title_side = 'top', + column_title_gp = gpar(fontsize = 12, fontface = 'bold'), + column_title_rot = 0, + show_column_names = FALSE, + + show_heatmap_legend = FALSE, + + clustering_distance_columns = 'euclidean', + clustering_method_columns = 'ward.D2', + clustering_distance_rows = 'euclidean', + clustering_method_rows = 'ward.D2', + + top_annotation=colAnn_2, + + left_annotation = haGenes_2, + + bottom_annotation = haTerms_2) +draw(hmapGSEA_2, + heatmap_legend_side = 'right', + annotation_legend_side = 'right') +``` + +\newpage +```{r, , fig.width = 15, fig.height = 12, message=FALSE, warning=FALSE} +# A larger plot size for html output +draw(hmapGSEA_2, heatmap_legend_side = 'right', annotation_legend_side = 'right') +``` + +\newpage +## Term-Gene Heatmap (6mo) +```{r, , fig.width = 8, fig.height = 6, message=FALSE, warning=FALSE} +# specify time point of interest +my_timept_3 <- "6mo" + +# subset my celltype-specific Seurat object by time +my_seurat_3 <- subset(x = my_celltype_data, subset = Age == my_timept_3) + +# set active identities of my subset seurat to "Mt" attribute (denoting variant) +# for DE testing +Idents(my_seurat_3) <- "Mt" +summary(as.factor(my_seurat_3$Mt)) + +# run D.E. test to identify genes differentially expressed between +# between diseased and normal cells, for my celltype and at this timepoint +marker_genes_3.df <- FindMarkers(my_seurat_3, + ident.1 = "V337M", ident.2 = "V337V") + +# order results by p-value +ordered_marker_genes_3.df <- marker_genes_3.df[order(marker_genes_3.df$p_val),] + +# convert row names to column of gene IDs +# remove the original row names +ordered_marker_genes_3.df$geneIDs <- rownames(ordered_marker_genes_3.df) +rownames(ordered_marker_genes_3.df) <- NULL +head(ordered_marker_genes_3.df) + +# format a new data frame using the contents of +# ordered_marker_genes.df, to pass to pathfinder +pathfinder_3.df <- ordered_marker_genes_3.df %>% + dplyr::select(geneIDs, avg_log2FC, p_val) +colnames(pathfinder_3.df) <- c("geneIDs","logFC","pvalue") + +knitr::kable(head(pathfinder_3.df)) + +output_3_df <- run_pathfindR(pathfinder_3.df, gene_sets = "Reactome", pin_name_path = "STRING", + p_val_threshold = 0.05, plot_enrichment_chart = FALSE) + +clustered_3_df <- cluster_enriched_terms(output_3_df, plot_hmap = FALSE, plot_dend = FALSE, plot_clusters_graph = FALSE, clu_method = "centroid") + +## The representative terms +knitr::kable(clustered_3_df[clustered_3_df$Status == "Representative", ]) + +# format a new data frame from representative pathways only +heatmap_3.df <- clustered_3_df[clustered_3_df$Status == "Representative", ] +heatmap_3.df <- heatmap_3.df %>% + dplyr::select(-Status, -lowest_p, -support, -occurrence, -Cluster) +colnames(heatmap_3.df) <- c("ID", "Term", "Enrichment_Score", "P_value", "Up_regulated", "Down_regulated") + +# filter by P-value +heatmap_3.df <- heatmap_3.df %>% + filter(P_value <= 0.01) + +# normalize the P-value +heatmap_3.df$P_value <- -log10(heatmap_3.df$P_value) + +# pathways with up-regulated genes +up_3.df <- heatmap_3.df %>% + dplyr::select(-Down_regulated) +up_3_tidy.df <- cSplit(up_3.df, 'Up_regulated', ', ', 'long') +up_3_tidy.df$logFC <- "Up_regulated" +colnames(up_3_tidy.df) <- c("ID", "Term", "Enrichment_Score", "P_value", "Gene", "logFC") + +# pathways with down-regulated genes +down_3.df <- heatmap_3.df %>% + dplyr::select(-Up_regulated) +down_3_tidy.df <- cSplit(down_3.df, 'Down_regulated', ', ', 'long') +down_3_tidy.df$logFC <- "Down_regulated" +colnames(down_3_tidy.df) <- c("ID", "Term", "Enrichment_Score", "P_value", "Gene", "logFC") + +# combine up_tidy.df and down_tidy.df +heatmap_3_final.df <- rbind(up_3_tidy.df, down_3_tidy.df) + +# add row annotation (log2FC) +dfGeneAnno_3 <- data.frame(heatmap_3_final.df$Gene, heatmap_3_final.df$logFC) +dfGeneAnno_3 <- dfGeneAnno_3 %>% distinct(heatmap_3_final.df.Gene, .keep_all = TRUE) +dfGeneAnno_3 <- dfGeneAnno_3 %>% + dplyr::select(-heatmap_3_final.df.Gene) +colnames(dfGeneAnno_3) <- c('Log2FC') +colours <- list('Log2FC' = c('Up_regulated' = 'royalblue', 'Down_regulated' = 'yellow')) +haGenes_3 <- rowAnnotation( + df = dfGeneAnno_3, + col = colours, + width = unit(1,'cm'), + show_annotation_name = FALSE) + #annotation_name_side = 'top', + #annotation_name_rot = 45) + +# add column annotation (enrichment term score) +tidy_3.df <- heatmap_3_final.df %>% distinct(Term, .keep_all = TRUE) +tidy_3.df <- tidy_3.df %>% + dplyr::select(Term, Enrichment_Score) +transpose_3.df <- t(tidy_3.df) +colnames(transpose_3.df) <- tidy_3.df$Term +transpose_3.df <- as.data.frame(transpose_3.df) +transpose_3.df <- transpose_3.df[-1,] +df_3.terms <- data.frame(tidy_3.df$Enrichment_Score) +colnames(df_3.terms) <- "Enrichment\nterm score" +haTerms_3 <- HeatmapAnnotation( + df = df_3.terms, + Term = anno_text( + colnames(transpose_3.df), + rot = 45, + just = 'right', + gp = gpar(fontsize = 6)), + annotation_height = unit.c(unit(1, 'cm'), unit(8, 'cm')), + annotation_name_side = 'left') + +# prepare final heatmap data +df_3.wide <- heatmap_3_final.df %>% + dplyr::select(-ID, -Enrichment_Score, -logFC) +df_3.wider <- pivot_wider(df_3.wide, names_from = Term, values_from = P_value, values_fill = 0) +final_3.df <- df_3.wider[,-1] +final_3.df[final_3.df != 0] <- 1 +final_3.df <- as.matrix(final_3.df) +row.names(final_3.df) <- df_3.wider$Gene + +# add column annotation (P-value) +pv_3.df <- heatmap_3_final.df %>% distinct(Term, .keep_all = TRUE) +pv_3.df <- pv_3.df %>% + dplyr::select(Term, P_value) +colfunc <- colorRampPalette(c("red","yellow","green")) +TermsPvalue_3 <- data.frame(pv_3.df$P_value) +colnames(TermsPvalue_3) <- "-log10(pvalue)" +colAnn_3 <- HeatmapAnnotation(df = TermsPvalue_3, + which = 'col', + annotation_width = unit(1, 'cm'), + gap = unit(1, 'mm'), + annotation_name_side = 'left') + +# add mark annotation (frequent genes) +anno_3.df <- as.data.frame(final_3.df) +anno_3.df$counts <- rowSums(anno_3.df) +anno_3.df <- anno_3.df %>% + filter(counts >= 2) +mark_gene_3 <- rownames(anno_3.df) +gene_pos_3 <- which(rownames(final_3.df) %in% mark_gene_3) +row_anno_3 <- rowAnnotation(mark_gene = anno_mark(at = gene_pos_3, + labels = mark_gene_3)) + +# draw ComplexHeatmap +hmapGSEA_3 <- Heatmap(final_3.df, + name = 'Enrichment result heatmap', + row_split = dfGeneAnno_3[,1], + + col = c('0' = 'white', '1' = '#52854C'), + + rect_gp = gpar(col = 'grey85'), + + cluster_rows = TRUE, + show_row_dend = TRUE, + #row_title = 'Top Genes', + #row_title_side = 'left', + #row_title_gp = gpar(fontsize = 11, fontface = 'bold'), + #row_title_rot = 90, + show_row_names = FALSE, + right_annotation = row_anno_3, + row_names_gp = gpar(fontsize = 11, fontface = 'bold'), + row_names_side = 'left', + row_dend_width = unit(35, 'mm'), + row_dend_gp = gpar(), + + cluster_columns = TRUE, + show_column_dend = TRUE, + column_title = 'Enriched terms', + column_title_side = 'top', + column_title_gp = gpar(fontsize = 12, fontface = 'bold'), + column_title_rot = 0, + show_column_names = FALSE, + + show_heatmap_legend = FALSE, + + clustering_distance_columns = 'euclidean', + clustering_method_columns = 'ward.D2', + clustering_distance_rows = 'euclidean', + clustering_method_rows = 'ward.D2', + + top_annotation=colAnn_3, + + left_annotation = haGenes_3, + + bottom_annotation = haTerms_3) +draw(hmapGSEA_3, + heatmap_legend_side = 'right', + annotation_legend_side = 'right') +``` + +\newpage +```{r, , fig.width = 15, fig.height = 12, message=FALSE, warning=FALSE} +# A larger plot size for html output +draw(hmapGSEA_3, heatmap_legend_side = 'right', annotation_legend_side = 'right') +``` \ No newline at end of file