Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
CSP_Rank/tSNE.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
410 lines (339 sloc)
16.9 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("bio3d") | |
library('dplyr') | |
library('ggplot2') | |
library('Rtsne') | |
library('umap') | |
library('cluster') | |
library('factoextra') | |
library('plotly') | |
library('irlba') | |
library('igraph') | |
library('FNN') | |
print(getwd()) | |
setwd('/home/tiburon/Desktop/ROT4/CSP_Rank') | |
print(getwd()) | |
CSP_Rank_Scores <- './CSP_Rank_Scores/' | |
PDB_FILES <- './PDB_FILES/' | |
experimental_structures <- paste0(PDB_FILES, 'experimental_structures/') | |
computational_structures <- paste0(PDB_FILES, 'computational_structures/') | |
CS_Lists <- './CS_Lists/' | |
CS_Predictions <- './CS_Predictions/' | |
CLUSTERING_RESULTS <- './CLUSTERING_RESULTS/' | |
get_matrix <- function(pdb_id, chain) { | |
dir_path <- paste0(PDB_FILES, pdb_id, "_aligned") | |
path_to_pdb_files <- dir_path#paste0("./", pdb_id, "_aligned") | |
alt_dir_path <- dir_path | |
path_to_alt_pdb_files <- alt_dir_path#paste0("./", pdb_id, "_alt_aligned") | |
pdb_files <- list.files(path = dir_path, pattern = "\\.pdb$") | |
alt_pdb_files <- list.files(path = alt_dir_path, pattern = "\\.pdb$") | |
csp_data <- read.csv(paste0(CSP_Rank_Scores, 'CSP_', tolower(pdb_id), "_CSpred.csv")) | |
chain_a_xyz <- list() | |
for (file in pdb_files) { | |
rows <- filter(csp_data, holo_model_path == file.path(path_to_pdb_files, file)) | |
row <- rows[1, ] | |
if (!is.na(row$F1)) { | |
pdb <- read.pdb(file.path(dir_path, file)) | |
chain_to_analyze <- pdb | |
if (chain == 'A') { | |
chain_to_analyze <- trim.pdb(pdb, chain = "A") | |
} else if ( chain == 'B' ) { | |
chain_to_analyze <- trim.pdb(pdb, chain = "B") | |
} | |
chain_a_xyz[[file]] <- chain_to_analyze$xyz | |
} | |
} | |
len_AFS <- length(chain_a_xyz) | |
for (file in alt_pdb_files) { | |
rows <- filter(csp_data, holo_model_path == file.path(path_to_alt_pdb_files, file)) | |
row <- rows[1, ] | |
if (!is.na(row$F1)) { | |
pdb <- read.pdb(file.path(alt_dir_path, file)) | |
chain_to_analyze <- pdb | |
if (chain == 'A') { | |
chain_to_analyze <- trim.pdb(pdb, chain = "A") | |
} else if ( chain == 'B' ) { | |
chain_to_analyze <- trim.pdb(pdb, chain = "B") | |
} | |
chain_a_xyz[[file]] <- chain_to_analyze$xyz | |
} | |
} | |
if (len_AFS == length(chain_a_xyz)) { | |
print("DID YOU RUN add_aligned_suffix.py?") | |
} | |
special_pdb_file <- paste0('comp_', tolower(pdb_id), ".pdb") | |
special_pdb_file_dir <- paste0(PDB_FILES, 'computational_structures/') | |
pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file)) | |
chain_to_analyze <- pdb | |
if (chain == 'A') { | |
chain_to_analyze <- trim.pdb(pdb, chain = "A") | |
} else if ( chain == 'B' ) { | |
chain_to_analyze <- trim.pdb(pdb, chain = "B") | |
} | |
chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz | |
special_pdb_file <- paste0('exp_', tolower(pdb_id), ".pdb") | |
special_pdb_file_dir <- paste0(PDB_FILES, 'experimental_structures/') | |
pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file)) | |
chain_to_analyze <- pdb | |
if (chain == 'A') { | |
chain_to_analyze <- trim.pdb(pdb, chain = "A") | |
} else if ( chain == 'B' ) { | |
chain_to_analyze <- trim.pdb(pdb, chain = "B") | |
} | |
chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz | |
pdb_files <- list.files(path = paste0(special_pdb_file_dir, tolower(pdb_id), '/'), pattern = "\\.pdb$") | |
for (file in pdb_files) { | |
pdb <- read.pdb(file.path(paste0(special_pdb_file_dir, tolower(pdb_id), '/'), file)) | |
chain_to_analyze <- pdb | |
if (chain == 'A') { | |
chain_to_analyze <- trim.pdb(pdb, chain = "A") | |
} else if ( chain == 'B' ) { | |
chain_to_analyze <- trim.pdb(pdb, chain = "B") | |
} | |
chain_a_xyz[[file]] <- chain_to_analyze$xyz | |
} | |
return(chain_a_xyz) | |
} | |
read_consensus_data <- function(pdb_id, pdb_files_names) { | |
# Remove entries from pdb_files_names containing the substring 'exp' | |
#pdb_files_names <- pdb_files_names[!grepl('exp', pdb_files_names)] | |
#pdb_files_names <- pdb_files_names[!grepl('comp', pdb_files_names)] | |
csv_path <- paste0(CSP_Rank_Scores,"CSP_", tolower(pdb_id), "_CSpred.csv") | |
csp_data <- read.csv(csv_path) | |
# Filter out rows without a value in 'Confidence' | |
#if (TRUE){ | |
# csp_data <- csp_data[!is.na(csp_data$Confidence) & csp_data$Confidence != 0, ] | |
# } | |
# Extract the basename of holo_model_path for comparison | |
csp_data$basename_holo_model_path <- basename(csp_data$holo_model_path) | |
# Filter and ensure the order matches pdb_files_names | |
filtered_data <- csp_data %>% | |
filter(basename_holo_model_path %in% pdb_files_names)# %>% | |
#filter(!grepl('exp', basename_holo_model_path)) | |
filtered_data <- filtered_data[match(pdb_files_names, filtered_data$basename_holo_model_path), ] | |
consensus_values <- filtered_data$consensus | |
filtered_data2 <- na.omit(filtered_data) | |
if (FALSE){ | |
#consensus_values <- 2 * filtered_data$consensus * filtered_data$DP / (filtered_data$consensus + filtered_data$DP) | |
#consensus_values <- filtered_data$consensus * (filtered_data$DP - min(filtered_data2$DP)) / (max(filtered_data2$DP) - min(filtered_data2$DP)) | |
consensus_values <- filtered_data$consensus * (filtered_data$RPF_RECALL - min(filtered_data2$RPF_RECALL)) / (max(filtered_data2$RPF_RECALL) - min(filtered_data2$RPF_RECALL)) | |
#consensus_values <- filtered_data$DP | |
} | |
if (FALSE){ | |
consensus_values <- filtered_data$consensus * (1 - filtered_data$Q_score) | |
} | |
confidence_values <- filtered_data$Confidence | |
plddt_values <- filtered_data$plddt | |
special_cases <- ifelse(grepl(paste0("exp_", tolower(pdb_id)), filtered_data$holo_model_path), "green", | |
ifelse(grepl(paste0("comp_", tolower(pdb_id)), filtered_data$holo_model_path), "cyan", | |
ifelse(grepl('v3_', filtered_data$holo_model_path) & grepl('dropout', filtered_data$holo_model_path), 'blue', | |
ifelse(grepl('v2_', filtered_data$holo_model_path) & grepl('dropout', filtered_data$holo_model_path), 'pink', | |
ifelse(grepl('dropout', filtered_data$holo_model_path), 'red', | |
ifelse(grepl('v2_', filtered_data$holo_model_path), "purple", | |
ifelse(grepl('notemplate', filtered_data$holo_model_path), "orange", | |
ifelse(grepl('multimer', filtered_data$holo_model_path), "yellow", "NA")))))))) | |
return(list(consensus = consensus_values, plddt = plddt_values, Confidence = confidence_values, special_cases = special_cases, basename_holo_model_path = filtered_data$basename_holo_model_path)) | |
} | |
determine_optimal_clusters <- function(data, max_clusters = 20) { | |
wss <- sapply(1:max_clusters, function(k) { | |
kmeans(data, k, nstart = 10)$tot.withinss | |
}) | |
elbow_plot <- qplot(1:max_clusters, wss, geom = "line") + | |
geom_point() + | |
ggtitle("Elbow Method for Optimal Clusters") + | |
xlab("Number of Clusters") + | |
ylab("Total Within-Cluster Sum of Squares") | |
silhouette_scores <- sapply(2:max_clusters, function(k) { | |
km <- kmeans(data, centers = k, nstart = 10) | |
ss <- silhouette(km$cluster, dist(data)) | |
mean(ss[, 3]) | |
}) | |
silhouette_plot <- qplot(2:max_clusters, silhouette_scores, geom = "line") + | |
geom_point() + | |
ggtitle("Silhouette Method for Optimal Clusters") + | |
xlab("Number of Clusters") + | |
ylab("Average Silhouette Score") | |
optimal_clusters <- which.max(silhouette_scores) + 1 | |
return(list(optimal_clusters = optimal_clusters, | |
elbow_plot = elbow_plot, | |
silhouette_plot = silhouette_plot)) | |
} | |
plot_dimensional_reduction <- function(tsne_data, consensus_data, pdb_id, chain) { | |
# Remove rows in tsne_data where pdb_file contains the substring "exp" | |
#tsne_data <- tsne_data[!grepl('exp', tsne_data$pdb_file), ] | |
#tsne_data <- tsne_data[!grepl('comp', tsne_data$pdb_file), ] | |
special_cases <- consensus_data$special_cases | |
consensus_df <- data.frame(consensus_data) | |
print(consensus_df[consensus_df$special_cases == "cyan", ]) | |
print(consensus_df[consensus_df$special_cases == "green", ]) | |
consensus_values <- consensus_data$consensus | |
consensus_values <- ifelse( | |
consensus_data$special_cases == "green", | |
-1, | |
consensus_values | |
) | |
consensus_values <- ifelse( | |
consensus_data$special_cases == "cyan", | |
0, | |
consensus_values | |
) | |
consensus_data$bayesian_values <- as.numeric(ifelse(is.na(consensus_data$Confidence) | consensus_data$Confidence == "", -1, consensus_data$consensus * consensus_data$Confidence)) | |
consensus_data$basename_holo_model_path | |
# Define the mapping for special cases | |
special_cases_mapping1 <- c("AFSample v2.2" = "purple", "AFAlt" = "orange", "AFSample v2.1" = "yellow", "AFSample2 v2.3" = "blue", "AFSample2 v2.1" = "pink", "AFSample2 v1" = "red", "NA" = "gray", "NMR Model" = "green", "AF2 Baseline" = "cyan") | |
special_cases_mapping2 <- c("purple" = "AFSample v2.2", "orange" = "AFAlt", "yellow" = "AFSample v2.1", "blue" = "AFSample2 v2.3", "pink" = "AFSample2 v2.1", "red" = "AFSample2 v1", "gray" = "NA", "green" = "NMR Model", "cyan"= "AF2 Baseline") | |
special_cases <- factor(special_cases, levels = names(special_cases_mapping2)) | |
levels(special_cases) <- names(special_cases_mapping1) | |
# Assign colors to tsne_data$Special based on special_cases_mapping | |
tsne_data$Special <- special_cases | |
tsne_data$pdb_file | |
# Set consensus values for color mapping | |
tsne_data$Color <- consensus_values | |
if (FALSE){ | |
tsne_data$Color <- consensus_data$bayesian_values | |
} | |
# t-SNE Plot 1: Clusters | |
tsne_plot_clusters <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Cluster, text = pdb_file)) + | |
geom_point() + | |
#ggtitle(paste("t-SNE Plot for PDB ID=", pdb_id)) + | |
theme(legend.position = "none") + | |
theme_minimal() | |
tsne_plotly_clusters <- ggplotly(tsne_plot_clusters, tooltip = "text") | |
print(tsne_plotly_clusters) | |
# t-SNE Plot 2: Heatmap of consensus values | |
# tsne_data$Color <- ifelse(tsne_data$Color == -1, "green", tsne_data$Color) | |
# tsne_plot_heatmap <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Color, text = paste0("PDB File:", pdb_file, "\nCSPRank:", Color))) + | |
# geom_point() + | |
# scale_color_gradient(low = "blue", high = "red", name = "Posterior") + | |
# ggtitle(paste("Posteriors for ", pdb_id)) + | |
# theme_minimal() | |
library(ggplot2) | |
tsne_plot_heatmap <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, text = pdb_file)) + | |
# 1) Points where Color != -1 -> color by numeric Posterior | |
geom_point( | |
data = subset(tsne_data, Color > 0), | |
aes(color = as.numeric(Color)), | |
size = 2 | |
) + | |
# 2) Points where Color == -1 -> use a special shape | |
geom_point( | |
data = subset(tsne_data, Color == -1), | |
aes(shape = "NMR models"), # map them to a discrete shape | |
color = "green", # color them green | |
size = 3 | |
) + | |
# 3) Points where Color == -1 -> use a special shape | |
geom_point( | |
data = subset(tsne_data, Color == 0), | |
aes(shape = "AF2 model"), # map them to a discrete shape | |
color = "cyan", # color them green | |
size = 3 | |
) + | |
# Set up shape legend for discrete “NMR models” | |
scale_shape_manual( | |
values = c("NMR models" = 19), # pick any shape code you like | |
name = "Model Type" # legend title for shape | |
) + | |
# Set up shape legend for discrete “NMR models” | |
scale_shape_manual( | |
values = c("AF2 model" = 19), # pick any shape code you like | |
name = "Model Type" # legend title for shape | |
) + | |
# Set up color gradient for numeric Posterior | |
scale_color_gradient(low = "blue", high = "red", name = "Posterior") + | |
ggtitle(paste("Posteriors for", pdb_id)) + | |
theme_minimal() | |
#tsne_plot_heatmap + | |
# guides( | |
# shape = guide_legend(title = NULL)#, # no title for shape legend | |
# #color = guide_colorbar(title = "Posterior") | |
# ) | |
tsne_plotly_heatmap <- ggplotly(tsne_plot_heatmap, tooltip = "text") | |
print(tsne_plotly_heatmap) | |
# t-SNE Plot 3: Special cases | |
tsne_plot_special <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Special, text = pdb_file)) + | |
geom_point() + | |
scale_color_manual(values = special_cases_mapping1, name = "Model Source") + | |
ggtitle(paste("t-SNE for ", pdb_id)) + | |
theme_minimal() | |
tsne_plotly_special <- ggplotly(tsne_plot_special, tooltip = "text") | |
print(tsne_plotly_special) | |
# Save static t-SNE plots | |
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_clusters_plot.png'), plot = tsne_plot_clusters) | |
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_heatmap_plot.png'), plot = tsne_plot_heatmap) | |
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_special_plot.png'), plot = tsne_plot_special) | |
return(list(tsne_plotly_clusters = tsne_plotly_clusters, tsne_plotly_heatmap = tsne_plotly_heatmap, | |
tsne_plotly_special = tsne_plotly_special)) | |
} | |
plot_cluster_CSPRank_boxplot <- function(tsne_data, consensus_data, pdb_id, chain) { | |
# Match tsne_data$pdb_file to consensus_data$basename_holo_model_path | |
matched_data <- merge(tsne_data, consensus_data, by.x = "pdb_file", by.y = "basename_holo_model_path") | |
# Create the boxplot using ggplot2 | |
#tsne_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = bayesian_values, fill = as.factor(Cluster))) + | |
tsne_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = consensus, fill = as.factor(Cluster))) + | |
geom_boxplot() + | |
labs(x = "Cluster", y = "metric", title = paste("t-SNE Cluster CSP Bayesian model selection metric")) + | |
theme_minimal() + | |
scale_fill_discrete(name = "Cluster") | |
# Save the t-SNE plot | |
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_', chain, '_tsne_boxplot.png'), plot = tsne_cluster_CSPRank_boxplot) | |
return(list(tsne_cluster_CSPRank_boxplot = tsne_cluster_CSPRank_boxplot)) | |
} | |
# Function to read the output CSV file and trim the data | |
trim_data <- function(csv_file_path, xyz_matrix, pdb_files_names) { | |
# Read the CSV file | |
result_data <- read.csv(csv_file_path) | |
# Extract the pdb_file names from the result_data | |
remaining_pdb_files <- result_data$pdb_file # Assuming 'pdb_file' is the column in the CSV with the pdb file names | |
# Create a logical vector indicating which pdb_files_names are in the remaining_pdb_files | |
keep_indices <- pdb_files_names %in% remaining_pdb_files | |
# Trim the xyz_matrix and pdb_files_names using the logical vector | |
trimmed_xyz_matrix <- xyz_matrix[keep_indices, ] | |
trimmed_pdb_files_names <- pdb_files_names[keep_indices] | |
return(list(trimmed_xyz_matrix = trimmed_xyz_matrix, trimmed_pdb_files_names = trimmed_pdb_files_names)) | |
} | |
perform_analysis <- function(pdb_id, chain, max_clusters = 10) { | |
set.seed(42) | |
chain_a_xyz <- get_matrix(pdb_id, chain) | |
xyz_matrix <- do.call(rbind, chain_a_xyz) | |
# Identify unique rows and their indices | |
unique_indices <- !duplicated(xyz_matrix) | |
xyz_matrix <- xyz_matrix[unique_indices, ] | |
# Create a data frame for PCA results | |
pdb_files_names <- names(chain_a_xyz)[unique_indices] | |
# trim pdb_files_names and xyz_matrix to points which are not outliers in PCA space | |
#if (TRUE) { | |
if (FALSE) { | |
trimmed_pca_csv_file <- paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_PCA_chain_', chain, '_data_trimmed.csv') | |
results <- trim_data(trimmed_pca_csv_file, xyz_matrix, pdb_files_names) | |
pdb_files_names <- results$trimmed_pdb_files_names | |
xyz_matrix <- results$trimmed_xyz_matrix | |
} | |
print(dim(xyz_matrix)) | |
# t-SNE Analysis | |
tsne_result <- Rtsne(xyz_matrix, dims = 2, perplexity = 100, verbose = TRUE, max_iter = 3000, partial_pca = TRUE) | |
tsne_data <- data.frame(pdb_file = pdb_files_names, TSNE1 = tsne_result$Y[, 1], TSNE2 = tsne_result$Y[, 2]) | |
# Determine optimal clusters for t-SNE results | |
tsne_clusters_info <- determine_optimal_clusters(tsne_data[, c("TSNE1", "TSNE2")], max_clusters) | |
optimal_tsne_clusters <- tsne_clusters_info$optimal_clusters | |
optimal_tsne_clusters <- 20 | |
print(paste("Optimal number of clusters for t-SNE:", optimal_tsne_clusters)) | |
tsne_clusters <- kmeans(tsne_data[, c("TSNE1", "TSNE2")], centers = optimal_tsne_clusters)$cluster | |
tsne_data$Cluster <- factor(tsne_clusters) | |
# Save t-SNE cluster plots | |
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_Elbow_plot.png'), plot = tsne_clusters_info$elbow_plot) | |
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_Silhouette_plot.png'), plot = tsne_clusters_info$silhouette_plot) | |
# Write t-SNE results to CSV | |
write.csv(tsne_data, paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_TSNE_chain_', chain, '_data.csv'), row.names = FALSE) | |
# Read consensus data | |
consensus_data <- read_consensus_data(pdb_id, pdb_files_names) | |
# Plot the results | |
plots <- plot_dimensional_reduction(tsne_data, consensus_data, pdb_id, chain) | |
# Print interactive plots to display in R | |
print(plots$tsne_plotly_clusters) | |
print(plots$tsne_plotly_heatmap) | |
print(plots$tsne_plotly_special) | |
} | |
max_clusters = 10 | |
chain <- 'B' | |
pdb_id <- '2N23' | |
perform_analysis(pdb_id, chain) |