Skip to content
Permalink
b9fe25bf2d
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
418 lines (346 sloc) 17.5 KB
library("bio3d")
library('dplyr')
library('ggplot2')
library('Rtsne')
library('umap')
library('cluster')
library('factoextra')
library('plotly')
library('irlba')
library('igraph')
library('FNN')
print(getwd())
setwd('/home/tiburon/Desktop/ROT4/CSP_Rank')
print(getwd())
CSP_Rank_Scores <- './CSP_Rank_Scores/'
PDB_FILES <- './PDB_FILES/'
experimental_structures <- paste0(PDB_FILES, 'experimental_structures/')
computational_structures <- paste0(PDB_FILES, 'computational_structures/')
CS_Lists <- './CS_Lists/'
CS_Predictions <- './CS_Predictions/'
CLUSTERING_RESULTS <- './CLUSTERING_RESULTS/'
get_matrix <- function(pdb_id, chain) {
dir_path <- paste0(PDB_FILES, pdb_id, "_aligned")
path_to_pdb_files <- dir_path#paste0("./", pdb_id, "_aligned")
alt_dir_path <- dir_path
path_to_alt_pdb_files <- alt_dir_path#paste0("./", pdb_id, "_alt_aligned")
pdb_files <- list.files(path = dir_path, pattern = "\\.pdb$")
alt_pdb_files <- list.files(path = alt_dir_path, pattern = "\\.pdb$")
csp_data <- read.csv(paste0(CSP_Rank_Scores, 'CSP_', tolower(pdb_id), "_CSpred.csv"))
chain_a_xyz <- list()
for (file in pdb_files) {
rows <- filter(csp_data, holo_model_path == file.path(path_to_pdb_files, file))
row <- rows[1, ]
if (!is.na(row$F1)) {
pdb <- read.pdb(file.path(dir_path, file))
chain_to_analyze <- pdb
if (chain == 'A') {
chain_to_analyze <- trim.pdb(pdb, chain = "A")
} else if ( chain == 'B' ) {
chain_to_analyze <- trim.pdb(pdb, chain = "B")
}
chain_a_xyz[[file]] <- chain_to_analyze$xyz
}
}
len_AFS <- length(chain_a_xyz)
for (file in alt_pdb_files) {
rows <- filter(csp_data, holo_model_path == file.path(path_to_alt_pdb_files, file))
row <- rows[1, ]
if (!is.na(row$F1)) {
pdb <- read.pdb(file.path(alt_dir_path, file))
chain_to_analyze <- pdb
if (chain == 'A') {
chain_to_analyze <- trim.pdb(pdb, chain = "A")
} else if ( chain == 'B' ) {
chain_to_analyze <- trim.pdb(pdb, chain = "B")
}
chain_a_xyz[[file]] <- chain_to_analyze$xyz
}
}
if (len_AFS == length(chain_a_xyz)) {
print("DID YOU RUN add_aligned_suffix.py?")
}
special_pdb_file <- paste0('comp_', tolower(pdb_id), ".pdb")
special_pdb_file_dir <- paste0(PDB_FILES, 'computational_structures/')
pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file))
chain_to_analyze <- pdb
if (chain == 'A') {
chain_to_analyze <- trim.pdb(pdb, chain = "A")
} else if ( chain == 'B' ) {
chain_to_analyze <- trim.pdb(pdb, chain = "B")
}
chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz
special_pdb_file <- paste0('exp_', tolower(pdb_id), ".pdb")
special_pdb_file_dir <- paste0(PDB_FILES, 'experimental_structures/')
pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file))
chain_to_analyze <- pdb
if (chain == 'A') {
chain_to_analyze <- trim.pdb(pdb, chain = "A")
} else if ( chain == 'B' ) {
chain_to_analyze <- trim.pdb(pdb, chain = "B")
}
chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz
pdb_files <- list.files(path = paste0(special_pdb_file_dir, tolower(pdb_id), '/'), pattern = "\\.pdb$")
for (file in pdb_files) {
pdb <- read.pdb(file.path(paste0(special_pdb_file_dir, tolower(pdb_id), '/'), file))
chain_to_analyze <- pdb
if (chain == 'A') {
chain_to_analyze <- trim.pdb(pdb, chain = "A")
} else if ( chain == 'B' ) {
chain_to_analyze <- trim.pdb(pdb, chain = "B")
}
chain_a_xyz[[file]] <- chain_to_analyze$xyz
}
return(chain_a_xyz)
}
read_consensus_data <- function(pdb_id, pdb_files_names) {
# Remove entries from pdb_files_names containing the substring 'exp'
#pdb_files_names <- pdb_files_names[!grepl('exp', pdb_files_names)]
#pdb_files_names <- pdb_files_names[!grepl('comp', pdb_files_names)]
csv_path <- paste0(CSP_Rank_Scores,"CSP_", tolower(pdb_id), "_CSpred.csv")
csp_data <- read.csv(csv_path)
# Filter out rows without a value in 'Confidence'
#if (TRUE){
# csp_data <- csp_data[!is.na(csp_data$Confidence) & csp_data$Confidence != 0, ]
# }
# Extract the basename of holo_model_path for comparison
csp_data$basename_holo_model_path <- basename(csp_data$holo_model_path)
# Filter and ensure the order matches pdb_files_names
filtered_data <- csp_data %>%
filter(basename_holo_model_path %in% pdb_files_names)# %>%
#filter(!grepl('exp', basename_holo_model_path))
filtered_data <- filtered_data[match(pdb_files_names, filtered_data$basename_holo_model_path), ]
consensus_values <- filtered_data$consensus
filtered_data2 <- na.omit(filtered_data)
if (TRUE){
#consensus_values <- 2 * filtered_data$consensus * filtered_data$DP / (filtered_data$consensus + filtered_data$DP)
#consensus_values <- filtered_data$consensus * (filtered_data$DP - min(filtered_data2$DP)) / (max(filtered_data2$DP) - min(filtered_data2$DP))
consensus_values <- filtered_data$consensus * (filtered_data$RPF_RECALL - min(filtered_data2$RPF_RECALL)) / (max(filtered_data2$RPF_RECALL) - min(filtered_data2$RPF_RECALL))
#consensus_values <- filtered_data$DP
}
if (FALSE){
consensus_values <- filtered_data$consensus * (1 - filtered_data$Q_score)
}
confidence_values <- filtered_data$Confidence
plddt_values <- filtered_data$plddt
special_cases <- ifelse(grepl(paste0("exp_", tolower(pdb_id)), filtered_data$holo_model_path), "green",
ifelse(grepl(paste0("comp_", tolower(pdb_id)), filtered_data$holo_model_path), "cyan",
ifelse(grepl('v3_', filtered_data$holo_model_path) & grepl('dropout', filtered_data$holo_model_path), 'blue',
ifelse(grepl('v2_', filtered_data$holo_model_path) & grepl('dropout', filtered_data$holo_model_path), 'pink',
ifelse(grepl('dropout', filtered_data$holo_model_path), 'red',
ifelse(grepl('v2_', filtered_data$holo_model_path), "purple",
ifelse(grepl('notemplate', filtered_data$holo_model_path), "orange",
ifelse(grepl('multimer', filtered_data$holo_model_path), "yellow", "NA"))))))))
return(list(consensus = consensus_values, plddt = plddt_values, Confidence = confidence_values, special_cases = special_cases, basename_holo_model_path = filtered_data$basename_holo_model_path))
}
determine_optimal_clusters <- function(data, max_clusters = 20) {
wss <- sapply(1:max_clusters, function(k) {
kmeans(data, k, nstart = 10)$tot.withinss
})
elbow_plot <- qplot(1:max_clusters, wss, geom = "line") +
geom_point() +
ggtitle("Elbow Method for Optimal Clusters") +
xlab("Number of Clusters") +
ylab("Total Within-Cluster Sum of Squares")
silhouette_scores <- sapply(2:max_clusters, function(k) {
km <- kmeans(data, centers = k, nstart = 10)
ss <- silhouette(km$cluster, dist(data))
mean(ss[, 3])
})
silhouette_plot <- qplot(2:max_clusters, silhouette_scores, geom = "line") +
geom_point() +
ggtitle("Silhouette Method for Optimal Clusters") +
xlab("Number of Clusters") +
ylab("Average Silhouette Score")
optimal_clusters <- which.max(silhouette_scores) + 1
return(list(optimal_clusters = optimal_clusters,
elbow_plot = elbow_plot,
silhouette_plot = silhouette_plot))
}
plot_dimensional_reduction <- function(tsne_data, consensus_data, pdb_id, chain) {
# Remove rows in tsne_data where pdb_file contains the substring "exp"
#tsne_data <- tsne_data[!grepl('exp', tsne_data$pdb_file), ]
#tsne_data <- tsne_data[!grepl('comp', tsne_data$pdb_file), ]
special_cases <- consensus_data$special_cases
consensus_df <- data.frame(consensus_data)
print(consensus_df[consensus_df$special_cases == "cyan", ])
print(consensus_df[consensus_df$special_cases == "green", ])
consensus_values <- consensus_df$consensus
negative_consensus_indices <- which(consensus_values <= 0)
consensus_df <- consensus_df[-negative_consensus_indices, ]
consensus_values <- consensus_df$consensus
consensus_values <- ifelse(
consensus_data$special_cases == "green",
-1,
consensus_values
)
consensus_values <- ifelse(
consensus_data$special_cases == "cyan",
0,
consensus_values
)
negative_consensus_indices <- which(consensus_values <= 0)
print(consensus_values[negative_consensus_indices])
print(consensus_data$basename_holo_model_path[negative_consensus_indices])
consensus_data$bayesian_values <- as.numeric(ifelse(is.na(consensus_data$Confidence) | consensus_data$Confidence == "", -1, consensus_data$consensus * consensus_data$Confidence))
consensus_data$bayesian_values <- ifelse(grepl('comp', consensus_data$basename_holo_model_path), 0, consensus_data$bayesian_values)
consensus_data$basename_holo_model_path
# Define the mapping for special cases
special_cases_mapping1 <- c("AFSample v2.2" = "purple", "AFAlt" = "orange", "AFSample v2.1" = "yellow", "AFSample2 v2.3" = "blue", "AFSample2 v2.1" = "pink", "AFSample2 v1" = "red", "NA" = "gray", "NMR Model" = "green", "AF2 Baseline" = "cyan")
special_cases_mapping2 <- c("purple" = "AFSample v2.2", "orange" = "AFAlt", "yellow" = "AFSample v2.1", "blue" = "AFSample2 v2.3", "pink" = "AFSample2 v2.1", "red" = "AFSample2 v1", "gray" = "NA", "green" = "NMR Model", "cyan"= "AF2 Baseline")
special_cases <- factor(special_cases, levels = names(special_cases_mapping2))
levels(special_cases) <- names(special_cases_mapping1)
# Assign colors to tsne_data$Special based on special_cases_mapping
tsne_data$Special <- special_cases
tsne_data$pdb_file
# Set consensus values for color mapping
tsne_data$Color <- consensus_values
if (FALSE){
tsne_data$Color <- consensus_data$bayesian_values
}
# t-SNE Plot 1: Clusters
tsne_plot_clusters <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Cluster, text = pdb_file)) +
geom_point() +
#ggtitle(paste("t-SNE Plot for PDB ID=", pdb_id)) +
theme(legend.position = "none") +
theme_minimal()
tsne_plotly_clusters <- ggplotly(tsne_plot_clusters, tooltip = "text")
print(tsne_plotly_clusters)
# t-SNE Plot 2: Heatmap of consensus values
# tsne_data$Color <- ifelse(tsne_data$Color == -1, "green", tsne_data$Color)
# tsne_plot_heatmap <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Color, text = paste0("PDB File:", pdb_file, "\nCSPRank:", Color))) +
# geom_point() +
# scale_color_gradient(low = "blue", high = "red", name = "Posterior") +
# ggtitle(paste("Posteriors for ", pdb_id)) +
# theme_minimal()
library(ggplot2)
tsne_plot_heatmap <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, text = pdb_file)) +
# 1) Points where Color != -1 -> color by numeric Posterior
geom_point(
data = subset(tsne_data, Color > 0),
aes(color = as.numeric(Color)),
size = 2
) +
# 2) Points where Color == -1 -> use a special shape
geom_point(
data = subset(tsne_data, Color == -1),
aes(shape = "NMR models"), # map them to a discrete shape
color = "green", # color them green
size = 3
) +
# 3) Points where Color == -1 -> use a special shape
geom_point(
data = subset(tsne_data, Color == 0),
aes(shape = "AF2 model"), # map them to a discrete shape
color = "cyan", # color them green
size = 3
) +
# Set up shape legend for discrete “NMR models”
scale_shape_manual(
values = c("NMR models" = 19), # pick any shape code you like
name = "Model Type" # legend title for shape
) +
# Set up shape legend for discrete “NMR models”
scale_shape_manual(
values = c("AF2 model" = 19), # pick any shape code you like
name = "Model Type" # legend title for shape
) +
# Set up color gradient for numeric Posterior
scale_color_gradient(low = "blue", high = "red", name = "Posterior") +
ggtitle(paste("Posteriors for", pdb_id)) +
theme_minimal()
#tsne_plot_heatmap +
# guides(
# shape = guide_legend(title = NULL)#, # no title for shape legend
# #color = guide_colorbar(title = "Posterior")
# )
tsne_plotly_heatmap <- ggplotly(tsne_plot_heatmap, tooltip = "text")
print(tsne_plotly_heatmap)
# t-SNE Plot 3: Special cases
tsne_plot_special <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Special, text = pdb_file)) +
geom_point() +
scale_color_manual(values = special_cases_mapping1, name = "Model Source") +
ggtitle(paste("t-SNE for ", pdb_id)) +
theme_minimal()
tsne_plotly_special <- ggplotly(tsne_plot_special, tooltip = "text")
print(tsne_plotly_special)
# Save static t-SNE plots
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_clusters_plot.png'), plot = tsne_plot_clusters)
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_heatmap_plot.png'), plot = tsne_plot_heatmap)
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_special_plot.png'), plot = tsne_plot_special)
return(list(tsne_plotly_clusters = tsne_plotly_clusters, tsne_plotly_heatmap = tsne_plotly_heatmap,
tsne_plotly_special = tsne_plotly_special))
}
plot_cluster_CSPRank_boxplot <- function(tsne_data, consensus_data, pdb_id, chain) {
# Match tsne_data$pdb_file to consensus_data$basename_holo_model_path
matched_data <- merge(tsne_data, consensus_data, by.x = "pdb_file", by.y = "basename_holo_model_path")
# Create the boxplot using ggplot2
#tsne_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = bayesian_values, fill = as.factor(Cluster))) +
tsne_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = consensus, fill = as.factor(Cluster))) +
geom_boxplot() +
labs(x = "Cluster", y = "metric", title = paste("t-SNE Cluster CSP Bayesian model selection metric")) +
theme_minimal() +
scale_fill_discrete(name = "Cluster")
# Save the t-SNE plot
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_', chain, '_tsne_boxplot.png'), plot = tsne_cluster_CSPRank_boxplot)
return(list(tsne_cluster_CSPRank_boxplot = tsne_cluster_CSPRank_boxplot))
}
# Function to read the output CSV file and trim the data
trim_data <- function(csv_file_path, xyz_matrix, pdb_files_names) {
# Read the CSV file
result_data <- read.csv(csv_file_path)
# Extract the pdb_file names from the result_data
remaining_pdb_files <- result_data$pdb_file # Assuming 'pdb_file' is the column in the CSV with the pdb file names
# Create a logical vector indicating which pdb_files_names are in the remaining_pdb_files
keep_indices <- pdb_files_names %in% remaining_pdb_files
# Trim the xyz_matrix and pdb_files_names using the logical vector
trimmed_xyz_matrix <- xyz_matrix[keep_indices, ]
trimmed_pdb_files_names <- pdb_files_names[keep_indices]
return(list(trimmed_xyz_matrix = trimmed_xyz_matrix, trimmed_pdb_files_names = trimmed_pdb_files_names))
}
perform_analysis <- function(pdb_id, chain, max_clusters = 10) {
set.seed(42)
chain_a_xyz <- get_matrix(pdb_id, chain)
xyz_matrix <- do.call(rbind, chain_a_xyz)
# Identify unique rows and their indices
unique_indices <- !duplicated(xyz_matrix)
xyz_matrix <- xyz_matrix[unique_indices, ]
# Create a data frame for PCA results
pdb_files_names <- names(chain_a_xyz)[unique_indices]
# trim pdb_files_names and xyz_matrix to points which are not outliers in PCA space
#if (TRUE) {
if (FALSE) {
trimmed_pca_csv_file <- paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_PCA_chain_', chain, '_data_trimmed.csv')
results <- trim_data(trimmed_pca_csv_file, xyz_matrix, pdb_files_names)
pdb_files_names <- results$trimmed_pdb_files_names
xyz_matrix <- results$trimmed_xyz_matrix
}
print(dim(xyz_matrix))
# t-SNE Analysis
tsne_result <- Rtsne(xyz_matrix, dims = 2, perplexity = 100, verbose = TRUE, max_iter = 3000, partial_pca = TRUE)
tsne_data <- data.frame(pdb_file = pdb_files_names, TSNE1 = tsne_result$Y[, 1], TSNE2 = tsne_result$Y[, 2])
# Determine optimal clusters for t-SNE results
tsne_clusters_info <- determine_optimal_clusters(tsne_data[, c("TSNE1", "TSNE2")], max_clusters)
optimal_tsne_clusters <- tsne_clusters_info$optimal_clusters
optimal_tsne_clusters <- 20
print(paste("Optimal number of clusters for t-SNE:", optimal_tsne_clusters))
tsne_clusters <- kmeans(tsne_data[, c("TSNE1", "TSNE2")], centers = optimal_tsne_clusters)$cluster
tsne_data$Cluster <- factor(tsne_clusters)
# Save t-SNE cluster plots
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_Elbow_plot.png'), plot = tsne_clusters_info$elbow_plot)
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_Silhouette_plot.png'), plot = tsne_clusters_info$silhouette_plot)
# Write t-SNE results to CSV
write.csv(tsne_data, paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_TSNE_chain_', chain, '_data.csv'), row.names = FALSE)
# Read consensus data
consensus_data <- read_consensus_data(pdb_id, pdb_files_names)
# Plot the results
plots <- plot_dimensional_reduction(tsne_data, consensus_data, pdb_id, chain)
# Print interactive plots to display in R
print(plots$tsne_plotly_clusters)
print(plots$tsne_plotly_heatmap)
print(plots$tsne_plotly_special)
}
max_clusters = 10
chain <- 'B'
pdb_id <- '7JYN'
perform_analysis(pdb_id, chain)