Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
library("bio3d")
library('dplyr')
library('ggplot2')
library('Rtsne')
library('umap')
library('cluster')
library('factoextra')
library('plotly')
print(getwd())
setwd('/home/tiburon/Desktop/ROT4/CSPRank_ES_Processing')
print(getwd())
CSP_Rank_Scores <- './CSP_Rank_Scores/'
PDB_FILES <- './PDB_FILES/'
experimental_structures <- paste0(PDB_FILES, 'experimental_structures/')
computational_structures <- paste0(PDB_FILES, 'computational_structures/')
CS_Lists <- './CS_Lists/'
CS_Predictions <- './CS_Predictions/'
CLUSTERING_RESULTS <- './CLUSTERING_RESULTS/'
get_matrix <- function(pdb_id, chain) {
dir_path <- paste0(PDB_FILES, pdb_id, "_aligned")
path_to_pdb_files <- dir_path#paste0("./", pdb_id, "_aligned")
alt_dir_path <- paste0(PDB_FILES, pdb_id, "_alt_aligned")
path_to_alt_pdb_files <- alt_dir_path#paste0("./", pdb_id, "_alt_aligned")
pdb_files <- list.files(path = dir_path, pattern = "\\.pdb$")
alt_pdb_files <- list.files(path = alt_dir_path, pattern = "\\.pdb$")
csp_data <- read.csv(paste0(CSP_Rank_Scores, 'CSP_', tolower(pdb_id), "_CSpred.csv"))
chain_a_xyz <- list()
for (file in pdb_files) {
rows <- filter(csp_data, holo_model_path == file.path(path_to_pdb_files, file))
row <- rows[1, ]
if (!is.na(row$F1)) {
pdb <- read.pdb(file.path(dir_path, file))
chain_to_analyze <- pdb
if (chain == 'A') {
chain_to_analyze <- trim.pdb(pdb, chain = "A")
} else if ( chain == 'B' ) {
chain_to_analyze <- trim.pdb(pdb, chain = "B")
}
chain_a_xyz[[file]] <- chain_to_analyze$xyz
}
}
len_AFS <- length(chain_a_xyz)
for (file in alt_pdb_files) {
rows <- filter(csp_data, holo_model_path == file.path(path_to_alt_pdb_files, file))
row <- rows[1, ]
if (!is.na(row$F1)) {
pdb <- read.pdb(file.path(alt_dir_path, file))
chain_to_analyze <- pdb
if (chain == 'A') {
chain_to_analyze <- trim.pdb(pdb, chain = "A")
} else if ( chain == 'B' ) {
chain_to_analyze <- trim.pdb(pdb, chain = "B")
}
chain_a_xyz[[file]] <- chain_to_analyze$xyz
}
}
if (len_AFS == length(chain_a_xyz)) {
print("DID YOU RUN add_aligned_suffix.py?")
}
special_pdb_file <- paste0('comp_', tolower(pdb_id), ".pdb")
special_pdb_file_dir <- paste0(PDB_FILES, 'computational_structures/')
pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file))
chain_to_analyze <- pdb
if (chain == 'A') {
chain_to_analyze <- trim.pdb(pdb, chain = "A")
} else if ( chain == 'B' ) {
chain_to_analyze <- trim.pdb(pdb, chain = "B")
}
chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz
special_pdb_file <- paste0('exp_', tolower(pdb_id), ".pdb")
special_pdb_file_dir <- paste0(PDB_FILES, 'experimental_structures/')
pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file))
chain_to_analyze <- pdb
if (chain == 'A') {
chain_to_analyze <- trim.pdb(pdb, chain = "A")
} else if ( chain == 'B' ) {
chain_to_analyze <- trim.pdb(pdb, chain = "B")
}
chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz
pdb_files <- list.files(path = paste0(special_pdb_file_dir, tolower(pdb_id), '/'), pattern = "\\.pdb$")
for (file in pdb_files) {
pdb <- read.pdb(file.path(paste0(special_pdb_file_dir, tolower(pdb_id), '/'), file))
chain_to_analyze <- pdb
if (chain == 'A') {
chain_to_analyze <- trim.pdb(pdb, chain = "A")
} else if ( chain == 'B' ) {
chain_to_analyze <- trim.pdb(pdb, chain = "B")
}
chain_a_xyz[[file]] <- chain_to_analyze$xyz
}
return(chain_a_xyz)
}
read_consensus_data <- function(pdb_id, pdb_files_names) {
csv_path <- paste0(CSP_Rank_Scores,"CSP_", tolower(pdb_id), "_CSpred.csv")
csp_data <- read.csv(csv_path)
# Filter out rows without a value in 'Confidence'
if (FALSE){
csp_data <- csp_data[!is.na(csp_data$Confidence) & csp_data$Confidence != 0, ]
}
# Extract the basename of holo_model_path for comparison
csp_data$basename_holo_model_path <- basename(csp_data$holo_model_path)
# Filter and ensure the order matches pdb_files_names
filtered_data <- csp_data %>% filter(basename_holo_model_path %in% pdb_files_names)
filtered_data <- filtered_data[match(pdb_files_names, filtered_data$basename_holo_model_path), ]
consensus_values <- filtered_data$consensus
filtered_data2 <- na.omit(filtered_data)
if (FALSE){
consensus_values <- 2 * filtered_data$consensus * filtered_data$DP / (filtered_data$consensus + filtered_data$DP)
#consensus_values <- filtered_data$consensus * (filtered_data$DP - min(filtered_data2$DP)) / (max(filtered_data2$DP) - min(filtered_data2$DP))
#consensus_values <- filtered_data$consensus * (filtered_data$RPF_RECALL - min(filtered_data2$RPF_RECALL)) / (max(filtered_data2$RPF_RECALL) - min(filtered_data2$RPF_RECALL))
#consensus_values <- filtered_data$DP
}
confidence_values <- filtered_data$Confidence
plddt_values <- filtered_data$plddt
special_cases <- ifelse(grepl(paste0("exp_", tolower(pdb_id)), filtered_data$holo_model_path), "green",
ifelse(grepl(paste0("comp_", tolower(pdb_id)), filtered_data$holo_model_path), "purple",
ifelse(grepl('v3_', filtered_data$holo_model_path) & grepl('dropout', filtered_data$holo_model_path), 'blue',
ifelse(grepl('v2_', filtered_data$holo_model_path) & grepl('dropout', filtered_data$holo_model_path), 'pink',
ifelse(grepl('dropout', filtered_data$holo_model_path), 'red',
ifelse(grepl('v2_', filtered_data$holo_model_path), "cyan",
ifelse(grepl('notemplate', filtered_data$holo_model_path), "orange",
ifelse(grepl('multimer', filtered_data$holo_model_path), "yellow", "NA"))))))))
return(list(consensus = consensus_values, plddt = plddt_values, Confidence = confidence_values, special_cases = special_cases, basename_holo_model_path = filtered_data$basename_holo_model_path))
}
determine_optimal_clusters <- function(data, max_clusters = 20) {
wss <- sapply(1:max_clusters, function(k) {
kmeans(data, k, nstart = 10)$tot.withinss
})
elbow_plot <- qplot(1:max_clusters, wss, geom = "line") +
geom_point() +
ggtitle("Elbow Method for Optimal Clusters") +
xlab("Number of Clusters") +
ylab("Total Within-Cluster Sum of Squares")
silhouette_scores <- sapply(2:max_clusters, function(k) {
km <- kmeans(data, centers = k, nstart = 10)
ss <- silhouette(km$cluster, dist(data))
mean(ss[, 3])
})
silhouette_plot <- qplot(2:max_clusters, silhouette_scores, geom = "line") +
geom_point() +
ggtitle("Silhouette Method for Optimal Clusters") +
xlab("Number of Clusters") +
ylab("Average Silhouette Score")
optimal_clusters <- which.max(silhouette_scores) + 1
return(list(optimal_clusters = optimal_clusters,
elbow_plot = elbow_plot,
silhouette_plot = silhouette_plot))
}
plot_dimensional_reduction <- function(tsne_data, umap_data, consensus_data, pdb_id, chain) {
special_cases <- consensus_data$special_cases
consensus_values <- consensus_data$consensus
consensus_data$bayesian_values <- consensus_data$consensus * consensus_data$Confidence
#consensus_data$bayesian_values <- consensus_data$consensus * consensus_data$plddt
consensus_data$basename_holo_model_path
# Define the mapping for special cases
special_cases_mapping1 <- c("NMR Model" = "green", "AF2 Baseline" = "purple", "AFSample v2.2" = "cyan", "AFAlt" = "orange", "AFSample v2.1" = "yellow", "AFSample2 v2.3" = "blue", "AFSample2 v2.1" = "pink", "AFSample2 v1" = "red", "NA" = "gray")
special_cases_mapping2 <- c("green" = "NMR Model", "purple"= "AF2 Baseline", "cyan" = "AFSample v2.2", "orange" = "AFAlt", "yellow" = "AFSample v2.1", "blue" = "AFSample2 v2.3", "pink" = "AFSample2 v2.1", "red" = "AFSample2 v1", "gray" = "NA")
special_cases <- factor(special_cases, levels = names(special_cases_mapping2))
levels(special_cases) <- names(special_cases_mapping1)
# Assign colors to tsne_data$Special based on special_cases_mapping
tsne_data$Special <- special_cases
umap_data$Special <- special_cases
tsne_data$pdb_file
# Set consensus values for color mapping
tsne_data$Color <- consensus_values
umap_data$Color <- consensus_values
if (TRUE){
tsne_data$Color <- consensus_data$bayesian_values
umap_data$Color <- consensus_data$bayesian_values
}
# t-SNE Plot 1: Clusters
tsne_plot_clusters <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Cluster, text = pdb_file)) +
geom_point() +
scale_color_discrete(name = "Cluster") +
ggtitle(paste("t-SNE Plot for PDB ID=", pdb_id)) +
theme_minimal()
tsne_plotly_clusters <- ggplotly(tsne_plot_clusters, tooltip = "text")
# t-SNE Plot 2: Heatmap of consensus values
tsne_plot_heatmap <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Color, text = paste0("PDB File:", pdb_file, "\nCSPRank:", Color))) +
geom_point() +
scale_color_gradient(low = "blue", high = "red", name = "Score") +
ggtitle(paste("t-SNE Heatmap for PDB ID=", pdb_id)) +
theme_minimal()
tsne_plotly_heatmap <- ggplotly(tsne_plot_heatmap, tooltip = "text")
# t-SNE Plot 3: Special cases
tsne_plot_special <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Special, text = pdb_file)) +
geom_point() +
scale_color_manual(values = special_cases_mapping1, name = "Model Source") +
ggtitle(paste("t-SNE for PDB ID=", pdb_id)) +
theme_minimal()
tsne_plotly_special <- ggplotly(tsne_plot_special, tooltip = "text")
# Save static t-SNE plots
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_clusters_plot.png'), plot = tsne_plot_clusters)
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_heatmap_plot.png'), plot = tsne_plot_heatmap)
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_special_plot.png'), plot = tsne_plot_special)
# UMAP Plot 1: Clusters
umap_plot_clusters <- ggplot(umap_data, aes(x = UMAP1, y = UMAP2, color = Cluster, text = pdb_file)) +
geom_point() +
scale_color_discrete(name = "Cluster") +
ggtitle(paste("UMAP Plot for PDB ID=", pdb_id)) +
theme_minimal()
umap_plotly_clusters <- ggplotly(umap_plot_clusters, tooltip = "text")
# UMAP Plot 2: Heatmap of consensus values
umap_plot_heatmap <- ggplot(umap_data, aes(x = UMAP1, y = UMAP2, color = Color, text = paste0("PDB File:", pdb_file, "\nCSPRank:", Color))) +
geom_point() +
scale_color_gradient(low = "blue", high = "red", name = "Score") +
ggtitle(paste("UMAP Heatmap for PDB ID=", pdb_id)) +
theme_minimal()
umap_plotly_heatmap <- ggplotly(umap_plot_heatmap, tooltip = "text")
# UMAP Plot 3: Special cases
umap_plot_special <- ggplot(umap_data, aes(x = UMAP1, y = UMAP2, color = Special, text = pdb_file)) +
geom_point() +
scale_color_manual(values = special_cases_mapping1, name = "Model Source") +
ggtitle(paste("UMAP for PDB ID=", pdb_id)) +
theme_minimal()
umap_plotly_special <- ggplotly(umap_plot_special, tooltip = "text")
# Save static UMAP plots
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_clusters_plot.png'), plot = umap_plot_clusters)
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_heatmap_plot.png'), plot = umap_plot_heatmap)
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_special_plot.png'), plot = umap_plot_special)
return(list(tsne_plotly_clusters = tsne_plotly_clusters, tsne_plotly_heatmap = tsne_plotly_heatmap,
tsne_plotly_special = tsne_plotly_special, umap_plotly_clusters = umap_plotly_clusters,
umap_plotly_heatmap = umap_plotly_heatmap, umap_plotly_special = umap_plotly_special))
}
plot_cluster_CSPRank_boxplot <- function(tsne_data, umap_data, consensus_data, pdb_id, chain) {
# Match tsne_data$pdb_file to consensus_data$basename_holo_model_path
matched_data <- merge(tsne_data, consensus_data, by.x = "pdb_file", by.y = "basename_holo_model_path")
# Create the boxplot using ggplot2
#tsne_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = bayesian_values, fill = as.factor(Cluster))) +
tsne_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = consensus, fill = as.factor(Cluster))) +
geom_boxplot() +
labs(x = "Cluster", y = "metric", title = paste("t-SNE Cluster CSP Bayesian model selection metric")) +
theme_minimal() +
scale_fill_discrete(name = "Cluster")
# Save the t-SNE plot
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_', chain, '_tsne_boxplot.png'), plot = tsne_cluster_CSPRank_boxplot)
# Match tsne_data$pdb_file to consensus_data$basename_holo_model_path
matched_data <- merge(umap_data, consensus_data, by.x = "pdb_file", by.y = "basename_holo_model_path")
# Create the boxplot using ggplot2
#umap_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = bayesian_values, fill = as.factor(Cluster))) +
umap_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = consensus, fill = as.factor(Cluster))) +
geom_boxplot() +
labs(x = "Cluster", y = "metric", title = paste("UMAP Cluster CSP Bayesian model selection metric")) +
theme_minimal() +
scale_fill_discrete(name = "Cluster")
# Save the UMAP plot
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_', chain, '_umap_boxplot.png'), plot = umap_cluster_CSPRank_boxplot)
return(list(tsne_cluster_CSPRank_boxplot = tsne_cluster_CSPRank_boxplot, umap_cluster_CSPRank_boxplot = umap_cluster_CSPRank_boxplot))
}
# Function to read the output CSV file and trim the data
trim_data <- function(csv_file_path, xyz_matrix, pdb_files_names) {
# Read the CSV file
result_data <- read.csv(csv_file_path)
# Extract the pdb_file names from the result_data
remaining_pdb_files <- result_data$pdb_file # Assuming 'pdb_file' is the column in the CSV with the pdb file names
# Create a logical vector indicating which pdb_files_names are in the remaining_pdb_files
keep_indices <- pdb_files_names %in% remaining_pdb_files
# Trim the xyz_matrix and pdb_files_names using the logical vector
trimmed_xyz_matrix <- xyz_matrix[keep_indices, ]
trimmed_pdb_files_names <- pdb_files_names[keep_indices]
return(list(trimmed_xyz_matrix = trimmed_xyz_matrix, trimmed_pdb_files_names = trimmed_pdb_files_names))
}
perform_analysis <- function(pdb_id, chain, max_clusters = 10) {
set.seed(42)
chain_a_xyz <- get_matrix(pdb_id, chain)
xyz_matrix <- do.call(rbind, chain_a_xyz)
# Identify unique rows and their indices
unique_indices <- !duplicated(xyz_matrix)
xyz_matrix <- xyz_matrix[unique_indices, ]
# Create a data frame for PCA results
pdb_files_names <- names(chain_a_xyz)[unique_indices]
# trim pdb_files_names and xyz_matrix to points which are not outliers in PCA space
if (TRUE) {
#if (FALSE) {
trimmed_pca_csv_file <- paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_PCA_chain_', chain, '_data_trimmed.csv')
results <- trim_data(trimmed_pca_csv_file, xyz_matrix, pdb_files_names)
pdb_files_names <- results$trimmed_pdb_files_names
xyz_matrix <- results$trimmed_xyz_matrix
}
print(dim(xyz_matrix))
# t-SNE Analysis
tsne_result <- Rtsne(xyz_matrix, dims = 2, perplexity = 25, verbose = TRUE)
#tsne_result <- Rtsne(xyz_matrix, dims = 2, perplexity = 30, verbose = TRUE)
tsne_data <- data.frame(pdb_file = pdb_files_names, TSNE1 = tsne_result$Y[, 1], TSNE2 = tsne_result$Y[, 2])
# Determine optimal clusters for t-SNE results
tsne_clusters_info <- determine_optimal_clusters(tsne_data[, c("TSNE1", "TSNE2")], max_clusters)
optimal_tsne_clusters <- tsne_clusters_info$optimal_clusters
print(paste("Optimal number of clusters for t-SNE:", optimal_tsne_clusters))
tsne_clusters <- kmeans(tsne_data[, c("TSNE1", "TSNE2")], centers = optimal_tsne_clusters)$cluster
tsne_data$Cluster <- factor(tsne_clusters)
# Save t-SNE cluster plots
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_Elbow_plot.png'), plot = tsne_clusters_info$elbow_plot)
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_Silhouette_plot.png'), plot = tsne_clusters_info$silhouette_plot)
# Write t-SNE results to CSV
write.csv(tsne_data, paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_TSNE_chain_', chain, '_data.csv'), row.names = FALSE)
# UMAP Analysis
umap_result <- umap(xyz_matrix)
umap_data <- data.frame(pdb_file = pdb_files_names, UMAP1 = umap_result$layout[, 1], UMAP2 = umap_result$layout[, 2])
# Determine optimal clusters for UMAP results
umap_clusters_info <- determine_optimal_clusters(umap_data[, c("UMAP1", "UMAP2")], max_clusters)
optimal_umap_clusters <- umap_clusters_info$optimal_clusters
print(paste("Optimal number of clusters for UMAP:", optimal_umap_clusters))
umap_clusters <- kmeans(umap_data[, c("UMAP1", "UMAP2")], centers = optimal_umap_clusters)$cluster
umap_data$Cluster <- factor(umap_clusters)
# Save UMAP cluster plots
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_Elbow_plot.png'), plot = umap_clusters_info$elbow_plot)
ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_Silhouette_plot.png'), plot = umap_clusters_info$silhouette_plot)
# Write UMAP results to CSV
write.csv(umap_data, paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_chain_', chain, '_data.csv'), row.names = FALSE)
# Read consensus data
consensus_data <- read_consensus_data(pdb_id, pdb_files_names)
# Plot the results
plots <- plot_dimensional_reduction(tsne_data, umap_data, consensus_data, pdb_id, chain)
# Print interactive plots to display in R
print(plots$tsne_plotly_clusters)
print(plots$tsne_plotly_heatmap)
print(plots$tsne_plotly_special)
print(plots$umap_plotly_clusters)
print(plots$umap_plotly_heatmap)
print(plots$umap_plotly_special)
plots <- plot_cluster_CSPRank_boxplot(tsne_data, umap_data, consensus_data, pdb_id, chain)
print(plots$tsne_cluster_CSPRank_boxplot)
print(plots$umap_cluster_CSPRank_boxplot)
}
chain <- 'B'
pdb_id <- '5OEO'
perform_analysis(pdb_id, chain)
max_clusters <- 10
pdb_ids <- c('2nmb','2hug', '2nd1', '2jw1', '5m9d', '5xv8', '2kwv')
chains <- c('B')
for (pdb_id in pdb_ids) {
for (chain in chains) {
print(paste0('Running ', pdb_id, ' with chain ', chain))
perform_analysis(pdb_id, chain)
}
}