tSNE_UMAP_two_plot.R


library("bio3d")
library('dplyr')
library('ggplot2')
library('Rtsne')
library('umap')
library('cluster')
library('factoextra')
library('plotly')

print(getwd())
setwd('/home/tiburon/Desktop/ROT4/CSPRank_ES_Processing')
print(getwd())

CSP_Rank_Scores <- './CSP_Rank_Scores/'
PDB_FILES <- './PDB_FILES/'
experimental_structures <- paste0(PDB_FILES, 'experimental_structures/')
computational_structures <- paste0(PDB_FILES, 'computational_structures/')
CS_Lists <- './CS_Lists/'
CS_Predictions <- './CS_Predictions/'
CLUSTERING_RESULTS <- './CLUSTERING_RESULTS/'

get_matrix <- function(pdb_id, chain) {

  dir_path <- paste0(PDB_FILES, pdb_id, "_aligned")
  path_to_pdb_files <- dir_path#paste0("./", pdb_id, "_aligned")
  alt_dir_path <- paste0(PDB_FILES, pdb_id, "_alt_aligned")
  path_to_alt_pdb_files <- alt_dir_path#paste0("./", pdb_id, "_alt_aligned")
  pdb_files <- list.files(path = dir_path, pattern = "\\.pdb$")
  alt_pdb_files <- list.files(path = alt_dir_path, pattern = "\\.pdb$")

  csp_data <- read.csv(paste0(CSP_Rank_Scores, 'CSP_', tolower(pdb_id), "_CSpred.csv"))

  chain_a_xyz <- list()

  for (file in pdb_files) {
    rows <- filter(csp_data, holo_model_path == file.path(path_to_pdb_files, file))
    row <- rows[1, ]

    if (!is.na(row$F1)) {
      pdb <- read.pdb(file.path(dir_path, file))
      chain_to_analyze <- pdb
      if (chain == 'A') {
        chain_to_analyze <- trim.pdb(pdb, chain = "A")
      } else if ( chain == 'B' ) {
        chain_to_analyze <- trim.pdb(pdb, chain = "B")
      }
      chain_a_xyz[[file]] <- chain_to_analyze$xyz
    }
  }

  len_AFS <- length(chain_a_xyz)
  for (file in alt_pdb_files) {
    rows <- filter(csp_data, holo_model_path == file.path(path_to_alt_pdb_files, file))
    row <- rows[1, ]

    if (!is.na(row$F1)) {
      pdb <- read.pdb(file.path(alt_dir_path, file))
      chain_to_analyze <- pdb
      if (chain == 'A') {
        chain_to_analyze <- trim.pdb(pdb, chain = "A")
      } else if ( chain == 'B' ) {
        chain_to_analyze <- trim.pdb(pdb, chain = "B")
      }
      chain_a_xyz[[file]] <- chain_to_analyze$xyz
    }
  }

  if (len_AFS == length(chain_a_xyz)) {
    print("DID YOU RUN add_aligned_suffix.py?")
  }

  special_pdb_file <- paste0('comp_', tolower(pdb_id), ".pdb")
  special_pdb_file_dir <- paste0(PDB_FILES, 'computational_structures/')
  pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file))
  chain_to_analyze <- pdb
  if (chain == 'A') {
    chain_to_analyze <- trim.pdb(pdb, chain = "A")
  } else if ( chain == 'B' ) {
    chain_to_analyze <- trim.pdb(pdb, chain = "B")
  }
  chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz

  special_pdb_file <- paste0('exp_', tolower(pdb_id), ".pdb")
  special_pdb_file_dir <- paste0(PDB_FILES, 'experimental_structures/')
  pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file))
  chain_to_analyze <- pdb
  if (chain == 'A') {
    chain_to_analyze <- trim.pdb(pdb, chain = "A")
  } else if ( chain == 'B' ) {
    chain_to_analyze <- trim.pdb(pdb, chain = "B")
  }
  chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz


  pdb_files <- list.files(path = paste0(special_pdb_file_dir, tolower(pdb_id), '/'), pattern = "\\.pdb$")

  for (file in pdb_files) {
    pdb <- read.pdb(file.path(paste0(special_pdb_file_dir, tolower(pdb_id), '/'), file))
    chain_to_analyze <- pdb
    if (chain == 'A') {
      chain_to_analyze <- trim.pdb(pdb, chain = "A")
    } else if ( chain == 'B' ) {
      chain_to_analyze <- trim.pdb(pdb, chain = "B")
    }
    chain_a_xyz[[file]] <- chain_to_analyze$xyz
  }

  return(chain_a_xyz)
}

read_consensus_data <- function(pdb_id, pdb_files_names) {
  csv_path <- paste0(CSP_Rank_Scores,"CSP_", tolower(pdb_id), "_CSpred.csv")
  csp_data <- read.csv(csv_path)

  # Filter out rows without a value in 'Confidence'
  if (FALSE){
    csp_data <- csp_data[!is.na(csp_data$Confidence) & csp_data$Confidence != 0, ]
  }

  # Extract the basename of holo_model_path for comparison
  csp_data$basename_holo_model_path <- basename(csp_data$holo_model_path)

  # Filter and ensure the order matches pdb_files_names
  filtered_data <- csp_data %>% filter(basename_holo_model_path %in% pdb_files_names)
  filtered_data <- filtered_data[match(pdb_files_names, filtered_data$basename_holo_model_path), ]

  consensus_values <- filtered_data$consensus
  filtered_data2 <- na.omit(filtered_data)
  if (FALSE){
    consensus_values <- 2 * filtered_data$consensus * filtered_data$DP / (filtered_data$consensus + filtered_data$DP)
    #consensus_values <- filtered_data$consensus * (filtered_data$DP - min(filtered_data2$DP)) / (max(filtered_data2$DP) - min(filtered_data2$DP))
    #consensus_values <- filtered_data$consensus * (filtered_data$RPF_RECALL - min(filtered_data2$RPF_RECALL)) / (max(filtered_data2$RPF_RECALL) - min(filtered_data2$RPF_RECALL))
    #consensus_values <- filtered_data$DP
  }
  confidence_values <- filtered_data$Confidence
  plddt_values <- filtered_data$plddt

  special_cases <- ifelse(grepl(paste0("exp_", tolower(pdb_id)), filtered_data$holo_model_path), "green",
                          ifelse(grepl(paste0("comp_", tolower(pdb_id)), filtered_data$holo_model_path), "purple",
                          ifelse(grepl('v3_', filtered_data$holo_model_path) & grepl('dropout', filtered_data$holo_model_path), 'blue',
                          ifelse(grepl('v2_', filtered_data$holo_model_path) & grepl('dropout', filtered_data$holo_model_path), 'pink',
                          ifelse(grepl('dropout', filtered_data$holo_model_path), 'red',
                          ifelse(grepl('v2_', filtered_data$holo_model_path), "cyan",
                          ifelse(grepl('notemplate', filtered_data$holo_model_path), "orange",
                          ifelse(grepl('multimer', filtered_data$holo_model_path), "yellow", "NA"))))))))

  return(list(consensus = consensus_values, plddt = plddt_values, Confidence = confidence_values, special_cases = special_cases, basename_holo_model_path = filtered_data$basename_holo_model_path))
}

determine_optimal_clusters <- function(data, max_clusters = 20) {
  wss <- sapply(1:max_clusters, function(k) {
    kmeans(data, k, nstart = 10)$tot.withinss
  })
  elbow_plot <- qplot(1:max_clusters, wss, geom = "line") +
    geom_point() +
    ggtitle("Elbow Method for Optimal Clusters") +
    xlab("Number of Clusters") +
    ylab("Total Within-Cluster Sum of Squares")

  silhouette_scores <- sapply(2:max_clusters, function(k) {
    km <- kmeans(data, centers = k, nstart = 10)
    ss <- silhouette(km$cluster, dist(data))
    mean(ss[, 3])
  })
  silhouette_plot <- qplot(2:max_clusters, silhouette_scores, geom = "line") +
    geom_point() +
    ggtitle("Silhouette Method for Optimal Clusters") +
    xlab("Number of Clusters") +
    ylab("Average Silhouette Score")

  optimal_clusters <- which.max(silhouette_scores) + 1

  return(list(optimal_clusters = optimal_clusters,
              elbow_plot = elbow_plot,
              silhouette_plot = silhouette_plot))
}

plot_dimensional_reduction <- function(tsne_data, umap_data, consensus_data, pdb_id, chain) {
  special_cases <- consensus_data$special_cases
  consensus_values <- consensus_data$consensus
  consensus_data$bayesian_values <- consensus_data$consensus * consensus_data$Confidence
  #consensus_data$bayesian_values <- consensus_data$consensus * consensus_data$plddt

  consensus_data$basename_holo_model_path
  # Define the mapping for special cases
  special_cases_mapping1 <- c("NMR Model" = "green", "AF2 Baseline" = "purple", "AFSample v2.2" = "cyan", "AFAlt" = "orange", "AFSample v2.1" = "yellow", "AFSample2 v2.3" = "blue", "AFSample2 v2.1" = "pink", "AFSample2 v1" = "red", "NA" = "gray")
  special_cases_mapping2 <- c("green" = "NMR Model", "purple"= "AF2 Baseline", "cyan" = "AFSample v2.2", "orange" = "AFAlt", "yellow" = "AFSample v2.1", "blue" = "AFSample2 v2.3", "pink" = "AFSample2 v2.1", "red" = "AFSample2 v1",  "gray" = "NA")
  special_cases <- factor(special_cases, levels = names(special_cases_mapping2))
  levels(special_cases) <- names(special_cases_mapping1)

  # Assign colors to tsne_data$Special based on special_cases_mapping
  tsne_data$Special <- special_cases
  umap_data$Special <- special_cases

  tsne_data$pdb_file
  # Set consensus values for color mapping
  tsne_data$Color <- consensus_values
  umap_data$Color <- consensus_values

  if (TRUE){
    tsne_data$Color <- consensus_data$bayesian_values
    umap_data$Color <- consensus_data$bayesian_values
  }

  # t-SNE Plot 1: Clusters
  tsne_plot_clusters <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Cluster, text = pdb_file)) +
    geom_point() +
    scale_color_discrete(name = "Cluster") +
    ggtitle(paste("t-SNE Plot for PDB ID=", pdb_id)) +
    theme_minimal()

  tsne_plotly_clusters <- ggplotly(tsne_plot_clusters, tooltip = "text")

  # t-SNE Plot 2: Heatmap of consensus values
  tsne_plot_heatmap <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Color, text = paste0("PDB File:", pdb_file, "\nCSPRank:", Color))) +
    geom_point() +
    scale_color_gradient(low = "blue", high = "red", name = "Score") +
    ggtitle(paste("t-SNE Heatmap for PDB ID=", pdb_id)) +
    theme_minimal()

  tsne_plotly_heatmap <- ggplotly(tsne_plot_heatmap, tooltip = "text")

  # t-SNE Plot 3: Special cases
  tsne_plot_special <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Special, text = pdb_file)) +
    geom_point() +
    scale_color_manual(values = special_cases_mapping1, name = "Model Source") +
    ggtitle(paste("t-SNE for PDB ID=", pdb_id)) +
    theme_minimal()

  tsne_plotly_special <- ggplotly(tsne_plot_special, tooltip = "text")

  # Save static t-SNE plots
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_clusters_plot.png'), plot = tsne_plot_clusters)
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_heatmap_plot.png'), plot = tsne_plot_heatmap)
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_special_plot.png'), plot = tsne_plot_special)

  # UMAP Plot 1: Clusters
  umap_plot_clusters <- ggplot(umap_data, aes(x = UMAP1, y = UMAP2, color = Cluster, text = pdb_file)) +
    geom_point() +
    scale_color_discrete(name = "Cluster") +
    ggtitle(paste("UMAP Plot for PDB ID=", pdb_id)) +
    theme_minimal()

  umap_plotly_clusters <- ggplotly(umap_plot_clusters, tooltip = "text")

  # UMAP Plot 2: Heatmap of consensus values
  umap_plot_heatmap <- ggplot(umap_data, aes(x = UMAP1, y = UMAP2, color = Color, text = paste0("PDB File:", pdb_file, "\nCSPRank:", Color))) +
    geom_point() +
    scale_color_gradient(low = "blue", high = "red", name = "Score") +
    ggtitle(paste("UMAP Heatmap for PDB ID=", pdb_id)) +
    theme_minimal()

  umap_plotly_heatmap <- ggplotly(umap_plot_heatmap, tooltip = "text")

  # UMAP Plot 3: Special cases
  umap_plot_special <- ggplot(umap_data, aes(x = UMAP1, y = UMAP2, color = Special, text = pdb_file)) +
    geom_point() +
    scale_color_manual(values = special_cases_mapping1, name = "Model Source") +
    ggtitle(paste("UMAP for PDB ID=", pdb_id)) +
    theme_minimal()

  umap_plotly_special <- ggplotly(umap_plot_special, tooltip = "text")

  # Save static UMAP plots
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_clusters_plot.png'), plot = umap_plot_clusters)
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_heatmap_plot.png'), plot = umap_plot_heatmap)
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_special_plot.png'), plot = umap_plot_special)

  return(list(tsne_plotly_clusters = tsne_plotly_clusters, tsne_plotly_heatmap = tsne_plotly_heatmap,
              tsne_plotly_special = tsne_plotly_special, umap_plotly_clusters = umap_plotly_clusters,
              umap_plotly_heatmap = umap_plotly_heatmap, umap_plotly_special = umap_plotly_special))
}


plot_cluster_CSPRank_boxplot <- function(tsne_data, umap_data, consensus_data, pdb_id, chain) {

  # Match tsne_data$pdb_file to consensus_data$basename_holo_model_path
  matched_data <- merge(tsne_data, consensus_data, by.x = "pdb_file", by.y = "basename_holo_model_path")

  # Create the boxplot using ggplot2
  #tsne_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = bayesian_values, fill = as.factor(Cluster))) +
  tsne_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = consensus, fill = as.factor(Cluster))) +
    geom_boxplot() +
    labs(x = "Cluster", y = "metric", title = paste("t-SNE Cluster CSP Bayesian model selection metric")) +
    theme_minimal() +
    scale_fill_discrete(name = "Cluster")

  # Save the t-SNE plot
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_', chain, '_tsne_boxplot.png'), plot = tsne_cluster_CSPRank_boxplot)

  # Match tsne_data$pdb_file to consensus_data$basename_holo_model_path
  matched_data <- merge(umap_data, consensus_data, by.x = "pdb_file", by.y = "basename_holo_model_path")

  # Create the boxplot using ggplot2
  #umap_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = bayesian_values, fill = as.factor(Cluster))) +
  umap_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = consensus, fill = as.factor(Cluster))) +
    geom_boxplot() +
    labs(x = "Cluster", y = "metric", title = paste("UMAP Cluster CSP Bayesian model selection metric")) +
    theme_minimal() +
    scale_fill_discrete(name = "Cluster")

  # Save the UMAP plot
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_', chain, '_umap_boxplot.png'), plot = umap_cluster_CSPRank_boxplot)

  return(list(tsne_cluster_CSPRank_boxplot = tsne_cluster_CSPRank_boxplot, umap_cluster_CSPRank_boxplot = umap_cluster_CSPRank_boxplot))

}
# Function to read the output CSV file and trim the data
trim_data <- function(csv_file_path, xyz_matrix, pdb_files_names) {
  # Read the CSV file
  result_data <- read.csv(csv_file_path)

  # Extract the pdb_file names from the result_data
  remaining_pdb_files <- result_data$pdb_file  # Assuming 'pdb_file' is the column in the CSV with the pdb file names

  # Create a logical vector indicating which pdb_files_names are in the remaining_pdb_files
  keep_indices <- pdb_files_names %in% remaining_pdb_files

  # Trim the xyz_matrix and pdb_files_names using the logical vector
  trimmed_xyz_matrix <- xyz_matrix[keep_indices, ]
  trimmed_pdb_files_names <- pdb_files_names[keep_indices]

  return(list(trimmed_xyz_matrix = trimmed_xyz_matrix, trimmed_pdb_files_names = trimmed_pdb_files_names))
}

perform_analysis <- function(pdb_id, chain, max_clusters = 10) {
  set.seed(42)
  chain_a_xyz <- get_matrix(pdb_id, chain)
  xyz_matrix <- do.call(rbind, chain_a_xyz)

  # Identify unique rows and their indices
  unique_indices <- !duplicated(xyz_matrix)
  xyz_matrix <- xyz_matrix[unique_indices, ]

  # Create a data frame for PCA results
  pdb_files_names <- names(chain_a_xyz)[unique_indices]
  # trim pdb_files_names and xyz_matrix to points which are not outliers in PCA space
  if (TRUE) {
  #if (FALSE) {
    trimmed_pca_csv_file <-  paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_PCA_chain_', chain, '_data_trimmed.csv')
    results <- trim_data(trimmed_pca_csv_file, xyz_matrix, pdb_files_names)

    pdb_files_names <- results$trimmed_pdb_files_names
    xyz_matrix <- results$trimmed_xyz_matrix
  }

  print(dim(xyz_matrix))
  # t-SNE Analysis
  tsne_result <- Rtsne(xyz_matrix, dims = 2, perplexity = 25, verbose = TRUE)
  #tsne_result <- Rtsne(xyz_matrix, dims = 2, perplexity = 30, verbose = TRUE)
  tsne_data <- data.frame(pdb_file = pdb_files_names, TSNE1 = tsne_result$Y[, 1], TSNE2 = tsne_result$Y[, 2])

  # Determine optimal clusters for t-SNE results
  tsne_clusters_info <- determine_optimal_clusters(tsne_data[, c("TSNE1", "TSNE2")], max_clusters)
  optimal_tsne_clusters <- tsne_clusters_info$optimal_clusters
  print(paste("Optimal number of clusters for t-SNE:", optimal_tsne_clusters))
  tsne_clusters <- kmeans(tsne_data[, c("TSNE1", "TSNE2")], centers = optimal_tsne_clusters)$cluster
  tsne_data$Cluster <- factor(tsne_clusters)

  # Save t-SNE cluster plots
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_Elbow_plot.png'), plot = tsne_clusters_info$elbow_plot)
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_Silhouette_plot.png'), plot = tsne_clusters_info$silhouette_plot)

  # Write t-SNE results to CSV
  write.csv(tsne_data, paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_TSNE_chain_', chain, '_data.csv'), row.names = FALSE)

  # UMAP Analysis
  umap_result <- umap(xyz_matrix)
  umap_data <- data.frame(pdb_file = pdb_files_names, UMAP1 = umap_result$layout[, 1], UMAP2 = umap_result$layout[, 2])

  # Determine optimal clusters for UMAP results
  umap_clusters_info <- determine_optimal_clusters(umap_data[, c("UMAP1", "UMAP2")], max_clusters)
  optimal_umap_clusters <- umap_clusters_info$optimal_clusters
  print(paste("Optimal number of clusters for UMAP:", optimal_umap_clusters))
  umap_clusters <- kmeans(umap_data[, c("UMAP1", "UMAP2")], centers = optimal_umap_clusters)$cluster
  umap_data$Cluster <- factor(umap_clusters)

  # Save UMAP cluster plots
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_Elbow_plot.png'), plot = umap_clusters_info$elbow_plot)
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_Silhouette_plot.png'), plot = umap_clusters_info$silhouette_plot)

  # Write UMAP results to CSV
  write.csv(umap_data, paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_chain_', chain, '_data.csv'), row.names = FALSE)

  # Read consensus data
  consensus_data <- read_consensus_data(pdb_id, pdb_files_names)

  # Plot the results
  plots <- plot_dimensional_reduction(tsne_data, umap_data, consensus_data, pdb_id, chain)

  # Print interactive plots to display in R
  print(plots$tsne_plotly_clusters)
  print(plots$tsne_plotly_heatmap)
  print(plots$tsne_plotly_special)
  print(plots$umap_plotly_clusters)
  print(plots$umap_plotly_heatmap)
  print(plots$umap_plotly_special)

  plots <- plot_cluster_CSPRank_boxplot(tsne_data, umap_data, consensus_data, pdb_id, chain)
  print(plots$tsne_cluster_CSPRank_boxplot)
  print(plots$umap_cluster_CSPRank_boxplot)
}

chain <- 'B'
pdb_id <- '5OEO'
perform_analysis(pdb_id, chain)
max_clusters <- 10

pdb_ids <- c('2nmb','2hug', '2nd1', '2jw1', '5m9d', '5xv8', '2kwv')
chains <- c('B')
for (pdb_id in pdb_ids) {
  for (chain in chains) {
    print(paste0('Running ', pdb_id, ' with chain ', chain))
    perform_analysis(pdb_id, chain)
  }
}

	library("bio3d")
	library('dplyr')
	library('ggplot2')
	library('Rtsne')
	library('umap')
	library('cluster')
	library('factoextra')
	library('plotly')

	print(getwd())
	setwd('/home/tiburon/Desktop/ROT4/CSPRank_ES_Processing')
	print(getwd())

	CSP_Rank_Scores <- './CSP_Rank_Scores/'
	PDB_FILES <- './PDB_FILES/'
	experimental_structures <- paste0(PDB_FILES, 'experimental_structures/')
	computational_structures <- paste0(PDB_FILES, 'computational_structures/')
	CS_Lists <- './CS_Lists/'
	CS_Predictions <- './CS_Predictions/'
	CLUSTERING_RESULTS <- './CLUSTERING_RESULTS/'

	get_matrix <- function(pdb_id, chain) {

	dir_path <- paste0(PDB_FILES, pdb_id, "_aligned")
	path_to_pdb_files <- dir_path#paste0("./", pdb_id, "_aligned")
	alt_dir_path <- paste0(PDB_FILES, pdb_id, "_alt_aligned")
	path_to_alt_pdb_files <- alt_dir_path#paste0("./", pdb_id, "_alt_aligned")
	pdb_files <- list.files(path = dir_path, pattern = "\\.pdb$")
	alt_pdb_files <- list.files(path = alt_dir_path, pattern = "\\.pdb$")

	csp_data <- read.csv(paste0(CSP_Rank_Scores, 'CSP_', tolower(pdb_id), "_CSpred.csv"))

	chain_a_xyz <- list()

	for (file in pdb_files) {
	rows <- filter(csp_data, holo_model_path == file.path(path_to_pdb_files, file))
	row <- rows[1, ]

	if (!is.na(row$F1)) {
	pdb <- read.pdb(file.path(dir_path, file))
	chain_to_analyze <- pdb
	if (chain == 'A') {
	chain_to_analyze <- trim.pdb(pdb, chain = "A")
	} else if ( chain == 'B' ) {
	chain_to_analyze <- trim.pdb(pdb, chain = "B")
	}
	chain_a_xyz[[file]] <- chain_to_analyze$xyz
	}
	}

	len_AFS <- length(chain_a_xyz)
	for (file in alt_pdb_files) {
	rows <- filter(csp_data, holo_model_path == file.path(path_to_alt_pdb_files, file))
	row <- rows[1, ]

	if (!is.na(row$F1)) {
	pdb <- read.pdb(file.path(alt_dir_path, file))
	chain_to_analyze <- pdb
	if (chain == 'A') {
	chain_to_analyze <- trim.pdb(pdb, chain = "A")
	} else if ( chain == 'B' ) {
	chain_to_analyze <- trim.pdb(pdb, chain = "B")
	}
	chain_a_xyz[[file]] <- chain_to_analyze$xyz
	}
	}

	if (len_AFS == length(chain_a_xyz)) {
	print("DID YOU RUN add_aligned_suffix.py?")
	}

	special_pdb_file <- paste0('comp_', tolower(pdb_id), ".pdb")
	special_pdb_file_dir <- paste0(PDB_FILES, 'computational_structures/')
	pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file))
	chain_to_analyze <- pdb
	if (chain == 'A') {
	chain_to_analyze <- trim.pdb(pdb, chain = "A")
	} else if ( chain == 'B' ) {
	chain_to_analyze <- trim.pdb(pdb, chain = "B")
	}
	chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz

	special_pdb_file <- paste0('exp_', tolower(pdb_id), ".pdb")
	special_pdb_file_dir <- paste0(PDB_FILES, 'experimental_structures/')
	pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file))
	chain_to_analyze <- pdb
	if (chain == 'A') {
	chain_to_analyze <- trim.pdb(pdb, chain = "A")
	} else if ( chain == 'B' ) {
	chain_to_analyze <- trim.pdb(pdb, chain = "B")
	}
	chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz


	pdb_files <- list.files(path = paste0(special_pdb_file_dir, tolower(pdb_id), '/'), pattern = "\\.pdb$")

	for (file in pdb_files) {
	pdb <- read.pdb(file.path(paste0(special_pdb_file_dir, tolower(pdb_id), '/'), file))
	chain_to_analyze <- pdb
	if (chain == 'A') {
	chain_to_analyze <- trim.pdb(pdb, chain = "A")
	} else if ( chain == 'B' ) {
	chain_to_analyze <- trim.pdb(pdb, chain = "B")
	}
	chain_a_xyz[[file]] <- chain_to_analyze$xyz
	}

	return(chain_a_xyz)
	}

	read_consensus_data <- function(pdb_id, pdb_files_names) {
	csv_path <- paste0(CSP_Rank_Scores,"CSP_", tolower(pdb_id), "_CSpred.csv")
	csp_data <- read.csv(csv_path)

	# Filter out rows without a value in 'Confidence'
	if (FALSE){
	csp_data <- csp_data[!is.na(csp_data$Confidence) & csp_data$Confidence != 0, ]
	}

	# Extract the basename of holo_model_path for comparison
	csp_data$basename_holo_model_path <- basename(csp_data$holo_model_path)

	# Filter and ensure the order matches pdb_files_names
	filtered_data <- csp_data %>% filter(basename_holo_model_path %in% pdb_files_names)
	filtered_data <- filtered_data[match(pdb_files_names, filtered_data$basename_holo_model_path), ]

	consensus_values <- filtered_data$consensus
	filtered_data2 <- na.omit(filtered_data)
	if (FALSE){
	consensus_values <- 2 * filtered_data$consensus * filtered_data$DP / (filtered_data$consensus + filtered_data$DP)
	#consensus_values <- filtered_data$consensus * (filtered_data$DP - min(filtered_data2$DP)) / (max(filtered_data2$DP) - min(filtered_data2$DP))
	#consensus_values <- filtered_data$consensus * (filtered_data$RPF_RECALL - min(filtered_data2$RPF_RECALL)) / (max(filtered_data2$RPF_RECALL) - min(filtered_data2$RPF_RECALL))
	#consensus_values <- filtered_data$DP
	}
	confidence_values <- filtered_data$Confidence
	plddt_values <- filtered_data$plddt

	special_cases <- ifelse(grepl(paste0("exp_", tolower(pdb_id)), filtered_data$holo_model_path), "green",
	ifelse(grepl(paste0("comp_", tolower(pdb_id)), filtered_data$holo_model_path), "purple",
	ifelse(grepl('v3_', filtered_data$holo_model_path) & grepl('dropout', filtered_data$holo_model_path), 'blue',
	ifelse(grepl('v2_', filtered_data$holo_model_path) & grepl('dropout', filtered_data$holo_model_path), 'pink',
	ifelse(grepl('dropout', filtered_data$holo_model_path), 'red',
	ifelse(grepl('v2_', filtered_data$holo_model_path), "cyan",
	ifelse(grepl('notemplate', filtered_data$holo_model_path), "orange",
	ifelse(grepl('multimer', filtered_data$holo_model_path), "yellow", "NA"))))))))

	return(list(consensus = consensus_values, plddt = plddt_values, Confidence = confidence_values, special_cases = special_cases, basename_holo_model_path = filtered_data$basename_holo_model_path))
	}

	determine_optimal_clusters <- function(data, max_clusters = 20) {
	wss <- sapply(1:max_clusters, function(k) {
	kmeans(data, k, nstart = 10)$tot.withinss
	})
	elbow_plot <- qplot(1:max_clusters, wss, geom = "line") +
	geom_point() +
	ggtitle("Elbow Method for Optimal Clusters") +
	xlab("Number of Clusters") +
	ylab("Total Within-Cluster Sum of Squares")

	silhouette_scores <- sapply(2:max_clusters, function(k) {
	km <- kmeans(data, centers = k, nstart = 10)
	ss <- silhouette(km$cluster, dist(data))
	mean(ss[, 3])
	})
	silhouette_plot <- qplot(2:max_clusters, silhouette_scores, geom = "line") +
	geom_point() +
	ggtitle("Silhouette Method for Optimal Clusters") +
	xlab("Number of Clusters") +
	ylab("Average Silhouette Score")

	optimal_clusters <- which.max(silhouette_scores) + 1

	return(list(optimal_clusters = optimal_clusters,
	elbow_plot = elbow_plot,
	silhouette_plot = silhouette_plot))
	}

	plot_dimensional_reduction <- function(tsne_data, umap_data, consensus_data, pdb_id, chain) {
	special_cases <- consensus_data$special_cases
	consensus_values <- consensus_data$consensus
	consensus_data$bayesian_values <- consensus_data$consensus * consensus_data$Confidence
	#consensus_data$bayesian_values <- consensus_data$consensus * consensus_data$plddt

	consensus_data$basename_holo_model_path
	# Define the mapping for special cases
	special_cases_mapping1 <- c("NMR Model" = "green", "AF2 Baseline" = "purple", "AFSample v2.2" = "cyan", "AFAlt" = "orange", "AFSample v2.1" = "yellow", "AFSample2 v2.3" = "blue", "AFSample2 v2.1" = "pink", "AFSample2 v1" = "red", "NA" = "gray")
	special_cases_mapping2 <- c("green" = "NMR Model", "purple"= "AF2 Baseline", "cyan" = "AFSample v2.2", "orange" = "AFAlt", "yellow" = "AFSample v2.1", "blue" = "AFSample2 v2.3", "pink" = "AFSample2 v2.1", "red" = "AFSample2 v1", "gray" = "NA")
	special_cases <- factor(special_cases, levels = names(special_cases_mapping2))
	levels(special_cases) <- names(special_cases_mapping1)

	# Assign colors to tsne_data$Special based on special_cases_mapping
	tsne_data$Special <- special_cases
	umap_data$Special <- special_cases

	tsne_data$pdb_file
	# Set consensus values for color mapping
	tsne_data$Color <- consensus_values
	umap_data$Color <- consensus_values

	if (TRUE){
	tsne_data$Color <- consensus_data$bayesian_values
	umap_data$Color <- consensus_data$bayesian_values
	}

	# t-SNE Plot 1: Clusters
	tsne_plot_clusters <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Cluster, text = pdb_file)) +
	geom_point() +
	scale_color_discrete(name = "Cluster") +
	ggtitle(paste("t-SNE Plot for PDB ID=", pdb_id)) +
	theme_minimal()

	tsne_plotly_clusters <- ggplotly(tsne_plot_clusters, tooltip = "text")

	# t-SNE Plot 2: Heatmap of consensus values
	tsne_plot_heatmap <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Color, text = paste0("PDB File:", pdb_file, "\nCSPRank:", Color))) +
	geom_point() +
	scale_color_gradient(low = "blue", high = "red", name = "Score") +
	ggtitle(paste("t-SNE Heatmap for PDB ID=", pdb_id)) +
	theme_minimal()

	tsne_plotly_heatmap <- ggplotly(tsne_plot_heatmap, tooltip = "text")

	# t-SNE Plot 3: Special cases
	tsne_plot_special <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Special, text = pdb_file)) +
	geom_point() +
	scale_color_manual(values = special_cases_mapping1, name = "Model Source") +
	ggtitle(paste("t-SNE for PDB ID=", pdb_id)) +
	theme_minimal()

	tsne_plotly_special <- ggplotly(tsne_plot_special, tooltip = "text")

	# Save static t-SNE plots
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_clusters_plot.png'), plot = tsne_plot_clusters)
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_heatmap_plot.png'), plot = tsne_plot_heatmap)
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_special_plot.png'), plot = tsne_plot_special)

	# UMAP Plot 1: Clusters
	umap_plot_clusters <- ggplot(umap_data, aes(x = UMAP1, y = UMAP2, color = Cluster, text = pdb_file)) +
	geom_point() +
	scale_color_discrete(name = "Cluster") +
	ggtitle(paste("UMAP Plot for PDB ID=", pdb_id)) +
	theme_minimal()

	umap_plotly_clusters <- ggplotly(umap_plot_clusters, tooltip = "text")

	# UMAP Plot 2: Heatmap of consensus values
	umap_plot_heatmap <- ggplot(umap_data, aes(x = UMAP1, y = UMAP2, color = Color, text = paste0("PDB File:", pdb_file, "\nCSPRank:", Color))) +
	geom_point() +
	scale_color_gradient(low = "blue", high = "red", name = "Score") +
	ggtitle(paste("UMAP Heatmap for PDB ID=", pdb_id)) +
	theme_minimal()

	umap_plotly_heatmap <- ggplotly(umap_plot_heatmap, tooltip = "text")

	# UMAP Plot 3: Special cases
	umap_plot_special <- ggplot(umap_data, aes(x = UMAP1, y = UMAP2, color = Special, text = pdb_file)) +
	geom_point() +
	scale_color_manual(values = special_cases_mapping1, name = "Model Source") +
	ggtitle(paste("UMAP for PDB ID=", pdb_id)) +
	theme_minimal()

	umap_plotly_special <- ggplotly(umap_plot_special, tooltip = "text")

	# Save static UMAP plots
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_clusters_plot.png'), plot = umap_plot_clusters)
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_heatmap_plot.png'), plot = umap_plot_heatmap)
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_special_plot.png'), plot = umap_plot_special)

	return(list(tsne_plotly_clusters = tsne_plotly_clusters, tsne_plotly_heatmap = tsne_plotly_heatmap,
	tsne_plotly_special = tsne_plotly_special, umap_plotly_clusters = umap_plotly_clusters,
	umap_plotly_heatmap = umap_plotly_heatmap, umap_plotly_special = umap_plotly_special))
	}


	plot_cluster_CSPRank_boxplot <- function(tsne_data, umap_data, consensus_data, pdb_id, chain) {

	# Match tsne_data$pdb_file to consensus_data$basename_holo_model_path
	matched_data <- merge(tsne_data, consensus_data, by.x = "pdb_file", by.y = "basename_holo_model_path")

	# Create the boxplot using ggplot2
	#tsne_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = bayesian_values, fill = as.factor(Cluster))) +
	tsne_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = consensus, fill = as.factor(Cluster))) +
	geom_boxplot() +
	labs(x = "Cluster", y = "metric", title = paste("t-SNE Cluster CSP Bayesian model selection metric")) +
	theme_minimal() +
	scale_fill_discrete(name = "Cluster")

	# Save the t-SNE plot
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_', chain, '_tsne_boxplot.png'), plot = tsne_cluster_CSPRank_boxplot)

	# Match tsne_data$pdb_file to consensus_data$basename_holo_model_path
	matched_data <- merge(umap_data, consensus_data, by.x = "pdb_file", by.y = "basename_holo_model_path")

	# Create the boxplot using ggplot2
	#umap_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = bayesian_values, fill = as.factor(Cluster))) +
	umap_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = consensus, fill = as.factor(Cluster))) +
	geom_boxplot() +
	labs(x = "Cluster", y = "metric", title = paste("UMAP Cluster CSP Bayesian model selection metric")) +
	theme_minimal() +
	scale_fill_discrete(name = "Cluster")

	# Save the UMAP plot
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_', chain, '_umap_boxplot.png'), plot = umap_cluster_CSPRank_boxplot)

	return(list(tsne_cluster_CSPRank_boxplot = tsne_cluster_CSPRank_boxplot, umap_cluster_CSPRank_boxplot = umap_cluster_CSPRank_boxplot))

	}
	# Function to read the output CSV file and trim the data
	trim_data <- function(csv_file_path, xyz_matrix, pdb_files_names) {
	# Read the CSV file
	result_data <- read.csv(csv_file_path)

	# Extract the pdb_file names from the result_data
	remaining_pdb_files <- result_data$pdb_file # Assuming 'pdb_file' is the column in the CSV with the pdb file names

	# Create a logical vector indicating which pdb_files_names are in the remaining_pdb_files
	keep_indices <- pdb_files_names %in% remaining_pdb_files

	# Trim the xyz_matrix and pdb_files_names using the logical vector
	trimmed_xyz_matrix <- xyz_matrix[keep_indices, ]
	trimmed_pdb_files_names <- pdb_files_names[keep_indices]

	return(list(trimmed_xyz_matrix = trimmed_xyz_matrix, trimmed_pdb_files_names = trimmed_pdb_files_names))
	}

	perform_analysis <- function(pdb_id, chain, max_clusters = 10) {
	set.seed(42)
	chain_a_xyz <- get_matrix(pdb_id, chain)
	xyz_matrix <- do.call(rbind, chain_a_xyz)

	# Identify unique rows and their indices
	unique_indices <- !duplicated(xyz_matrix)
	xyz_matrix <- xyz_matrix[unique_indices, ]

	# Create a data frame for PCA results
	pdb_files_names <- names(chain_a_xyz)[unique_indices]
	# trim pdb_files_names and xyz_matrix to points which are not outliers in PCA space
	if (TRUE) {
	#if (FALSE) {
	trimmed_pca_csv_file <- paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_PCA_chain_', chain, '_data_trimmed.csv')
	results <- trim_data(trimmed_pca_csv_file, xyz_matrix, pdb_files_names)

	pdb_files_names <- results$trimmed_pdb_files_names
	xyz_matrix <- results$trimmed_xyz_matrix
	}

	print(dim(xyz_matrix))
	# t-SNE Analysis
	tsne_result <- Rtsne(xyz_matrix, dims = 2, perplexity = 25, verbose = TRUE)
	#tsne_result <- Rtsne(xyz_matrix, dims = 2, perplexity = 30, verbose = TRUE)
	tsne_data <- data.frame(pdb_file = pdb_files_names, TSNE1 = tsne_result$Y[, 1], TSNE2 = tsne_result$Y[, 2])

	# Determine optimal clusters for t-SNE results
	tsne_clusters_info <- determine_optimal_clusters(tsne_data[, c("TSNE1", "TSNE2")], max_clusters)
	optimal_tsne_clusters <- tsne_clusters_info$optimal_clusters
	print(paste("Optimal number of clusters for t-SNE:", optimal_tsne_clusters))
	tsne_clusters <- kmeans(tsne_data[, c("TSNE1", "TSNE2")], centers = optimal_tsne_clusters)$cluster
	tsne_data$Cluster <- factor(tsne_clusters)

	# Save t-SNE cluster plots
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_Elbow_plot.png'), plot = tsne_clusters_info$elbow_plot)
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_Silhouette_plot.png'), plot = tsne_clusters_info$silhouette_plot)

	# Write t-SNE results to CSV
	write.csv(tsne_data, paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_TSNE_chain_', chain, '_data.csv'), row.names = FALSE)

	# UMAP Analysis
	umap_result <- umap(xyz_matrix)
	umap_data <- data.frame(pdb_file = pdb_files_names, UMAP1 = umap_result$layout[, 1], UMAP2 = umap_result$layout[, 2])

	# Determine optimal clusters for UMAP results
	umap_clusters_info <- determine_optimal_clusters(umap_data[, c("UMAP1", "UMAP2")], max_clusters)
	optimal_umap_clusters <- umap_clusters_info$optimal_clusters
	print(paste("Optimal number of clusters for UMAP:", optimal_umap_clusters))
	umap_clusters <- kmeans(umap_data[, c("UMAP1", "UMAP2")], centers = optimal_umap_clusters)$cluster
	umap_data$Cluster <- factor(umap_clusters)

	# Save UMAP cluster plots
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_Elbow_plot.png'), plot = umap_clusters_info$elbow_plot)
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_Silhouette_plot.png'), plot = umap_clusters_info$silhouette_plot)

	# Write UMAP results to CSV
	write.csv(umap_data, paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_UMAP_chain_', chain, '_data.csv'), row.names = FALSE)

	# Read consensus data
	consensus_data <- read_consensus_data(pdb_id, pdb_files_names)

	# Plot the results
	plots <- plot_dimensional_reduction(tsne_data, umap_data, consensus_data, pdb_id, chain)

	# Print interactive plots to display in R
	print(plots$tsne_plotly_clusters)
	print(plots$tsne_plotly_heatmap)
	print(plots$tsne_plotly_special)
	print(plots$umap_plotly_clusters)
	print(plots$umap_plotly_heatmap)
	print(plots$umap_plotly_special)

	plots <- plot_cluster_CSPRank_boxplot(tsne_data, umap_data, consensus_data, pdb_id, chain)
	print(plots$tsne_cluster_CSPRank_boxplot)
	print(plots$umap_cluster_CSPRank_boxplot)
	}

	chain <- 'B'
	pdb_id <- '5OEO'
	perform_analysis(pdb_id, chain)
	max_clusters <- 10

	pdb_ids <- c('2nmb','2hug', '2nd1', '2jw1', '5m9d', '5xv8', '2kwv')
	chains <- c('B')
	for (pdb_id in pdb_ids) {
	for (chain in chains) {
	print(paste0('Running ', pdb_id, ' with chain ', chain))
	perform_analysis(pdb_id, chain)
	}
	}