tSNE_RPF.R


library("bio3d")
library('dplyr')
library('ggplot2')
library('Rtsne')
library('umap')
library('cluster')
library('factoextra')
library('plotly')
library('irlba')
library('igraph')
library('FNN')

print(getwd())
setwd('/home/tiburon/Desktop/ROT4/CSP_Rank')
print(getwd())

CSP_Rank_Scores <- './CSP_Rank_Scores/'
PDB_FILES <- './PDB_FILES/'
experimental_structures <- paste0(PDB_FILES, 'experimental_structures/')
computational_structures <- paste0(PDB_FILES, 'computational_structures/')
CS_Lists <- './CS_Lists/'
CS_Predictions <- './CS_Predictions/'
CLUSTERING_RESULTS <- './CLUSTERING_RESULTS/'

get_matrix <- function(pdb_id, chain) {

  dir_path <- paste0(PDB_FILES, pdb_id, "_aligned")
  path_to_pdb_files <- dir_path#paste0("./", pdb_id, "_aligned")
  alt_dir_path <- dir_path
  path_to_alt_pdb_files <- alt_dir_path#paste0("./", pdb_id, "_alt_aligned")
  pdb_files <- list.files(path = dir_path, pattern = "\\.pdb$")
  alt_pdb_files <- list.files(path = alt_dir_path, pattern = "\\.pdb$")

  csp_data <- read.csv(paste0(CSP_Rank_Scores, 'CSP_', tolower(pdb_id), "_CSpred.csv"))

  chain_a_xyz <- list()

  for (file in pdb_files) {
    rows <- filter(csp_data, holo_model_path == file.path(path_to_pdb_files, file))
    row <- rows[1, ]

    if (!is.na(row$F1)) {
      pdb <- read.pdb(file.path(dir_path, file))
      chain_to_analyze <- pdb
      if (chain == 'A') {
        chain_to_analyze <- trim.pdb(pdb, chain = "A")
      } else if ( chain == 'B' ) {
        chain_to_analyze <- trim.pdb(pdb, chain = "B")
      }
      chain_a_xyz[[file]] <- chain_to_analyze$xyz
    }
  }

  len_AFS <- length(chain_a_xyz)
  for (file in alt_pdb_files) {
    rows <- filter(csp_data, holo_model_path == file.path(path_to_alt_pdb_files, file))
    row <- rows[1, ]

    if (!is.na(row$F1)) {
      pdb <- read.pdb(file.path(alt_dir_path, file))
      chain_to_analyze <- pdb
      if (chain == 'A') {
        chain_to_analyze <- trim.pdb(pdb, chain = "A")
      } else if ( chain == 'B' ) {
        chain_to_analyze <- trim.pdb(pdb, chain = "B")
      }
      chain_a_xyz[[file]] <- chain_to_analyze$xyz
    }
  }

  if (len_AFS == length(chain_a_xyz)) {
    print("DID YOU RUN add_aligned_suffix.py?")
  }

  special_pdb_file <- paste0('comp_', tolower(pdb_id), ".pdb")
  special_pdb_file_dir <- paste0(PDB_FILES, 'computational_structures/')
  pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file))
  chain_to_analyze <- pdb
  if (chain == 'A') {
    chain_to_analyze <- trim.pdb(pdb, chain = "A")
  } else if ( chain == 'B' ) {
    chain_to_analyze <- trim.pdb(pdb, chain = "B")
  }
  chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz

  special_pdb_file <- paste0('exp_', tolower(pdb_id), ".pdb")
  special_pdb_file_dir <- paste0(PDB_FILES, 'experimental_structures/')
  pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file))
  chain_to_analyze <- pdb
  if (chain == 'A') {
    chain_to_analyze <- trim.pdb(pdb, chain = "A")
  } else if ( chain == 'B' ) {
    chain_to_analyze <- trim.pdb(pdb, chain = "B")
  }
  chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz


  pdb_files <- list.files(path = paste0(special_pdb_file_dir, tolower(pdb_id), '/'), pattern = "\\.pdb$")

  for (file in pdb_files) {
    pdb <- read.pdb(file.path(paste0(special_pdb_file_dir, tolower(pdb_id), '/'), file))
    chain_to_analyze <- pdb
    if (chain == 'A') {
      chain_to_analyze <- trim.pdb(pdb, chain = "A")
    } else if ( chain == 'B' ) {
      chain_to_analyze <- trim.pdb(pdb, chain = "B")
    }
    chain_a_xyz[[file]] <- chain_to_analyze$xyz
  }

  return(chain_a_xyz)
}

read_consensus_data <- function(pdb_id, pdb_files_names) {

  # Remove entries from pdb_files_names containing the substring 'exp'
  #pdb_files_names <- pdb_files_names[!grepl('exp', pdb_files_names)]
  #pdb_files_names <- pdb_files_names[!grepl('comp', pdb_files_names)]
  csv_path <- paste0(CSP_Rank_Scores,"CSP_", tolower(pdb_id), "_CSpred.csv")
  csp_data <- read.csv(csv_path)

  # Filter out rows without a value in 'Confidence'
  #if (TRUE){
  #  csp_data <- csp_data[!is.na(csp_data$Confidence) & csp_data$Confidence != 0, ]
  # }

  # Extract the basename of holo_model_path for comparison
  csp_data$basename_holo_model_path <- basename(csp_data$holo_model_path)

  # Filter and ensure the order matches pdb_files_names
  filtered_data <- csp_data %>%
    filter(basename_holo_model_path %in% pdb_files_names)# %>%
  #filter(!grepl('exp', basename_holo_model_path))
  filtered_data <- filtered_data[match(pdb_files_names, filtered_data$basename_holo_model_path), ]

  consensus_values <- filtered_data$consensus
  filtered_data2 <- na.omit(filtered_data)
  if (TRUE){
    #consensus_values <- 2 * filtered_data$consensus * filtered_data$DP / (filtered_data$consensus + filtered_data$DP)
    #consensus_values <- filtered_data$consensus * (filtered_data$DP - min(filtered_data2$DP)) / (max(filtered_data2$DP) - min(filtered_data2$DP))
    consensus_values <- filtered_data$consensus * (filtered_data$RPF_RECALL - min(filtered_data2$RPF_RECALL)) / (max(filtered_data2$RPF_RECALL) - min(filtered_data2$RPF_RECALL))
    #consensus_values <- filtered_data$DP
  }
  if (FALSE){
    consensus_values <- filtered_data$consensus * (1 - filtered_data$Q_score)
  }
  confidence_values <- filtered_data$Confidence
  plddt_values <- filtered_data$plddt

  special_cases <- ifelse(grepl(paste0("exp_", tolower(pdb_id)), filtered_data$holo_model_path), "green",
                          ifelse(grepl(paste0("comp_", tolower(pdb_id)), filtered_data$holo_model_path), "cyan",
                                 ifelse(grepl('v3_', filtered_data$holo_model_path) & grepl('dropout', filtered_data$holo_model_path), 'blue',
                                        ifelse(grepl('v2_', filtered_data$holo_model_path) & grepl('dropout', filtered_data$holo_model_path), 'pink',
                                               ifelse(grepl('dropout', filtered_data$holo_model_path), 'red',
                                                      ifelse(grepl('v2_', filtered_data$holo_model_path), "purple",
                                                             ifelse(grepl('notemplate', filtered_data$holo_model_path), "orange",
                                                                    ifelse(grepl('multimer', filtered_data$holo_model_path), "yellow", "NA"))))))))

  return(list(consensus = consensus_values, plddt = plddt_values, Confidence = confidence_values, special_cases = special_cases, basename_holo_model_path = filtered_data$basename_holo_model_path))
}
determine_optimal_clusters <- function(data, max_clusters = 20) {
  wss <- sapply(1:max_clusters, function(k) {
    kmeans(data, k, nstart = 10)$tot.withinss
  })
  elbow_plot <- qplot(1:max_clusters, wss, geom = "line") +
    geom_point() +
    ggtitle("Elbow Method for Optimal Clusters") +
    xlab("Number of Clusters") +
    ylab("Total Within-Cluster Sum of Squares")

  silhouette_scores <- sapply(2:max_clusters, function(k) {
    km <- kmeans(data, centers = k, nstart = 10)
    ss <- silhouette(km$cluster, dist(data))
    mean(ss[, 3])
  })
  silhouette_plot <- qplot(2:max_clusters, silhouette_scores, geom = "line") +
    geom_point() +
    ggtitle("Silhouette Method for Optimal Clusters") +
    xlab("Number of Clusters") +
    ylab("Average Silhouette Score")

  optimal_clusters <- which.max(silhouette_scores) + 1

  return(list(optimal_clusters = optimal_clusters,
              elbow_plot = elbow_plot,
              silhouette_plot = silhouette_plot))
}

plot_dimensional_reduction <- function(tsne_data, consensus_data, pdb_id, chain) {


  # Remove rows in tsne_data where pdb_file contains the substring "exp"
  #tsne_data <- tsne_data[!grepl('exp', tsne_data$pdb_file), ]
  #tsne_data <- tsne_data[!grepl('comp', tsne_data$pdb_file), ]

  special_cases <- consensus_data$special_cases
  consensus_df <- data.frame(consensus_data)
  print(consensus_df[consensus_df$special_cases == "cyan", ])
  print(consensus_df[consensus_df$special_cases == "green", ])

  consensus_values <- consensus_df$consensus
  negative_consensus_indices <- which(consensus_values <= 0)
  consensus_df <- consensus_df[-negative_consensus_indices, ]
  consensus_values <- consensus_df$consensus

  consensus_values <- ifelse(
    consensus_data$special_cases == "green",
    -1,
    consensus_values
  )
  consensus_values <- ifelse(
    consensus_data$special_cases == "cyan",
    0,
    consensus_values
  )
  negative_consensus_indices <- which(consensus_values <= 0)
  print(consensus_values[negative_consensus_indices])
  print(consensus_data$basename_holo_model_path[negative_consensus_indices])

  consensus_data$bayesian_values <- as.numeric(ifelse(is.na(consensus_data$Confidence) | consensus_data$Confidence == "", -1, consensus_data$consensus * consensus_data$Confidence))
  consensus_data$bayesian_values <- ifelse(grepl('comp', consensus_data$basename_holo_model_path), 0, consensus_data$bayesian_values)

  consensus_data$basename_holo_model_path
  # Define the mapping for special cases
  special_cases_mapping1 <- c("AFSample v2.2" = "purple", "AFAlt" = "orange", "AFSample v2.1" = "yellow", "AFSample2 v2.3" = "blue", "AFSample2 v2.1" = "pink", "AFSample2 v1" = "red", "NA" = "gray", "NMR Model" = "green", "AF2 Baseline" = "cyan")
  special_cases_mapping2 <- c("purple" = "AFSample v2.2", "orange" = "AFAlt", "yellow" = "AFSample v2.1", "blue" = "AFSample2 v2.3", "pink" = "AFSample2 v2.1", "red" = "AFSample2 v1",  "gray" = "NA", "green" = "NMR Model", "cyan"= "AF2 Baseline")
  special_cases <- factor(special_cases, levels = names(special_cases_mapping2))
  levels(special_cases) <- names(special_cases_mapping1)

  # Assign colors to tsne_data$Special based on special_cases_mapping
  tsne_data$Special <- special_cases
  tsne_data$pdb_file
  # Set consensus values for color mapping
  tsne_data$Color <- consensus_values

  if (FALSE){
    tsne_data$Color <- consensus_data$bayesian_values
  }

  # t-SNE Plot 1: Clusters
  tsne_plot_clusters <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Cluster, text = pdb_file)) +
    geom_point() +
    #ggtitle(paste("t-SNE Plot for PDB ID=", pdb_id)) +
    theme(legend.position = "none") +
    theme_minimal()

  tsne_plotly_clusters <- ggplotly(tsne_plot_clusters, tooltip = "text")
  print(tsne_plotly_clusters)

  # t-SNE Plot 2: Heatmap of consensus values
  # tsne_data$Color <- ifelse(tsne_data$Color == -1, "green", tsne_data$Color)
  # tsne_plot_heatmap <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Color, text = paste0("PDB File:", pdb_file, "\nCSPRank:", Color))) +
  #   geom_point() +
  #   scale_color_gradient(low = "blue", high = "red", name = "Posterior") +
  #   ggtitle(paste("Posteriors for ", pdb_id)) +
  #   theme_minimal()

  library(ggplot2)
  tsne_plot_heatmap <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, text = pdb_file)) +
    # 1) Points where Color != -1 -> color by numeric Posterior
    geom_point(
      data = subset(tsne_data, Color > 0),
      aes(color = as.numeric(Color)),
      size = 2
    ) +
    # 2) Points where Color == -1 -> use a special shape
    geom_point(
      data = subset(tsne_data, Color == -1),
      aes(shape = "NMR models"),    # map them to a discrete shape
      color = "green",             # color them green
      size = 3
    ) +
    # 3) Points where Color == -1 -> use a special shape
    geom_point(
      data = subset(tsne_data, Color == 0),
      aes(shape = "AF2 model"),    # map them to a discrete shape
      color = "cyan",             # color them green
      size = 3
    ) +
    # Set up shape legend for discrete “NMR models”
    scale_shape_manual(
      values = c("NMR models" = 19),   # pick any shape code you like
      name = "Model Type"             # legend title for shape
    ) +
    # Set up shape legend for discrete “NMR models”
    scale_shape_manual(
      values = c("AF2 model" = 19),   # pick any shape code you like
      name = "Model Type"             # legend title for shape
    ) +
    # Set up color gradient for numeric Posterior
    scale_color_gradient(low = "blue", high = "red", name = "Posterior") +
    ggtitle(paste("Posteriors for", pdb_id)) +
    theme_minimal()
  #tsne_plot_heatmap +
  #  guides(
   #   shape = guide_legend(title = NULL)#,       # no title for shape legend
   #   #color = guide_colorbar(title = "Posterior")
  #  )
  tsne_plotly_heatmap <- ggplotly(tsne_plot_heatmap, tooltip = "text")
  print(tsne_plotly_heatmap)

  # t-SNE Plot 3: Special cases
  tsne_plot_special <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Special, text = pdb_file)) +
    geom_point() +
    scale_color_manual(values = special_cases_mapping1, name = "Model Source") +
    ggtitle(paste("t-SNE for ", pdb_id)) +
    theme_minimal()

  tsne_plotly_special <- ggplotly(tsne_plot_special, tooltip = "text")
  print(tsne_plotly_special)

  # Save static t-SNE plots
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_clusters_plot.png'), plot = tsne_plot_clusters)
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_heatmap_plot.png'), plot = tsne_plot_heatmap)
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_special_plot.png'), plot = tsne_plot_special)


  return(list(tsne_plotly_clusters = tsne_plotly_clusters, tsne_plotly_heatmap = tsne_plotly_heatmap,
              tsne_plotly_special = tsne_plotly_special))
}


plot_cluster_CSPRank_boxplot <- function(tsne_data, consensus_data, pdb_id, chain) {

  # Match tsne_data$pdb_file to consensus_data$basename_holo_model_path
  matched_data <- merge(tsne_data, consensus_data, by.x = "pdb_file", by.y = "basename_holo_model_path")

  # Create the boxplot using ggplot2
  #tsne_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = bayesian_values, fill = as.factor(Cluster))) +
  tsne_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = consensus, fill = as.factor(Cluster))) +
    geom_boxplot() +
    labs(x = "Cluster", y = "metric", title = paste("t-SNE Cluster CSP Bayesian model selection metric")) +
    theme_minimal() +
    scale_fill_discrete(name = "Cluster")

  # Save the t-SNE plot
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_', chain, '_tsne_boxplot.png'), plot = tsne_cluster_CSPRank_boxplot)


  return(list(tsne_cluster_CSPRank_boxplot = tsne_cluster_CSPRank_boxplot))

}
# Function to read the output CSV file and trim the data
trim_data <- function(csv_file_path, xyz_matrix, pdb_files_names) {
  # Read the CSV file
  result_data <- read.csv(csv_file_path)

  # Extract the pdb_file names from the result_data
  remaining_pdb_files <- result_data$pdb_file  # Assuming 'pdb_file' is the column in the CSV with the pdb file names

  # Create a logical vector indicating which pdb_files_names are in the remaining_pdb_files
  keep_indices <- pdb_files_names %in% remaining_pdb_files

  # Trim the xyz_matrix and pdb_files_names using the logical vector
  trimmed_xyz_matrix <- xyz_matrix[keep_indices, ]
  trimmed_pdb_files_names <- pdb_files_names[keep_indices]

  return(list(trimmed_xyz_matrix = trimmed_xyz_matrix, trimmed_pdb_files_names = trimmed_pdb_files_names))
}

perform_analysis <- function(pdb_id, chain, max_clusters = 10) {
  set.seed(42)
  chain_a_xyz <- get_matrix(pdb_id, chain)
  xyz_matrix <- do.call(rbind, chain_a_xyz)

  # Identify unique rows and their indices
  unique_indices <- !duplicated(xyz_matrix)
  xyz_matrix <- xyz_matrix[unique_indices, ]

  # Create a data frame for PCA results
  pdb_files_names <- names(chain_a_xyz)[unique_indices]
  # trim pdb_files_names and xyz_matrix to points which are not outliers in PCA space
  #if (TRUE) {
  if (FALSE) {
    trimmed_pca_csv_file <-  paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_PCA_chain_', chain, '_data_trimmed.csv')
    results <- trim_data(trimmed_pca_csv_file, xyz_matrix, pdb_files_names)

    pdb_files_names <- results$trimmed_pdb_files_names
    xyz_matrix <- results$trimmed_xyz_matrix
  }

  print(dim(xyz_matrix))
  # t-SNE Analysis
  tsne_result <- Rtsne(xyz_matrix, dims = 2, perplexity = 100, verbose = TRUE, max_iter = 3000, partial_pca = TRUE)
  tsne_data <- data.frame(pdb_file = pdb_files_names, TSNE1 = tsne_result$Y[, 1], TSNE2 = tsne_result$Y[, 2])

  # Determine optimal clusters for t-SNE results
  tsne_clusters_info <- determine_optimal_clusters(tsne_data[, c("TSNE1", "TSNE2")], max_clusters)
  optimal_tsne_clusters <- tsne_clusters_info$optimal_clusters
  optimal_tsne_clusters <- 20
  print(paste("Optimal number of clusters for t-SNE:", optimal_tsne_clusters))
  tsne_clusters <- kmeans(tsne_data[, c("TSNE1", "TSNE2")], centers = optimal_tsne_clusters)$cluster
  tsne_data$Cluster <- factor(tsne_clusters)

  # Save t-SNE cluster plots
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_Elbow_plot.png'), plot = tsne_clusters_info$elbow_plot)
  ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_Silhouette_plot.png'), plot = tsne_clusters_info$silhouette_plot)

  # Write t-SNE results to CSV
  write.csv(tsne_data, paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_TSNE_chain_', chain, '_data.csv'), row.names = FALSE)

  # Read consensus data
  consensus_data <- read_consensus_data(pdb_id, pdb_files_names)

  # Plot the results
  plots <- plot_dimensional_reduction(tsne_data, consensus_data, pdb_id, chain)

  # Print interactive plots to display in R
  print(plots$tsne_plotly_clusters)
  print(plots$tsne_plotly_heatmap)
  print(plots$tsne_plotly_special)
}

max_clusters = 10
chain <- 'B'
pdb_id <- '7JYN'
perform_analysis(pdb_id, chain)

	library("bio3d")
	library('dplyr')
	library('ggplot2')
	library('Rtsne')
	library('umap')
	library('cluster')
	library('factoextra')
	library('plotly')
	library('irlba')
	library('igraph')
	library('FNN')

	print(getwd())
	setwd('/home/tiburon/Desktop/ROT4/CSP_Rank')
	print(getwd())

	CSP_Rank_Scores <- './CSP_Rank_Scores/'
	PDB_FILES <- './PDB_FILES/'
	experimental_structures <- paste0(PDB_FILES, 'experimental_structures/')
	computational_structures <- paste0(PDB_FILES, 'computational_structures/')
	CS_Lists <- './CS_Lists/'
	CS_Predictions <- './CS_Predictions/'
	CLUSTERING_RESULTS <- './CLUSTERING_RESULTS/'

	get_matrix <- function(pdb_id, chain) {

	dir_path <- paste0(PDB_FILES, pdb_id, "_aligned")
	path_to_pdb_files <- dir_path#paste0("./", pdb_id, "_aligned")
	alt_dir_path <- dir_path
	path_to_alt_pdb_files <- alt_dir_path#paste0("./", pdb_id, "_alt_aligned")
	pdb_files <- list.files(path = dir_path, pattern = "\\.pdb$")
	alt_pdb_files <- list.files(path = alt_dir_path, pattern = "\\.pdb$")

	csp_data <- read.csv(paste0(CSP_Rank_Scores, 'CSP_', tolower(pdb_id), "_CSpred.csv"))

	chain_a_xyz <- list()

	for (file in pdb_files) {
	rows <- filter(csp_data, holo_model_path == file.path(path_to_pdb_files, file))
	row <- rows[1, ]

	if (!is.na(row$F1)) {
	pdb <- read.pdb(file.path(dir_path, file))
	chain_to_analyze <- pdb
	if (chain == 'A') {
	chain_to_analyze <- trim.pdb(pdb, chain = "A")
	} else if ( chain == 'B' ) {
	chain_to_analyze <- trim.pdb(pdb, chain = "B")
	}
	chain_a_xyz[[file]] <- chain_to_analyze$xyz
	}
	}

	len_AFS <- length(chain_a_xyz)
	for (file in alt_pdb_files) {
	rows <- filter(csp_data, holo_model_path == file.path(path_to_alt_pdb_files, file))
	row <- rows[1, ]

	if (!is.na(row$F1)) {
	pdb <- read.pdb(file.path(alt_dir_path, file))
	chain_to_analyze <- pdb
	if (chain == 'A') {
	chain_to_analyze <- trim.pdb(pdb, chain = "A")
	} else if ( chain == 'B' ) {
	chain_to_analyze <- trim.pdb(pdb, chain = "B")
	}
	chain_a_xyz[[file]] <- chain_to_analyze$xyz
	}
	}

	if (len_AFS == length(chain_a_xyz)) {
	print("DID YOU RUN add_aligned_suffix.py?")
	}

	special_pdb_file <- paste0('comp_', tolower(pdb_id), ".pdb")
	special_pdb_file_dir <- paste0(PDB_FILES, 'computational_structures/')
	pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file))
	chain_to_analyze <- pdb
	if (chain == 'A') {
	chain_to_analyze <- trim.pdb(pdb, chain = "A")
	} else if ( chain == 'B' ) {
	chain_to_analyze <- trim.pdb(pdb, chain = "B")
	}
	chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz

	special_pdb_file <- paste0('exp_', tolower(pdb_id), ".pdb")
	special_pdb_file_dir <- paste0(PDB_FILES, 'experimental_structures/')
	pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file))
	chain_to_analyze <- pdb
	if (chain == 'A') {
	chain_to_analyze <- trim.pdb(pdb, chain = "A")
	} else if ( chain == 'B' ) {
	chain_to_analyze <- trim.pdb(pdb, chain = "B")
	}
	chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz


	pdb_files <- list.files(path = paste0(special_pdb_file_dir, tolower(pdb_id), '/'), pattern = "\\.pdb$")

	for (file in pdb_files) {
	pdb <- read.pdb(file.path(paste0(special_pdb_file_dir, tolower(pdb_id), '/'), file))
	chain_to_analyze <- pdb
	if (chain == 'A') {
	chain_to_analyze <- trim.pdb(pdb, chain = "A")
	} else if ( chain == 'B' ) {
	chain_to_analyze <- trim.pdb(pdb, chain = "B")
	}
	chain_a_xyz[[file]] <- chain_to_analyze$xyz
	}

	return(chain_a_xyz)
	}

	read_consensus_data <- function(pdb_id, pdb_files_names) {

	# Remove entries from pdb_files_names containing the substring 'exp'
	#pdb_files_names <- pdb_files_names[!grepl('exp', pdb_files_names)]
	#pdb_files_names <- pdb_files_names[!grepl('comp', pdb_files_names)]
	csv_path <- paste0(CSP_Rank_Scores,"CSP_", tolower(pdb_id), "_CSpred.csv")
	csp_data <- read.csv(csv_path)

	# Filter out rows without a value in 'Confidence'
	#if (TRUE){
	# csp_data <- csp_data[!is.na(csp_data$Confidence) & csp_data$Confidence != 0, ]
	# }

	# Extract the basename of holo_model_path for comparison
	csp_data$basename_holo_model_path <- basename(csp_data$holo_model_path)

	# Filter and ensure the order matches pdb_files_names
	filtered_data <- csp_data %>%
	filter(basename_holo_model_path %in% pdb_files_names)# %>%
	#filter(!grepl('exp', basename_holo_model_path))
	filtered_data <- filtered_data[match(pdb_files_names, filtered_data$basename_holo_model_path), ]

	consensus_values <- filtered_data$consensus
	filtered_data2 <- na.omit(filtered_data)
	if (TRUE){
	#consensus_values <- 2 * filtered_data$consensus * filtered_data$DP / (filtered_data$consensus + filtered_data$DP)
	#consensus_values <- filtered_data$consensus * (filtered_data$DP - min(filtered_data2$DP)) / (max(filtered_data2$DP) - min(filtered_data2$DP))
	consensus_values <- filtered_data$consensus * (filtered_data$RPF_RECALL - min(filtered_data2$RPF_RECALL)) / (max(filtered_data2$RPF_RECALL) - min(filtered_data2$RPF_RECALL))
	#consensus_values <- filtered_data$DP
	}
	if (FALSE){
	consensus_values <- filtered_data$consensus * (1 - filtered_data$Q_score)
	}
	confidence_values <- filtered_data$Confidence
	plddt_values <- filtered_data$plddt

	special_cases <- ifelse(grepl(paste0("exp_", tolower(pdb_id)), filtered_data$holo_model_path), "green",
	ifelse(grepl(paste0("comp_", tolower(pdb_id)), filtered_data$holo_model_path), "cyan",
	ifelse(grepl('v3_', filtered_data$holo_model_path) & grepl('dropout', filtered_data$holo_model_path), 'blue',
	ifelse(grepl('v2_', filtered_data$holo_model_path) & grepl('dropout', filtered_data$holo_model_path), 'pink',
	ifelse(grepl('dropout', filtered_data$holo_model_path), 'red',
	ifelse(grepl('v2_', filtered_data$holo_model_path), "purple",
	ifelse(grepl('notemplate', filtered_data$holo_model_path), "orange",
	ifelse(grepl('multimer', filtered_data$holo_model_path), "yellow", "NA"))))))))

	return(list(consensus = consensus_values, plddt = plddt_values, Confidence = confidence_values, special_cases = special_cases, basename_holo_model_path = filtered_data$basename_holo_model_path))
	}
	determine_optimal_clusters <- function(data, max_clusters = 20) {
	wss <- sapply(1:max_clusters, function(k) {
	kmeans(data, k, nstart = 10)$tot.withinss
	})
	elbow_plot <- qplot(1:max_clusters, wss, geom = "line") +
	geom_point() +
	ggtitle("Elbow Method for Optimal Clusters") +
	xlab("Number of Clusters") +
	ylab("Total Within-Cluster Sum of Squares")

	silhouette_scores <- sapply(2:max_clusters, function(k) {
	km <- kmeans(data, centers = k, nstart = 10)
	ss <- silhouette(km$cluster, dist(data))
	mean(ss[, 3])
	})
	silhouette_plot <- qplot(2:max_clusters, silhouette_scores, geom = "line") +
	geom_point() +
	ggtitle("Silhouette Method for Optimal Clusters") +
	xlab("Number of Clusters") +
	ylab("Average Silhouette Score")

	optimal_clusters <- which.max(silhouette_scores) + 1

	return(list(optimal_clusters = optimal_clusters,
	elbow_plot = elbow_plot,
	silhouette_plot = silhouette_plot))
	}

	plot_dimensional_reduction <- function(tsne_data, consensus_data, pdb_id, chain) {


	# Remove rows in tsne_data where pdb_file contains the substring "exp"
	#tsne_data <- tsne_data[!grepl('exp', tsne_data$pdb_file), ]
	#tsne_data <- tsne_data[!grepl('comp', tsne_data$pdb_file), ]

	special_cases <- consensus_data$special_cases
	consensus_df <- data.frame(consensus_data)
	print(consensus_df[consensus_df$special_cases == "cyan", ])
	print(consensus_df[consensus_df$special_cases == "green", ])

	consensus_values <- consensus_df$consensus
	negative_consensus_indices <- which(consensus_values <= 0)
	consensus_df <- consensus_df[-negative_consensus_indices, ]
	consensus_values <- consensus_df$consensus

	consensus_values <- ifelse(
	consensus_data$special_cases == "green",
	-1,
	consensus_values
	)
	consensus_values <- ifelse(
	consensus_data$special_cases == "cyan",
	0,
	consensus_values
	)
	negative_consensus_indices <- which(consensus_values <= 0)
	print(consensus_values[negative_consensus_indices])
	print(consensus_data$basename_holo_model_path[negative_consensus_indices])

	consensus_data$bayesian_values <- as.numeric(ifelse(is.na(consensus_data$Confidence) \| consensus_data$Confidence == "", -1, consensus_data$consensus * consensus_data$Confidence))
	consensus_data$bayesian_values <- ifelse(grepl('comp', consensus_data$basename_holo_model_path), 0, consensus_data$bayesian_values)

	consensus_data$basename_holo_model_path
	# Define the mapping for special cases
	special_cases_mapping1 <- c("AFSample v2.2" = "purple", "AFAlt" = "orange", "AFSample v2.1" = "yellow", "AFSample2 v2.3" = "blue", "AFSample2 v2.1" = "pink", "AFSample2 v1" = "red", "NA" = "gray", "NMR Model" = "green", "AF2 Baseline" = "cyan")
	special_cases_mapping2 <- c("purple" = "AFSample v2.2", "orange" = "AFAlt", "yellow" = "AFSample v2.1", "blue" = "AFSample2 v2.3", "pink" = "AFSample2 v2.1", "red" = "AFSample2 v1", "gray" = "NA", "green" = "NMR Model", "cyan"= "AF2 Baseline")
	special_cases <- factor(special_cases, levels = names(special_cases_mapping2))
	levels(special_cases) <- names(special_cases_mapping1)

	# Assign colors to tsne_data$Special based on special_cases_mapping
	tsne_data$Special <- special_cases
	tsne_data$pdb_file
	# Set consensus values for color mapping
	tsne_data$Color <- consensus_values

	if (FALSE){
	tsne_data$Color <- consensus_data$bayesian_values
	}

	# t-SNE Plot 1: Clusters
	tsne_plot_clusters <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Cluster, text = pdb_file)) +
	geom_point() +
	#ggtitle(paste("t-SNE Plot for PDB ID=", pdb_id)) +
	theme(legend.position = "none") +
	theme_minimal()

	tsne_plotly_clusters <- ggplotly(tsne_plot_clusters, tooltip = "text")
	print(tsne_plotly_clusters)

	# t-SNE Plot 2: Heatmap of consensus values
	# tsne_data$Color <- ifelse(tsne_data$Color == -1, "green", tsne_data$Color)
	# tsne_plot_heatmap <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Color, text = paste0("PDB File:", pdb_file, "\nCSPRank:", Color))) +
	# geom_point() +
	# scale_color_gradient(low = "blue", high = "red", name = "Posterior") +
	# ggtitle(paste("Posteriors for ", pdb_id)) +
	# theme_minimal()

	library(ggplot2)
	tsne_plot_heatmap <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, text = pdb_file)) +
	# 1) Points where Color != -1 -> color by numeric Posterior
	geom_point(
	data = subset(tsne_data, Color > 0),
	aes(color = as.numeric(Color)),
	size = 2
	) +
	# 2) Points where Color == -1 -> use a special shape
	geom_point(
	data = subset(tsne_data, Color == -1),
	aes(shape = "NMR models"), # map them to a discrete shape
	color = "green", # color them green
	size = 3
	) +
	# 3) Points where Color == -1 -> use a special shape
	geom_point(
	data = subset(tsne_data, Color == 0),
	aes(shape = "AF2 model"), # map them to a discrete shape
	color = "cyan", # color them green
	size = 3
	) +
	# Set up shape legend for discrete “NMR models”
	scale_shape_manual(
	values = c("NMR models" = 19), # pick any shape code you like
	name = "Model Type" # legend title for shape
	) +
	# Set up shape legend for discrete “NMR models”
	scale_shape_manual(
	values = c("AF2 model" = 19), # pick any shape code you like
	name = "Model Type" # legend title for shape
	) +
	# Set up color gradient for numeric Posterior
	scale_color_gradient(low = "blue", high = "red", name = "Posterior") +
	ggtitle(paste("Posteriors for", pdb_id)) +
	theme_minimal()
	#tsne_plot_heatmap +
	# guides(
	# shape = guide_legend(title = NULL)#, # no title for shape legend
	# #color = guide_colorbar(title = "Posterior")
	# )
	tsne_plotly_heatmap <- ggplotly(tsne_plot_heatmap, tooltip = "text")
	print(tsne_plotly_heatmap)

	# t-SNE Plot 3: Special cases
	tsne_plot_special <- ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, color = Special, text = pdb_file)) +
	geom_point() +
	scale_color_manual(values = special_cases_mapping1, name = "Model Source") +
	ggtitle(paste("t-SNE for ", pdb_id)) +
	theme_minimal()

	tsne_plotly_special <- ggplotly(tsne_plot_special, tooltip = "text")
	print(tsne_plotly_special)

	# Save static t-SNE plots
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_clusters_plot.png'), plot = tsne_plot_clusters)
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_heatmap_plot.png'), plot = tsne_plot_heatmap)
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_special_plot.png'), plot = tsne_plot_special)


	return(list(tsne_plotly_clusters = tsne_plotly_clusters, tsne_plotly_heatmap = tsne_plotly_heatmap,
	tsne_plotly_special = tsne_plotly_special))
	}


	plot_cluster_CSPRank_boxplot <- function(tsne_data, consensus_data, pdb_id, chain) {

	# Match tsne_data$pdb_file to consensus_data$basename_holo_model_path
	matched_data <- merge(tsne_data, consensus_data, by.x = "pdb_file", by.y = "basename_holo_model_path")

	# Create the boxplot using ggplot2
	#tsne_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = bayesian_values, fill = as.factor(Cluster))) +
	tsne_cluster_CSPRank_boxplot <- ggplot(matched_data, aes(x = as.factor(Cluster), y = consensus, fill = as.factor(Cluster))) +
	geom_boxplot() +
	labs(x = "Cluster", y = "metric", title = paste("t-SNE Cluster CSP Bayesian model selection metric")) +
	theme_minimal() +
	scale_fill_discrete(name = "Cluster")

	# Save the t-SNE plot
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_', chain, '_tsne_boxplot.png'), plot = tsne_cluster_CSPRank_boxplot)


	return(list(tsne_cluster_CSPRank_boxplot = tsne_cluster_CSPRank_boxplot))

	}
	# Function to read the output CSV file and trim the data
	trim_data <- function(csv_file_path, xyz_matrix, pdb_files_names) {
	# Read the CSV file
	result_data <- read.csv(csv_file_path)

	# Extract the pdb_file names from the result_data
	remaining_pdb_files <- result_data$pdb_file # Assuming 'pdb_file' is the column in the CSV with the pdb file names

	# Create a logical vector indicating which pdb_files_names are in the remaining_pdb_files
	keep_indices <- pdb_files_names %in% remaining_pdb_files

	# Trim the xyz_matrix and pdb_files_names using the logical vector
	trimmed_xyz_matrix <- xyz_matrix[keep_indices, ]
	trimmed_pdb_files_names <- pdb_files_names[keep_indices]

	return(list(trimmed_xyz_matrix = trimmed_xyz_matrix, trimmed_pdb_files_names = trimmed_pdb_files_names))
	}

	perform_analysis <- function(pdb_id, chain, max_clusters = 10) {
	set.seed(42)
	chain_a_xyz <- get_matrix(pdb_id, chain)
	xyz_matrix <- do.call(rbind, chain_a_xyz)

	# Identify unique rows and their indices
	unique_indices <- !duplicated(xyz_matrix)
	xyz_matrix <- xyz_matrix[unique_indices, ]

	# Create a data frame for PCA results
	pdb_files_names <- names(chain_a_xyz)[unique_indices]
	# trim pdb_files_names and xyz_matrix to points which are not outliers in PCA space
	#if (TRUE) {
	if (FALSE) {
	trimmed_pca_csv_file <- paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_PCA_chain_', chain, '_data_trimmed.csv')
	results <- trim_data(trimmed_pca_csv_file, xyz_matrix, pdb_files_names)

	pdb_files_names <- results$trimmed_pdb_files_names
	xyz_matrix <- results$trimmed_xyz_matrix
	}

	print(dim(xyz_matrix))
	# t-SNE Analysis
	tsne_result <- Rtsne(xyz_matrix, dims = 2, perplexity = 100, verbose = TRUE, max_iter = 3000, partial_pca = TRUE)
	tsne_data <- data.frame(pdb_file = pdb_files_names, TSNE1 = tsne_result$Y[, 1], TSNE2 = tsne_result$Y[, 2])

	# Determine optimal clusters for t-SNE results
	tsne_clusters_info <- determine_optimal_clusters(tsne_data[, c("TSNE1", "TSNE2")], max_clusters)
	optimal_tsne_clusters <- tsne_clusters_info$optimal_clusters
	optimal_tsne_clusters <- 20
	print(paste("Optimal number of clusters for t-SNE:", optimal_tsne_clusters))
	tsne_clusters <- kmeans(tsne_data[, c("TSNE1", "TSNE2")], centers = optimal_tsne_clusters)$cluster
	tsne_data$Cluster <- factor(tsne_clusters)

	# Save t-SNE cluster plots
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_Elbow_plot.png'), plot = tsne_clusters_info$elbow_plot)
	ggsave(filename = paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_tSNE_Silhouette_plot.png'), plot = tsne_clusters_info$silhouette_plot)

	# Write t-SNE results to CSV
	write.csv(tsne_data, paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_TSNE_chain_', chain, '_data.csv'), row.names = FALSE)

	# Read consensus data
	consensus_data <- read_consensus_data(pdb_id, pdb_files_names)

	# Plot the results
	plots <- plot_dimensional_reduction(tsne_data, consensus_data, pdb_id, chain)

	# Print interactive plots to display in R
	print(plots$tsne_plotly_clusters)
	print(plots$tsne_plotly_heatmap)
	print(plots$tsne_plotly_special)
	}

	max_clusters = 10
	chain <- 'B'
	pdb_id <- '7JYN'
	perform_analysis(pdb_id, chain)