PCA_remove_outliers.R

# This R script takes as input the pdb_id of a pdb you'd like to run PCA on.
# This code assumes there is a folder './'+pdb_id+'_aligned' which contains many pdb files to use for the PCA.
# Additionally, we need values from a csv file named './CSP_'+pdb_id+'.csv' which contains a metric value to color the points on the PCA by.

library("bio3d")
library('dplyr')
library('ggplot2')
library('Rtsne')
library('umap')
library('cluster')
library('factoextra')
library('plotly')
library('MASS')  # For Mahalanobis distance

print(getwd())
setwd('/home/tiburon/Desktop/ROT4/CSPRank_ES_Processing')
print(getwd())

CSP_Rank_Scores <- './CSP_Rank_Scores/'
PDB_FILES <- './PDB_FILES/'
experimental_structures <- paste0(PDB_FILES, 'experimental_structures/')
computational_structures <- paste0(PDB_FILES, 'computational_structures/')
CS_Lists <- './CS_Lists/'
CS_Predictions <- './CS_Predictions/'
CLUSTERING_RESULTS <- './CLUSTERING_RESULTS/'

get_matrix <- function(pdb_id, chain) {

  dir_path <- paste0(PDB_FILES, pdb_id, "_aligned")
  path_to_pdb_files <- dir_path#paste0("./", pdb_id, "_aligned")
  alt_dir_path <- paste0(PDB_FILES, pdb_id, "_alt_aligned")
  path_to_alt_pdb_files <- alt_dir_path#paste0("./", pdb_id, "_alt_aligned")
  pdb_files <- list.files(path = dir_path, pattern = "\\.pdb$")
  alt_pdb_files <- list.files(path = alt_dir_path, pattern = "\\.pdb$")

  csp_data <- read.csv(paste0(CSP_Rank_Scores, "CSP_", tolower(pdb_id), "_CSpred.csv"))

  chain_a_xyz <- list()

  for (file in pdb_files) {
    rows <- filter(csp_data, holo_model_path == file.path(path_to_pdb_files, file))
    row <- rows[1, ]

    if (!is.na(row$F1)) {
      pdb <- read.pdb(file.path(dir_path, file))
      chain_to_analyze <- pdb
      if (chain == 'A') {
        chain_to_analyze <- trim.pdb(pdb, chain = "A")
      } else if ( chain == 'B' ) {
        chain_to_analyze <- trim.pdb(pdb, chain = "B")
      }
      chain_a_xyz[[file]] <- chain_to_analyze$xyz
    }
  }

  for (file in alt_pdb_files) {
    rows <- filter(csp_data, holo_model_path == file.path(path_to_alt_pdb_files, file))
    row <- rows[1, ]

    if (!is.na(row$F1)) {
      pdb <- read.pdb(file.path(alt_dir_path, file))
      chain_to_analyze <- pdb
      if (chain == 'A') {
        chain_to_analyze <- trim.pdb(pdb, chain = "A")
      } else if ( chain == 'B' ) {
        chain_to_analyze <- trim.pdb(pdb, chain = "B")
      }
      chain_a_xyz[[file]] <- chain_to_analyze$xyz
    }
  }

  special_pdb_file <- paste0('comp_', tolower(pdb_id), ".pdb")
  special_pdb_file_dir <- paste0(PDB_FILES, 'computational_structures/')
  # Construct the file path
  pdb_file_path <- file.path(special_pdb_file_dir, special_pdb_file)
  # Check if the file exists
  if (file.exists(pdb_file_path)) {
    # If the file exists, read the pdb file
    pdb <- read.pdb(pdb_file_path)
    chain_to_analyze <- pdb
    if (chain == 'A') {
      chain_to_analyze <- trim.pdb(pdb, chain = "A")
    } else if ( chain == 'B' ) {
      chain_to_analyze <- trim.pdb(pdb, chain = "B")
    }
    chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz
  }


  special_pdb_file <- paste0('exp_', tolower(pdb_id), ".pdb")
  special_pdb_file_dir <- paste0(PDB_FILES, 'experimental_structures/')
  pdb_file_path <- file.path(special_pdb_file_dir, special_pdb_file)
  # Check if the file exists
  if (file.exists(pdb_file_path)) {
    pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file))
    chain_to_analyze <- pdb
    if (chain == 'A') {
      chain_to_analyze <- trim.pdb(pdb, chain = "A")
    } else if ( chain == 'B' ) {
      chain_to_analyze <- trim.pdb(pdb, chain = "B")
    }
    chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz
  }


  pdb_files <- list.files(path = paste0(special_pdb_file_dir, tolower(pdb_id), '/'), pattern = "\\.pdb$")

  for (file in pdb_files) {
    pdb <- read.pdb(file.path(paste0(special_pdb_file_dir, tolower(pdb_id), '/'), file))
    chain_to_analyze <- pdb
    if (chain == 'A') {
      chain_to_analyze <- trim.pdb(pdb, chain = "A")
    } else if ( chain == 'B' ) {
      chain_to_analyze <- trim.pdb(pdb, chain = "B")
    }
    chain_a_xyz[[file]] <- chain_to_analyze$xyz
    }

  return(chain_a_xyz)
}


# Function to compute Mahalanobis distances
compute_mahalanobis_distances <- function(pca_data) {
  cov_matrix <- cov(pca_data)
  mean_vector <- colMeans(pca_data)
  distances <- apply(pca_data, 1, function(row) {
    mahalanobis(row, mean_vector, cov_matrix)
  })
  return(distances)
}

get_pca <- function(data) {

  pca_result <- pca.xyz(data)

  # Extract the first three principal components
  pc1 <- pca_result$z[, 1]
  pc2 <- pca_result$z[, 2]
  pc3 <- pca_result$z[, 3]

  # Create a data frame with the principal components
  return(list(pca_data = data.frame(x = pc1, y = pc2, z = pc3), variance_explained = pca_result$sdev^2))
}

remove_outliers_pca <- function(data, pdb_files_names, threshold, max_iterations = 10) {
  iteration <- 0
  repeat {

    # Get the PCA results
    pca_result <- get_pca(data)
    pca_data <- pca_result$pca_data
    variance_explained <- pca_result$variance_explained

    if (iteration >= max_iterations) {
      break
    }

    # Compute Mahalanobis distances
    distances <- compute_mahalanobis_distances(pca_data)

    # Identify outliers
    outliers <- distances > threshold

    # Ensure rows with 'exp' or 'comp' in pdb_files_names are not removed
    protected_rows <- grepl("exp|comp", pdb_files_names)
    outliers <- outliers & !protected_rows

    # Count the number of outliers
    num_outliers <- sum(outliers)

    # Print the iteration number and number of outliers removed
    cat("Iteration:", iteration + 1, "- Number of outliers removed:", num_outliers, "\n")
    #cat(outliers)

    # Break the loop if no outliers are detected or max iterations are reached
    if (num_outliers == 0 || iteration >= max_iterations) {
      break
    }

    # Remove outliers from data and pdb_files_names
    data <- data[!outliers, ]
    pdb_files_names <- pdb_files_names[!outliers]

    iteration <- iteration + 1
  }

  return(list(final_data = data, final_pdb_files_names = pdb_files_names, pca_result = pca_data, variance_explained = variance_explained))
}

perform_analysis <- function(pdb_id, chain, max_clusters = 10, remove_outliers = TRUE) {
  chain_a_xyz <- get_matrix(pdb_id, chain)
  xyz_matrix <- do.call(rbind, chain_a_xyz)

  # Identify unique rows and their indices
  unique_indices <- !duplicated(xyz_matrix)
  xyz_matrix <- xyz_matrix[unique_indices, ]

  # Create a data frame for PCA results
  pdb_files_names <- names(chain_a_xyz)[unique_indices]

  threshold <- qchisq(0.975, df = 3)  # 97.5th percentile of Chi-squared distribution with 3 degrees of freedom
  if (remove_outliers) {
    result <- remove_outliers_pca(xyz_matrix, pdb_files_names, threshold)
  } else {
    result <- remove_outliers_pca(xyz_matrix, pdb_files_names, threshold, max_iterations = 0)
  }

  xyz_matrix <- result$final_data
  pca_result <- result$pca_result

  # Calculate variance explained by each PC
  variance_explained <- result$variance_explained
  total_variance <- sum(variance_explained)
  proportion_variance_explained <- variance_explained / total_variance

  # Print the variance explained by PC1 and PC2
  pc1_variance_explained <- proportion_variance_explained[1]
  pc2_variance_explained <- proportion_variance_explained[2]
  pc3_variance_explained <- proportion_variance_explained[3]

  print(paste("Proportion of variance explained by PC1:", pc1_variance_explained))
  print(paste("Proportion of variance explained by PC2:", pc2_variance_explained))
  print(paste("Proportion of variance explained by PC3:", pc3_variance_explained))

  # Extract PC1 and PC2
  pc1 <- pca_result$x
  pc2 <- pca_result$y
  pc3 <- pca_result$z

  # Create a data frame for PCA results
  output_df_pca <- data.frame(pdb_file = result$final_pdb_files_names, PC1 = pc1, PC2 = pc2, PC3 = pc3)

  # Write PCA results to CSV
  write.csv(output_df_pca, paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_PCA_chain_', chain, '_data_trimmed.csv'), row.names = FALSE)

}

chain <- 'A'
pdb_id <- 'RHOA_A161P'
perform_analysis(pdb_id, chain, remove_outliers = FALSE)

pdb_ids <- c('2nmb','2hug', '2nd1', '2jw1', '5m9d', '5xv8', '2lox', '2kwv')
chains <- c('B')
for (pdb_id in pdb_ids) {
  for (chain in chains) {
    print(paste0('Running ', pdb_id, ' with chain ', chain))
    perform_analysis(pdb_id, chain)
  }
}
	# This R script takes as input the pdb_id of a pdb you'd like to run PCA on.
	# This code assumes there is a folder './'+pdb_id+'_aligned' which contains many pdb files to use for the PCA.
	# Additionally, we need values from a csv file named './CSP_'+pdb_id+'.csv' which contains a metric value to color the points on the PCA by.

	library("bio3d")
	library('dplyr')
	library('ggplot2')
	library('Rtsne')
	library('umap')
	library('cluster')
	library('factoextra')
	library('plotly')
	library('MASS') # For Mahalanobis distance

	print(getwd())
	setwd('/home/tiburon/Desktop/ROT4/CSPRank_ES_Processing')
	print(getwd())

	CSP_Rank_Scores <- './CSP_Rank_Scores/'
	PDB_FILES <- './PDB_FILES/'
	experimental_structures <- paste0(PDB_FILES, 'experimental_structures/')
	computational_structures <- paste0(PDB_FILES, 'computational_structures/')
	CS_Lists <- './CS_Lists/'
	CS_Predictions <- './CS_Predictions/'
	CLUSTERING_RESULTS <- './CLUSTERING_RESULTS/'

	get_matrix <- function(pdb_id, chain) {

	dir_path <- paste0(PDB_FILES, pdb_id, "_aligned")
	path_to_pdb_files <- dir_path#paste0("./", pdb_id, "_aligned")
	alt_dir_path <- paste0(PDB_FILES, pdb_id, "_alt_aligned")
	path_to_alt_pdb_files <- alt_dir_path#paste0("./", pdb_id, "_alt_aligned")
	pdb_files <- list.files(path = dir_path, pattern = "\\.pdb$")
	alt_pdb_files <- list.files(path = alt_dir_path, pattern = "\\.pdb$")

	csp_data <- read.csv(paste0(CSP_Rank_Scores, "CSP_", tolower(pdb_id), "_CSpred.csv"))

	chain_a_xyz <- list()

	for (file in pdb_files) {
	rows <- filter(csp_data, holo_model_path == file.path(path_to_pdb_files, file))
	row <- rows[1, ]

	if (!is.na(row$F1)) {
	pdb <- read.pdb(file.path(dir_path, file))
	chain_to_analyze <- pdb
	if (chain == 'A') {
	chain_to_analyze <- trim.pdb(pdb, chain = "A")
	} else if ( chain == 'B' ) {
	chain_to_analyze <- trim.pdb(pdb, chain = "B")
	}
	chain_a_xyz[[file]] <- chain_to_analyze$xyz
	}
	}

	for (file in alt_pdb_files) {
	rows <- filter(csp_data, holo_model_path == file.path(path_to_alt_pdb_files, file))
	row <- rows[1, ]

	if (!is.na(row$F1)) {
	pdb <- read.pdb(file.path(alt_dir_path, file))
	chain_to_analyze <- pdb
	if (chain == 'A') {
	chain_to_analyze <- trim.pdb(pdb, chain = "A")
	} else if ( chain == 'B' ) {
	chain_to_analyze <- trim.pdb(pdb, chain = "B")
	}
	chain_a_xyz[[file]] <- chain_to_analyze$xyz
	}
	}

	special_pdb_file <- paste0('comp_', tolower(pdb_id), ".pdb")
	special_pdb_file_dir <- paste0(PDB_FILES, 'computational_structures/')
	# Construct the file path
	pdb_file_path <- file.path(special_pdb_file_dir, special_pdb_file)
	# Check if the file exists
	if (file.exists(pdb_file_path)) {
	# If the file exists, read the pdb file
	pdb <- read.pdb(pdb_file_path)
	chain_to_analyze <- pdb
	if (chain == 'A') {
	chain_to_analyze <- trim.pdb(pdb, chain = "A")
	} else if ( chain == 'B' ) {
	chain_to_analyze <- trim.pdb(pdb, chain = "B")
	}
	chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz
	}


	special_pdb_file <- paste0('exp_', tolower(pdb_id), ".pdb")
	special_pdb_file_dir <- paste0(PDB_FILES, 'experimental_structures/')
	pdb_file_path <- file.path(special_pdb_file_dir, special_pdb_file)
	# Check if the file exists
	if (file.exists(pdb_file_path)) {
	pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file))
	chain_to_analyze <- pdb
	if (chain == 'A') {
	chain_to_analyze <- trim.pdb(pdb, chain = "A")
	} else if ( chain == 'B' ) {
	chain_to_analyze <- trim.pdb(pdb, chain = "B")
	}
	chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz
	}


	pdb_files <- list.files(path = paste0(special_pdb_file_dir, tolower(pdb_id), '/'), pattern = "\\.pdb$")

	for (file in pdb_files) {
	pdb <- read.pdb(file.path(paste0(special_pdb_file_dir, tolower(pdb_id), '/'), file))
	chain_to_analyze <- pdb
	if (chain == 'A') {
	chain_to_analyze <- trim.pdb(pdb, chain = "A")
	} else if ( chain == 'B' ) {
	chain_to_analyze <- trim.pdb(pdb, chain = "B")
	}
	chain_a_xyz[[file]] <- chain_to_analyze$xyz
	}

	return(chain_a_xyz)
	}



	# Function to compute Mahalanobis distances
	compute_mahalanobis_distances <- function(pca_data) {
	cov_matrix <- cov(pca_data)
	mean_vector <- colMeans(pca_data)
	distances <- apply(pca_data, 1, function(row) {
	mahalanobis(row, mean_vector, cov_matrix)
	})
	return(distances)
	}

	get_pca <- function(data) {

	pca_result <- pca.xyz(data)

	# Extract the first three principal components
	pc1 <- pca_result$z[, 1]
	pc2 <- pca_result$z[, 2]
	pc3 <- pca_result$z[, 3]

	# Create a data frame with the principal components
	return(list(pca_data = data.frame(x = pc1, y = pc2, z = pc3), variance_explained = pca_result$sdev^2))
	}

	remove_outliers_pca <- function(data, pdb_files_names, threshold, max_iterations = 10) {
	iteration <- 0
	repeat {

	# Get the PCA results
	pca_result <- get_pca(data)
	pca_data <- pca_result$pca_data
	variance_explained <- pca_result$variance_explained

	if (iteration >= max_iterations) {
	break
	}

	# Compute Mahalanobis distances
	distances <- compute_mahalanobis_distances(pca_data)

	# Identify outliers
	outliers <- distances > threshold

	# Ensure rows with 'exp' or 'comp' in pdb_files_names are not removed
	protected_rows <- grepl("exp\|comp", pdb_files_names)
	outliers <- outliers & !protected_rows

	# Count the number of outliers
	num_outliers <- sum(outliers)

	# Print the iteration number and number of outliers removed
	cat("Iteration:", iteration + 1, "- Number of outliers removed:", num_outliers, "\n")
	#cat(outliers)

	# Break the loop if no outliers are detected or max iterations are reached
	if (num_outliers == 0 \|\| iteration >= max_iterations) {
	break
	}

	# Remove outliers from data and pdb_files_names
	data <- data[!outliers, ]
	pdb_files_names <- pdb_files_names[!outliers]

	iteration <- iteration + 1
	}

	return(list(final_data = data, final_pdb_files_names = pdb_files_names, pca_result = pca_data, variance_explained = variance_explained))
	}

	perform_analysis <- function(pdb_id, chain, max_clusters = 10, remove_outliers = TRUE) {
	chain_a_xyz <- get_matrix(pdb_id, chain)
	xyz_matrix <- do.call(rbind, chain_a_xyz)

	# Identify unique rows and their indices
	unique_indices <- !duplicated(xyz_matrix)
	xyz_matrix <- xyz_matrix[unique_indices, ]

	# Create a data frame for PCA results
	pdb_files_names <- names(chain_a_xyz)[unique_indices]

	threshold <- qchisq(0.975, df = 3) # 97.5th percentile of Chi-squared distribution with 3 degrees of freedom
	if (remove_outliers) {
	result <- remove_outliers_pca(xyz_matrix, pdb_files_names, threshold)
	} else {
	result <- remove_outliers_pca(xyz_matrix, pdb_files_names, threshold, max_iterations = 0)
	}

	xyz_matrix <- result$final_data
	pca_result <- result$pca_result

	# Calculate variance explained by each PC
	variance_explained <- result$variance_explained
	total_variance <- sum(variance_explained)
	proportion_variance_explained <- variance_explained / total_variance

	# Print the variance explained by PC1 and PC2
	pc1_variance_explained <- proportion_variance_explained[1]
	pc2_variance_explained <- proportion_variance_explained[2]
	pc3_variance_explained <- proportion_variance_explained[3]

	print(paste("Proportion of variance explained by PC1:", pc1_variance_explained))
	print(paste("Proportion of variance explained by PC2:", pc2_variance_explained))
	print(paste("Proportion of variance explained by PC3:", pc3_variance_explained))

	# Extract PC1 and PC2
	pc1 <- pca_result$x
	pc2 <- pca_result$y
	pc3 <- pca_result$z

	# Create a data frame for PCA results
	output_df_pca <- data.frame(pdb_file = result$final_pdb_files_names, PC1 = pc1, PC2 = pc2, PC3 = pc3)

	# Write PCA results to CSV
	write.csv(output_df_pca, paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_PCA_chain_', chain, '_data_trimmed.csv'), row.names = FALSE)

	}

	chain <- 'A'
	pdb_id <- 'RHOA_A161P'
	perform_analysis(pdb_id, chain, remove_outliers = FALSE)

	pdb_ids <- c('2nmb','2hug', '2nd1', '2jw1', '5m9d', '5xv8', '2lox', '2kwv')
	chains <- c('B')
	for (pdb_id in pdb_ids) {
	for (chain in chains) {
	print(paste0('Running ', pdb_id, ' with chain ', chain))
	perform_analysis(pdb_id, chain)
	}
	}