Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
CSP_Rank/PCA_remove_outliers.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
252 lines (200 sloc)
8.36 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This R script takes as input the pdb_id of a pdb you'd like to run PCA on. | |
# This code assumes there is a folder './'+pdb_id+'_aligned' which contains many pdb files to use for the PCA. | |
# Additionally, we need values from a csv file named './CSP_'+pdb_id+'.csv' which contains a metric value to color the points on the PCA by. | |
library("bio3d") | |
library('dplyr') | |
library('ggplot2') | |
library('Rtsne') | |
library('umap') | |
library('cluster') | |
library('factoextra') | |
library('plotly') | |
library('MASS') # For Mahalanobis distance | |
print(getwd()) | |
setwd('/home/tiburon/Desktop/ROT4/CSPRank_ES_Processing') | |
print(getwd()) | |
CSP_Rank_Scores <- './CSP_Rank_Scores/' | |
PDB_FILES <- './PDB_FILES/' | |
experimental_structures <- paste0(PDB_FILES, 'experimental_structures/') | |
computational_structures <- paste0(PDB_FILES, 'computational_structures/') | |
CS_Lists <- './CS_Lists/' | |
CS_Predictions <- './CS_Predictions/' | |
CLUSTERING_RESULTS <- './CLUSTERING_RESULTS/' | |
get_matrix <- function(pdb_id, chain) { | |
dir_path <- paste0(PDB_FILES, pdb_id, "_aligned") | |
path_to_pdb_files <- dir_path#paste0("./", pdb_id, "_aligned") | |
alt_dir_path <- paste0(PDB_FILES, pdb_id, "_alt_aligned") | |
path_to_alt_pdb_files <- alt_dir_path#paste0("./", pdb_id, "_alt_aligned") | |
pdb_files <- list.files(path = dir_path, pattern = "\\.pdb$") | |
alt_pdb_files <- list.files(path = alt_dir_path, pattern = "\\.pdb$") | |
csp_data <- read.csv(paste0(CSP_Rank_Scores, "CSP_", tolower(pdb_id), "_CSpred.csv")) | |
chain_a_xyz <- list() | |
for (file in pdb_files) { | |
rows <- filter(csp_data, holo_model_path == file.path(path_to_pdb_files, file)) | |
row <- rows[1, ] | |
if (!is.na(row$F1)) { | |
pdb <- read.pdb(file.path(dir_path, file)) | |
chain_to_analyze <- pdb | |
if (chain == 'A') { | |
chain_to_analyze <- trim.pdb(pdb, chain = "A") | |
} else if ( chain == 'B' ) { | |
chain_to_analyze <- trim.pdb(pdb, chain = "B") | |
} | |
chain_a_xyz[[file]] <- chain_to_analyze$xyz | |
} | |
} | |
for (file in alt_pdb_files) { | |
rows <- filter(csp_data, holo_model_path == file.path(path_to_alt_pdb_files, file)) | |
row <- rows[1, ] | |
if (!is.na(row$F1)) { | |
pdb <- read.pdb(file.path(alt_dir_path, file)) | |
chain_to_analyze <- pdb | |
if (chain == 'A') { | |
chain_to_analyze <- trim.pdb(pdb, chain = "A") | |
} else if ( chain == 'B' ) { | |
chain_to_analyze <- trim.pdb(pdb, chain = "B") | |
} | |
chain_a_xyz[[file]] <- chain_to_analyze$xyz | |
} | |
} | |
special_pdb_file <- paste0('comp_', tolower(pdb_id), ".pdb") | |
special_pdb_file_dir <- paste0(PDB_FILES, 'computational_structures/') | |
# Construct the file path | |
pdb_file_path <- file.path(special_pdb_file_dir, special_pdb_file) | |
# Check if the file exists | |
if (file.exists(pdb_file_path)) { | |
# If the file exists, read the pdb file | |
pdb <- read.pdb(pdb_file_path) | |
chain_to_analyze <- pdb | |
if (chain == 'A') { | |
chain_to_analyze <- trim.pdb(pdb, chain = "A") | |
} else if ( chain == 'B' ) { | |
chain_to_analyze <- trim.pdb(pdb, chain = "B") | |
} | |
chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz | |
} | |
special_pdb_file <- paste0('exp_', tolower(pdb_id), ".pdb") | |
special_pdb_file_dir <- paste0(PDB_FILES, 'experimental_structures/') | |
pdb_file_path <- file.path(special_pdb_file_dir, special_pdb_file) | |
# Check if the file exists | |
if (file.exists(pdb_file_path)) { | |
pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file)) | |
chain_to_analyze <- pdb | |
if (chain == 'A') { | |
chain_to_analyze <- trim.pdb(pdb, chain = "A") | |
} else if ( chain == 'B' ) { | |
chain_to_analyze <- trim.pdb(pdb, chain = "B") | |
} | |
chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz | |
} | |
pdb_files <- list.files(path = paste0(special_pdb_file_dir, tolower(pdb_id), '/'), pattern = "\\.pdb$") | |
for (file in pdb_files) { | |
pdb <- read.pdb(file.path(paste0(special_pdb_file_dir, tolower(pdb_id), '/'), file)) | |
chain_to_analyze <- pdb | |
if (chain == 'A') { | |
chain_to_analyze <- trim.pdb(pdb, chain = "A") | |
} else if ( chain == 'B' ) { | |
chain_to_analyze <- trim.pdb(pdb, chain = "B") | |
} | |
chain_a_xyz[[file]] <- chain_to_analyze$xyz | |
} | |
return(chain_a_xyz) | |
} | |
# Function to compute Mahalanobis distances | |
compute_mahalanobis_distances <- function(pca_data) { | |
cov_matrix <- cov(pca_data) | |
mean_vector <- colMeans(pca_data) | |
distances <- apply(pca_data, 1, function(row) { | |
mahalanobis(row, mean_vector, cov_matrix) | |
}) | |
return(distances) | |
} | |
get_pca <- function(data) { | |
pca_result <- pca.xyz(data) | |
# Extract the first three principal components | |
pc1 <- pca_result$z[, 1] | |
pc2 <- pca_result$z[, 2] | |
pc3 <- pca_result$z[, 3] | |
# Create a data frame with the principal components | |
return(list(pca_data = data.frame(x = pc1, y = pc2, z = pc3), variance_explained = pca_result$sdev^2)) | |
} | |
remove_outliers_pca <- function(data, pdb_files_names, threshold, max_iterations = 10) { | |
iteration <- 0 | |
repeat { | |
# Get the PCA results | |
pca_result <- get_pca(data) | |
pca_data <- pca_result$pca_data | |
variance_explained <- pca_result$variance_explained | |
if (iteration >= max_iterations) { | |
break | |
} | |
# Compute Mahalanobis distances | |
distances <- compute_mahalanobis_distances(pca_data) | |
# Identify outliers | |
outliers <- distances > threshold | |
# Ensure rows with 'exp' or 'comp' in pdb_files_names are not removed | |
protected_rows <- grepl("exp|comp", pdb_files_names) | |
outliers <- outliers & !protected_rows | |
# Count the number of outliers | |
num_outliers <- sum(outliers) | |
# Print the iteration number and number of outliers removed | |
cat("Iteration:", iteration + 1, "- Number of outliers removed:", num_outliers, "\n") | |
#cat(outliers) | |
# Break the loop if no outliers are detected or max iterations are reached | |
if (num_outliers == 0 || iteration >= max_iterations) { | |
break | |
} | |
# Remove outliers from data and pdb_files_names | |
data <- data[!outliers, ] | |
pdb_files_names <- pdb_files_names[!outliers] | |
iteration <- iteration + 1 | |
} | |
return(list(final_data = data, final_pdb_files_names = pdb_files_names, pca_result = pca_data, variance_explained = variance_explained)) | |
} | |
perform_analysis <- function(pdb_id, chain, max_clusters = 10, remove_outliers = TRUE) { | |
chain_a_xyz <- get_matrix(pdb_id, chain) | |
xyz_matrix <- do.call(rbind, chain_a_xyz) | |
# Identify unique rows and their indices | |
unique_indices <- !duplicated(xyz_matrix) | |
xyz_matrix <- xyz_matrix[unique_indices, ] | |
# Create a data frame for PCA results | |
pdb_files_names <- names(chain_a_xyz)[unique_indices] | |
threshold <- qchisq(0.975, df = 3) # 97.5th percentile of Chi-squared distribution with 3 degrees of freedom | |
if (remove_outliers) { | |
result <- remove_outliers_pca(xyz_matrix, pdb_files_names, threshold) | |
} else { | |
result <- remove_outliers_pca(xyz_matrix, pdb_files_names, threshold, max_iterations = 0) | |
} | |
xyz_matrix <- result$final_data | |
pca_result <- result$pca_result | |
# Calculate variance explained by each PC | |
variance_explained <- result$variance_explained | |
total_variance <- sum(variance_explained) | |
proportion_variance_explained <- variance_explained / total_variance | |
# Print the variance explained by PC1 and PC2 | |
pc1_variance_explained <- proportion_variance_explained[1] | |
pc2_variance_explained <- proportion_variance_explained[2] | |
pc3_variance_explained <- proportion_variance_explained[3] | |
print(paste("Proportion of variance explained by PC1:", pc1_variance_explained)) | |
print(paste("Proportion of variance explained by PC2:", pc2_variance_explained)) | |
print(paste("Proportion of variance explained by PC3:", pc3_variance_explained)) | |
# Extract PC1 and PC2 | |
pc1 <- pca_result$x | |
pc2 <- pca_result$y | |
pc3 <- pca_result$z | |
# Create a data frame for PCA results | |
output_df_pca <- data.frame(pdb_file = result$final_pdb_files_names, PC1 = pc1, PC2 = pc2, PC3 = pc3) | |
# Write PCA results to CSV | |
write.csv(output_df_pca, paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_PCA_chain_', chain, '_data_trimmed.csv'), row.names = FALSE) | |
} | |
chain <- 'A' | |
pdb_id <- 'RHOA_A161P' | |
perform_analysis(pdb_id, chain, remove_outliers = FALSE) | |
pdb_ids <- c('2nmb','2hug', '2nd1', '2jw1', '5m9d', '5xv8', '2lox', '2kwv') | |
chains <- c('B') | |
for (pdb_id in pdb_ids) { | |
for (chain in chains) { | |
print(paste0('Running ', pdb_id, ' with chain ', chain)) | |
perform_analysis(pdb_id, chain) | |
} | |
} | |