Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
# This R script takes as input the pdb_id of a pdb you'd like to run PCA on.
# This code assumes there is a folder './'+pdb_id+'_aligned' which contains many pdb files to use for the PCA.
# Additionally, we need values from a csv file named './CSP_'+pdb_id+'.csv' which contains a metric value to color the points on the PCA by.
library("bio3d")
library('dplyr')
library('ggplot2')
library('Rtsne')
library('umap')
library('cluster')
library('factoextra')
library('plotly')
library('MASS') # For Mahalanobis distance
print(getwd())
setwd('/home/tiburon/Desktop/ROT4/CSPRank_ES_Processing')
print(getwd())
CSP_Rank_Scores <- './CSP_Rank_Scores/'
PDB_FILES <- './PDB_FILES/'
experimental_structures <- paste0(PDB_FILES, 'experimental_structures/')
computational_structures <- paste0(PDB_FILES, 'computational_structures/')
CS_Lists <- './CS_Lists/'
CS_Predictions <- './CS_Predictions/'
CLUSTERING_RESULTS <- './CLUSTERING_RESULTS/'
get_matrix <- function(pdb_id, chain) {
dir_path <- paste0(PDB_FILES, pdb_id, "_aligned")
path_to_pdb_files <- dir_path#paste0("./", pdb_id, "_aligned")
alt_dir_path <- paste0(PDB_FILES, pdb_id, "_alt_aligned")
path_to_alt_pdb_files <- alt_dir_path#paste0("./", pdb_id, "_alt_aligned")
pdb_files <- list.files(path = dir_path, pattern = "\\.pdb$")
alt_pdb_files <- list.files(path = alt_dir_path, pattern = "\\.pdb$")
csp_data <- read.csv(paste0(CSP_Rank_Scores, "CSP_", tolower(pdb_id), "_CSpred.csv"))
chain_a_xyz <- list()
for (file in pdb_files) {
rows <- filter(csp_data, holo_model_path == file.path(path_to_pdb_files, file))
row <- rows[1, ]
if (!is.na(row$F1)) {
pdb <- read.pdb(file.path(dir_path, file))
chain_to_analyze <- pdb
if (chain == 'A') {
chain_to_analyze <- trim.pdb(pdb, chain = "A")
} else if ( chain == 'B' ) {
chain_to_analyze <- trim.pdb(pdb, chain = "B")
}
chain_a_xyz[[file]] <- chain_to_analyze$xyz
}
}
for (file in alt_pdb_files) {
rows <- filter(csp_data, holo_model_path == file.path(path_to_alt_pdb_files, file))
row <- rows[1, ]
if (!is.na(row$F1)) {
pdb <- read.pdb(file.path(alt_dir_path, file))
chain_to_analyze <- pdb
if (chain == 'A') {
chain_to_analyze <- trim.pdb(pdb, chain = "A")
} else if ( chain == 'B' ) {
chain_to_analyze <- trim.pdb(pdb, chain = "B")
}
chain_a_xyz[[file]] <- chain_to_analyze$xyz
}
}
special_pdb_file <- paste0('comp_', tolower(pdb_id), ".pdb")
special_pdb_file_dir <- paste0(PDB_FILES, 'computational_structures/')
# Construct the file path
pdb_file_path <- file.path(special_pdb_file_dir, special_pdb_file)
# Check if the file exists
if (file.exists(pdb_file_path)) {
# If the file exists, read the pdb file
pdb <- read.pdb(pdb_file_path)
chain_to_analyze <- pdb
if (chain == 'A') {
chain_to_analyze <- trim.pdb(pdb, chain = "A")
} else if ( chain == 'B' ) {
chain_to_analyze <- trim.pdb(pdb, chain = "B")
}
chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz
}
special_pdb_file <- paste0('exp_', tolower(pdb_id), ".pdb")
special_pdb_file_dir <- paste0(PDB_FILES, 'experimental_structures/')
pdb_file_path <- file.path(special_pdb_file_dir, special_pdb_file)
# Check if the file exists
if (file.exists(pdb_file_path)) {
pdb <- read.pdb(file.path(special_pdb_file_dir, special_pdb_file))
chain_to_analyze <- pdb
if (chain == 'A') {
chain_to_analyze <- trim.pdb(pdb, chain = "A")
} else if ( chain == 'B' ) {
chain_to_analyze <- trim.pdb(pdb, chain = "B")
}
chain_a_xyz[[special_pdb_file]] <- chain_to_analyze$xyz
}
pdb_files <- list.files(path = paste0(special_pdb_file_dir, tolower(pdb_id), '/'), pattern = "\\.pdb$")
for (file in pdb_files) {
pdb <- read.pdb(file.path(paste0(special_pdb_file_dir, tolower(pdb_id), '/'), file))
chain_to_analyze <- pdb
if (chain == 'A') {
chain_to_analyze <- trim.pdb(pdb, chain = "A")
} else if ( chain == 'B' ) {
chain_to_analyze <- trim.pdb(pdb, chain = "B")
}
chain_a_xyz[[file]] <- chain_to_analyze$xyz
}
return(chain_a_xyz)
}
# Function to compute Mahalanobis distances
compute_mahalanobis_distances <- function(pca_data) {
cov_matrix <- cov(pca_data)
mean_vector <- colMeans(pca_data)
distances <- apply(pca_data, 1, function(row) {
mahalanobis(row, mean_vector, cov_matrix)
})
return(distances)
}
get_pca <- function(data) {
pca_result <- pca.xyz(data)
# Extract the first three principal components
pc1 <- pca_result$z[, 1]
pc2 <- pca_result$z[, 2]
pc3 <- pca_result$z[, 3]
# Create a data frame with the principal components
return(list(pca_data = data.frame(x = pc1, y = pc2, z = pc3), variance_explained = pca_result$sdev^2))
}
remove_outliers_pca <- function(data, pdb_files_names, threshold, max_iterations = 10) {
iteration <- 0
repeat {
# Get the PCA results
pca_result <- get_pca(data)
pca_data <- pca_result$pca_data
variance_explained <- pca_result$variance_explained
if (iteration >= max_iterations) {
break
}
# Compute Mahalanobis distances
distances <- compute_mahalanobis_distances(pca_data)
# Identify outliers
outliers <- distances > threshold
# Ensure rows with 'exp' or 'comp' in pdb_files_names are not removed
protected_rows <- grepl("exp|comp", pdb_files_names)
outliers <- outliers & !protected_rows
# Count the number of outliers
num_outliers <- sum(outliers)
# Print the iteration number and number of outliers removed
cat("Iteration:", iteration + 1, "- Number of outliers removed:", num_outliers, "\n")
#cat(outliers)
# Break the loop if no outliers are detected or max iterations are reached
if (num_outliers == 0 || iteration >= max_iterations) {
break
}
# Remove outliers from data and pdb_files_names
data <- data[!outliers, ]
pdb_files_names <- pdb_files_names[!outliers]
iteration <- iteration + 1
}
return(list(final_data = data, final_pdb_files_names = pdb_files_names, pca_result = pca_data, variance_explained = variance_explained))
}
perform_analysis <- function(pdb_id, chain, max_clusters = 10, remove_outliers = TRUE) {
chain_a_xyz <- get_matrix(pdb_id, chain)
xyz_matrix <- do.call(rbind, chain_a_xyz)
# Identify unique rows and their indices
unique_indices <- !duplicated(xyz_matrix)
xyz_matrix <- xyz_matrix[unique_indices, ]
# Create a data frame for PCA results
pdb_files_names <- names(chain_a_xyz)[unique_indices]
threshold <- qchisq(0.975, df = 3) # 97.5th percentile of Chi-squared distribution with 3 degrees of freedom
if (remove_outliers) {
result <- remove_outliers_pca(xyz_matrix, pdb_files_names, threshold)
} else {
result <- remove_outliers_pca(xyz_matrix, pdb_files_names, threshold, max_iterations = 0)
}
xyz_matrix <- result$final_data
pca_result <- result$pca_result
# Calculate variance explained by each PC
variance_explained <- result$variance_explained
total_variance <- sum(variance_explained)
proportion_variance_explained <- variance_explained / total_variance
# Print the variance explained by PC1 and PC2
pc1_variance_explained <- proportion_variance_explained[1]
pc2_variance_explained <- proportion_variance_explained[2]
pc3_variance_explained <- proportion_variance_explained[3]
print(paste("Proportion of variance explained by PC1:", pc1_variance_explained))
print(paste("Proportion of variance explained by PC2:", pc2_variance_explained))
print(paste("Proportion of variance explained by PC3:", pc3_variance_explained))
# Extract PC1 and PC2
pc1 <- pca_result$x
pc2 <- pca_result$y
pc3 <- pca_result$z
# Create a data frame for PCA results
output_df_pca <- data.frame(pdb_file = result$final_pdb_files_names, PC1 = pc1, PC2 = pc2, PC3 = pc3)
# Write PCA results to CSV
write.csv(output_df_pca, paste0(CLUSTERING_RESULTS, pdb_id, '_aligned_CSPRED', chain, '_PCA_chain_', chain, '_data_trimmed.csv'), row.names = FALSE)
}
chain <- 'A'
pdb_id <- 'RHOA_A161P'
perform_analysis(pdb_id, chain, remove_outliers = FALSE)
pdb_ids <- c('2nmb','2hug', '2nd1', '2jw1', '5m9d', '5xv8', '2lox', '2kwv')
chains <- c('B')
for (pdb_id in pdb_ids) {
for (chain in chains) {
print(paste0('Running ', pdb_id, ' with chain ', chain))
perform_analysis(pdb_id, chain)
}
}