Skip to content
Permalink
5ab4ed8e4a
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
153 lines (124 sloc) 5.2 KB
# UMAP_TSNE_STATS.py
from util import *
import sys
import pymol
from pymol import cmd
def process_pdb(pdb_file, object_name):
# Initialize PyMOL
pymol.finish_launching()
# Load the PDB file
cmd.load(pdb_file, object_name)
# Color by chain
cmd.color('green', object_name + ' and chain A')
cmd.color('cyan', object_name + ' and chain B')
# Show chain B as sticks
cmd.show('sticks', f'{object_name} and chain B')
# Hide ribbon for chain B
cmd.hide('cartoon', f'{object_name} and chain B')
cmd.orient()
cmd.viewport(800, 800)
#import matplotlib.pyplot as plt
def plot_boxplots(data_dict):
"""
Plot boxplots for each key:query pair in the dictionary.
Parameters:
data_dict (dict): Dictionary with integer keys and lists of floats as values.
"""
# Extract keys and values
keys = list(data_dict.keys())
values = list(data_dict.values())
# Create the boxplot
plt.figure(figsize=(10, 6))
plt.boxplot(values, labels=keys)
# Set plot labels and title
plt.xlabel('Keys')
plt.ylabel('Values')
plt.title('Boxplots for Each Key:Query Pair')
# Show plot
plt.show()
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python UMAP_TSNE_STATS.py <bound>")
sys.exit(1)
bound = sys.argv[1]
data_source_file = './CSP_'+bound+'_CSpred.csv'
parsed_data = parse_csv(data_source_file)
holo_model_files = [data['holo_model_path'][data['holo_model_path'].rfind('/')+1:] for data in parsed_data]
holo_model_files_raw = [data['holo_model_path'] for data in parsed_data]
consensus_scores = [float(data['consensus']) for data in parsed_data]
UMAP_file = './data/'+bound+'_aligned_CSPREDB_UMAP_chain_B_data.csv'
UMAP_data = parse_csv(UMAP_file)
UMAP_files = [ data['pdb_file'] for data in UMAP_data ]
UMAP_clusters = [ int(data['Cluster']) for data in UMAP_data ]
TSNE_file = './data/'+bound+'_aligned_CSPREDB_TSNE_chain_B_data.csv'
TSNE_data = parse_csv(TSNE_file)
TSNE_files = [ data['pdb_file'] for data in TSNE_data ]
TSNE_clusters = [ int(data['Cluster']) for data in TSNE_data ]
print("getting TSNE cluster scores")
TSNE_cluster_scores = {}
TSNE_cluster_files = {}
for i, pdb_file in enumerate(TSNE_files):
cluster_number = TSNE_clusters[i]
if cluster_number not in list(TSNE_cluster_scores):
TSNE_cluster_scores[cluster_number] = []
TSNE_cluster_files[cluster_number] = []
try:
index = holo_model_files.index(pdb_file)
except:
continue
TSNE_cluster_files[cluster_number].append(holo_model_files_raw[index])
TSNE_cluster_scores[cluster_number].append(consensus_scores[index])
#plot_boxplots(TSNE_cluster_scores)
print("getting UMAP cluster scores")
UMAP_cluster_scores = {}
UMAP_cluster_files = {}
for i, pdb_file in enumerate(UMAP_files):
cluster_number = UMAP_clusters[i]
if cluster_number not in list(UMAP_cluster_scores):
UMAP_cluster_scores[cluster_number] = []
UMAP_cluster_files[cluster_number] = []
try:
index = holo_model_files.index(pdb_file)
except:
continue
UMAP_cluster_files[cluster_number].append(holo_model_files_raw[index])
UMAP_cluster_scores[cluster_number].append(consensus_scores[index])
#plot_boxplots(UMAP_cluster_scores)
print("getting TSNE cluster score averages")
TSNE_cluster_score_averages = {}
for i in list(TSNE_cluster_scores):
sum_scores = 0
for j in TSNE_cluster_scores[i]:
sum_scores += j
sum_scores /= len(TSNE_cluster_scores[i])
TSNE_cluster_score_averages[i] = sum_scores
print("getting UMAP cluster score averages")
UMAP_cluster_score_averages = {}
for i in list(UMAP_cluster_scores):
sum_scores = 0
for j in UMAP_cluster_scores[i]:
sum_scores += j
sum_scores /= len(UMAP_cluster_scores[i])
UMAP_cluster_score_averages[i] = sum_scores
def print_sorted_dicts(*dicts):
for d in dicts:
sorted_dict = {k: round(v, 3) for k, v in sorted(d.items())}
for k, v in sorted_dict.items():
print(f"{k}: {v}")
print() # Print a newline for better separation between dictionaries
print_sorted_dicts(TSNE_cluster_score_averages, UMAP_cluster_score_averages)
tSNE_medoid_files = []
print("getting TSNE cluster medoid structures")
for itr, i in enumerate(list(TSNE_cluster_files)):
medoid_file = find_medoid_structure(TSNE_cluster_files[i])
print("Medoid file for TSNE CLUSTER " + str(i) + " = " + medoid_file)
tSNE_medoid_files.append(medoid_file)
process_pdb(medoid_file, 'tSNE_' + str(i))
UMAP_medoid_files = []
print("getting UMAP cluster medoid structures")
for itr,i in enumerate(list(UMAP_cluster_files)):
medoid_file = find_medoid_structure(UMAP_cluster_files[i])
print("Medoid file for UMAP CLUSTER " + str(i) + " = " + medoid_file)
UMAP_medoid_files.append(medoid_file)
process_pdb(medoid_file, 'UMAP_' + str(i))
cmd.hide('everything', 'hydro')