Skip to content
Permalink
5ab4ed8e4a
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
292 lines (244 sloc) 11.6 KB
# UMAP_TSNE_STATS.py
from util import *
import sys
import pymol
from pymol import cmd
from paths import *
def process_pdb(pdb_file, object_name):
# Initialize PyMOL
pymol.finish_launching()
# Load the PDB file
cmd.load(pdb_file, object_name)
# Color by chain
cmd.color('green', object_name + ' and chain A')
cmd.color('cyan', object_name + ' and chain B')
# Show chain B as sticks
cmd.show('sticks', f'{object_name} and chain B')
# Hide ribbon for chain B
cmd.hide('cartoon', f'{object_name} and chain B')
cmd.orient()
cmd.viewport(800, 800)
#import matplotlib.pyplot as plt
def plot_boxplots(data_dict):
"""
Plot boxplots for each key:query pair in the dictionary.
Parameters:
data_dict (dict): Dictionary with integer keys and lists of floats as values.
"""
# Extract keys and values
keys = list(data_dict.keys())
values = list(data_dict.values())
# Create the boxplot
plt.figure(figsize=(10, 6))
plt.boxplot(values, labels=keys)
# Set plot labels and title
plt.xlabel('Keys')
plt.ylabel('Values')
plt.title('Boxplots for Each Key:Query Pair')
# Show plot
plt.show()
from paths import *
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python UMAP_TSNE_STATS.py <bound>")
sys.exit(1)
bound = sys.argv[1].lower()
pdb_id = bound
data_source_file = CSP_Rank_Scores + './CSP_'+bound+'_CSpred.csv'
parsed_data = parse_csv(data_source_file)
holo_model_files = [data['holo_model_path'][data['holo_model_path'].rfind('/')+1:] for data in parsed_data]
holo_model_files_raw = [data['holo_model_path'] for data in parsed_data]
consensus_scores = []#[float(data['consensus']) for data in parsed_data]
Confidence_scores = []
dp_scores = []
recall_scores = []
precision_scores = []
plddt_scores = []
for data in parsed_data:
try:
conf = float(data['Confidence'])
cons = float(data['consensus'])
plddt = float(data['plddt'])
consensus_scores.append(cons)
#consensus_scores.append(cons)# * df / (cons + df))
Confidence_scores.append(conf)
plddt_scores.append(plddt)
except:
consensus_scores.append(0)
Confidence_scores.append(0)
plddt_scores.append(0)
try:
dp = float(data['DP'])
precision = float(data['RPF_PRECISION'])
recall = float(data['RPF_RECALL'])
recall_scores.append(recall)
precision_scores.append(precision)
dp_scores.append(dp)
except Exception as e:
recall_scores.append(0)
precision_scores.append(0)
dp_scores.append(0)
min_recall = min(score for score in recall_scores if score > 0)
max_recall = max(recall_scores)
min_dp = min(score for score in dp_scores if score > 0)
max_dp = max(dp_scores)
min_precision = min(score for score in precision_scores if score > 0)
max_precision = max(precision_scores)
#print(min_recall)
#print(max_recall)
method_names = ['CSPRank', 'DP_Harmonic', 'DP', 'Recall_weighted', 'DP_weighted']
for method in range(5):
print(f"Method {method_names[method]}")
bayesian_selection_metric = []
for i, consensus in enumerate(consensus_scores):
# METHOD 0: disregard RPF results in model selection
if method == 0:
bayesian_selection_metric.append(consensus_scores[i])
# METHOD 1
if method == 1:
try:
bayesian_selection_metric.append(consensus_scores[i] * dp_scores[i] * 2 / ( consensus_scores[i] + dp_scores[i] ))
except:
bayesian_selection_metric.append(0)
# METHOD 2
if method == 2:
bayesian_selection_metric.append(dp_scores[i])
# METHOD 3
if method == 3:
if recall_scores[i] == 0:
bayesian_selection_metric.append(0)
else:
bayesian_selection_metric.append(consensus_scores[i] * ((recall_scores[i] - min_recall) / (max_recall - min_recall)))
# METHOD 4
if method == 4:
bayesian_selection_metric.append(consensus_scores[i] * ((dp_scores[i] - min_dp) / (max_dp - min_dp)))
max_df = max(dp_scores)
n = 0
#Confidence_scores = [float(data['Confidence']) for data in parsed_data]
UMAP_file = f'{CLUSTERING_RESULTS}{bound}_aligned_CSPREDB_UMAP_chain_B_data.csv'
UMAP_data = parse_csv(UMAP_file)
UMAP_files = [ data['pdb_file'] for data in UMAP_data ]
UMAP_clusters = [ int(data['Cluster']) for data in UMAP_data ]
TSNE_file = f'{CLUSTERING_RESULTS}{bound}_aligned_CSPREDB_TSNE_chain_B_data.csv'
TSNE_data = parse_csv(TSNE_file)
TSNE_files = [ data['pdb_file'] for data in TSNE_data ]
TSNE_clusters = [ int(data['Cluster']) for data in TSNE_data ]
print("getting TSNE cluster scores")
TSNE_cluster_scores = {}
TSNE_CSPRank_scores = {}
TSNE_cluster_files = {}
for i, pdb_file in enumerate(TSNE_files):
cluster_number = TSNE_clusters[i]
if cluster_number not in list(TSNE_cluster_scores):
TSNE_cluster_scores[cluster_number] = []
TSNE_cluster_files[cluster_number] = []
TSNE_CSPRank_scores[cluster_number] = []
try:
index = holo_model_files.index(pdb_file)
except:
print("couldn't find " + pdb_file)
continue
TSNE_cluster_files[cluster_number].append(holo_model_files_raw[index])
TSNE_CSPRank_scores[cluster_number].append(bayesian_selection_metric[index])
if bayesian_selection_metric[index] <= 0:
TSNE_cluster_scores[cluster_number].append(0)
else:
TSNE_cluster_scores[cluster_number].append(math.sqrt(bayesian_selection_metric[index] * Confidence_scores[index]))
print("getting UMAP cluster scores")
UMAP_cluster_scores = {}
UMAP_CSPRank_scores = {}
UMAP_cluster_files = {}
for i, pdb_file in enumerate(UMAP_files):
cluster_number = UMAP_clusters[i]
if cluster_number not in list(UMAP_cluster_scores):
UMAP_cluster_scores[cluster_number] = []
UMAP_CSPRank_scores[cluster_number] = []
UMAP_cluster_files[cluster_number] = []
try:
index = holo_model_files.index(pdb_file)
except:
continue
UMAP_cluster_files[cluster_number].append(holo_model_files_raw[index])
UMAP_CSPRank_scores[cluster_number].append(bayesian_selection_metric[index])
if bayesian_selection_metric[index] <= 0:
UMAP_cluster_scores[cluster_number].append(0)
else:
UMAP_cluster_scores[cluster_number].append(math.sqrt(bayesian_selection_metric[index] * Confidence_scores[index]))
print("getting TSNE cluster score averages")
TSNE_cluster_score_averages = {}
for i in list(TSNE_cluster_scores):
sum_scores = 0
for j in TSNE_cluster_scores[i]:
sum_scores += j
sum_scores /= len(TSNE_cluster_scores[i])
TSNE_cluster_score_averages[i] = sum_scores
print("getting UMAP cluster score averages")
UMAP_cluster_score_averages = {}
for i in list(UMAP_cluster_scores):
sum_scores = 0
for j in UMAP_cluster_scores[i]:
sum_scores += j
sum_scores /= len(UMAP_cluster_scores[i])
UMAP_cluster_score_averages[i] = sum_scores
def print_sorted_dicts(*dicts):
for d in dicts:
sorted_dict = {k: round(v, 3) for k, v in sorted(d.items())}
for k, v in sorted_dict.items():
print(f"{k}: {v}")
print() # Print a newline for better separation between dictionaries
#print_sorted_dicts(TSNE_cluster_score_averages, UMAP_cluster_score_averages)
max_consensus_files = []
print("getting TSNE cluster max Bayes Score structures")
for cluster in list(TSNE_cluster_scores):
max_score = 0
max_score_itr = -1
for itr, score in enumerate(TSNE_cluster_scores[cluster]):
if score > max_score and TSNE_cluster_files[cluster][itr].find('exp') == -1:
max_score = score
max_score_itr = itr
#max_score_file = PDB_FILES + TSNE_cluster_files[cluster][max_score_itr]
max_score_file = TSNE_cluster_files[cluster][max_score_itr]
#print(TSNE_CSPRank_scores[cluster])
CSPRank_score = TSNE_CSPRank_scores[cluster][max_score_itr]
if CSPRank_score > 0:
print("Max Bayes Score for TSNE cluster " + str(cluster) + ' = ' + str(max_score) + '. CSPRank Score = ' + str(CSPRank_score)+ '. PDB file = ' + max_score_file)
max_consensus_files.append(max_score_file)
#process_pdb(max_score_file, 'tSNE_max' + str(cluster))
else:
print("Could not find CSPRank score > 0 for TSNE cluster " + str(cluster))
print("getting TSNE cluster max Bayes Score structures")
for cluster in list(UMAP_cluster_scores):
max_score = 0
max_score_itr = -1
for itr, score in enumerate(UMAP_cluster_scores[cluster]):
if score > max_score and UMAP_cluster_files[cluster][itr].find('exp') == -1:
max_score = score
max_score_itr = itr
#max_score_file = PDB_FILES + UMAP_cluster_files[cluster][max_score_itr]
max_score_file = UMAP_cluster_files[cluster][max_score_itr]
CSPRank_score = UMAP_CSPRank_scores[cluster][max_score_itr]
if CSPRank_score > 0:
print("Max consensus for UMAP cluster " + str(cluster) + ' = ' + str(max_score) + '. CSPRank Score = ' + str(CSPRank_score)+ '. PDB file = ' + max_score_file)
max_consensus_files.append(max_score_file)
#process_pdb(max_score_file, 'UMAP_max' + str(cluster))
else:
print("Could not find CSPRank score > 0 for UMAP cluster " + str(cluster))
experimental_medoid_file = experimental_structures + 'exp_'+ pdb_id + '.pdb'
#process_pdb(experimental_medoid_file, 'exp_' + pdb_id)
outdir = PDB_FILES + './'+pdb_id+'_max_RPF_NLDR_' + method_names[method] + '_files/'
if isdir(outdir) == False:
os.system('mkdir '+ outdir)
else:
os.system('rm -r ' + outdir)
os.system('mkdir '+ outdir)
for consensus_file in max_consensus_files:
os.system('cp ' + consensus_file + ' ' + outdir + consensus_file[consensus_file.rfind('/')+1:])
os.system('python3 compress.py ' + outdir)
cmd.hide('everything', 'hydro')
medoid_file = find_medoid_structure([outdir + f for f in listdir(outdir)])
medoid_file_basename = medoid_file[medoid_file.rfind('/')+1:]
for i in range(0, len(holo_model_files)):
if holo_model_files[i].find(medoid_file_basename) != -1:
print("Medoid file = " + medoid_file + " for method " + method_names[method])
print("Medoid file CSP_Rank_Score = " + str(consensus_scores[i]))
#os.system('python3 medoid.py ' + outdir)