UMAP_TSNE_STATS_max_RPF.py

# UMAP_TSNE_STATS.py

from util import *
import sys
import pymol
from pymol import cmd
from paths import *

def process_pdb(pdb_file, object_name):
    # Initialize PyMOL
    pymol.finish_launching()

    # Load the PDB file
    cmd.load(pdb_file, object_name)

    # Color by chain
    cmd.color('green', object_name + ' and chain A')
    cmd.color('cyan', object_name + ' and chain B')

    # Show chain B as sticks
    cmd.show('sticks', f'{object_name} and chain B')

    # Hide ribbon for chain B
    cmd.hide('cartoon', f'{object_name} and chain B')

    cmd.orient()
    cmd.viewport(800, 800)


#import matplotlib.pyplot as plt

def plot_boxplots(data_dict):
    """
    Plot boxplots for each key:query pair in the dictionary.

    Parameters:
    data_dict (dict): Dictionary with integer keys and lists of floats as values.
    """
    # Extract keys and values
    keys = list(data_dict.keys())
    values = list(data_dict.values())

    # Create the boxplot
    plt.figure(figsize=(10, 6))
    plt.boxplot(values, labels=keys)

    # Set plot labels and title
    plt.xlabel('Keys')
    plt.ylabel('Values')
    plt.title('Boxplots for Each Key:Query Pair')

    # Show plot
    plt.show()
from paths import *
if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python UMAP_TSNE_STATS.py <bound>")
        sys.exit(1)

    bound = sys.argv[1].lower()
    pdb_id = bound

    data_source_file = CSP_Rank_Scores + './CSP_'+bound+'_CSpred.csv'
    parsed_data = parse_csv(data_source_file)
    holo_model_files = [data['holo_model_path'][data['holo_model_path'].rfind('/')+1:] for data in parsed_data]
    holo_model_files_raw = [data['holo_model_path'] for data in parsed_data]
    consensus_scores = []#[float(data['consensus']) for data in parsed_data]
    Confidence_scores = []
    dp_scores = []
    recall_scores = []
    precision_scores = []
    plddt_scores = []
    for data in parsed_data:
        try:
            conf = float(data['Confidence'])
            cons = float(data['consensus'])
            plddt = float(data['plddt'])
            consensus_scores.append(cons)
            #consensus_scores.append(cons)# * df / (cons + df))
            Confidence_scores.append(conf)
            plddt_scores.append(plddt)
        except:
            consensus_scores.append(0)
            Confidence_scores.append(0)
            plddt_scores.append(0)
        try:
            dp = float(data['DP'])
            precision = float(data['RPF_PRECISION'])
            recall = float(data['RPF_RECALL'])
            recall_scores.append(recall)
            precision_scores.append(precision)
            dp_scores.append(dp)
        except Exception as e:
            recall_scores.append(0)
            precision_scores.append(0)
            dp_scores.append(0)


    min_recall = min(score for score in recall_scores if score > 0)
    max_recall = max(recall_scores)
    min_dp = min(score for score in dp_scores if score > 0)
    max_dp = max(dp_scores)
    min_precision = min(score for score in precision_scores if score > 0)
    max_precision = max(precision_scores)

    #print(min_recall)
    #print(max_recall)

    for i, consensus in enumerate(consensus_scores):
        # METHOD 0: disregard RPF results in model selection
        #consensus_scores[i] = consensus_scores[i]
        # METHOD 1
        #try:
        #    consensus_scores[i] = consensus_scores[i] * dp_scores[i] * 2 / ( consensus_scores[i] + dp_scores[i] )
        #except:
        #    consensus_scores[i] = 0
        # METHOD 2
        #consensus_scores[i] = dp_scores[i]
        # METHOD 3
        #consensus_scores[i] = consensus_scores[i] * ((recall_scores[i] - min_recall) / (max_recall - min_recall))
        # METHOD 4
        consensus_scores[i] = consensus_scores[i] * ((dp_scores[i] - min_dp) / (max_dp - min_dp))

    #print(consensus_scores[:10])
    #print(Confidence_scores[:10])

    max_df = max(dp_scores)
    n = 0
    #raise
    if False:
        for i, consensus in enumerate(consensus_scores):
            if dp_scores[i] < max_df - 0.01:
                consensus_scores[i] = 0
            else:
                n += 1
        print(n)
    #raise

    #Confidence_scores = [float(data['Confidence']) for data in parsed_data]

    UMAP_file = f'{CLUSTERING_RESULTS}{bound}_aligned_CSPREDB_UMAP_chain_B_data.csv'
    UMAP_data = parse_csv(UMAP_file)
    UMAP_files = [ data['pdb_file'] for data in UMAP_data ]
    UMAP_clusters = [ int(data['Cluster']) for data in UMAP_data ]

    TSNE_file = f'{CLUSTERING_RESULTS}{bound}_aligned_CSPREDB_TSNE_chain_B_data.csv'
    TSNE_data = parse_csv(TSNE_file)
    TSNE_files = [ data['pdb_file'] for data in TSNE_data ]
    TSNE_clusters = [ int(data['Cluster']) for data in TSNE_data ]

    print("getting TSNE cluster scores")
    TSNE_cluster_scores = {}
    TSNE_CSPRank_scores = {}
    TSNE_cluster_files = {}
    for i, pdb_file in enumerate(TSNE_files):
        cluster_number = TSNE_clusters[i]
        if cluster_number not in list(TSNE_cluster_scores):
            TSNE_cluster_scores[cluster_number] = []
            TSNE_cluster_files[cluster_number] = []
            TSNE_CSPRank_scores[cluster_number] = []
        try:
            index = holo_model_files.index(pdb_file)
        except:
            print("couldn't find " + pdb_file)
            continue
        TSNE_cluster_files[cluster_number].append(holo_model_files_raw[index])
        TSNE_CSPRank_scores[cluster_number].append(consensus_scores[index])
        #TSNE_cluster_scores[cluster_number].append(consensus_scores[index] * Confidence_scores[index])
        #if Confidence_scores[index] > 0:
        #    if consensus_scores[index] > 0:
                #print(f"math.sqrt({consensus_scores[index]} * {Confidence_scores[index]})")
        TSNE_cluster_scores[cluster_number].append(math.sqrt(consensus_scores[index] * Confidence_scores[index]))
        #TSNE_cluster_scores[cluster_number].append(math.sqrt(consensus_scores[index] * (plddt_scores[index]/100)))
    #print(consensus_scores)
    #print(TSNE_cluster_scores)
    #plot_boxplots(TSNE_cluster_scores)
    #raise

    print("getting UMAP cluster scores")
    UMAP_cluster_scores = {}
    UMAP_CSPRank_scores = {}
    UMAP_cluster_files = {}
    for i, pdb_file in enumerate(UMAP_files):
        cluster_number = UMAP_clusters[i]
        if cluster_number not in list(UMAP_cluster_scores):
            UMAP_cluster_scores[cluster_number] = []
            UMAP_CSPRank_scores[cluster_number] = []
            UMAP_cluster_files[cluster_number] = []
        try:
            index = holo_model_files.index(pdb_file)
        except:
            continue
        UMAP_cluster_files[cluster_number].append(holo_model_files_raw[index])
        UMAP_CSPRank_scores[cluster_number].append(consensus_scores[index])
        #UMAP_cluster_scores[cluster_number].append(consensus_scores[index] * Confidence_scores[index])
        UMAP_cluster_scores[cluster_number].append(math.sqrt(consensus_scores[index] * Confidence_scores[index]))
        #UMAP_cluster_scores[cluster_number].append(math.sqrt(consensus_scores[index] * (plddt_scores[index]/100)))
    #plot_boxplots(UMAP_cluster_scores)

    print("getting TSNE cluster score averages")
    TSNE_cluster_score_averages = {}
    for i in list(TSNE_cluster_scores):
        sum_scores = 0
        for j in TSNE_cluster_scores[i]:
            sum_scores += j
        sum_scores /= len(TSNE_cluster_scores[i])
        TSNE_cluster_score_averages[i] = sum_scores

    print("getting UMAP cluster score averages")
    UMAP_cluster_score_averages = {}
    for i in list(UMAP_cluster_scores):
        sum_scores = 0
        for j in UMAP_cluster_scores[i]:
            sum_scores += j
        sum_scores /= len(UMAP_cluster_scores[i])
        UMAP_cluster_score_averages[i] = sum_scores

    def print_sorted_dicts(*dicts):
        for d in dicts:
            sorted_dict = {k: round(v, 3) for k, v in sorted(d.items())}
            for k, v in sorted_dict.items():
                print(f"{k}: {v}")
            print()  # Print a newline for better separation between dictionaries

    print_sorted_dicts(TSNE_cluster_score_averages, UMAP_cluster_score_averages)

    max_consensus_files = []
    print("getting TSNE cluster max Bayes Score structures")
    for cluster in list(TSNE_cluster_scores):
        max_score = 0
        max_score_itr = -1
        for itr, score in enumerate(TSNE_cluster_scores[cluster]):
            if score > max_score and TSNE_cluster_files[cluster][itr].find('exp') == -1:
                max_score = score
                max_score_itr = itr
        #max_score_file = PDB_FILES + TSNE_cluster_files[cluster][max_score_itr]
        max_score_file = TSNE_cluster_files[cluster][max_score_itr]
        #print(TSNE_CSPRank_scores[cluster])
        CSPRank_score = TSNE_CSPRank_scores[cluster][max_score_itr]
        if CSPRank_score > 0:
            print("Max Bayes Score for TSNE cluster " + str(cluster) + ' = ' + str(max_score) + '. CSPRank Score = ' + str(CSPRank_score)+ '. PDB file = ' + max_score_file)
            max_consensus_files.append(max_score_file)
            process_pdb(max_score_file, 'tSNE_max' + str(cluster))
        else:
            print("Could not find CSPRank score > 0 for TSNE cluster " + str(cluster))

    print("getting TSNE cluster max Bayes Score structures")
    for cluster in list(UMAP_cluster_scores):
        max_score = 0
        max_score_itr = -1
        for itr, score in enumerate(UMAP_cluster_scores[cluster]):
            if score > max_score and UMAP_cluster_files[cluster][itr].find('exp') == -1:
                max_score = score
                max_score_itr = itr
        #max_score_file = PDB_FILES + UMAP_cluster_files[cluster][max_score_itr]
        max_score_file = UMAP_cluster_files[cluster][max_score_itr]
        CSPRank_score = UMAP_CSPRank_scores[cluster][max_score_itr]
        if CSPRank_score > 0:
            print("Max consensus for UMAP cluster " + str(cluster) + ' = ' + str(max_score) + '. CSPRank Score = ' + str(CSPRank_score)+ '. PDB file = ' + max_score_file)
            max_consensus_files.append(max_score_file)
            process_pdb(max_score_file, 'UMAP_max' + str(cluster))
        else:
            print("Could not find CSPRank score > 0 for UMAP cluster " + str(cluster))


    experimental_medoid_file = experimental_structures + 'exp_'+ pdb_id + '.pdb'
    process_pdb(experimental_medoid_file, 'exp_' + pdb_id)

    outdir = PDB_FILES + './'+pdb_id+'_max_RPF_NLDR_consensus_files/'
    if isdir(outdir) == False:
        os.system('mkdir '+ outdir)
    else:
        os.system('rm -r ' + outdir)
        os.system('mkdir '+ outdir)
    for consensus_file in max_consensus_files:
        os.system('cp ' + consensus_file + ' ' + outdir + consensus_file[consensus_file.rfind('/')+1:])

    os.system('python3 compress.py ' + outdir)

    cmd.hide('everything', 'hydro')
	# UMAP_TSNE_STATS.py

	from util import *
	import sys
	import pymol
	from pymol import cmd
	from paths import *

	def process_pdb(pdb_file, object_name):
	# Initialize PyMOL
	pymol.finish_launching()

	# Load the PDB file
	cmd.load(pdb_file, object_name)

	# Color by chain
	cmd.color('green', object_name + ' and chain A')
	cmd.color('cyan', object_name + ' and chain B')

	# Show chain B as sticks
	cmd.show('sticks', f'{object_name} and chain B')

	# Hide ribbon for chain B
	cmd.hide('cartoon', f'{object_name} and chain B')

	cmd.orient()
	cmd.viewport(800, 800)


	#import matplotlib.pyplot as plt

	def plot_boxplots(data_dict):
	"""
	Plot boxplots for each key:query pair in the dictionary.

	Parameters:
	data_dict (dict): Dictionary with integer keys and lists of floats as values.
	"""
	# Extract keys and values
	keys = list(data_dict.keys())
	values = list(data_dict.values())

	# Create the boxplot
	plt.figure(figsize=(10, 6))
	plt.boxplot(values, labels=keys)

	# Set plot labels and title
	plt.xlabel('Keys')
	plt.ylabel('Values')
	plt.title('Boxplots for Each Key:Query Pair')

	# Show plot
	plt.show()
	from paths import *
	if __name__ == "__main__":
	if len(sys.argv) != 2:
	print("Usage: python UMAP_TSNE_STATS.py <bound>")
	sys.exit(1)

	bound = sys.argv[1].lower()
	pdb_id = bound

	data_source_file = CSP_Rank_Scores + './CSP_'+bound+'_CSpred.csv'
	parsed_data = parse_csv(data_source_file)
	holo_model_files = [data['holo_model_path'][data['holo_model_path'].rfind('/')+1:] for data in parsed_data]
	holo_model_files_raw = [data['holo_model_path'] for data in parsed_data]
	consensus_scores = []#[float(data['consensus']) for data in parsed_data]
	Confidence_scores = []
	dp_scores = []
	recall_scores = []
	precision_scores = []
	plddt_scores = []
	for data in parsed_data:
	try:
	conf = float(data['Confidence'])
	cons = float(data['consensus'])
	plddt = float(data['plddt'])
	consensus_scores.append(cons)
	#consensus_scores.append(cons)# * df / (cons + df))
	Confidence_scores.append(conf)
	plddt_scores.append(plddt)
	except:
	consensus_scores.append(0)
	Confidence_scores.append(0)
	plddt_scores.append(0)
	try:
	dp = float(data['DP'])
	precision = float(data['RPF_PRECISION'])
	recall = float(data['RPF_RECALL'])
	recall_scores.append(recall)
	precision_scores.append(precision)
	dp_scores.append(dp)
	except Exception as e:
	recall_scores.append(0)
	precision_scores.append(0)
	dp_scores.append(0)


	min_recall = min(score for score in recall_scores if score > 0)
	max_recall = max(recall_scores)
	min_dp = min(score for score in dp_scores if score > 0)
	max_dp = max(dp_scores)
	min_precision = min(score for score in precision_scores if score > 0)
	max_precision = max(precision_scores)

	#print(min_recall)
	#print(max_recall)

	for i, consensus in enumerate(consensus_scores):
	# METHOD 0: disregard RPF results in model selection
	#consensus_scores[i] = consensus_scores[i]
	# METHOD 1
	#try:
	# consensus_scores[i] = consensus_scores[i] * dp_scores[i] * 2 / ( consensus_scores[i] + dp_scores[i] )
	#except:
	# consensus_scores[i] = 0
	# METHOD 2
	#consensus_scores[i] = dp_scores[i]
	# METHOD 3
	#consensus_scores[i] = consensus_scores[i] * ((recall_scores[i] - min_recall) / (max_recall - min_recall))
	# METHOD 4
	consensus_scores[i] = consensus_scores[i] * ((dp_scores[i] - min_dp) / (max_dp - min_dp))

	#print(consensus_scores[:10])
	#print(Confidence_scores[:10])

	max_df = max(dp_scores)
	n = 0
	#raise
	if False:
	for i, consensus in enumerate(consensus_scores):
	if dp_scores[i] < max_df - 0.01:
	consensus_scores[i] = 0
	else:
	n += 1
	print(n)
	#raise

	#Confidence_scores = [float(data['Confidence']) for data in parsed_data]

	UMAP_file = f'{CLUSTERING_RESULTS}{bound}_aligned_CSPREDB_UMAP_chain_B_data.csv'
	UMAP_data = parse_csv(UMAP_file)
	UMAP_files = [ data['pdb_file'] for data in UMAP_data ]
	UMAP_clusters = [ int(data['Cluster']) for data in UMAP_data ]

	TSNE_file = f'{CLUSTERING_RESULTS}{bound}_aligned_CSPREDB_TSNE_chain_B_data.csv'
	TSNE_data = parse_csv(TSNE_file)
	TSNE_files = [ data['pdb_file'] for data in TSNE_data ]
	TSNE_clusters = [ int(data['Cluster']) for data in TSNE_data ]

	print("getting TSNE cluster scores")
	TSNE_cluster_scores = {}
	TSNE_CSPRank_scores = {}
	TSNE_cluster_files = {}
	for i, pdb_file in enumerate(TSNE_files):
	cluster_number = TSNE_clusters[i]
	if cluster_number not in list(TSNE_cluster_scores):
	TSNE_cluster_scores[cluster_number] = []
	TSNE_cluster_files[cluster_number] = []
	TSNE_CSPRank_scores[cluster_number] = []
	try:
	index = holo_model_files.index(pdb_file)
	except:
	print("couldn't find " + pdb_file)
	continue
	TSNE_cluster_files[cluster_number].append(holo_model_files_raw[index])
	TSNE_CSPRank_scores[cluster_number].append(consensus_scores[index])
	#TSNE_cluster_scores[cluster_number].append(consensus_scores[index] * Confidence_scores[index])
	#if Confidence_scores[index] > 0:
	# if consensus_scores[index] > 0:
	#print(f"math.sqrt({consensus_scores[index]} * {Confidence_scores[index]})")
	TSNE_cluster_scores[cluster_number].append(math.sqrt(consensus_scores[index] * Confidence_scores[index]))
	#TSNE_cluster_scores[cluster_number].append(math.sqrt(consensus_scores[index] * (plddt_scores[index]/100)))
	#print(consensus_scores)
	#print(TSNE_cluster_scores)
	#plot_boxplots(TSNE_cluster_scores)
	#raise

	print("getting UMAP cluster scores")
	UMAP_cluster_scores = {}
	UMAP_CSPRank_scores = {}
	UMAP_cluster_files = {}
	for i, pdb_file in enumerate(UMAP_files):
	cluster_number = UMAP_clusters[i]
	if cluster_number not in list(UMAP_cluster_scores):
	UMAP_cluster_scores[cluster_number] = []
	UMAP_CSPRank_scores[cluster_number] = []
	UMAP_cluster_files[cluster_number] = []
	try:
	index = holo_model_files.index(pdb_file)
	except:
	continue
	UMAP_cluster_files[cluster_number].append(holo_model_files_raw[index])
	UMAP_CSPRank_scores[cluster_number].append(consensus_scores[index])
	#UMAP_cluster_scores[cluster_number].append(consensus_scores[index] * Confidence_scores[index])
	UMAP_cluster_scores[cluster_number].append(math.sqrt(consensus_scores[index] * Confidence_scores[index]))
	#UMAP_cluster_scores[cluster_number].append(math.sqrt(consensus_scores[index] * (plddt_scores[index]/100)))
	#plot_boxplots(UMAP_cluster_scores)

	print("getting TSNE cluster score averages")
	TSNE_cluster_score_averages = {}
	for i in list(TSNE_cluster_scores):
	sum_scores = 0
	for j in TSNE_cluster_scores[i]:
	sum_scores += j
	sum_scores /= len(TSNE_cluster_scores[i])
	TSNE_cluster_score_averages[i] = sum_scores

	print("getting UMAP cluster score averages")
	UMAP_cluster_score_averages = {}
	for i in list(UMAP_cluster_scores):
	sum_scores = 0
	for j in UMAP_cluster_scores[i]:
	sum_scores += j
	sum_scores /= len(UMAP_cluster_scores[i])
	UMAP_cluster_score_averages[i] = sum_scores

	def print_sorted_dicts(*dicts):
	for d in dicts:
	sorted_dict = {k: round(v, 3) for k, v in sorted(d.items())}
	for k, v in sorted_dict.items():
	print(f"{k}: {v}")
	print() # Print a newline for better separation between dictionaries

	print_sorted_dicts(TSNE_cluster_score_averages, UMAP_cluster_score_averages)

	max_consensus_files = []
	print("getting TSNE cluster max Bayes Score structures")
	for cluster in list(TSNE_cluster_scores):
	max_score = 0
	max_score_itr = -1
	for itr, score in enumerate(TSNE_cluster_scores[cluster]):
	if score > max_score and TSNE_cluster_files[cluster][itr].find('exp') == -1:
	max_score = score
	max_score_itr = itr
	#max_score_file = PDB_FILES + TSNE_cluster_files[cluster][max_score_itr]
	max_score_file = TSNE_cluster_files[cluster][max_score_itr]
	#print(TSNE_CSPRank_scores[cluster])
	CSPRank_score = TSNE_CSPRank_scores[cluster][max_score_itr]
	if CSPRank_score > 0:
	print("Max Bayes Score for TSNE cluster " + str(cluster) + ' = ' + str(max_score) + '. CSPRank Score = ' + str(CSPRank_score)+ '. PDB file = ' + max_score_file)
	max_consensus_files.append(max_score_file)
	process_pdb(max_score_file, 'tSNE_max' + str(cluster))
	else:
	print("Could not find CSPRank score > 0 for TSNE cluster " + str(cluster))

	print("getting TSNE cluster max Bayes Score structures")
	for cluster in list(UMAP_cluster_scores):
	max_score = 0
	max_score_itr = -1
	for itr, score in enumerate(UMAP_cluster_scores[cluster]):
	if score > max_score and UMAP_cluster_files[cluster][itr].find('exp') == -1:
	max_score = score
	max_score_itr = itr
	#max_score_file = PDB_FILES + UMAP_cluster_files[cluster][max_score_itr]
	max_score_file = UMAP_cluster_files[cluster][max_score_itr]
	CSPRank_score = UMAP_CSPRank_scores[cluster][max_score_itr]
	if CSPRank_score > 0:
	print("Max consensus for UMAP cluster " + str(cluster) + ' = ' + str(max_score) + '. CSPRank Score = ' + str(CSPRank_score)+ '. PDB file = ' + max_score_file)
	max_consensus_files.append(max_score_file)
	process_pdb(max_score_file, 'UMAP_max' + str(cluster))
	else:
	print("Could not find CSPRank score > 0 for UMAP cluster " + str(cluster))


	experimental_medoid_file = experimental_structures + 'exp_'+ pdb_id + '.pdb'
	process_pdb(experimental_medoid_file, 'exp_' + pdb_id)

	outdir = PDB_FILES + './'+pdb_id+'_max_RPF_NLDR_consensus_files/'
	if isdir(outdir) == False:
	os.system('mkdir '+ outdir)
	else:
	os.system('rm -r ' + outdir)
	os.system('mkdir '+ outdir)
	for consensus_file in max_consensus_files:
	os.system('cp ' + consensus_file + ' ' + outdir + consensus_file[consensus_file.rfind('/')+1:])

	os.system('python3 compress.py ' + outdir)

	cmd.hide('everything', 'hydro')