calc_CSP_CSPred.py

# calc_CSP_metrics.py
# calculate Fmeasure, Precision, Recall for each apo/holo pair with CSP data
# determine binding residues using UCBShift predictions for holo form, spectra aligned to holo shifts

import sys
from os.path import basename
from util import *
from paths import *

method = "MONTE"
z_value = 0

def align_shifts_to_seq(aligned_sequence, sequence, shifts):
    new_shifts = []
    seq_index = 0
    for i in range(len(aligned_sequence)):
        if seq_index < len(sequence) and aligned_sequence[i] == sequence[seq_index]:
            new_shifts.append(shifts[seq_index])
            seq_index += 1
        else:
            new_shifts.append(-1)
    return new_shifts

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python calc_CSP_metrics.py <bound>")
        sys.exit(1)

    bound = sys.argv[1]

    data_source_file = f'{CSP_Rank_Scores}CSP_{bound}_CSpred.csv'
    parsed_data = parse_csv(data_source_file)

    apos = [str(data['apo_bmrb']) for data in parsed_data]
    bounds = [data['holo_pdb'] for data in parsed_data]
    bound_model_paths = [data['holo_model_path'] for data in parsed_data]
    data_check = []
    try:
        data_check = [data['F1'] for data in parsed_data]
    except:
        x = 0

    i = 0
    for apo in tqdm(apos):

        try:
            x = 0
            # uncomment if we only want to run for entries which don't have a calculated data_check column
            if len(str(data_check[i])) > 0:
                i += 1
                continue
        except:
            x = 0

        apo = apo.lower()
        bound = bounds[i].lower()

        bound_path = bound_model_paths[i]
        print(bound_path)

        #if apo != '2akl' or bound != '2js3':
        #if  bound != '2kfh':
        #    i += 1
        #    continue
        print(apo + ' ' + bound)

        def add_data(bound_path, i, model = False ):

            if bound_path is None or exists(bound_path) == False:
                if exists(PDB_FILES + bound_path):
                    bound_path = PDB_FILES + bound_path
                else:
                    print("Could not find :" + bound_path)
                    return

            bound_CSList_path = f'{CS_Lists}{bound.upper()}.csv'
            apo_CSList_path = f'{CS_Lists}{apo.upper()}.csv'

            bound_pred_CSList_path = f'{CS_Predictions}{bound.lower()}_AFS_shift_predictions/{basename(bound_path).split(".")[0]}.csv'
            if exists(bound_pred_CSList_path) == False:
                bound_pred_CSList_path = f'{CS_Predictions}{bound.lower()}_AFS2_shift_predictions/{basename(bound_path).split(".")[0]}.csv'
                if exists(bound_pred_CSList_path) == False:
                    print("Could not locate shift prediction file.")
                    return
                    raise

            bound_seq = get_sequences(bound_path, file = True)[get_longest_chain(bound_path)]
            new_path = bound_path[:bound_path.rfind('.')]+'_'+apo+'.pdb'

            real_CSPs = []
            real_CSP_outlier_cutoff = -1
            real_bound_sequence = ""

            pred_CSPs = []
            pred_CSP_outlier_cutoff = -1
            pred_bound_sequence = ""

            try:
                real_CSPs, real_CSP_outlier_cutoff, real_bound_sequence = calc_CSP_new(apo_CSList_path, bound_CSList_path, method = method)
            except Exception as e:
                print(e)
                raise

            try:
                pred_CSPs, pred_CSP_outlier_cutoff, pred_bound_sequence = calc_CSP_new(apo_CSList_path, bound_pred_CSList_path, method = method, UCBShift_pred_holo = True)
            except Exception as e:
                print(e)
                raise

            update_b_factors_longest_chain(bound_path, real_CSPs, real_bound_sequence, new_path)

            pred_sequence_aligned, real_sequence_aligned = align(pred_bound_sequence, real_bound_sequence)
            real_CSPs_aligned = align_shifts_to_seq(real_sequence_aligned, real_bound_sequence, real_CSPs)
            pred_CSPs_aligned = align_shifts_to_seq(pred_sequence_aligned, pred_bound_sequence, pred_CSPs)

            MCC = -1
            F = 0

            real_CSP_below_thresh = [ C for C in real_CSPs_aligned if C < real_CSP_outlier_cutoff and C > 0 ]
            pred_CSP_below_thresh = [ C for C in pred_CSPs_aligned if C < pred_CSP_outlier_cutoff and C > 0 ]

            pred_sig_cutoff = -1
            real_sig_cutoff = -1
            try:
                pred_sig_cutoff = calculate_z_score_threshold(pred_CSP_below_thresh, z_value)
                real_sig_cutoff = calculate_z_score_threshold(real_CSP_below_thresh, z_value)
            except Exception as e:
                print(e)
                pred_sig_cutoff = -1
                real_sig_cutoff = -1
                #continue

            residues_expected_to_have_sig_CSPS = [True if f > pred_sig_cutoff else False for i, f in enumerate(pred_CSPs_aligned)]
            TP = 0
            TN = 0
            FP = 0
            FN = 0
            F = 0
            Beta = 0
            z_score_at_zero = calculate_z_score(real_CSP_below_thresh, 0)

            for j,real_CSP in enumerate(real_CSPs_aligned):
                if real_CSP == -1:
                    continue
                CSP_z_score = calculate_z_score(real_CSP_below_thresh, real_CSP)
                if residues_expected_to_have_sig_CSPS[j]:
                    if CSP_z_score <= 0:
                        FN += -1 * max(CSP_z_score, z_score_at_zero)
                    else:
                        TP += min(CSP_z_score, 3)
                else: # not in binding site
                    if CSP_z_score > 0:
                        FP += min(CSP_z_score, 3)
                    else:
                        TN += -1 * max(CSP_z_score, z_score_at_zero)

            print(f"TP = {TP}, FP = {FP}\nTN = {TN}, FN = {FN}")
            Precision = 0
            if TP + FP > 0:
                Precision = TP / ( TP + FP )
            Recall = 0
            if TP + FN > 0:
                Recall = TP / ( TP + FN )

            print(f"F = 2 * ({Precision} * {Recall}) / ({Precision} + {Recall})")
            try:
                F = 2 * (Precision * Recall) / (Precision + Recall)
            except Exception as e:
                print(e)
                raise
                F = 0

            print("("+str(TP)+"*"+str(TN)+"-"+str(FP)+"*"+str(FN)+")/sqrt(("+str(TP)+"+"+str(FP)+")*("+str(TN)+"+"+str(FP)+")*("+str(TN)+"+"+str(FN)+"))")
            MCC = 0
            try:
                MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
            except Exception as e:
                print(e)
                raise
                MCC = -1

            consensus = ( F + (MCC/2)+0.5 ) / 2
            new_columns = 'consensus'

            print("holo = " + bound + ", apo = " + apo)
            print("Precision = " + str(Precision))
            print("Recall = " + str(Recall))
            print("F measure = " + str(F))
            print("Significance cutoff = " + str(real_sig_cutoff))
            print("consensus = " + str(consensus))

            new_values = []
            new_values = [F, Precision, Recall, MCC, consensus]
            new_columns = ['F1', 'Precision', 'Recall', 'MCC', 'consensus']

            print("new_values = " + str(new_values))
            update_row(data_source_file, apo.upper(), bound_path, new_values, new_columns)

            os.remove(new_path)

        print("running add data")
        try:
            add_data(bound_path, i)
        except:
            raise
            x = 0
        i += 1
	# calc_CSP_metrics.py
	# calculate Fmeasure, Precision, Recall for each apo/holo pair with CSP data
	# determine binding residues using UCBShift predictions for holo form, spectra aligned to holo shifts

	import sys
	from os.path import basename
	from util import *
	from paths import *

	method = "MONTE"
	z_value = 0

	def align_shifts_to_seq(aligned_sequence, sequence, shifts):
	new_shifts = []
	seq_index = 0
	for i in range(len(aligned_sequence)):
	if seq_index < len(sequence) and aligned_sequence[i] == sequence[seq_index]:
	new_shifts.append(shifts[seq_index])
	seq_index += 1
	else:
	new_shifts.append(-1)
	return new_shifts

	if __name__ == "__main__":
	if len(sys.argv) != 2:
	print("Usage: python calc_CSP_metrics.py <bound>")
	sys.exit(1)

	bound = sys.argv[1]

	data_source_file = f'{CSP_Rank_Scores}CSP_{bound}_CSpred.csv'
	parsed_data = parse_csv(data_source_file)

	apos = [str(data['apo_bmrb']) for data in parsed_data]
	bounds = [data['holo_pdb'] for data in parsed_data]
	bound_model_paths = [data['holo_model_path'] for data in parsed_data]
	data_check = []
	try:
	data_check = [data['F1'] for data in parsed_data]
	except:
	x = 0

	i = 0
	for apo in tqdm(apos):

	try:
	x = 0
	# uncomment if we only want to run for entries which don't have a calculated data_check column
	if len(str(data_check[i])) > 0:
	i += 1
	continue
	except:
	x = 0

	apo = apo.lower()
	bound = bounds[i].lower()

	bound_path = bound_model_paths[i]
	print(bound_path)

	#if apo != '2akl' or bound != '2js3':
	#if bound != '2kfh':
	# i += 1
	# continue
	print(apo + ' ' + bound)

	def add_data(bound_path, i, model = False ):

	if bound_path is None or exists(bound_path) == False:
	if exists(PDB_FILES + bound_path):
	bound_path = PDB_FILES + bound_path
	else:
	print("Could not find :" + bound_path)
	return

	bound_CSList_path = f'{CS_Lists}{bound.upper()}.csv'
	apo_CSList_path = f'{CS_Lists}{apo.upper()}.csv'

	bound_pred_CSList_path = f'{CS_Predictions}{bound.lower()}_AFS_shift_predictions/{basename(bound_path).split(".")[0]}.csv'
	if exists(bound_pred_CSList_path) == False:
	bound_pred_CSList_path = f'{CS_Predictions}{bound.lower()}_AFS2_shift_predictions/{basename(bound_path).split(".")[0]}.csv'
	if exists(bound_pred_CSList_path) == False:
	print("Could not locate shift prediction file.")
	return
	raise

	bound_seq = get_sequences(bound_path, file = True)[get_longest_chain(bound_path)]
	new_path = bound_path[:bound_path.rfind('.')]+'_'+apo+'.pdb'

	real_CSPs = []
	real_CSP_outlier_cutoff = -1
	real_bound_sequence = ""

	pred_CSPs = []
	pred_CSP_outlier_cutoff = -1
	pred_bound_sequence = ""

	try:
	real_CSPs, real_CSP_outlier_cutoff, real_bound_sequence = calc_CSP_new(apo_CSList_path, bound_CSList_path, method = method)
	except Exception as e:
	print(e)
	raise

	try:
	pred_CSPs, pred_CSP_outlier_cutoff, pred_bound_sequence = calc_CSP_new(apo_CSList_path, bound_pred_CSList_path, method = method, UCBShift_pred_holo = True)
	except Exception as e:
	print(e)
	raise

	update_b_factors_longest_chain(bound_path, real_CSPs, real_bound_sequence, new_path)

	pred_sequence_aligned, real_sequence_aligned = align(pred_bound_sequence, real_bound_sequence)
	real_CSPs_aligned = align_shifts_to_seq(real_sequence_aligned, real_bound_sequence, real_CSPs)
	pred_CSPs_aligned = align_shifts_to_seq(pred_sequence_aligned, pred_bound_sequence, pred_CSPs)

	MCC = -1
	F = 0

	real_CSP_below_thresh = [ C for C in real_CSPs_aligned if C < real_CSP_outlier_cutoff and C > 0 ]
	pred_CSP_below_thresh = [ C for C in pred_CSPs_aligned if C < pred_CSP_outlier_cutoff and C > 0 ]

	pred_sig_cutoff = -1
	real_sig_cutoff = -1
	try:
	pred_sig_cutoff = calculate_z_score_threshold(pred_CSP_below_thresh, z_value)
	real_sig_cutoff = calculate_z_score_threshold(real_CSP_below_thresh, z_value)
	except Exception as e:
	print(e)
	pred_sig_cutoff = -1
	real_sig_cutoff = -1
	#continue

	residues_expected_to_have_sig_CSPS = [True if f > pred_sig_cutoff else False for i, f in enumerate(pred_CSPs_aligned)]
	TP = 0
	TN = 0
	FP = 0
	FN = 0
	F = 0
	Beta = 0
	z_score_at_zero = calculate_z_score(real_CSP_below_thresh, 0)

	for j,real_CSP in enumerate(real_CSPs_aligned):
	if real_CSP == -1:
	continue
	CSP_z_score = calculate_z_score(real_CSP_below_thresh, real_CSP)
	if residues_expected_to_have_sig_CSPS[j]:
	if CSP_z_score <= 0:
	FN += -1 * max(CSP_z_score, z_score_at_zero)
	else:
	TP += min(CSP_z_score, 3)
	else: # not in binding site
	if CSP_z_score > 0:
	FP += min(CSP_z_score, 3)
	else:
	TN += -1 * max(CSP_z_score, z_score_at_zero)

	print(f"TP = {TP}, FP = {FP}\nTN = {TN}, FN = {FN}")
	Precision = 0
	if TP + FP > 0:
	Precision = TP / ( TP + FP )
	Recall = 0
	if TP + FN > 0:
	Recall = TP / ( TP + FN )

	print(f"F = 2 * ({Precision} * {Recall}) / ({Precision} + {Recall})")
	try:
	F = 2 * (Precision * Recall) / (Precision + Recall)
	except Exception as e:
	print(e)
	raise
	F = 0

	print("("+str(TP)+""+str(TN)+"-"+str(FP)+""+str(FN)+")/sqrt(("+str(TP)+"+"+str(FP)+")("+str(TN)+"+"+str(FP)+")("+str(TN)+"+"+str(FN)+"))")
	MCC = 0
	try:
	MCC = (TPTN-FPFN)/math.sqrt((TP+FP)(TP+FN)(TN+FP)*(TN+FN))
	except Exception as e:
	print(e)
	raise
	MCC = -1

	consensus = ( F + (MCC/2)+0.5 ) / 2
	new_columns = 'consensus'

	print("holo = " + bound + ", apo = " + apo)
	print("Precision = " + str(Precision))
	print("Recall = " + str(Recall))
	print("F measure = " + str(F))
	print("Significance cutoff = " + str(real_sig_cutoff))
	print("consensus = " + str(consensus))

	new_values = []
	new_values = [F, Precision, Recall, MCC, consensus]
	new_columns = ['F1', 'Precision', 'Recall', 'MCC', 'consensus']

	print("new_values = " + str(new_values))
	update_row(data_source_file, apo.upper(), bound_path, new_values, new_columns)

	os.remove(new_path)

	print("running add data")
	try:
	add_data(bound_path, i)
	except:
	raise
	x = 0
	i += 1