Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
# calc_CSP_metrics.py
# calculate Fmeasure, Precision, Recall for each apo/holo pair with CSP data
# determine binding residues using UCBShift predictions for holo form, spectra aligned to holo shifts
import sys
from os.path import basename
from util import *
from paths import *
method = "MONTE"
z_value = 0
def align_shifts_to_seq(aligned_sequence, sequence, shifts):
new_shifts = []
seq_index = 0
for i in range(len(aligned_sequence)):
if seq_index < len(sequence) and aligned_sequence[i] == sequence[seq_index]:
new_shifts.append(shifts[seq_index])
seq_index += 1
else:
new_shifts.append(-1)
return new_shifts
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python calc_CSP_metrics.py <bound>")
sys.exit(1)
bound = sys.argv[1]
data_source_file = f'{CSP_Rank_Scores}CSP_{bound}_CSpred.csv'
parsed_data = parse_csv(data_source_file)
apos = [str(data['apo_bmrb']) for data in parsed_data]
bounds = [data['holo_pdb'] for data in parsed_data]
bound_model_paths = [data['holo_model_path'] for data in parsed_data]
data_check = []
try:
data_check = [data['F1'] for data in parsed_data]
except:
x = 0
i = 0
for apo in tqdm(apos):
try:
x = 0
# uncomment if we only want to run for entries which don't have a calculated data_check column
if len(str(data_check[i])) > 0:
i += 1
continue
except:
x = 0
apo = apo.lower()
bound = bounds[i].lower()
bound_path = bound_model_paths[i]
print(bound_path)
#if apo != '2akl' or bound != '2js3':
#if bound != '2kfh':
# i += 1
# continue
print(apo + ' ' + bound)
def add_data(bound_path, i, model = False ):
if bound_path is None or exists(bound_path) == False:
if exists(PDB_FILES + bound_path):
bound_path = PDB_FILES + bound_path
else:
print("Could not find :" + bound_path)
return
bound_CSList_path = f'{CS_Lists}{bound.upper()}.csv'
apo_CSList_path = f'{CS_Lists}{apo.upper()}.csv'
bound_pred_CSList_path = f'{CS_Predictions}{bound.lower()}_AFS_shift_predictions/{basename(bound_path).split(".")[0]}.csv'
if exists(bound_pred_CSList_path) == False:
bound_pred_CSList_path = f'{CS_Predictions}{bound.lower()}_AFS2_shift_predictions/{basename(bound_path).split(".")[0]}.csv'
if exists(bound_pred_CSList_path) == False:
print("Could not locate shift prediction file.")
return
raise
bound_seq = get_sequences(bound_path, file = True)[get_longest_chain(bound_path)]
new_path = bound_path[:bound_path.rfind('.')]+'_'+apo+'.pdb'
real_CSPs = []
real_CSP_outlier_cutoff = -1
real_bound_sequence = ""
pred_CSPs = []
pred_CSP_outlier_cutoff = -1
pred_bound_sequence = ""
try:
real_CSPs, real_CSP_outlier_cutoff, real_bound_sequence = calc_CSP_new(apo_CSList_path, bound_CSList_path, method = method)
except Exception as e:
print(e)
raise
try:
pred_CSPs, pred_CSP_outlier_cutoff, pred_bound_sequence = calc_CSP_new(apo_CSList_path, bound_pred_CSList_path, method = method, UCBShift_pred_holo = True)
except Exception as e:
print(e)
raise
update_b_factors_longest_chain(bound_path, real_CSPs, real_bound_sequence, new_path)
pred_sequence_aligned, real_sequence_aligned = align(pred_bound_sequence, real_bound_sequence)
real_CSPs_aligned = align_shifts_to_seq(real_sequence_aligned, real_bound_sequence, real_CSPs)
pred_CSPs_aligned = align_shifts_to_seq(pred_sequence_aligned, pred_bound_sequence, pred_CSPs)
MCC = -1
F = 0
real_CSP_below_thresh = [ C for C in real_CSPs_aligned if C < real_CSP_outlier_cutoff and C > 0 ]
pred_CSP_below_thresh = [ C for C in pred_CSPs_aligned if C < pred_CSP_outlier_cutoff and C > 0 ]
pred_sig_cutoff = -1
real_sig_cutoff = -1
try:
pred_sig_cutoff = calculate_z_score_threshold(pred_CSP_below_thresh, z_value)
real_sig_cutoff = calculate_z_score_threshold(real_CSP_below_thresh, z_value)
except Exception as e:
print(e)
pred_sig_cutoff = -1
real_sig_cutoff = -1
#continue
residues_expected_to_have_sig_CSPS = [True if f > pred_sig_cutoff else False for i, f in enumerate(pred_CSPs_aligned)]
TP = 0
TN = 0
FP = 0
FN = 0
F = 0
Beta = 0
z_score_at_zero = calculate_z_score(real_CSP_below_thresh, 0)
for j,real_CSP in enumerate(real_CSPs_aligned):
if real_CSP == -1:
continue
CSP_z_score = calculate_z_score(real_CSP_below_thresh, real_CSP)
if residues_expected_to_have_sig_CSPS[j]:
if CSP_z_score <= 0:
FN += -1 * max(CSP_z_score, z_score_at_zero)
else:
TP += min(CSP_z_score, 3)
else: # not in binding site
if CSP_z_score > 0:
FP += min(CSP_z_score, 3)
else:
TN += -1 * max(CSP_z_score, z_score_at_zero)
print(f"TP = {TP}, FP = {FP}\nTN = {TN}, FN = {FN}")
Precision = 0
if TP + FP > 0:
Precision = TP / ( TP + FP )
Recall = 0
if TP + FN > 0:
Recall = TP / ( TP + FN )
print(f"F = 2 * ({Precision} * {Recall}) / ({Precision} + {Recall})")
try:
F = 2 * (Precision * Recall) / (Precision + Recall)
except Exception as e:
print(e)
raise
F = 0
print("("+str(TP)+"*"+str(TN)+"-"+str(FP)+"*"+str(FN)+")/sqrt(("+str(TP)+"+"+str(FP)+")*("+str(TN)+"+"+str(FP)+")*("+str(TN)+"+"+str(FN)+"))")
MCC = 0
try:
MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
except Exception as e:
print(e)
raise
MCC = -1
consensus = ( F + (MCC/2)+0.5 ) / 2
new_columns = 'consensus'
print("holo = " + bound + ", apo = " + apo)
print("Precision = " + str(Precision))
print("Recall = " + str(Recall))
print("F measure = " + str(F))
print("Significance cutoff = " + str(real_sig_cutoff))
print("consensus = " + str(consensus))
new_values = []
new_values = [F, Precision, Recall, MCC, consensus]
new_columns = ['F1', 'Precision', 'Recall', 'MCC', 'consensus']
print("new_values = " + str(new_values))
update_row(data_source_file, apo.upper(), bound_path, new_values, new_columns)
os.remove(new_path)
print("running add data")
try:
add_data(bound_path, i)
except:
raise
x = 0
i += 1