Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
# calc_CSP_metrics.py
# calculate Fmeasure, Precision, Recall for each apo/holo pair with CSP data
from util import *
import sys
#clean_output_dir()
#clean_boundpdbs()
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python calc_CSP_metrics.py <bound>")
sys.exit(1)
bound = sys.argv[1]
data_source_file = './CSP_'+bound+'.csv'
parsed_data = parse_csv(data_source_file)
apos = [str(data['apo_bmrb']) for data in parsed_data]
bounds = [data['holo_pdb'] for data in parsed_data]
bound_model_paths = [data['holo_model_path'] for data in parsed_data]
data_check = []
try:
data_check = [data['metric_comp_F1-harmonic_ind2.5_dir3'] for data in parsed_data]
except:
x = 0
#print(data_check)
indirect_cutoff = 2.5
z_score = 0
i = 0
for apo in tqdm(apos):
# uncomment if we only want to run for entries which don't have a calculated data_check column
if len(data_check) != 0 and len(str(data_check[i])) > 0:
i += 1
continue
apo = apo.lower()
#print(apo)
bound = bounds[i].lower()
bound_model_path = bound_model_paths[i]
print(bound_model_path)
#bound_path = bound_paths[i]
#print(bound_path)
#if apo != '2akl' or bound != '2js3':
# i += 1
# continue
print(apo + ' ' + bound)
def add_data(bound_path, i, model=False):
if bound_path is None or exists(bound_path) == False:
return
#continue
bound_seq = get_sequences(bound_path, file = True)[get_longest_chain(bound_path)]
new_path = bound_path[:bound_path.rfind('.')]+'_'+apo+'.pdb'
for buried in [False]:#, False]:
for method in ["MONTE"]:#, "GRZESIEK1996"]:
print(method)
CSPs = []
u_3std_cutoff = -1
bound_sequence = ""
try:
CSPs, u_3std_cutoff, bound_sequence = calc_CSP(bound, apo, method = method, buried = buried, bound_file = bound_path)
except Exception as e:
print(e)
continue
update_b_factors_longest_chain(bound_path, CSPs, bound_sequence, new_path)
#print(get_sequences(bound.lower()))
# Precision = TP / TP + FP
# Recall = TP / TP + FN
# F = Precision + Recall / 2
# get list per prot res of minimum distance between any atom in res to any atom in pep
distances = pdb_interchain_distances(new_path, chain = get_longest_chain(new_path))
peptide_distances = pdb_interchain_distances(new_path, chain = get_shortest_chain(new_path))
cb_distances = pdb_interchain_cb_distances(new_path)
if distances is None:
os.remove(new_path)
continue
#print(distances)
bfactors = get_bfactors(new_path)
#print(bfactors)
#for F_method in ['FB', 'F1-harmonic', 'MCC']:
# for pocket_method in ['custom', 'CB8A']:
for F_method in ['F1-harmonic', 'MCC']:
for pocket_method in ['custom']:
for indirect_cutoff in [ 2.5 ]:#, 4, 5]:#[1.5, 2, 2.5, 3, 3.5, 4, 5, 6]:
for contact_distance in [ 3 ]:#, 3, 3.5 ]:
#contact_distance = 3.5
num_peptide_residues_in_contact = len([l for l in peptide_distances if l <= contact_distance ])
num_peptide_residues = len(peptide_distances)
print(num_peptide_residues_in_contact)
direct_binding_interface = [ True if d <= contact_distance else False for d in distances]
indirect_binding_interface = get_indirect_binding_interface(direct_binding_interface, new_path, cutoff = indirect_cutoff)
# test CB 8 distance
if pocket_method == 'CB8A':
indirect_binding_interface = [True if d <= 8 else False for d in cb_distances]
#print(indirect_binding_interface)
CSP_below_thresh = [ C for C in CSPs if C < u_3std_cutoff and C > 0 ]
if len(CSP_below_thresh) == 0:
print("No CSPs below u + 3std threshold.")
continue
for z_value in [0]:# -2, 1, 2, '3std']:
cutoff = -1
if z_value == '3std':
mean = statistics.mean(CSP_below_thresh)
# Calculate the standard deviation of the data
std_dev = statistics.stdev(CSP_below_thresh)
cutoff = std_dev * 3
else:
try:
cutoff = calculate_z_score_threshold(CSP_below_thresh, z_value)
except:
continue
if len(CSP_below_thresh) == 0:
continue
TP = 0
TN = 0
FP = 0
FN = 0
F = 0
inTP = 0
inTN = 0
inFP = 0
inFN = 0
inF = 0
Beta = 0
inBeta = 0
for j, distance in enumerate(distances):
if bfactors[j] == -1:
#print(bound_path + " " + str(j))
#return
continue
if direct_binding_interface[j]:
if bfactors[j] <= cutoff:
#FN += 1
FN += -1 * calculate_z_score(CSP_below_thresh, bfactors[j])
else:
#TP += 1
TP += calculate_z_score(CSP_below_thresh, bfactors[j])
else: # not in binding site
if bfactors[j] > cutoff:
#FP += 1
FP += calculate_z_score(CSP_below_thresh, bfactors[j])
else:
#TN += 1
TN += -1 * calculate_z_score(CSP_below_thresh, bfactors[j])
if indirect_binding_interface[j]:
if bfactors[j] <= cutoff:
#inFN += 1
inFN += -1 * calculate_z_score(CSP_below_thresh, bfactors[j])
else:
#inTP += 1
inTP += calculate_z_score(CSP_below_thresh, bfactors[j])
else:
if bfactors[j] > cutoff:
#inFP += 1
inFP += calculate_z_score(CSP_below_thresh, bfactors[j])
else:
#inTN += 1
inTN += -1 * calculate_z_score(CSP_below_thresh, bfactors[j])
Precision = 0
if TP + FP > 0:
Precision = TP / ( TP + FP )
Recall = 0
if TP + FN > 0:
Recall = TP / ( TP + FN )
if F_method == 'F1-avg': # F1-avg is bad.
F = ( Precision + Recall ) / 2
elif F_method == 'F1-harmonic':
try:
F = 2 * (Precision * Recall) / (Precision + Recall)
except:
continue
elif F_method == 'FB':
try:
Beta = ((TP + FN) / ((num_peptide_residues_in_contact)**2 / num_peptide_residues))
print("Beta = " + str(Beta))
F = (1 + Beta**2) * (Precision * Recall) / ((Beta**2 * Precision) + Recall)
except:
continue
inPrecision = 0
if inTP + inFP > 0:
inPrecision = inTP / ( inTP + inFP )
inRecall = 0
if inTP + inFN > 0:
inRecall = inTP / ( inTP + inFN )
if F_method == 'F1-avg':
inF = ( inPrecision + inRecall ) / 2
elif F_method == 'F1-harmonic':
try:
inF = 2 * (inPrecision * inRecall) / (inPrecision + inRecall)
except:
continue
elif F_method == 'FB':
try:
#inBeta = ((inTP + inFN) / (num_peptide_residues_in_contact)) ** 2 / ((inTP+inFN) / num_peptide_residues)
inBeta = ((TP + FN) / (num_peptide_residues_in_contact))
print("indirect Beta = " + str(inBeta))
inF = (1 + inBeta**2) * (inPrecision * inRecall) / ((inBeta**2 * inPrecision) + inRecall)
except:
continue
inMCC = (inTP*inTN-inFP*inFN)/math.sqrt((inTP+inFP)*(inTP+inFN)*(inTN+inFP)*(inTN+inFN))
print("("+str(inTP)+"*"+str(inTN)+"-"+str(inFP)+"*"+str(inFN)+")/sqrt(("+str(inTP)+"+"+str(inFP)+")*("+str(inTN)+"+"+str(inFP)+")*("+str(inTN)+"+"+str(inFN)+"))")
print("holo = " + bound + ", apo = " + apo)
print("Precision = " + str(Precision))
print("Recall = " + str(Recall))
print("F measure = " + str(F))
print("Indirect Precision = " + str(inPrecision))
print("Indirect Recall = " + str(inRecall))
print("Indirect F measure = " + str(inF))
print("Significance cutoff = " + str(cutoff))
new_values = []#[inF, inPrecision, inRecall, cutoff]
if F_method == 'F1-harmonic':
new_values = [inF, inPrecision, inRecall, cutoff]
new_columns = ["metric", 'Precision', 'Recall', "Cutoff"]#, 'Precision', 'Recall', 'Cutoff']
elif F_method == 'FB':
new_values = [inF, inBeta]
new_columns = ["metric", 'Beta']
elif F_method == 'MCC':
new_values = [inMCC]
new_columns = ["metric"]
if model:
new_columns = [ f + '_comp' for f in new_columns ]
new_columns = [ f + '_'+F_method for f in new_columns ]
#new_columns = [ f + '_'+str(z_value) for f in new_columns ]
if pocket_method == 'CB8A':
new_columns = [ f + '_CB8A' for f in new_columns ]
else:
new_columns = [ f + '_ind'+str(indirect_cutoff) for f in new_columns ]
new_columns = [ f + '_dir'+str(contact_distance) for f in new_columns ]
#if buried:
# new_columns = [ f + '_buried_excluded' for f in new_columns ]
#print(new_columns[0])
print("new_values = " + str(new_values))
print(apo.upper())
print(bound_path)
update_row(data_source_file, apo.upper(), bound_path, new_values, new_columns)
print("removing new path = " + new_path)
os.remove(new_path)
print("running add data")
try:
add_data(bound_model_path, i, model = True)
except Exception as e:
print(e)
x = 0
try:
add_data(bound_path, i, model = False)
except:
x = 0
i += 1