get_confidences.py

import sys
import pandas as pd
from util import *
from paths import *
import pickle


# function accomodates both .json format and .pkl format, calculates confidence according to
# Wallner, B. (2023b). Improved multimer prediction using massive sampling with AlphaFold in
# CASP15. PROTEINS: Structure, Function and Bioinformatics 91, 1734-1746.
# https://doi.org/10.1002/prot.26562.
def find_confidence(json_file_path):
    if json_file_path.endswith('json'):
        with open(json_file_path) as file:
            try:
                data = json.load(file)
                iptm = round(data.get('iptm', 0), 2)
                ptm = round(data.get('ptm', 0), 2)
                confidence = round(0.8 * iptm + 0.2 * ptm, 2)
                return confidence, ptm, iptm
            except:
                return None, None, None
    elif json_file_path.endswith('pkl'):
        data = pickle.load(open(json_file_path, 'rb'))
        iptm = round(float(data.get('iptm', 0)), 2)
        ptm = round(float(data.get('ptm', 0)), 2)
        confidence = round(0.8 * iptm + 0.2 * ptm, 2)
        return confidence, ptm, iptm
    else:
        print("BAD JSON FILE")
        raise


def find_avg_plddt(pdb_file_path):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('structure', pdb_file_path)

    bfactor_dict = defaultdict(list)

    for model in structure:
        for chain in model:
            for residue in chain:
                if is_aa(residue, standard=True):
                    for atom in residue:
                        bfactor_dict[residue.get_id()].append(atom.get_bfactor())

    # Calculate the average B-factor per residue
    num_bfactor = 0.0
    bfactor_sum = 0.0
    for residue_id in sorted(bfactor_dict.keys()):
        average_bfactor = sum(bfactor_dict[residue_id]) / len(bfactor_dict[residue_id])
        num_bfactor += 1.0
        bfactor_sum += average_bfactor

    return bfactor_sum / num_bfactor

def get_PostProcess_pdb_file(pdb_file, pdb):
    pdb_file_basename = ''
    if pdb_file.find('min_') != -1:
        pdb_file_basename = pdb_file[pdb_file.rfind('min_')+4:pdb_file.rfind('_af2')]
    else:
        pdb_file_basename = pdb_file
    pdb_file = PDB_FILES + pdb.lower() +'/unrelaxed_'+pdb_file_basename+'.pdb'
    #pdb_file = '/home/tiburon/Desktop/ROT4/PostProcess/'+pdb+'/unrelaxed_'+pdb_file_basename+'.pdb'
    return pdb_file

if len(sys.argv) < 2 :
    print("Usage: python script.py <bound>")
    print('e.g python3 get_confidences.py 2nd1"')
    sys.exit(1)

bound = sys.argv[1].lower()

# Input file
input_file = CSP_Rank_Scores + './CSP_'+bound.lower()+'_CSpred.csv'
#input_file = './CSP_'+bound+'_CSpred.csv'

# Read the input data
data = pd.read_csv(input_file)

# Directory containing zip files
zip_file_dir = PDB_FILES + bound.upper() + '_json/'
#zip_file_dir = '/home/tiburon/Desktop/ROT4/AFS_FINAL/'+bound+'_json/'

model_file_dir = PDB_FILES + bound.upper() + '/'
#model_file_dir = '/home/tiburon/Desktop/ROT4/AFS_FINAL/'+bound+'/'

# Iterate over each pdb in data
for index, pdb in tqdm(data['holo_model_path'].items()):
    if pdb.find('multimer') != -1 and pdb.find('min') != -1:
        basename_match = pdb[pdb.rfind('model_'):pdb.rfind('_af2')]
        # Find the corresponding zip file
        try:
            zip_file = [zip_file_dir + f for f in listdir(zip_file_dir) if f.find(basename_match) != -1][0]
            print(zip_file)
            # Get confidence, ptm, iptm values
            confidence, ptm, iptm = find_confidence(zip_file)
            # Update the data DataFrame with the extracted values
            data.at[index, 'Confidence'] = confidence
            data.at[index, 'ptm'] = ptm
            data.at[index, 'iptm'] = iptm
        except Exception as e:
            print(e)
            print("couldn't find " + basename_match + ' in ' + zip_file_dir)
            continue
            raise
        pdb_file = get_PostProcess_pdb_file(basename_match, bound)
        plddt = find_avg_plddt(pdb_file)
        data.at[index, 'plddt'] = plddt

# Save the updated data to a new file or overwrite the existing one
data.to_csv(input_file, index=False)

print("FINISHED getting ipTM/pTM, Confidence, and plDDT")
	import sys
	import pandas as pd
	from util import *
	from paths import *
	import pickle


	# function accomodates both .json format and .pkl format, calculates confidence according to
	# Wallner, B. (2023b). Improved multimer prediction using massive sampling with AlphaFold in
	# CASP15. PROTEINS: Structure, Function and Bioinformatics 91, 1734-1746.
	# https://doi.org/10.1002/prot.26562.
	def find_confidence(json_file_path):
	if json_file_path.endswith('json'):
	with open(json_file_path) as file:
	try:
	data = json.load(file)
	iptm = round(data.get('iptm', 0), 2)
	ptm = round(data.get('ptm', 0), 2)
	confidence = round(0.8 * iptm + 0.2 * ptm, 2)
	return confidence, ptm, iptm
	except:
	return None, None, None
	elif json_file_path.endswith('pkl'):
	data = pickle.load(open(json_file_path, 'rb'))
	iptm = round(float(data.get('iptm', 0)), 2)
	ptm = round(float(data.get('ptm', 0)), 2)
	confidence = round(0.8 * iptm + 0.2 * ptm, 2)
	return confidence, ptm, iptm
	else:
	print("BAD JSON FILE")
	raise


	def find_avg_plddt(pdb_file_path):
	parser = PDBParser(QUIET=True)
	structure = parser.get_structure('structure', pdb_file_path)

	bfactor_dict = defaultdict(list)

	for model in structure:
	for chain in model:
	for residue in chain:
	if is_aa(residue, standard=True):
	for atom in residue:
	bfactor_dict[residue.get_id()].append(atom.get_bfactor())

	# Calculate the average B-factor per residue
	num_bfactor = 0.0
	bfactor_sum = 0.0
	for residue_id in sorted(bfactor_dict.keys()):
	average_bfactor = sum(bfactor_dict[residue_id]) / len(bfactor_dict[residue_id])
	num_bfactor += 1.0
	bfactor_sum += average_bfactor

	return bfactor_sum / num_bfactor

	def get_PostProcess_pdb_file(pdb_file, pdb):
	pdb_file_basename = ''
	if pdb_file.find('min_') != -1:
	pdb_file_basename = pdb_file[pdb_file.rfind('min_')+4:pdb_file.rfind('_af2')]
	else:
	pdb_file_basename = pdb_file
	pdb_file = PDB_FILES + pdb.lower() +'/unrelaxed_'+pdb_file_basename+'.pdb'
	#pdb_file = '/home/tiburon/Desktop/ROT4/PostProcess/'+pdb+'/unrelaxed_'+pdb_file_basename+'.pdb'
	return pdb_file

	if len(sys.argv) < 2 :
	print("Usage: python script.py <bound>")
	print('e.g python3 get_confidences.py 2nd1"')
	sys.exit(1)

	bound = sys.argv[1].lower()

	# Input file
	input_file = CSP_Rank_Scores + './CSP_'+bound.lower()+'_CSpred.csv'
	#input_file = './CSP_'+bound+'_CSpred.csv'

	# Read the input data
	data = pd.read_csv(input_file)

	# Directory containing zip files
	zip_file_dir = PDB_FILES + bound.upper() + '_json/'
	#zip_file_dir = '/home/tiburon/Desktop/ROT4/AFS_FINAL/'+bound+'_json/'

	model_file_dir = PDB_FILES + bound.upper() + '/'
	#model_file_dir = '/home/tiburon/Desktop/ROT4/AFS_FINAL/'+bound+'/'

	# Iterate over each pdb in data
	for index, pdb in tqdm(data['holo_model_path'].items()):
	if pdb.find('multimer') != -1 and pdb.find('min') != -1:
	basename_match = pdb[pdb.rfind('model_'):pdb.rfind('_af2')]
	# Find the corresponding zip file
	try:
	zip_file = [zip_file_dir + f for f in listdir(zip_file_dir) if f.find(basename_match) != -1][0]
	print(zip_file)
	# Get confidence, ptm, iptm values
	confidence, ptm, iptm = find_confidence(zip_file)
	# Update the data DataFrame with the extracted values
	data.at[index, 'Confidence'] = confidence
	data.at[index, 'ptm'] = ptm
	data.at[index, 'iptm'] = iptm
	except Exception as e:
	print(e)
	print("couldn't find " + basename_match + ' in ' + zip_file_dir)
	continue
	raise
	pdb_file = get_PostProcess_pdb_file(basename_match, bound)
	plddt = find_avg_plddt(pdb_file)
	data.at[index, 'plddt'] = plddt

	# Save the updated data to a new file or overwrite the existing one
	data.to_csv(input_file, index=False)

	print("FINISHED getting ipTM/pTM, Confidence, and plDDT")