calc_DockQ_top_rank.py

from util import *
from Bio import PDB
import seaborn as sns

data_source_file = './CSPRANK.csv'
parsed_data = pd.read_csv(data_source_file)
holo_NMR_structure_dir = './PDB_FILES/NMR_holo/'
AF2_structure_dir = './PDB_FILES/AF2_holo/'


def parse_ranges(ranges_str):
    """Parse the input string to extract chains and residue ranges."""
    ranges = {}
    for part in ranges_str.split(', '):
        chain, start_end = part.split(":")
        start_res, end_res = map(int, start_end.split(".."))
        if chain not in ranges:
            ranges[chain] = []
        ranges[chain].append((start_res, end_res))
    return ranges

def adjust_residue_numbers(ranges):
    """Adjust residue numbers to start from 1 for each chain."""
    adjustment_maps = {}
    for chain, chain_ranges in ranges.items():
        adjustment_map = {}
        new_residue_num = 1
        for start_res, end_res in chain_ranges:
            for original_res_num in range(start_res, end_res + 1):
                adjustment_map[original_res_num] = new_residue_num
                new_residue_num += 1
        adjustment_maps[chain] = adjustment_map
    return adjustment_maps

def trim_pdb_by_residues(pdb_file_path, ranges_str, new_file):
    """Trim and reindex residues in a PDB file based on provided ranges string."""
    ranges = parse_ranges(ranges_str)
    adjustment_maps = adjust_residue_numbers(ranges)
    print(ranges)
    with open(pdb_file_path, 'r') as pdb_file, open(new_file, 'w') as output_file:
        for line in pdb_file:
            if line.startswith("ATOM") or line.startswith("HETATM"):
                chain_id = line[21]
                residue_num = int(line[22:26].strip())
                if chain_id in ranges:
                    for start, end in ranges[chain_id]:
                        if start <= residue_num <= end:
                            # Adjust residue number
                            adjusted_residue_num = adjustment_maps[chain_id][residue_num]
                            # Rewrite line with adjusted residue number
                            new_line = line[:22] + "{:>4}".format(adjusted_residue_num) + line[26:]
                            output_file.write(new_line)
                            break
                else:
                    # Write lines for chains not in ranges as they are
                    output_file.write(line)

    return new_file


def align_raw(NMR_file, AF2_file):
    NMR_sequence = get_pdb_sequence(NMR_file)
    AF2_sequence = get_pdb_sequence(AF2_file)
    NMR_aligned, AF2_aligned = align(NMR_sequence, AF2_sequence)
    print(NMR_aligned)
    print(AF2_aligned)
    # raise

# data_check = parsed_data['AF2_DockQ_top_rank']

for i, holo_pdb in enumerate(parsed_data['holo_pdb']):

    # if str(data_check[i]) != 'nan':
    #     continue

    apo = str(parsed_data['apo_bmrb'][i])
    holo = holo_pdb.lower()
    print(holo)

    if holo in ['2kne', '2lsk', '2m55', '2mps', '5j8h']:
        continue

    def get_largest_chain(structure):
        largest_chain = None
        max_residues = 0
        for model in structure:
            for chain in model:
                num_residues = len([residue for residue in chain if PDB.is_aa(residue)])
                if num_residues > max_residues:
                    max_residues = num_residues
                    largest_chain = chain
        return largest_chain

    def align_chains(ref_chain, mobile_chain):
        # Create a Superimposer object
        super_imposer = PDB.Superimposer()

        # Extract alpha carbon (CA) atoms from the reference chain, only from 'ATOM' lines
        ref_atoms = [residue['CA'] for residue in ref_chain if 'CA' in residue and residue.id[0] == ' ']

        # Extract alpha carbon (CA) atoms from the mobile chain, only from 'ATOM' lines
        mobile_atoms = [residue['CA'] for residue in mobile_chain if 'CA' in residue and residue.id[0] == ' ']

        # Print the number of CA atoms in the reference and mobile chains
        print("REF: ", len(ref_atoms))
        print("MOBILE: ", len(mobile_atoms))

        # Set the atoms to be used for alignment
        super_imposer.set_atoms(ref_atoms, mobile_atoms)

        # Return the Superimposer object
        return super_imposer

    # get top rank AF2 file
    AF2_files = [AF2_structure_dir + f for f in os.listdir(AF2_structure_dir) if f.find(holo+"_1") != -1]
    if len(AF2_files) == 0:
        print(f"Could not find AF2 file for {holo}")
        continue

    NMR_files = [holo_NMR_structure_dir + f for f in os.listdir(holo_NMR_structure_dir) if f.find(holo) != -1]
    if len(NMR_files) == 0:
        print(f"Could not find NMR file for {holo}")
        continue
    medoid_NMR_file = find_medoid_structure(NMR_files)

    # get medoid NMR file
    NMR_files = [medoid_NMR_file]

    # TRIM THE NMR FILE

    def superpose(NMR_file, AF2_file):
        parser = PDB.PDBParser(QUIET=True)
        ref_structure = parser.get_structure('NMR', NMR_file)
        mobile_structure = parser.get_structure('AF2', AF2_file)

        # Get the largest chains
        ref_chain = get_largest_chain(ref_structure)
        mobile_chain = get_largest_chain(mobile_structure)

        # Align the chains
        super_imposer = align_chains(ref_chain, mobile_chain)

        # Apply the transformation to all atoms in the mobile structure
        for model in mobile_structure:
            for chain in model:
                super_imposer.apply(chain.get_atoms())

        # Save the aligned structure
        io = PDB.PDBIO()
        io.set_structure(mobile_structure)
        aligned_AF2_file = AF2_file.replace('.pdb', '_aligned.pdb')
        io.save(aligned_AF2_file)
        os.system('mv ' + aligned_AF2_file + ' ' + AF2_file)

        # Update the AF2_file variable to point to the aligned file
        return AF2_file

    def trim(NMR_file):
        trim = pd.read_csv(data_source_file)
        well_defined_residues = ""
        try:
            well_defined_residues = trim.loc[trim['holo_pdb'] == holo]['Well_Defined_Residues'].values[0]
        except:
            print(holo + " not found in the data source file.")
        # Construct the full path to the pdb file
        pdb_path = NMR_file
        # Construct the full path to the output directory
        output_file = f'{holo_NMR_structure_dir}{pdb_path.split('/')[-1].split('.')[0]}_trim.pdb'
        # Move the pdb file to the output directory
        trim_pdb_by_residues(NMR_file, well_defined_residues, output_file)
        return output_file


    iRMS_scores = []
    LRMS_scores = []
    DockQ_scores = []
    for i, NMR_file in enumerate(NMR_files):
        iRMS_scores.append([])
        LRMS_scores.append([])
        DockQ_scores.append([])
        for AF2_file in AF2_files:
            # align_raw(NMR_file, AF2_file)
            # break
            try:
                AF2_file = superpose(NMR_file, AF2_file)
            except:
                try:
                    print("TRIMMING NMR FILE")
                    NMR_file = trim(NMR_file)
                    print("NMR_FILE TRIMMED")
                    print("New NMR_file = " + NMR_file)
                    print("ALIGNING")
                    AF2_file = superpose(NMR_file, AF2_file)
                except:
                    # try aligning the sequences then trim

                    print("Could not align or trim the structure.")
                    raise
            iRMS, LRMS, DockQ = compute_DockQ_score(AF2_file, NMR_file)
            iRMS_scores[i].append(iRMS)
            LRMS_scores[i].append(LRMS)
            DockQ_scores[i].append(DockQ)
    import matplotlib.pyplot as plt

    def plot_heatmap(data, title, xlabel, ylabel, output_file):
        """Plot a 2D heatmap from a 2D array."""
        plt.figure(figsize=(10, 8))
        sns.heatmap(data, annot=True, fmt=".2f", cmap="viridis")
        plt.title(title)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.savefig(output_file)
        # plt.show()
        plt.cla()
        plt.clf()
        plt.close()

    # Example usage:
    # plot_heatmap(iRMS_scores, 'iRMS Scores Heatmap', 'AF2 Files', 'NMR Files', f'./images/{holo}_iRMS_scores_heatmap.png')
    # plot_heatmap(LRMS_scores, 'LRMS Scores Heatmap', 'AF2 Files', 'NMR Files', f'./images/{holo}_LRMS_scores_heatmap.png')
    # plot_heatmap(DockQ_scores, 'DockQ Scores Heatmap', 'AF2 Files', 'NMR Files', f'./images/{holo}_DockQ_scores_heatmap.png')


    iRMS = np.mean(iRMS_scores)
    LRMS = np.mean(LRMS_scores)
    DockQ = np.mean(DockQ_scores)

    new_values = [iRMS, LRMS, DockQ]
    new_columns = ['AF2_iRMS_top_rank', 'AF2_LRMS_top_rank', 'AF2_DockQ_top_rank']

    print(new_values)
    update_row(data_source_file, apo.lower(), holo, new_values, new_columns)
    # raise
	from util import *
	from Bio import PDB
	import seaborn as sns

	data_source_file = './CSPRANK.csv'
	parsed_data = pd.read_csv(data_source_file)
	holo_NMR_structure_dir = './PDB_FILES/NMR_holo/'
	AF2_structure_dir = './PDB_FILES/AF2_holo/'


	def parse_ranges(ranges_str):
	"""Parse the input string to extract chains and residue ranges."""
	ranges = {}
	for part in ranges_str.split(', '):
	chain, start_end = part.split(":")
	start_res, end_res = map(int, start_end.split(".."))
	if chain not in ranges:
	ranges[chain] = []
	ranges[chain].append((start_res, end_res))
	return ranges

	def adjust_residue_numbers(ranges):
	"""Adjust residue numbers to start from 1 for each chain."""
	adjustment_maps = {}
	for chain, chain_ranges in ranges.items():
	adjustment_map = {}
	new_residue_num = 1
	for start_res, end_res in chain_ranges:
	for original_res_num in range(start_res, end_res + 1):
	adjustment_map[original_res_num] = new_residue_num
	new_residue_num += 1
	adjustment_maps[chain] = adjustment_map
	return adjustment_maps

	def trim_pdb_by_residues(pdb_file_path, ranges_str, new_file):
	"""Trim and reindex residues in a PDB file based on provided ranges string."""
	ranges = parse_ranges(ranges_str)
	adjustment_maps = adjust_residue_numbers(ranges)
	print(ranges)
	with open(pdb_file_path, 'r') as pdb_file, open(new_file, 'w') as output_file:
	for line in pdb_file:
	if line.startswith("ATOM") or line.startswith("HETATM"):
	chain_id = line[21]
	residue_num = int(line[22:26].strip())
	if chain_id in ranges:
	for start, end in ranges[chain_id]:
	if start <= residue_num <= end:
	# Adjust residue number
	adjusted_residue_num = adjustment_maps[chain_id][residue_num]
	# Rewrite line with adjusted residue number
	new_line = line[:22] + "{:>4}".format(adjusted_residue_num) + line[26:]
	output_file.write(new_line)
	break
	else:
	# Write lines for chains not in ranges as they are
	output_file.write(line)

	return new_file


	def align_raw(NMR_file, AF2_file):
	NMR_sequence = get_pdb_sequence(NMR_file)
	AF2_sequence = get_pdb_sequence(AF2_file)
	NMR_aligned, AF2_aligned = align(NMR_sequence, AF2_sequence)
	print(NMR_aligned)
	print(AF2_aligned)
	# raise

	# data_check = parsed_data['AF2_DockQ_top_rank']

	for i, holo_pdb in enumerate(parsed_data['holo_pdb']):

	# if str(data_check[i]) != 'nan':
	# continue

	apo = str(parsed_data['apo_bmrb'][i])
	holo = holo_pdb.lower()
	print(holo)

	if holo in ['2kne', '2lsk', '2m55', '2mps', '5j8h']:
	continue

	def get_largest_chain(structure):
	largest_chain = None
	max_residues = 0
	for model in structure:
	for chain in model:
	num_residues = len([residue for residue in chain if PDB.is_aa(residue)])
	if num_residues > max_residues:
	max_residues = num_residues
	largest_chain = chain
	return largest_chain

	def align_chains(ref_chain, mobile_chain):
	# Create a Superimposer object
	super_imposer = PDB.Superimposer()

	# Extract alpha carbon (CA) atoms from the reference chain, only from 'ATOM' lines
	ref_atoms = [residue['CA'] for residue in ref_chain if 'CA' in residue and residue.id[0] == ' ']

	# Extract alpha carbon (CA) atoms from the mobile chain, only from 'ATOM' lines
	mobile_atoms = [residue['CA'] for residue in mobile_chain if 'CA' in residue and residue.id[0] == ' ']

	# Print the number of CA atoms in the reference and mobile chains
	print("REF: ", len(ref_atoms))
	print("MOBILE: ", len(mobile_atoms))

	# Set the atoms to be used for alignment
	super_imposer.set_atoms(ref_atoms, mobile_atoms)

	# Return the Superimposer object
	return super_imposer

	# get top rank AF2 file
	AF2_files = [AF2_structure_dir + f for f in os.listdir(AF2_structure_dir) if f.find(holo+"_1") != -1]
	if len(AF2_files) == 0:
	print(f"Could not find AF2 file for {holo}")
	continue

	NMR_files = [holo_NMR_structure_dir + f for f in os.listdir(holo_NMR_structure_dir) if f.find(holo) != -1]
	if len(NMR_files) == 0:
	print(f"Could not find NMR file for {holo}")
	continue
	medoid_NMR_file = find_medoid_structure(NMR_files)

	# get medoid NMR file
	NMR_files = [medoid_NMR_file]

	# TRIM THE NMR FILE

	def superpose(NMR_file, AF2_file):
	parser = PDB.PDBParser(QUIET=True)
	ref_structure = parser.get_structure('NMR', NMR_file)
	mobile_structure = parser.get_structure('AF2', AF2_file)

	# Get the largest chains
	ref_chain = get_largest_chain(ref_structure)
	mobile_chain = get_largest_chain(mobile_structure)

	# Align the chains
	super_imposer = align_chains(ref_chain, mobile_chain)

	# Apply the transformation to all atoms in the mobile structure
	for model in mobile_structure:
	for chain in model:
	super_imposer.apply(chain.get_atoms())

	# Save the aligned structure
	io = PDB.PDBIO()
	io.set_structure(mobile_structure)
	aligned_AF2_file = AF2_file.replace('.pdb', '_aligned.pdb')
	io.save(aligned_AF2_file)
	os.system('mv ' + aligned_AF2_file + ' ' + AF2_file)

	# Update the AF2_file variable to point to the aligned file
	return AF2_file

	def trim(NMR_file):
	trim = pd.read_csv(data_source_file)
	well_defined_residues = ""
	try:
	well_defined_residues = trim.loc[trim['holo_pdb'] == holo]['Well_Defined_Residues'].values[0]
	except:
	print(holo + " not found in the data source file.")
	# Construct the full path to the pdb file
	pdb_path = NMR_file
	# Construct the full path to the output directory
	output_file = f'{holo_NMR_structure_dir}{pdb_path.split('/')[-1].split('.')[0]}_trim.pdb'
	# Move the pdb file to the output directory
	trim_pdb_by_residues(NMR_file, well_defined_residues, output_file)
	return output_file


	iRMS_scores = []
	LRMS_scores = []
	DockQ_scores = []
	for i, NMR_file in enumerate(NMR_files):
	iRMS_scores.append([])
	LRMS_scores.append([])
	DockQ_scores.append([])
	for AF2_file in AF2_files:
	# align_raw(NMR_file, AF2_file)
	# break
	try:
	AF2_file = superpose(NMR_file, AF2_file)
	except:
	try:
	print("TRIMMING NMR FILE")
	NMR_file = trim(NMR_file)
	print("NMR_FILE TRIMMED")
	print("New NMR_file = " + NMR_file)
	print("ALIGNING")
	AF2_file = superpose(NMR_file, AF2_file)
	except:
	# try aligning the sequences then trim

	print("Could not align or trim the structure.")
	raise
	iRMS, LRMS, DockQ = compute_DockQ_score(AF2_file, NMR_file)
	iRMS_scores[i].append(iRMS)
	LRMS_scores[i].append(LRMS)
	DockQ_scores[i].append(DockQ)
	import matplotlib.pyplot as plt

	def plot_heatmap(data, title, xlabel, ylabel, output_file):
	"""Plot a 2D heatmap from a 2D array."""
	plt.figure(figsize=(10, 8))
	sns.heatmap(data, annot=True, fmt=".2f", cmap="viridis")
	plt.title(title)
	plt.xlabel(xlabel)
	plt.ylabel(ylabel)
	plt.savefig(output_file)
	# plt.show()
	plt.cla()
	plt.clf()
	plt.close()

	# Example usage:
	# plot_heatmap(iRMS_scores, 'iRMS Scores Heatmap', 'AF2 Files', 'NMR Files', f'./images/{holo}_iRMS_scores_heatmap.png')
	# plot_heatmap(LRMS_scores, 'LRMS Scores Heatmap', 'AF2 Files', 'NMR Files', f'./images/{holo}_LRMS_scores_heatmap.png')
	# plot_heatmap(DockQ_scores, 'DockQ Scores Heatmap', 'AF2 Files', 'NMR Files', f'./images/{holo}_DockQ_scores_heatmap.png')


	iRMS = np.mean(iRMS_scores)
	LRMS = np.mean(LRMS_scores)
	DockQ = np.mean(DockQ_scores)

	new_values = [iRMS, LRMS, DockQ]
	new_columns = ['AF2_iRMS_top_rank', 'AF2_LRMS_top_rank', 'AF2_DockQ_top_rank']

	print(new_values)
	update_row(data_source_file, apo.lower(), holo, new_values, new_columns)
	# raise