Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
from util import *
from Bio import PDB
import seaborn as sns
data_source_file = './CSPRANK.csv'
parsed_data = pd.read_csv(data_source_file)
holo_NMR_structure_dir = './PDB_FILES/NMR_holo/'
AF2_structure_dir = './PDB_FILES/AF3_holo/'
import pandas as pd
import subprocess
import re
# Function to call TM-align and return the TM-score
def get_tm_score(pdb_path1, pdb_path2):
try:
# Call TM-align with the two PDB paths
result = subprocess.run(['./MMalign', pdb_path1, pdb_path2], capture_output=True, text=True)
output = result.stdout
# Extract TM-score from the output
# This regex looks for TM-score in the output assuming it's in the format "TM-score= X.XXXX"
match = re.search(r"TM-score\s*=\s*(\d.\d+)", output)
if match:
return float(match.group(1))
else:
print("TM-score not found in the output.")
return None
except Exception as e:
print(f"Error calculating TM-score for {pdb_path1} and {pdb_path2}: {e}")
return None
def parse_ranges(ranges_str):
"""Parse the input string to extract chains and residue ranges."""
ranges = {}
for part in ranges_str.split(', '):
chain, start_end = part.split(":")
start_res, end_res = map(int, start_end.split(".."))
if chain not in ranges:
ranges[chain] = []
ranges[chain].append((start_res, end_res))
return ranges
def adjust_residue_numbers(ranges):
"""Adjust residue numbers to start from 1 for each chain."""
adjustment_maps = {}
for chain, chain_ranges in ranges.items():
adjustment_map = {}
new_residue_num = 1
for start_res, end_res in chain_ranges:
for original_res_num in range(start_res, end_res + 1):
adjustment_map[original_res_num] = new_residue_num
new_residue_num += 1
adjustment_maps[chain] = adjustment_map
return adjustment_maps
def trim_pdb_by_residues(pdb_file_path, ranges_str, new_file):
"""Trim and reindex residues in a PDB file based on provided ranges string."""
ranges = parse_ranges(ranges_str)
adjustment_maps = adjust_residue_numbers(ranges)
print(ranges)
with open(pdb_file_path, 'r') as pdb_file, open(new_file, 'w') as output_file:
for line in pdb_file:
if line.startswith("ATOM") or line.startswith("HETATM"):
chain_id = line[21]
residue_num = int(line[22:26].strip())
if chain_id in ranges:
for start, end in ranges[chain_id]:
if start <= residue_num <= end:
# Adjust residue number
adjusted_residue_num = adjustment_maps[chain_id][residue_num]
# Rewrite line with adjusted residue number
new_line = line[:22] + "{:>4}".format(adjusted_residue_num) + line[26:]
output_file.write(new_line)
break
else:
# Write lines for chains not in ranges as they are
output_file.write(line)
return new_file
def align_raw(NMR_file, AF2_file):
NMR_sequence = get_pdb_sequence(NMR_file)
AF2_sequence = get_pdb_sequence(AF2_file)
NMR_aligned, AF2_aligned = align(NMR_sequence, AF2_sequence)
print(NMR_aligned)
print(AF2_aligned)
# raise
for i, holo_pdb in enumerate(parsed_data['holo_pdb']):
apo = str(parsed_data['apo_bmrb'][i])
holo = holo_pdb.lower()
# if holo != '2jw1':
# continue
print(holo)
if holo in ['2kne', '2lsk', '2m55', '2mps', '5j8h']:
continue
def get_largest_chain(structure):
largest_chain = None
max_residues = 0
for model in structure:
for chain in model:
num_residues = len([residue for residue in chain if PDB.is_aa(residue)])
if num_residues > max_residues:
max_residues = num_residues
largest_chain = chain
return largest_chain
def align_chains(ref_chain, mobile_chain):
# Create a Superimposer object
super_imposer = PDB.Superimposer()
# Extract alpha carbon (CA) atoms from the reference chain, only from 'ATOM' lines
ref_atoms = [residue['CA'] for residue in ref_chain if 'CA' in residue and residue.id[0] == ' ']
# Extract alpha carbon (CA) atoms from the mobile chain, only from 'ATOM' lines
mobile_atoms = [residue['CA'] for residue in mobile_chain if 'CA' in residue and residue.id[0] == ' ']
# Print the number of CA atoms in the reference and mobile chains
# print("REF: ", len(ref_atoms))
# print("MOBILE: ", len(mobile_atoms))
# Set the atoms to be used for alignment
super_imposer.set_atoms(ref_atoms, mobile_atoms)
# Return the Superimposer object
return super_imposer
AF2_files = [AF2_structure_dir + f for f in os.listdir(AF2_structure_dir) if f.find(holo) != -1]
if len(AF2_files) == 0:
print(f"Could not find AF3 file for {holo}")
continue
NMR_files = [holo_NMR_structure_dir + f for f in os.listdir(holo_NMR_structure_dir) if f.find(holo) != -1]
if len(NMR_files) == 0:
print(f"Could not find NMR file for {holo}")
continue
ordered_NMR_files = ['' for i in range(len(NMR_files))]
for NMR_file in NMR_files:
index = int(NMR_file.split('_')[-1].split('.')[0])-1
ordered_NMR_files[index] = NMR_file
NMR_files = ordered_NMR_files
ordered_AF2_files = ['' for i in range(len(AF2_files))]
for AF2_file in AF2_files:
index = int(AF2_file.split('_')[-1].split('.')[0])-1
ordered_AF2_files[index] = AF2_file
AF2_files = ordered_AF2_files
# TRIM THE NMR FILE
def superpose(NMR_file, AF2_file):
parser = PDB.PDBParser(QUIET=True)
ref_structure = parser.get_structure('NMR', NMR_file)
mobile_structure = parser.get_structure('AF2', AF2_file)
# Get the largest chains
ref_chain = get_largest_chain(ref_structure)
mobile_chain = get_largest_chain(mobile_structure)
# Align the chains
super_imposer = align_chains(ref_chain, mobile_chain)
# Apply the transformation to all atoms in the mobile structure
for model in mobile_structure:
for chain in model:
super_imposer.apply(chain.get_atoms())
# Save the aligned structure
io = PDB.PDBIO()
io.set_structure(mobile_structure)
aligned_AF2_file = AF2_file.replace('.pdb', '_aligned.pdb')
io.save(aligned_AF2_file)
os.system('mv ' + aligned_AF2_file + ' ' + AF2_file)
# Update the AF2_file variable to point to the aligned file
return AF2_file
def trim(NMR_file):
trim = pd.read_csv(data_source_file)
well_defined_residues = ""
try:
well_defined_residues = trim.loc[trim['holo_pdb'] == holo]['Well_Defined_Residues'].values[0]
except:
print(holo + " not found in the data source file.")
# Construct the full path to the pdb file
pdb_path = NMR_file
# Construct the full path to the output directory
output_file = f'{holo_NMR_structure_dir}{pdb_path.split('/')[-1].split('.')[0]}_trim.pdb'
# Move the pdb file to the output directory
trim_pdb_by_residues(NMR_file, well_defined_residues, output_file)
return output_file
tm_scores = []
for i, NMR_file in enumerate(NMR_files):
tm_scores.append([])
for AF2_file in AF2_files:
# align_raw(NMR_file, AF2_file)
# break
try:
AF2_file = superpose(NMR_file, AF2_file)
except:
try:
print("TRIMMING NMR FILE")
NMR_file = trim(NMR_file)
print("NMR_FILE TRIMMED")
print("New NMR_file = " + NMR_file)
print("ALIGNING")
AF2_file = superpose(NMR_file, AF2_file)
except:
# try aligning the sequences then trim
print("Could not align or trim the structure.")
raise
tm_score = get_tm_score(AF2_file, NMR_file)
tm_scores[i].append(tm_score)
import matplotlib.pyplot as plt
def plot_heatmap(data, title, xlabel, ylabel, output_file):
"""Plot a 2D heatmap from a 2D array."""
plt.figure(figsize=(10, 8))
sns.heatmap(data, annot=True, fmt=".2f", cmap="viridis")
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.savefig(output_file)
# plt.show()
plt.cla()
plt.clf()
plt.close()
# Example usage:
plot_heatmap(tm_scores, 'TM Scores Heatmap', 'AF3 Files', 'NMR Files', f'./images/{holo}_TM_scores_heatmap.png')
TM = np.mean(tm_scores)
new_values = [TM]
new_columns = ['AF3_TM']
print(new_values)
update_row(data_source_file, apo.lower(), holo, new_values, new_columns)
# raise