Skip to content
Permalink
5ab4ed8e4a
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
293 lines (228 sloc) 10.3 KB
import argparse
import re
from Bio import PDB
import numpy as np
import os
from tqdm import tqdm
from os.path import exists
from os import listdir
from os.path import isdir
import pandas as pd
from paths import PDB_FILES, experimental_structures, computational_structures, CSP_Rank_Scores
def standardize(input_string):
if input_string[0].isdigit():
return input_string[1:] + input_string[0]
return input_string
def read_pdb_file(file_path):
chain_a_atoms = []
with open(file_path, 'r') as file:
for line in file:
if line.startswith("ATOM"):# or line.startswith("HETATM"):
atom_info = standardize(line[12:16].strip()) + line[17:26].strip()
chain_a_atoms.append(atom_info)
return chain_a_atoms
def find_exclusive_atoms(list1, list2):
exclusive_to_list1 = set(list1) - set(list2)
exclusive_to_list2 = set(list2) - set(list1)
return exclusive_to_list1, exclusive_to_list2
def remove_atoms_from_pdb(file_path, new_file_path, exclusive_atoms):
updated_lines = []
removed = 0
with open(file_path, 'r') as file:
for line in file:
if line.startswith("ATOM"):# or line.startswith("HETATM"):
atom_info = standardize(line[12:16].strip()) + line[17:26].strip()
#atom_info = line[13:26].strip() # Extracting the atom name and residue info
if atom_info not in exclusive_atoms:
updated_lines.append(line)
else:
removed += 1
else:
updated_lines.append(line)
# Saving the updated PDB file
with open(new_file_path, 'w') as file:
for line in updated_lines:
file.write(line)
def process(reference_pdb, target_pdb):
reference_atoms = read_pdb_file(reference_pdb)
target_atoms = read_pdb_file(target_pdb)
reference_exclusive, target_exclusive = find_exclusive_atoms(reference_atoms, target_atoms)
#print(target_exclusive)
#print(reference_exclusive)
return reference_exclusive, target_exclusive
def calculate_transformation(reference_structure, target_structure, residue_ranges):
"""
Calculate the transformation (rotation and translation) needed to superimpose target_structure onto reference_structure.
This function now uses a list of residue ranges for superposition.
Args:
reference_structure (Structure): The reference structure (Biopython Structure object).
target_structure (Structure): The target structure to be superimposed on the reference (Biopython Structure object).
residue_ranges (list of tuples): Each tuple contains (chain ID, min residue index, max residue index).
Returns:
tuple: A tuple containing the rotation matrix and translation vector.
"""
ref_atoms = []
target_atoms = []
# Process each chain and residue range
for chain_id, min_res, max_res in residue_ranges:
ref_atoms_new = [atom.get_coord() for residue in reference_structure[0][chain_id]
for atom in residue if min_res <= residue.get_id()[1] <= max_res]
ref_atoms.extend(ref_atoms_new)
target_atoms_new = [atom.get_coord() for residue in target_structure[0][chain_id]
for atom in residue if min_res <= residue.get_id()[1] <= max_res]
target_atoms.extend(target_atoms_new)
ref_atoms = np.array(ref_atoms)
target_atoms = np.array(target_atoms)
ref_center = np.mean(ref_atoms, axis=0)
target_center = np.mean(target_atoms, axis=0)
ref_atoms -= ref_center
target_atoms -= target_center
correlation_matrix = np.dot(np.transpose(target_atoms), ref_atoms)
u, s, vh = np.linalg.svd(correlation_matrix)
rotation = np.dot(u, vh)
translation = ref_center - np.dot(target_center, rotation)
return rotation, translation
def apply_transformation(structure, rotation, translation):
for atom in structure.get_atoms():
atom.coord = np.dot(atom.coord, rotation) + translation
return structure
def find_atoms_to_remove(reference_pdb_file, target_pdb_files):
atoms_remove = []
for filename in tqdm(target_pdb_files):
if exists(filename) and filename.endswith(".pdb"):
new_reference_atoms, new_target_atoms = process(reference_pdb_file, filename)
atom_len_pre = len(atoms_remove)
for ref in new_reference_atoms:
if ref not in atoms_remove:
atoms_remove.append(ref)
for tar in new_target_atoms:
if tar not in atoms_remove:
atoms_remove.append(tar)
atom_len_post = len(atoms_remove)
if atom_len_post - atom_len_pre > 80:
print(reference_pdb_file)
print(filename)
print(atoms_remove)
raise
return atoms_remove
def continue_prompt():
while True:
user_input = input("Do you want to continue? (y/n): ").lower()
if user_input in ['n', 'no']:
print("Terminating script.")
exit()
elif user_input in ['y', 'yes']:
print("Continuing execution.")
break
else:
print("Invalid input. Please enter 'y' for yes or 'n' for no.")
parser = argparse.ArgumentParser(description="Transform PDB files.")
parser.add_argument("-pdb", help="PDB ID", required=True)
parser.add_argument("-c", help="Chain ID", required=True)
parser.add_argument("-minr", help="min res ID", required=False)
parser.add_argument("-maxr", help="max res ID", required=False)
args = parser.parse_args()
pdb_id = args.pdb
chain_ids = [ e.strip() for e in args.c.split(',') ]
min_ress = [1]
try:
min_ress = [ int(e.strip()) for e in args.minr.split(',') ]
except:
min_ress = [1]
max_ress = [1000]
try:
max_ress = [ int(e.strip()) for e in args.maxr.split(',') ]
except:
max_ress = [1000]
if len(chain_ids) != len(min_ress) or len(chain_ids) != len(max_ress):
print("chain ids = " + str(chain_ids))
print("min residues = " + str(min_ress))
print("max residues = " + str(max_ress))
print("number of elements in chains, minr, and maxr must be the same.")
residue_ranges = []
for i in range(0, len(chain_ids)):
residue_ranges.append((chain_ids[i], min_ress[i], max_ress[i]))
csv_file = CSP_Rank_Scores + 'CSP_'+pdb_id+'_CSpred.csv'
if not(exists(csv_file)):
csv_file = CSP_Rank_Scores + 'CSP_'+pdb_id+'.csv'
csv_df = None
if exists(csv_file):
csv_df = pd.read_csv(csv_file)
cwd = os.getcwd() + '/'
def get_ref_structure_file(df, pdb_id = None):
if exists(experimental_structures +'exp_'+pdb_id+'.pdb'):
return experimental_structures +'exp_'+pdb_id+'.pdb'
else:
ref = ''
while not(exists(ref)):
ref = input("Set reference structure file")
return ref
source_dir = PDB_FILES + pdb_id.upper() + '/'
dest_dir = PDB_FILES + pdb_id.upper() + '_aligned/'
alt_source_dir = PDB_FILES + pdb_id.upper() + '_alt/'
alt_dest_dir = PDB_FILES + pdb_id.upper() + '_aligned/'
#reference_structure_file = "/home/tiburon/Desktop/ROT4/AFSample/experimental_structures/exp_"+pdb_id+'.pdb'
reference_structure_file = get_ref_structure_file(csv_df, pdb_id = pdb_id)
print("REFERENCE STRUCTURE FILE = " + reference_structure_file)
# TODO: CHANGE RESIDUE RANGES TO EXPRESS ORDERED RESIDUES TO BE USED FOR SUPERPOSITION
target_pdb_files = []
target_pdb_files_new = []
if exists(source_dir):
if not(exists(dest_dir)):
os.system('mkdir ' + dest_dir)
target_pdb_files = [ cwd + source_dir + f for f in listdir(source_dir) if f.endswith('.pdb') and f.find('filtered') == -1]
target_pdb_files_new = [ dest_dir + f for f in listdir(source_dir) if f.endswith('.pdb') and f.find('filtered') == -1]
if isdir(alt_source_dir):
if not(exists(alt_dest_dir)):
os.system('mkdir ' + alt_dest_dir)
for file in listdir(alt_source_dir):
target_pdb_files.append(cwd + alt_source_dir + file)
target_pdb_files_new.append(alt_dest_dir+file)
experiment_dir = experimental_structures + pdb_id + '/'
if exists(experiment_dir) and isdir(experiment_dir):
for f in listdir(experiment_dir):
if f.endswith('.pdb'):
target_pdb_files.append(cwd + experiment_dir + f)
target_pdb_files_new.append(experiment_dir + f)
experiment_file = experimental_structures + 'exp_'+pdb_id+'.pdb'
if exists(experiment_file):
target_pdb_files.append(cwd + experiment_file)
target_pdb_files_new.append(experiment_file)
compute_file = computational_structures + 'comp_'+pdb_id+'.pdb'
if exists(compute_file):
target_pdb_files.append(cwd + compute_file)
target_pdb_files_new.append(compute_file)
if not os.path.exists(dest_dir):
os.mkdir(dest_dir)
atoms_remove = find_atoms_to_remove(reference_structure_file, target_pdb_files)
print("THESE ATOMS WILL BE REMOVED FROM ALL FILES: ")
print(atoms_remove)
# TODO: DO NOT CONTINUE IF THERE ARE LOTS OF ATOMS TO BE REMOVED, THIS MAY REQUIRE REFORMATTING THE INPUT FILES.
continue_prompt()
print("REMOVING EXCLUSIVE ATOMS.")
remove_atoms_from_pdb(reference_structure_file, reference_structure_file, atoms_remove)
for i,f in enumerate(target_pdb_files):
remove_atoms_from_pdb(f, target_pdb_files_new[i], atoms_remove)
parser = PDB.PDBParser(QUIET=True)
reference_structure = parser.get_structure('reference.pdb', reference_structure_file)
print("SUPERPOSING STRUCTURES")
i = 0
for filename in tqdm(target_pdb_files_new):
if exists(filename) and filename.endswith(".pdb"):
reference_atoms, target_atoms = process(reference_structure_file, filename)
if len(reference_atoms) > 0 or len(target_atoms) > 0:
print("still have exclusive atoms = ")
print("reference = " + str(reference_atoms))
print("target = " + str(target_atoms))
raise
structure = parser.get_structure(filename[filename.rfind('/')+1:], filename)
rotation = None
translation = None
rotation, translation = calculate_transformation(reference_structure, structure, residue_ranges)
structure = apply_transformation(structure, rotation, translation)
io = PDB.PDBIO()
io.set_structure(structure)
io.save(target_pdb_files_new[i])
i += 1
print('python3 add_aligned_suffix_csv.py ' + pdb_id)
os.system('python3 add_aligned_suffix_csv.py ' + pdb_id)