transform.py

import argparse
import re
from Bio import PDB
import numpy as np
import os
from tqdm import tqdm
from os.path import exists
from os import listdir
from os.path import isdir
import pandas as pd
from paths import PDB_FILES, experimental_structures, computational_structures, CSP_Rank_Scores

def standardize(input_string):
    if input_string[0].isdigit():
        return input_string[1:] + input_string[0]
    return input_string

def read_pdb_file(file_path):
    chain_a_atoms = []
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith("ATOM"):# or line.startswith("HETATM"):
                atom_info = standardize(line[12:16].strip()) + line[17:26].strip()
                chain_a_atoms.append(atom_info)
    return chain_a_atoms


def find_exclusive_atoms(list1, list2):
    exclusive_to_list1 = set(list1) - set(list2)
    exclusive_to_list2 = set(list2) - set(list1)
    return exclusive_to_list1, exclusive_to_list2


def remove_atoms_from_pdb(file_path, new_file_path, exclusive_atoms):
    updated_lines = []
    removed = 0
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith("ATOM"):# or line.startswith("HETATM"):
                atom_info = standardize(line[12:16].strip()) + line[17:26].strip()
                #atom_info = line[13:26].strip()  # Extracting the atom name and residue info
                if atom_info not in exclusive_atoms:
                    updated_lines.append(line)
                else:
                    removed += 1
            else:
                updated_lines.append(line)
    # Saving the updated PDB file
    with open(new_file_path, 'w') as file:
        for line in updated_lines:
            file.write(line)


def process(reference_pdb, target_pdb):
    reference_atoms = read_pdb_file(reference_pdb)
    target_atoms = read_pdb_file(target_pdb)
    reference_exclusive, target_exclusive = find_exclusive_atoms(reference_atoms, target_atoms)
    #print(target_exclusive)
    #print(reference_exclusive)
    return reference_exclusive, target_exclusive


def calculate_transformation(reference_structure, target_structure, residue_ranges):
    """
    Calculate the transformation (rotation and translation) needed to superimpose target_structure onto reference_structure.

    This function now uses a list of residue ranges for superposition.

    Args:
    reference_structure (Structure): The reference structure (Biopython Structure object).
    target_structure (Structure): The target structure to be superimposed on the reference (Biopython Structure object).
    residue_ranges (list of tuples): Each tuple contains (chain ID, min residue index, max residue index).

    Returns:
    tuple: A tuple containing the rotation matrix and translation vector.
    """
    ref_atoms = []
    target_atoms = []

    # Process each chain and residue range
    for chain_id, min_res, max_res in residue_ranges:
        ref_atoms_new = [atom.get_coord() for residue in reference_structure[0][chain_id]
                         for atom in residue if min_res <= residue.get_id()[1] <= max_res]
        ref_atoms.extend(ref_atoms_new)

        target_atoms_new = [atom.get_coord() for residue in target_structure[0][chain_id]
                            for atom in residue if min_res <= residue.get_id()[1] <= max_res]
        target_atoms.extend(target_atoms_new)

    ref_atoms = np.array(ref_atoms)
    target_atoms = np.array(target_atoms)

    ref_center = np.mean(ref_atoms, axis=0)
    target_center = np.mean(target_atoms, axis=0)

    ref_atoms -= ref_center
    target_atoms -= target_center

    correlation_matrix = np.dot(np.transpose(target_atoms), ref_atoms)
    u, s, vh = np.linalg.svd(correlation_matrix)
    rotation = np.dot(u, vh)
    translation = ref_center - np.dot(target_center, rotation)

    return rotation, translation

def apply_transformation(structure, rotation, translation):
    for atom in structure.get_atoms():
        atom.coord = np.dot(atom.coord, rotation) + translation

    return structure

def find_atoms_to_remove(reference_pdb_file, target_pdb_files):

    atoms_remove = []

    for filename in tqdm(target_pdb_files):
        if exists(filename) and filename.endswith(".pdb"):

            new_reference_atoms, new_target_atoms = process(reference_pdb_file, filename)


            atom_len_pre = len(atoms_remove)

            for ref in new_reference_atoms:
                if ref not in atoms_remove:
                    atoms_remove.append(ref)

            for tar in new_target_atoms:
                if tar not in atoms_remove:
                    atoms_remove.append(tar)

            atom_len_post = len(atoms_remove)
            if atom_len_post - atom_len_pre > 80:
                print(reference_pdb_file)
                print(filename)
                print(atoms_remove)
                raise


    return atoms_remove

def continue_prompt():
    while True:
        user_input = input("Do you want to continue? (y/n): ").lower()
        if user_input in ['n', 'no']:
            print("Terminating script.")
            exit()
        elif user_input in ['y', 'yes']:
            print("Continuing execution.")
            break
        else:
            print("Invalid input. Please enter 'y' for yes or 'n' for no.")

parser = argparse.ArgumentParser(description="Transform PDB files.")
parser.add_argument("-pdb", help="PDB ID", required=True)
parser.add_argument("-c", help="Chain ID", required=True)
parser.add_argument("-minr", help="min res ID", required=False)
parser.add_argument("-maxr", help="max res ID", required=False)

args = parser.parse_args()

pdb_id = args.pdb
chain_ids = [ e.strip() for e in args.c.split(',') ]
min_ress = [1]
try:
    min_ress = [ int(e.strip()) for e in args.minr.split(',') ]
except:
    min_ress = [1]
max_ress = [1000]
try:
    max_ress = [ int(e.strip()) for e in args.maxr.split(',') ]
except:
    max_ress = [1000]

if len(chain_ids) != len(min_ress) or len(chain_ids) != len(max_ress):
    print("chain ids = " + str(chain_ids))
    print("min residues = " + str(min_ress))
    print("max residues = " + str(max_ress))
    print("number of elements in chains, minr, and maxr must be the same.")

residue_ranges = []
for i in range(0, len(chain_ids)):
    residue_ranges.append((chain_ids[i], min_ress[i], max_ress[i]))

csv_file = CSP_Rank_Scores + 'CSP_'+pdb_id+'_CSpred.csv'
if not(exists(csv_file)):
    csv_file = CSP_Rank_Scores + 'CSP_'+pdb_id+'.csv'
csv_df = None
if exists(csv_file):
    csv_df = pd.read_csv(csv_file)
cwd = os.getcwd() + '/'

def get_ref_structure_file(df, pdb_id = None):
    if exists(experimental_structures +'exp_'+pdb_id+'.pdb'):
        return experimental_structures +'exp_'+pdb_id+'.pdb'
    else:
        ref = ''
        while not(exists(ref)):
            ref = input("Set reference structure file")
        return ref

source_dir = PDB_FILES + pdb_id.upper() + '/'
dest_dir = PDB_FILES + pdb_id.upper() + '_aligned/'
alt_source_dir = PDB_FILES + pdb_id.upper() + '_alt/'
alt_dest_dir = PDB_FILES + pdb_id.upper() + '_aligned/'


#reference_structure_file = "/home/tiburon/Desktop/ROT4/AFSample/experimental_structures/exp_"+pdb_id+'.pdb'
reference_structure_file = get_ref_structure_file(csv_df, pdb_id = pdb_id)
print("REFERENCE STRUCTURE FILE = " + reference_structure_file)

# TODO: CHANGE RESIDUE RANGES TO EXPRESS ORDERED RESIDUES TO BE USED FOR SUPERPOSITION
target_pdb_files = []
target_pdb_files_new = []
if exists(source_dir):
    if not(exists(dest_dir)):
        os.system('mkdir ' + dest_dir)
    target_pdb_files =     [ cwd + source_dir + f for f in listdir(source_dir) if f.endswith('.pdb') and f.find('filtered') == -1]
    target_pdb_files_new = [ dest_dir   + f for f in listdir(source_dir) if f.endswith('.pdb') and f.find('filtered') == -1]

if isdir(alt_source_dir):
    if not(exists(alt_dest_dir)):
        os.system('mkdir ' + alt_dest_dir)
    for file in listdir(alt_source_dir):
        target_pdb_files.append(cwd + alt_source_dir + file)
        target_pdb_files_new.append(alt_dest_dir+file)

experiment_dir = experimental_structures + pdb_id + '/'
if exists(experiment_dir) and isdir(experiment_dir):
    for f in listdir(experiment_dir):
        if f.endswith('.pdb'):
            target_pdb_files.append(cwd + experiment_dir + f)
            target_pdb_files_new.append(experiment_dir + f)

experiment_file = experimental_structures + 'exp_'+pdb_id+'.pdb'
if exists(experiment_file):
    target_pdb_files.append(cwd + experiment_file)
    target_pdb_files_new.append(experiment_file)

compute_file = computational_structures + 'comp_'+pdb_id+'.pdb'
if exists(compute_file):
    target_pdb_files.append(cwd + compute_file)
    target_pdb_files_new.append(compute_file)

if not os.path.exists(dest_dir):
    os.mkdir(dest_dir)

atoms_remove = find_atoms_to_remove(reference_structure_file, target_pdb_files)

print("THESE ATOMS WILL BE REMOVED FROM ALL FILES: ")
print(atoms_remove)

# TODO: DO NOT CONTINUE IF THERE ARE LOTS OF ATOMS TO BE REMOVED, THIS MAY REQUIRE REFORMATTING THE INPUT FILES.
continue_prompt()

print("REMOVING EXCLUSIVE ATOMS.")
remove_atoms_from_pdb(reference_structure_file, reference_structure_file, atoms_remove)
for i,f in enumerate(target_pdb_files):
    remove_atoms_from_pdb(f, target_pdb_files_new[i], atoms_remove)

parser = PDB.PDBParser(QUIET=True)
reference_structure = parser.get_structure('reference.pdb', reference_structure_file)

print("SUPERPOSING STRUCTURES")
i = 0
for filename in tqdm(target_pdb_files_new):
    if exists(filename) and filename.endswith(".pdb"):

        reference_atoms, target_atoms = process(reference_structure_file, filename)

        if len(reference_atoms) > 0 or len(target_atoms) > 0:
            print("still have exclusive atoms = ")
            print("reference = " + str(reference_atoms))
            print("target = " + str(target_atoms))
            raise

        structure = parser.get_structure(filename[filename.rfind('/')+1:], filename)

        rotation = None
        translation = None

        rotation, translation = calculate_transformation(reference_structure, structure, residue_ranges)

        structure = apply_transformation(structure, rotation, translation)

        io = PDB.PDBIO()
        io.set_structure(structure)
        io.save(target_pdb_files_new[i])
    i += 1

print('python3 add_aligned_suffix_csv.py ' + pdb_id)
os.system('python3 add_aligned_suffix_csv.py ' + pdb_id)
	import argparse
	import re
	from Bio import PDB
	import numpy as np
	import os
	from tqdm import tqdm
	from os.path import exists
	from os import listdir
	from os.path import isdir
	import pandas as pd
	from paths import PDB_FILES, experimental_structures, computational_structures, CSP_Rank_Scores

	def standardize(input_string):
	if input_string[0].isdigit():
	return input_string[1:] + input_string[0]
	return input_string

	def read_pdb_file(file_path):
	chain_a_atoms = []
	with open(file_path, 'r') as file:
	for line in file:
	if line.startswith("ATOM"):# or line.startswith("HETATM"):
	atom_info = standardize(line[12:16].strip()) + line[17:26].strip()
	chain_a_atoms.append(atom_info)
	return chain_a_atoms


	def find_exclusive_atoms(list1, list2):
	exclusive_to_list1 = set(list1) - set(list2)
	exclusive_to_list2 = set(list2) - set(list1)
	return exclusive_to_list1, exclusive_to_list2


	def remove_atoms_from_pdb(file_path, new_file_path, exclusive_atoms):
	updated_lines = []
	removed = 0
	with open(file_path, 'r') as file:
	for line in file:
	if line.startswith("ATOM"):# or line.startswith("HETATM"):
	atom_info = standardize(line[12:16].strip()) + line[17:26].strip()
	#atom_info = line[13:26].strip() # Extracting the atom name and residue info
	if atom_info not in exclusive_atoms:
	updated_lines.append(line)
	else:
	removed += 1
	else:
	updated_lines.append(line)
	# Saving the updated PDB file
	with open(new_file_path, 'w') as file:
	for line in updated_lines:
	file.write(line)


	def process(reference_pdb, target_pdb):
	reference_atoms = read_pdb_file(reference_pdb)
	target_atoms = read_pdb_file(target_pdb)
	reference_exclusive, target_exclusive = find_exclusive_atoms(reference_atoms, target_atoms)
	#print(target_exclusive)
	#print(reference_exclusive)
	return reference_exclusive, target_exclusive


	def calculate_transformation(reference_structure, target_structure, residue_ranges):
	"""
	Calculate the transformation (rotation and translation) needed to superimpose target_structure onto reference_structure.

	This function now uses a list of residue ranges for superposition.

	Args:
	reference_structure (Structure): The reference structure (Biopython Structure object).
	target_structure (Structure): The target structure to be superimposed on the reference (Biopython Structure object).
	residue_ranges (list of tuples): Each tuple contains (chain ID, min residue index, max residue index).

	Returns:
	tuple: A tuple containing the rotation matrix and translation vector.
	"""
	ref_atoms = []
	target_atoms = []

	# Process each chain and residue range
	for chain_id, min_res, max_res in residue_ranges:
	ref_atoms_new = [atom.get_coord() for residue in reference_structure[0][chain_id]
	for atom in residue if min_res <= residue.get_id()[1] <= max_res]
	ref_atoms.extend(ref_atoms_new)

	target_atoms_new = [atom.get_coord() for residue in target_structure[0][chain_id]
	for atom in residue if min_res <= residue.get_id()[1] <= max_res]
	target_atoms.extend(target_atoms_new)

	ref_atoms = np.array(ref_atoms)
	target_atoms = np.array(target_atoms)

	ref_center = np.mean(ref_atoms, axis=0)
	target_center = np.mean(target_atoms, axis=0)

	ref_atoms -= ref_center
	target_atoms -= target_center

	correlation_matrix = np.dot(np.transpose(target_atoms), ref_atoms)
	u, s, vh = np.linalg.svd(correlation_matrix)
	rotation = np.dot(u, vh)
	translation = ref_center - np.dot(target_center, rotation)

	return rotation, translation

	def apply_transformation(structure, rotation, translation):
	for atom in structure.get_atoms():
	atom.coord = np.dot(atom.coord, rotation) + translation

	return structure

	def find_atoms_to_remove(reference_pdb_file, target_pdb_files):

	atoms_remove = []

	for filename in tqdm(target_pdb_files):
	if exists(filename) and filename.endswith(".pdb"):

	new_reference_atoms, new_target_atoms = process(reference_pdb_file, filename)


	atom_len_pre = len(atoms_remove)

	for ref in new_reference_atoms:
	if ref not in atoms_remove:
	atoms_remove.append(ref)

	for tar in new_target_atoms:
	if tar not in atoms_remove:
	atoms_remove.append(tar)

	atom_len_post = len(atoms_remove)
	if atom_len_post - atom_len_pre > 80:
	print(reference_pdb_file)
	print(filename)
	print(atoms_remove)
	raise


	return atoms_remove

	def continue_prompt():
	while True:
	user_input = input("Do you want to continue? (y/n): ").lower()
	if user_input in ['n', 'no']:
	print("Terminating script.")
	exit()
	elif user_input in ['y', 'yes']:
	print("Continuing execution.")
	break
	else:
	print("Invalid input. Please enter 'y' for yes or 'n' for no.")

	parser = argparse.ArgumentParser(description="Transform PDB files.")
	parser.add_argument("-pdb", help="PDB ID", required=True)
	parser.add_argument("-c", help="Chain ID", required=True)
	parser.add_argument("-minr", help="min res ID", required=False)
	parser.add_argument("-maxr", help="max res ID", required=False)

	args = parser.parse_args()

	pdb_id = args.pdb
	chain_ids = [ e.strip() for e in args.c.split(',') ]
	min_ress = [1]
	try:
	min_ress = [ int(e.strip()) for e in args.minr.split(',') ]
	except:
	min_ress = [1]
	max_ress = [1000]
	try:
	max_ress = [ int(e.strip()) for e in args.maxr.split(',') ]
	except:
	max_ress = [1000]

	if len(chain_ids) != len(min_ress) or len(chain_ids) != len(max_ress):
	print("chain ids = " + str(chain_ids))
	print("min residues = " + str(min_ress))
	print("max residues = " + str(max_ress))
	print("number of elements in chains, minr, and maxr must be the same.")

	residue_ranges = []
	for i in range(0, len(chain_ids)):
	residue_ranges.append((chain_ids[i], min_ress[i], max_ress[i]))

	csv_file = CSP_Rank_Scores + 'CSP_'+pdb_id+'_CSpred.csv'
	if not(exists(csv_file)):
	csv_file = CSP_Rank_Scores + 'CSP_'+pdb_id+'.csv'
	csv_df = None
	if exists(csv_file):
	csv_df = pd.read_csv(csv_file)
	cwd = os.getcwd() + '/'

	def get_ref_structure_file(df, pdb_id = None):
	if exists(experimental_structures +'exp_'+pdb_id+'.pdb'):
	return experimental_structures +'exp_'+pdb_id+'.pdb'
	else:
	ref = ''
	while not(exists(ref)):
	ref = input("Set reference structure file")
	return ref

	source_dir = PDB_FILES + pdb_id.upper() + '/'
	dest_dir = PDB_FILES + pdb_id.upper() + '_aligned/'
	alt_source_dir = PDB_FILES + pdb_id.upper() + '_alt/'
	alt_dest_dir = PDB_FILES + pdb_id.upper() + '_aligned/'


	#reference_structure_file = "/home/tiburon/Desktop/ROT4/AFSample/experimental_structures/exp_"+pdb_id+'.pdb'
	reference_structure_file = get_ref_structure_file(csv_df, pdb_id = pdb_id)
	print("REFERENCE STRUCTURE FILE = " + reference_structure_file)

	# TODO: CHANGE RESIDUE RANGES TO EXPRESS ORDERED RESIDUES TO BE USED FOR SUPERPOSITION
	target_pdb_files = []
	target_pdb_files_new = []
	if exists(source_dir):
	if not(exists(dest_dir)):
	os.system('mkdir ' + dest_dir)
	target_pdb_files = [ cwd + source_dir + f for f in listdir(source_dir) if f.endswith('.pdb') and f.find('filtered') == -1]
	target_pdb_files_new = [ dest_dir + f for f in listdir(source_dir) if f.endswith('.pdb') and f.find('filtered') == -1]

	if isdir(alt_source_dir):
	if not(exists(alt_dest_dir)):
	os.system('mkdir ' + alt_dest_dir)
	for file in listdir(alt_source_dir):
	target_pdb_files.append(cwd + alt_source_dir + file)
	target_pdb_files_new.append(alt_dest_dir+file)

	experiment_dir = experimental_structures + pdb_id + '/'
	if exists(experiment_dir) and isdir(experiment_dir):
	for f in listdir(experiment_dir):
	if f.endswith('.pdb'):
	target_pdb_files.append(cwd + experiment_dir + f)
	target_pdb_files_new.append(experiment_dir + f)

	experiment_file = experimental_structures + 'exp_'+pdb_id+'.pdb'
	if exists(experiment_file):
	target_pdb_files.append(cwd + experiment_file)
	target_pdb_files_new.append(experiment_file)

	compute_file = computational_structures + 'comp_'+pdb_id+'.pdb'
	if exists(compute_file):
	target_pdb_files.append(cwd + compute_file)
	target_pdb_files_new.append(compute_file)

	if not os.path.exists(dest_dir):
	os.mkdir(dest_dir)

	atoms_remove = find_atoms_to_remove(reference_structure_file, target_pdb_files)

	print("THESE ATOMS WILL BE REMOVED FROM ALL FILES: ")
	print(atoms_remove)

	# TODO: DO NOT CONTINUE IF THERE ARE LOTS OF ATOMS TO BE REMOVED, THIS MAY REQUIRE REFORMATTING THE INPUT FILES.
	continue_prompt()

	print("REMOVING EXCLUSIVE ATOMS.")
	remove_atoms_from_pdb(reference_structure_file, reference_structure_file, atoms_remove)
	for i,f in enumerate(target_pdb_files):
	remove_atoms_from_pdb(f, target_pdb_files_new[i], atoms_remove)

	parser = PDB.PDBParser(QUIET=True)
	reference_structure = parser.get_structure('reference.pdb', reference_structure_file)

	print("SUPERPOSING STRUCTURES")
	i = 0
	for filename in tqdm(target_pdb_files_new):
	if exists(filename) and filename.endswith(".pdb"):

	reference_atoms, target_atoms = process(reference_structure_file, filename)

	if len(reference_atoms) > 0 or len(target_atoms) > 0:
	print("still have exclusive atoms = ")
	print("reference = " + str(reference_atoms))
	print("target = " + str(target_atoms))
	raise

	structure = parser.get_structure(filename[filename.rfind('/')+1:], filename)

	rotation = None
	translation = None

	rotation, translation = calculate_transformation(reference_structure, structure, residue_ranges)

	structure = apply_transformation(structure, rotation, translation)

	io = PDB.PDBIO()
	io.set_structure(structure)
	io.save(target_pdb_files_new[i])
	i += 1

	print('python3 add_aligned_suffix_csv.py ' + pdb_id)
	os.system('python3 add_aligned_suffix_csv.py ' + pdb_id)