import pandas as pd
import glob
import os
import sys
import tempfile
from os.path import isdir
from util import *
import math
def get_comp_pdbs(pdb_id):
base_dir = '/home/tiburon/Desktop/ROT4/complex4/o/'
dest_dir = '/home/tiburon/Desktop/ROT4/AFS_FINAL/computational_structures/'
copy_dest_dir = '/home/tiburon/Desktop/ROT4/AFS_FINAL/computational_structures_copy/'
files_to_move = [ base_dir + f for f in listdir(base_dir) if f.endswith(pdb_id+'.pdb') ]
for f in files_to_move:
print('cp ' + f + ' ' + dest_dir + 'comp_' + pdb_id + '.pdb' )
os.system('cp ' + f + ' ' + dest_dir + 'comp_' + pdb_id + '.pdb' )
print('cp ' + f + ' ' + copy_dest_dir + 'comp_' + pdb_id + '.pdb' )
os.system('cp ' + f + ' ' + copy_dest_dir + 'comp_' + pdb_id + '.pdb' )
def get_exp_pdbs(pdb_id):
base_dir = '/home/tiburon/Desktop/ROT4/complex4/boundpdbs/'
dest_dir = '/home/tiburon/Desktop/ROT4/AFS_FINAL/experimental_structures/'
copy_dest_dir = r'/home/tiburon/Desktop/ROT4/AFS_FINAL/experimental_structures_copy/'
files_to_move = [ base_dir + f for f in listdir(base_dir) if f.find(pdb_id) != -1]
for f in files_to_move:
if isdir(f):
print('cp -r ' + f + ' ' + copy_dest_dir)
os.system('cp -r ' + f + ' ' + copy_dest_dir)
new_dir = copy_dest_dir + pdb_id + '/'
for f2 in listdir(new_dir):
print('mv ' + new_dir + f2 + ' ' + new_dir + 'exp_'+f2)
os.system('mv ' + new_dir + f2 + ' ' + new_dir + 'exp_'+f2)
print('rm ' + new_dir + 'exp_exp_*')
os.system('rm ' + new_dir + 'exp_exp_*')
print('cp -r ' + f + ' ' + dest_dir)
os.system('cp -r ' + f + ' ' + dest_dir)
new_dir = dest_dir + pdb_id + '/'
for f2 in listdir(new_dir):
print('mv ' + new_dir + f2 + ' ' + new_dir + 'exp_'+f2)
os.system('mv ' + new_dir + f2 + ' ' + new_dir + 'exp_'+f2)
print('rm ' + new_dir + 'exp_exp_*')
os.system('rm ' + new_dir + 'exp_exp_*')
medoid = find_medoid_structure([ new_dir + f for f in listdir(new_dir) ])
print('cp ' + medoid + ' ' + dest_dir + 'exp_'+pdb_id+'.pdb')
os.system('cp ' + medoid + ' ' + dest_dir + 'exp_'+pdb_id+'.pdb')
print('cp ' + medoid + ' ' + copy_dest_dir + 'exp_'+pdb_id+'.pdb')
os.system('cp ' + medoid + ' ' + copy_dest_dir + 'exp_'+pdb_id+'.pdb')
elif f.endswith('.pdb'):
print('cp ' + f + ' ' + dest_dir + 'exp_' + pdb_id + '.pdb' )
os.system('cp ' + f + ' ' + dest_dir + 'exp_' + pdb_id + '.pdb' )
print('cp ' + f + ' ' + copy_dest_dir + 'exp_' + pdb_id + '.pdb' )
os.system('cp ' + f + ' ' + copy_dest_dir + 'exp_' + pdb_id + '.pdb' )
def convert_aa_name(three_letter_code):
"""Convert three-letter amino acid codes to one-letter codes. Placeholder function."""
aa_dict = {
'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D',
'CYS': 'C', 'GLU': 'E', 'GLN': 'Q', 'GLY': 'G',
'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',
'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
return aa_dict.get(three_letter_code, '?')
def get_pdb_sequence(pdb_path):
sequence = []
residue_numbers = []
chain_ids = []
current_chain = ""
prev_chain = ""
prev_res_id = None
with open(pdb_path, "r") as pdb_file:
for line in pdb_file:
if line.startswith("ATOM"):
chain = line[21]
res_name = line[17:20].strip()
res_id = int(line[22:26])
if prev_chain != "" and prev_chain != chain:
chain_ids.append(' ')
if prev_res_id != res_id:
prev_chain = chain
prev_res_id = res_id
# Convert residue numbers into the desired format
formatted_sequence = "".join(sequence)
formatted_residue_numbers = format_residue_numbers(residue_numbers, chain_ids)
return formatted_residue_numbers + "\n" + formatted_sequence
def format_residue_numbers(residue_numbers, chain_ids):
num_lines = math.ceil(math.log(max(residue_numbers), 10) + 1)
"""Format residue numbers to align with the sequence positions."""
positions = [[" "] * len(residue_numbers) for i in range(0, num_lines)]
for line in range(0, num_lines):
if line == 0:
for col, chain_id in enumerate(chain_ids):
positions[line][col] = chain_id
for col,residue_number in enumerate(residue_numbers):
if residue_number == -1:
t_res = residue_number - (residue_number // int(math.pow(10, num_lines-line))) * int(math.pow(10, num_lines-line))
positions[line][col] = str( t_res // int(math.pow(10,num_lines - line - 1)))
lines = [''.join(position) for position in positions]
formatted_lines = '\n'.join(lines[:]) # Reverse to have the ones on top
return formatted_lines
def find_removal_ranges(experimental_sequence, afs_trimmed_sequence):
# Placeholder for the logic to parse your actual sequence data
# This should map sequence identifiers (e.g., 'A', 'B') to their respective sequences and positions
exp_seq = parse_sequence(experimental_sequence)
afs_seq = parse_sequence(afs_trimmed_sequence)
ranges = []
for chain, exp_chain_seq in exp_seq.items():
afs_chain_seq = afs_seq[chain]
# Find the start and end of the AFS sequence within the experimental sequence
exp_seq_split = experimental_sequence.split('\n')
start_pos = exp_seq_split[len(exp_seq_split)-1].find(afs_chain_seq)
start_ind = 0
end_ind = 0
for i in range(1, len(exp_seq_split)-1):
add = int(int(exp_seq_split[i][start_pos]) * math.pow(10, len(exp_seq_split)-i-2))
start_ind += add
end_ind = start_ind + len(afs_chain_seq) - 1
# Assuming positions are 1-indexed in your system
return ', '.join(ranges)
def parse_sequence(sequence_str):
# Split the string into lines
lines = sequence_str.strip().split('\n')
# Assume the last line contains the actual sequence
sequence_line = lines[-1]
# Split sequences for different chains based on ':'
chain_sequences = sequence_line.split(':')
sequences = {}
chain_id = 'A' # Starting chain ID, increment for each chain
for chain_sequence in chain_sequences:
sequences[chain_id] = chain_sequence.replace(" ", "") # Remove spaces
# Increment chain ID, simple logic assuming sequence of A, B, C, etc.
chain_id = chr(ord(chain_id) + 1)
return sequences
def parse_ranges(ranges_str):
"""Parse the input string to extract chains and residue ranges."""
ranges = {}
for part in ranges_str.split(', '):
chain, start_end = part.split(":")
start_res, end_res = map(int, start_end.split(".."))
if chain not in ranges:
ranges[chain] = []
ranges[chain].append((start_res, end_res))
return ranges
def adjust_residue_numbers(ranges):
"""Adjust residue numbers to start from 1 for each chain."""
adjustment_maps = {}
for chain, chain_ranges in ranges.items():
adjustment_map = {}
new_residue_num = 1
for start_res, end_res in chain_ranges:
for original_res_num in range(start_res, end_res + 1):
adjustment_map[original_res_num] = new_residue_num
new_residue_num += 1
adjustment_maps[chain] = adjustment_map
return adjustment_maps
def trim_pdb_by_residues(pdb_file_path, ranges_str):
"""Trim and reindex residues in a PDB file based on provided ranges string."""
ranges = parse_ranges(ranges_str)
adjustment_maps = adjust_residue_numbers(ranges)
temp_file, temp_file_path = tempfile.mkstemp()
with open(pdb_file_path, 'r') as pdb_file, os.fdopen(temp_file, 'w') as output_file:
for line in pdb_file:
if line.startswith("ATOM") or line.startswith("HETATM"):
chain_id = line[21]
residue_num = int(line[22:26].strip())
if chain_id in ranges:
for start, end in ranges[chain_id]:
if start <= residue_num <= end:
# Adjust residue number
adjusted_residue_num = adjustment_maps[chain_id][residue_num]
# Rewrite line with adjusted residue number
new_line = line[:22] + "{:>4}".format(adjusted_residue_num) + line[26:]
# Write lines for chains not in ranges as they are
# Replace the original file with the filtered content
os.replace(temp_file_path, pdb_file_path)
return pdb_file_path
def swap_and_relabel_chains(input_pdb_path, output_pdb_path):
chain_a_atoms = []
chain_b_atoms = []
other_lines = []
# Read the PDB file and segregate lines based on chain A, chain B, and others
with open(input_pdb_path, 'r') as pdb_file:
for line in pdb_file:
if line.startswith('ATOM'):
chain_id = line[21] # Chain identifier is at column 22 (0-based indexing)
if chain_id == 'A':
elif chain_id == 'B':
# Swap chain identifiers: A -> B and B -> A
swapped_chain_a_atoms = [atom[:21] + 'B' + atom[22:] for atom in chain_a_atoms]
swapped_chain_b_atoms = [atom[:21] + 'A' + atom[22:] for atom in chain_b_atoms]
# Combine swapped chains and other lines, with chain B atoms first
modified_lines = swapped_chain_b_atoms + swapped_chain_a_atoms + other_lines
# Write the modified content to a new PDB file
with open(output_pdb_path, 'w') as output_pdb:
for line in modified_lines:
def continue_prompt():
while True:
user_input = input("Do you want to continue? (y/n): ").lower()
if user_input in ['n', 'no']:
print("Terminating script.")
elif user_input in ['y', 'yes']:
print("Continuing execution.")
print("Invalid input. Please enter 'y' for yes or 'n' for no.")
def main(bound, apo, ranges_str):
bound_identifier = bound
csv_file_path = f'./CSP_{bound_identifier}.csv'
if not os.path.isfile(csv_file_path):
data = pd.DataFrame(columns=['apo_bmrb', 'holo_pdb', 'holo_model_path'])
data = pd.read_csv(csv_file_path)
pdb_files = glob.glob(f'./{bound.lower()}/*.pdb')
new_rows = []
unique_keys = set(data['holo_model_path'])
for file in pdb_files:
new_row_key = file
if new_row_key not in unique_keys:
if new_row_key.endswith(apo.upper() + '.pdb'):
os.system(f'rm "{new_row_key}"')
# Filter the PDB file before adding it to the CSV
new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
if isdir(f'./{bound.lower()}_alt/'):
pdb_files = glob.glob(f'./{bound.lower()}_alt/*.pdb')
unique_keys = set(data['holo_model_path'])
for file in pdb_files:
new_row_key = file
if new_row_key not in unique_keys:
if new_row_key.endswith(apo.upper() + '.pdb'):
os.system(f'rm "{new_row_key}"')
# Filter the PDB file before adding it to the CSV
new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
exp_dir = f'./experimental_structures/{bound}/'
if not os.path.isdir(exp_dir):
exp_dir = './experimental_structures/'
experimental_pdb_files = glob.glob(exp_dir + f'exp_{bound}*.pdb')
for file in experimental_pdb_files:
new_row_key = file
trim_pdb_by_residues(new_row_key, ranges_str)
print("Trimmed " + new_row_key)
if new_row_key not in unique_keys:
#swap_and_relabel_chains(new_row_key, new_row_key) # SOMETIMES WE NEED TO SWAP CHAIN A AND B
# Assume experimental PDB files do not need filtering
new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
exp_dir = './experimental_structures/'
experimental_pdb_files = glob.glob(exp_dir + f'exp_{bound}*.pdb')
for file in experimental_pdb_files:
new_row_key = file
trim_pdb_by_residues(new_row_key, ranges_str)
print("Trimmed " + new_row_key)
if new_row_key not in unique_keys:
#swap_and_relabel_chains(new_row_key, new_row_key) # SOMETIMES WE NEED TO SWAP CHAIN A AND B
# Assume experimental PDB files do not need filtering
new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
comp_dir = f'./computational_structures/'
comp_pdb_files = glob.glob(comp_dir + f'comp_{bound}*.pdb')
for file in comp_pdb_files:
new_row_key = file
trim_pdb_by_residues(new_row_key, ranges_str)
print("Trimmed " + new_row_key)
if new_row_key not in unique_keys:
#swap_and_relabel_chains(new_row_key, new_row_key) # SOMETIMES WE NEED TO SWAP CHAIN A AND B
new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
new_data = pd.DataFrame(new_rows)
data = pd.concat([data, new_data]).reset_index(drop=True)
data.to_csv(csv_file_path, index=False)
if __name__ == "__main__":
if len(sys.argv) not in [2, 4]:
print("Usage: python <bound> <apo> <residue_ranges>")
print('e.g python3 6bnh 15125 "A:8..76, B:1..13"')
print("Alternative usage: python <bound>")
bound = sys.argv[1]
df = pd.read_csv('/home/tiburon/Desktop/ROT4/complex4/csp.csv')
filtered_df = df[df['holo_pdb'] == bound.upper()]
apo = str(list(filtered_df['apo_bmrb'])[0])
experimental_sequence = get_pdb_sequence('./experimental_structures/exp_' + bound + '.pdb')
AFS_file_example = ""
if isdir('./'+bound+'/'):
AFS_file_example = './'+bound+'/' + listdir('./'+bound+'/')[0]
elif not(isdir('./'+bound+'/')) and isdir('./'+bound+'_alt/'):
AFS_file_example = './'+bound+'_alt/' + listdir('./'+bound+'_alt/')[0]
elif not(isdir('./'+bound+'/')) and not(isdir('./'+bound+'_alt/')):
# raise
AFS_trimmed_sequence = get_pdb_sequence(AFS_file_example)