Skip to content
Permalink
5ab4ed8e4a
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
386 lines (321 sloc) 15.8 KB
import pandas as pd
import glob
import os
import sys
import tempfile
from os.path import isdir
from util import *
import math
def get_comp_pdbs(pdb_id):
base_dir = '/home/tiburon/Desktop/ROT4/complex4/o/'
dest_dir = '/home/tiburon/Desktop/ROT4/AFS_FINAL/computational_structures/'
copy_dest_dir = '/home/tiburon/Desktop/ROT4/AFS_FINAL/computational_structures_copy/'
files_to_move = [ base_dir + f for f in listdir(base_dir) if f.endswith(pdb_id+'.pdb') ]
for f in files_to_move:
print('cp ' + f + ' ' + dest_dir + 'comp_' + pdb_id + '.pdb' )
os.system('cp ' + f + ' ' + dest_dir + 'comp_' + pdb_id + '.pdb' )
print('cp ' + f + ' ' + copy_dest_dir + 'comp_' + pdb_id + '.pdb' )
os.system('cp ' + f + ' ' + copy_dest_dir + 'comp_' + pdb_id + '.pdb' )
def get_exp_pdbs(pdb_id):
base_dir = '/home/tiburon/Desktop/ROT4/complex4/boundpdbs/'
dest_dir = '/home/tiburon/Desktop/ROT4/AFS_FINAL/experimental_structures/'
copy_dest_dir = r'/home/tiburon/Desktop/ROT4/AFS_FINAL/experimental_structures_copy/'
files_to_move = [ base_dir + f for f in listdir(base_dir) if f.find(pdb_id) != -1]
for f in files_to_move:
if isdir(f):
print('cp -r ' + f + ' ' + copy_dest_dir)
os.system('cp -r ' + f + ' ' + copy_dest_dir)
new_dir = copy_dest_dir + pdb_id + '/'
for f2 in listdir(new_dir):
print('mv ' + new_dir + f2 + ' ' + new_dir + 'exp_'+f2)
os.system('mv ' + new_dir + f2 + ' ' + new_dir + 'exp_'+f2)
print('rm ' + new_dir + 'exp_exp_*')
os.system('rm ' + new_dir + 'exp_exp_*')
print('cp -r ' + f + ' ' + dest_dir)
os.system('cp -r ' + f + ' ' + dest_dir)
new_dir = dest_dir + pdb_id + '/'
for f2 in listdir(new_dir):
print('mv ' + new_dir + f2 + ' ' + new_dir + 'exp_'+f2)
os.system('mv ' + new_dir + f2 + ' ' + new_dir + 'exp_'+f2)
print('rm ' + new_dir + 'exp_exp_*')
os.system('rm ' + new_dir + 'exp_exp_*')
medoid = find_medoid_structure([ new_dir + f for f in listdir(new_dir) ])
print('cp ' + medoid + ' ' + dest_dir + 'exp_'+pdb_id+'.pdb')
os.system('cp ' + medoid + ' ' + dest_dir + 'exp_'+pdb_id+'.pdb')
print('cp ' + medoid + ' ' + copy_dest_dir + 'exp_'+pdb_id+'.pdb')
os.system('cp ' + medoid + ' ' + copy_dest_dir + 'exp_'+pdb_id+'.pdb')
elif f.endswith('.pdb'):
print('cp ' + f + ' ' + dest_dir + 'exp_' + pdb_id + '.pdb' )
os.system('cp ' + f + ' ' + dest_dir + 'exp_' + pdb_id + '.pdb' )
print('cp ' + f + ' ' + copy_dest_dir + 'exp_' + pdb_id + '.pdb' )
os.system('cp ' + f + ' ' + copy_dest_dir + 'exp_' + pdb_id + '.pdb' )
def convert_aa_name(three_letter_code):
"""Convert three-letter amino acid codes to one-letter codes. Placeholder function."""
aa_dict = {
'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D',
'CYS': 'C', 'GLU': 'E', 'GLN': 'Q', 'GLY': 'G',
'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',
'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
}
return aa_dict.get(three_letter_code, '?')
def get_pdb_sequence(pdb_path):
sequence = []
residue_numbers = []
chain_ids = []
current_chain = ""
prev_chain = ""
prev_res_id = None
with open(pdb_path, "r") as pdb_file:
for line in pdb_file:
if line.startswith("ATOM"):
chain = line[21]
res_name = line[17:20].strip()
res_id = int(line[22:26])
if prev_chain != "" and prev_chain != chain:
sequence.append(':')
residue_numbers.append(-1)
chain_ids.append(' ')
if prev_res_id != res_id:
sequence.append(convert_aa_name(res_name))
residue_numbers.append(res_id)
chain_ids.append(chain)
prev_chain = chain
prev_res_id = res_id
# Convert residue numbers into the desired format
formatted_sequence = "".join(sequence)
formatted_residue_numbers = format_residue_numbers(residue_numbers, chain_ids)
return formatted_residue_numbers + "\n" + formatted_sequence
def format_residue_numbers(residue_numbers, chain_ids):
num_lines = math.ceil(math.log(max(residue_numbers), 10) + 1)
"""Format residue numbers to align with the sequence positions."""
positions = [[" "] * len(residue_numbers) for i in range(0, num_lines)]
for line in range(0, num_lines):
if line == 0:
for col, chain_id in enumerate(chain_ids):
positions[line][col] = chain_id
else:
for col,residue_number in enumerate(residue_numbers):
if residue_number == -1:
continue
t_res = residue_number - (residue_number // int(math.pow(10, num_lines-line))) * int(math.pow(10, num_lines-line))
positions[line][col] = str( t_res // int(math.pow(10,num_lines - line - 1)))
lines = [''.join(position) for position in positions]
formatted_lines = '\n'.join(lines[:]) # Reverse to have the ones on top
return formatted_lines
def find_removal_ranges(experimental_sequence, afs_trimmed_sequence):
# Placeholder for the logic to parse your actual sequence data
# This should map sequence identifiers (e.g., 'A', 'B') to their respective sequences and positions
exp_seq = parse_sequence(experimental_sequence)
afs_seq = parse_sequence(afs_trimmed_sequence)
#print('exp_seq')
#print(exp_seq)
#print('afs_seq')
#print(afs_seq)
ranges = []
for chain, exp_chain_seq in exp_seq.items():
afs_chain_seq = afs_seq[chain]
# Find the start and end of the AFS sequence within the experimental sequence
exp_seq_split = experimental_sequence.split('\n')
start_pos = exp_seq_split[len(exp_seq_split)-1].find(afs_chain_seq)
start_ind = 0
end_ind = 0
for i in range(1, len(exp_seq_split)-1):
add = int(int(exp_seq_split[i][start_pos]) * math.pow(10, len(exp_seq_split)-i-2))
start_ind += add
end_ind = start_ind + len(afs_chain_seq) - 1
# Assuming positions are 1-indexed in your system
ranges.append(f"{chain}:{start_ind}..{end_ind}")
return ', '.join(ranges)
def parse_sequence(sequence_str):
# Split the string into lines
lines = sequence_str.strip().split('\n')
# Assume the last line contains the actual sequence
sequence_line = lines[-1]
# Split sequences for different chains based on ':'
chain_sequences = sequence_line.split(':')
sequences = {}
chain_id = 'A' # Starting chain ID, increment for each chain
for chain_sequence in chain_sequences:
sequences[chain_id] = chain_sequence.replace(" ", "") # Remove spaces
# Increment chain ID, simple logic assuming sequence of A, B, C, etc.
chain_id = chr(ord(chain_id) + 1)
return sequences
def parse_ranges(ranges_str):
"""Parse the input string to extract chains and residue ranges."""
ranges = {}
for part in ranges_str.split(', '):
chain, start_end = part.split(":")
start_res, end_res = map(int, start_end.split(".."))
if chain not in ranges:
ranges[chain] = []
ranges[chain].append((start_res, end_res))
return ranges
def adjust_residue_numbers(ranges):
"""Adjust residue numbers to start from 1 for each chain."""
adjustment_maps = {}
for chain, chain_ranges in ranges.items():
adjustment_map = {}
new_residue_num = 1
for start_res, end_res in chain_ranges:
for original_res_num in range(start_res, end_res + 1):
adjustment_map[original_res_num] = new_residue_num
new_residue_num += 1
adjustment_maps[chain] = adjustment_map
return adjustment_maps
def trim_pdb_by_residues(pdb_file_path, ranges_str):
"""Trim and reindex residues in a PDB file based on provided ranges string."""
ranges = parse_ranges(ranges_str)
adjustment_maps = adjust_residue_numbers(ranges)
temp_file, temp_file_path = tempfile.mkstemp()
with open(pdb_file_path, 'r') as pdb_file, os.fdopen(temp_file, 'w') as output_file:
for line in pdb_file:
if line.startswith("ATOM") or line.startswith("HETATM"):
chain_id = line[21]
residue_num = int(line[22:26].strip())
if chain_id in ranges:
for start, end in ranges[chain_id]:
if start <= residue_num <= end:
# Adjust residue number
adjusted_residue_num = adjustment_maps[chain_id][residue_num]
# Rewrite line with adjusted residue number
new_line = line[:22] + "{:>4}".format(adjusted_residue_num) + line[26:]
output_file.write(new_line)
break
else:
# Write lines for chains not in ranges as they are
output_file.write(line)
# Replace the original file with the filtered content
os.replace(temp_file_path, pdb_file_path)
return pdb_file_path
def swap_and_relabel_chains(input_pdb_path, output_pdb_path):
chain_a_atoms = []
chain_b_atoms = []
other_lines = []
# Read the PDB file and segregate lines based on chain A, chain B, and others
with open(input_pdb_path, 'r') as pdb_file:
for line in pdb_file:
if line.startswith('ATOM'):
chain_id = line[21] # Chain identifier is at column 22 (0-based indexing)
if chain_id == 'A':
chain_a_atoms.append(line)
elif chain_id == 'B':
chain_b_atoms.append(line)
else:
other_lines.append(line)
else:
other_lines.append(line)
# Swap chain identifiers: A -> B and B -> A
swapped_chain_a_atoms = [atom[:21] + 'B' + atom[22:] for atom in chain_a_atoms]
swapped_chain_b_atoms = [atom[:21] + 'A' + atom[22:] for atom in chain_b_atoms]
# Combine swapped chains and other lines, with chain B atoms first
modified_lines = swapped_chain_b_atoms + swapped_chain_a_atoms + other_lines
# Write the modified content to a new PDB file
with open(output_pdb_path, 'w') as output_pdb:
for line in modified_lines:
output_pdb.write(line)
def continue_prompt():
while True:
user_input = input("Do you want to continue? (y/n): ").lower()
if user_input in ['n', 'no']:
print("Terminating script.")
exit()
elif user_input in ['y', 'yes']:
print("Continuing execution.")
break
else:
print("Invalid input. Please enter 'y' for yes or 'n' for no.")
def main(bound, apo, ranges_str):
bound_identifier = bound
csv_file_path = f'./CSP_{bound_identifier}.csv'
if not os.path.isfile(csv_file_path):
data = pd.DataFrame(columns=['apo_bmrb', 'holo_pdb', 'holo_model_path'])
else:
data = pd.read_csv(csv_file_path)
pdb_files = glob.glob(f'./{bound.lower()}/*.pdb')
new_rows = []
unique_keys = set(data['holo_model_path'])
for file in pdb_files:
new_row_key = file
#print(new_row_key)
if new_row_key not in unique_keys:
if new_row_key.endswith(apo.upper() + '.pdb'):
os.system(f'rm "{new_row_key}"')
continue
# Filter the PDB file before adding it to the CSV
new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
unique_keys.add(new_row_key)
if isdir(f'./{bound.lower()}_alt/'):
pdb_files = glob.glob(f'./{bound.lower()}_alt/*.pdb')
unique_keys = set(data['holo_model_path'])
for file in pdb_files:
new_row_key = file
if new_row_key not in unique_keys:
if new_row_key.endswith(apo.upper() + '.pdb'):
os.system(f'rm "{new_row_key}"')
continue
# Filter the PDB file before adding it to the CSV
new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
unique_keys.add(new_row_key)
exp_dir = f'./experimental_structures/{bound}/'
if not os.path.isdir(exp_dir):
exp_dir = './experimental_structures/'
experimental_pdb_files = glob.glob(exp_dir + f'exp_{bound}*.pdb')
for file in experimental_pdb_files:
new_row_key = file
trim_pdb_by_residues(new_row_key, ranges_str)
print("Trimmed " + new_row_key)
if new_row_key not in unique_keys:
#swap_and_relabel_chains(new_row_key, new_row_key) # SOMETIMES WE NEED TO SWAP CHAIN A AND B
# Assume experimental PDB files do not need filtering
new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
unique_keys.add(new_row_key)
exp_dir = './experimental_structures/'
experimental_pdb_files = glob.glob(exp_dir + f'exp_{bound}*.pdb')
for file in experimental_pdb_files:
new_row_key = file
trim_pdb_by_residues(new_row_key, ranges_str)
print("Trimmed " + new_row_key)
if new_row_key not in unique_keys:
#swap_and_relabel_chains(new_row_key, new_row_key) # SOMETIMES WE NEED TO SWAP CHAIN A AND B
# Assume experimental PDB files do not need filtering
new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
unique_keys.add(new_row_key)
comp_dir = f'./computational_structures/'
comp_pdb_files = glob.glob(comp_dir + f'comp_{bound}*.pdb')
for file in comp_pdb_files:
new_row_key = file
trim_pdb_by_residues(new_row_key, ranges_str)
print("Trimmed " + new_row_key)
if new_row_key not in unique_keys:
#swap_and_relabel_chains(new_row_key, new_row_key) # SOMETIMES WE NEED TO SWAP CHAIN A AND B
new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
unique_keys.add(new_row_key)
new_data = pd.DataFrame(new_rows)
print(len(new_data))
data = pd.concat([data, new_data]).reset_index(drop=True)
data.to_csv(csv_file_path, index=False)
if __name__ == "__main__":
if len(sys.argv) not in [2, 4]:
print("Usage: python script.py <bound> <apo> <residue_ranges>")
print('e.g python3 init_csv_and_trim.py 6bnh 15125 "A:8..76, B:1..13"')
print("Alternative usage: python script.py <bound>")
sys.exit(1)
bound = sys.argv[1]
df = pd.read_csv('/home/tiburon/Desktop/ROT4/complex4/csp.csv')
filtered_df = df[df['holo_pdb'] == bound.upper()]
apo = str(list(filtered_df['apo_bmrb'])[0])
experimental_sequence = get_pdb_sequence('./experimental_structures/exp_' + bound + '.pdb')
print('EXPERIMENTAL_SEQUENCE')
print(experimental_sequence)
AFS_file_example = ""
if isdir('./'+bound+'/'):
AFS_file_example = './'+bound+'/' + listdir('./'+bound+'/')[0]
elif not(isdir('./'+bound+'/')) and isdir('./'+bound+'_alt/'):
AFS_file_example = './'+bound+'_alt/' + listdir('./'+bound+'_alt/')[0]
elif not(isdir('./'+bound+'/')) and not(isdir('./'+bound+'_alt/')):
print("FOUND NO ENHANCED SAMPLING DIRECTORY, PLEASE FINISH POST PROCESSING PROCEDURE.")
# raise
AFS_trimmed_sequence = get_pdb_sequence(AFS_file_example)
print('AFS_TRIMMED_SEQUENCE')
print(AFS_trimmed_sequence)