get_seq.py

import pandas as pd
import glob
import os
import sys
import tempfile
from os.path import isdir
from util import *
import math

def get_comp_pdbs(pdb_id):
    base_dir = '/home/tiburon/Desktop/ROT4/complex4/o/'
    dest_dir = '/home/tiburon/Desktop/ROT4/AFS_FINAL/computational_structures/'
    copy_dest_dir = '/home/tiburon/Desktop/ROT4/AFS_FINAL/computational_structures_copy/'
    files_to_move = [ base_dir + f for f in listdir(base_dir) if f.endswith(pdb_id+'.pdb') ]
    for f in files_to_move:
        print('cp ' + f + ' ' + dest_dir + 'comp_' + pdb_id + '.pdb' )
        os.system('cp ' + f + ' ' + dest_dir + 'comp_' + pdb_id + '.pdb' )
        print('cp ' + f + ' ' + copy_dest_dir + 'comp_' + pdb_id + '.pdb' )
        os.system('cp ' + f + ' ' + copy_dest_dir + 'comp_' + pdb_id + '.pdb' )

def get_exp_pdbs(pdb_id):
    base_dir = '/home/tiburon/Desktop/ROT4/complex4/boundpdbs/'
    dest_dir = '/home/tiburon/Desktop/ROT4/AFS_FINAL/experimental_structures/'
    copy_dest_dir = r'/home/tiburon/Desktop/ROT4/AFS_FINAL/experimental_structures_copy/'

    files_to_move = [ base_dir + f for f in listdir(base_dir) if f.find(pdb_id) != -1]

    for f in files_to_move:
        if isdir(f):
            print('cp -r ' + f + ' ' + copy_dest_dir)
            os.system('cp -r ' + f + ' ' + copy_dest_dir)
            new_dir = copy_dest_dir + pdb_id + '/'
            for f2 in listdir(new_dir):
                print('mv ' + new_dir + f2 + ' ' + new_dir + 'exp_'+f2)
                os.system('mv ' + new_dir + f2 + ' ' + new_dir + 'exp_'+f2)
                print('rm ' + new_dir + 'exp_exp_*')
                os.system('rm ' + new_dir + 'exp_exp_*')

            print('cp -r ' + f + ' ' + dest_dir)
            os.system('cp -r ' + f + ' ' + dest_dir)
            new_dir = dest_dir + pdb_id + '/'
            for f2 in listdir(new_dir):
                print('mv ' + new_dir + f2 + ' ' + new_dir + 'exp_'+f2)
                os.system('mv ' + new_dir + f2 + ' ' + new_dir + 'exp_'+f2)
                print('rm ' + new_dir + 'exp_exp_*')
                os.system('rm ' + new_dir + 'exp_exp_*')

            medoid = find_medoid_structure([ new_dir + f for f in listdir(new_dir) ])
            print('cp ' + medoid + ' ' + dest_dir + 'exp_'+pdb_id+'.pdb')
            os.system('cp ' + medoid + ' ' + dest_dir + 'exp_'+pdb_id+'.pdb')
            print('cp ' + medoid + ' ' + copy_dest_dir + 'exp_'+pdb_id+'.pdb')
            os.system('cp ' + medoid + ' ' + copy_dest_dir + 'exp_'+pdb_id+'.pdb')
        elif f.endswith('.pdb'):
            print('cp ' + f + ' ' + dest_dir + 'exp_' + pdb_id + '.pdb' )
            os.system('cp ' + f + ' ' + dest_dir + 'exp_' + pdb_id + '.pdb' )
            print('cp ' + f + ' ' + copy_dest_dir + 'exp_' + pdb_id + '.pdb' )
            os.system('cp ' + f + ' ' + copy_dest_dir + 'exp_' + pdb_id + '.pdb' )


def convert_aa_name(three_letter_code):
    """Convert three-letter amino acid codes to one-letter codes. Placeholder function."""
    aa_dict = {
        'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D',
        'CYS': 'C', 'GLU': 'E', 'GLN': 'Q', 'GLY': 'G',
        'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',
        'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
        'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
    }
    return aa_dict.get(three_letter_code, '?')

def get_pdb_sequence(pdb_path):
    sequence = []
    residue_numbers = []
    chain_ids = []
    current_chain = ""
    prev_chain = ""
    prev_res_id = None

    with open(pdb_path, "r") as pdb_file:
        for line in pdb_file:
            if line.startswith("ATOM"):
                chain = line[21]
                res_name = line[17:20].strip()
                res_id = int(line[22:26])


                if prev_chain != "" and prev_chain != chain:
                    sequence.append(':')
                    residue_numbers.append(-1)
                    chain_ids.append(' ')
                if prev_res_id != res_id:
                    sequence.append(convert_aa_name(res_name))
                    residue_numbers.append(res_id)
                    chain_ids.append(chain)


                prev_chain = chain
                prev_res_id = res_id

    # Convert residue numbers into the desired format
    formatted_sequence = "".join(sequence)
    formatted_residue_numbers = format_residue_numbers(residue_numbers, chain_ids)

    return formatted_residue_numbers + "\n" + formatted_sequence

def format_residue_numbers(residue_numbers, chain_ids):

    num_lines = math.ceil(math.log(max(residue_numbers), 10) + 1)
    """Format residue numbers to align with the sequence positions."""

    positions = [[" "] * len(residue_numbers) for i in range(0, num_lines)]
    for line in range(0, num_lines):
        if line == 0:
            for col, chain_id in enumerate(chain_ids):
                positions[line][col] = chain_id
        else:
            for col,residue_number in enumerate(residue_numbers):
                if residue_number == -1:
                    continue
                t_res = residue_number - (residue_number // int(math.pow(10, num_lines-line))) * int(math.pow(10, num_lines-line))
                positions[line][col] = str( t_res // int(math.pow(10,num_lines - line - 1)))

    lines = [''.join(position) for position in positions]
    formatted_lines = '\n'.join(lines[:])  # Reverse to have the ones on top

    return formatted_lines

def find_removal_ranges(experimental_sequence, afs_trimmed_sequence):
    # Placeholder for the logic to parse your actual sequence data
    # This should map sequence identifiers (e.g., 'A', 'B') to their respective sequences and positions
    exp_seq = parse_sequence(experimental_sequence)
    afs_seq = parse_sequence(afs_trimmed_sequence)

    #print('exp_seq')
    #print(exp_seq)
    #print('afs_seq')
    #print(afs_seq)
    ranges = []

    for chain, exp_chain_seq in exp_seq.items():
        afs_chain_seq = afs_seq[chain]

        # Find the start and end of the AFS sequence within the experimental sequence
        exp_seq_split = experimental_sequence.split('\n')
        start_pos = exp_seq_split[len(exp_seq_split)-1].find(afs_chain_seq)

        start_ind = 0
        end_ind = 0
        for i in range(1, len(exp_seq_split)-1):
            add = int(int(exp_seq_split[i][start_pos]) * math.pow(10, len(exp_seq_split)-i-2))
            start_ind += add

        end_ind = start_ind + len(afs_chain_seq) - 1


        # Assuming positions are 1-indexed in your system
        ranges.append(f"{chain}:{start_ind}..{end_ind}")

    return ', '.join(ranges)

def parse_sequence(sequence_str):
    # Split the string into lines
    lines = sequence_str.strip().split('\n')

    # Assume the last line contains the actual sequence
    sequence_line = lines[-1]

    # Split sequences for different chains based on ':'
    chain_sequences = sequence_line.split(':')

    sequences = {}
    chain_id = 'A'  # Starting chain ID, increment for each chain

    for chain_sequence in chain_sequences:
        sequences[chain_id] = chain_sequence.replace(" ", "")  # Remove spaces
        # Increment chain ID, simple logic assuming sequence of A, B, C, etc.
        chain_id = chr(ord(chain_id) + 1)

    return sequences

def parse_ranges(ranges_str):
    """Parse the input string to extract chains and residue ranges."""
    ranges = {}
    for part in ranges_str.split(', '):
        chain, start_end = part.split(":")
        start_res, end_res = map(int, start_end.split(".."))
        if chain not in ranges:
            ranges[chain] = []
        ranges[chain].append((start_res, end_res))
    return ranges

def adjust_residue_numbers(ranges):
    """Adjust residue numbers to start from 1 for each chain."""
    adjustment_maps = {}
    for chain, chain_ranges in ranges.items():
        adjustment_map = {}
        new_residue_num = 1
        for start_res, end_res in chain_ranges:
            for original_res_num in range(start_res, end_res + 1):
                adjustment_map[original_res_num] = new_residue_num
                new_residue_num += 1
        adjustment_maps[chain] = adjustment_map
    return adjustment_maps

def trim_pdb_by_residues(pdb_file_path, ranges_str):
    """Trim and reindex residues in a PDB file based on provided ranges string."""
    ranges = parse_ranges(ranges_str)
    adjustment_maps = adjust_residue_numbers(ranges)
    temp_file, temp_file_path = tempfile.mkstemp()

    with open(pdb_file_path, 'r') as pdb_file, os.fdopen(temp_file, 'w') as output_file:
        for line in pdb_file:
            if line.startswith("ATOM") or line.startswith("HETATM"):
                chain_id = line[21]
                residue_num = int(line[22:26].strip())
                if chain_id in ranges:
                    for start, end in ranges[chain_id]:
                        if start <= residue_num <= end:
                            # Adjust residue number
                            adjusted_residue_num = adjustment_maps[chain_id][residue_num]
                            # Rewrite line with adjusted residue number
                            new_line = line[:22] + "{:>4}".format(adjusted_residue_num) + line[26:]
                            output_file.write(new_line)
                            break
                else:
                    # Write lines for chains not in ranges as they are
                    output_file.write(line)

    # Replace the original file with the filtered content
    os.replace(temp_file_path, pdb_file_path)
    return pdb_file_path

def swap_and_relabel_chains(input_pdb_path, output_pdb_path):
    chain_a_atoms = []
    chain_b_atoms = []
    other_lines = []

    # Read the PDB file and segregate lines based on chain A, chain B, and others
    with open(input_pdb_path, 'r') as pdb_file:
        for line in pdb_file:
            if line.startswith('ATOM'):
                chain_id = line[21]  # Chain identifier is at column 22 (0-based indexing)
                if chain_id == 'A':
                    chain_a_atoms.append(line)
                elif chain_id == 'B':
                    chain_b_atoms.append(line)
                else:
                    other_lines.append(line)
            else:
                other_lines.append(line)

    # Swap chain identifiers: A -> B and B -> A
    swapped_chain_a_atoms = [atom[:21] + 'B' + atom[22:] for atom in chain_a_atoms]
    swapped_chain_b_atoms = [atom[:21] + 'A' + atom[22:] for atom in chain_b_atoms]

    # Combine swapped chains and other lines, with chain B atoms first
    modified_lines = swapped_chain_b_atoms + swapped_chain_a_atoms + other_lines

    # Write the modified content to a new PDB file
    with open(output_pdb_path, 'w') as output_pdb:
        for line in modified_lines:
            output_pdb.write(line)

def continue_prompt():
    while True:
        user_input = input("Do you want to continue? (y/n): ").lower()
        if user_input in ['n', 'no']:
            print("Terminating script.")
            exit()
        elif user_input in ['y', 'yes']:
            print("Continuing execution.")
            break
        else:
            print("Invalid input. Please enter 'y' for yes or 'n' for no.")

def main(bound, apo, ranges_str):
    bound_identifier = bound
    csv_file_path = f'./CSP_{bound_identifier}.csv'

    if not os.path.isfile(csv_file_path):
        data = pd.DataFrame(columns=['apo_bmrb', 'holo_pdb', 'holo_model_path'])
    else:
        data = pd.read_csv(csv_file_path)

    pdb_files = glob.glob(f'./{bound.lower()}/*.pdb')
    new_rows = []
    unique_keys = set(data['holo_model_path'])

    for file in pdb_files:
        new_row_key = file
        #print(new_row_key)
        if new_row_key not in unique_keys:
            if new_row_key.endswith(apo.upper() + '.pdb'):
                os.system(f'rm "{new_row_key}"')
                continue
            # Filter the PDB file before adding it to the CSV
            new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
            unique_keys.add(new_row_key)

    if isdir(f'./{bound.lower()}_alt/'):
        pdb_files = glob.glob(f'./{bound.lower()}_alt/*.pdb')
        unique_keys = set(data['holo_model_path'])

        for file in pdb_files:
            new_row_key = file
            if new_row_key not in unique_keys:
                if new_row_key.endswith(apo.upper() + '.pdb'):
                    os.system(f'rm "{new_row_key}"')
                    continue
                # Filter the PDB file before adding it to the CSV
                new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
                unique_keys.add(new_row_key)


    exp_dir = f'./experimental_structures/{bound}/'
    if not os.path.isdir(exp_dir):
        exp_dir = './experimental_structures/'

    experimental_pdb_files = glob.glob(exp_dir + f'exp_{bound}*.pdb')
    for file in experimental_pdb_files:
        new_row_key = file
        trim_pdb_by_residues(new_row_key, ranges_str)
        print("Trimmed " + new_row_key)
        if new_row_key not in unique_keys:
            #swap_and_relabel_chains(new_row_key, new_row_key) # SOMETIMES WE NEED TO SWAP CHAIN A AND B
            # Assume experimental PDB files do not need filtering
            new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
            unique_keys.add(new_row_key)

    exp_dir = './experimental_structures/'
    experimental_pdb_files = glob.glob(exp_dir + f'exp_{bound}*.pdb')
    for file in experimental_pdb_files:
        new_row_key = file
        trim_pdb_by_residues(new_row_key, ranges_str)
        print("Trimmed " + new_row_key)
        if new_row_key not in unique_keys:
            #swap_and_relabel_chains(new_row_key, new_row_key) # SOMETIMES WE NEED TO SWAP CHAIN A AND B
            # Assume experimental PDB files do not need filtering
            new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
            unique_keys.add(new_row_key)

    comp_dir = f'./computational_structures/'
    comp_pdb_files = glob.glob(comp_dir + f'comp_{bound}*.pdb')
    for file in comp_pdb_files:
        new_row_key = file
        trim_pdb_by_residues(new_row_key, ranges_str)
        print("Trimmed " + new_row_key)
        if new_row_key not in unique_keys:
            #swap_and_relabel_chains(new_row_key, new_row_key) # SOMETIMES WE NEED TO SWAP CHAIN A AND B
            new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
            unique_keys.add(new_row_key)

    new_data = pd.DataFrame(new_rows)
    print(len(new_data))

    data = pd.concat([data, new_data]).reset_index(drop=True)
    data.to_csv(csv_file_path, index=False)

if __name__ == "__main__":
    if len(sys.argv) not in [2, 4]:
        print("Usage: python script.py <bound> <apo> <residue_ranges>")
        print('e.g python3 init_csv_and_trim.py 6bnh 15125 "A:8..76, B:1..13"')
        print("Alternative usage: python script.py <bound>")
        sys.exit(1)

    bound = sys.argv[1]
    df = pd.read_csv('/home/tiburon/Desktop/ROT4/complex4/csp.csv')
    filtered_df = df[df['holo_pdb'] == bound.upper()]
    apo = str(list(filtered_df['apo_bmrb'])[0])


    experimental_sequence = get_pdb_sequence('./experimental_structures/exp_' + bound + '.pdb')
    print('EXPERIMENTAL_SEQUENCE')
    print(experimental_sequence)

    AFS_file_example = ""
    if isdir('./'+bound+'/'):
        AFS_file_example = './'+bound+'/' + listdir('./'+bound+'/')[0]
    elif not(isdir('./'+bound+'/')) and isdir('./'+bound+'_alt/'):
        AFS_file_example = './'+bound+'_alt/' + listdir('./'+bound+'_alt/')[0]
    elif not(isdir('./'+bound+'/')) and not(isdir('./'+bound+'_alt/')):
        print("FOUND NO ENHANCED SAMPLING DIRECTORY, PLEASE FINISH POST PROCESSING PROCEDURE.")
    #    raise
    AFS_trimmed_sequence = get_pdb_sequence(AFS_file_example)
    print('AFS_TRIMMED_SEQUENCE')
    print(AFS_trimmed_sequence)
	import pandas as pd
	import glob
	import os
	import sys
	import tempfile
	from os.path import isdir
	from util import *
	import math

	def get_comp_pdbs(pdb_id):
	base_dir = '/home/tiburon/Desktop/ROT4/complex4/o/'
	dest_dir = '/home/tiburon/Desktop/ROT4/AFS_FINAL/computational_structures/'
	copy_dest_dir = '/home/tiburon/Desktop/ROT4/AFS_FINAL/computational_structures_copy/'
	files_to_move = [ base_dir + f for f in listdir(base_dir) if f.endswith(pdb_id+'.pdb') ]
	for f in files_to_move:
	print('cp ' + f + ' ' + dest_dir + 'comp_' + pdb_id + '.pdb' )
	os.system('cp ' + f + ' ' + dest_dir + 'comp_' + pdb_id + '.pdb' )
	print('cp ' + f + ' ' + copy_dest_dir + 'comp_' + pdb_id + '.pdb' )
	os.system('cp ' + f + ' ' + copy_dest_dir + 'comp_' + pdb_id + '.pdb' )

	def get_exp_pdbs(pdb_id):
	base_dir = '/home/tiburon/Desktop/ROT4/complex4/boundpdbs/'
	dest_dir = '/home/tiburon/Desktop/ROT4/AFS_FINAL/experimental_structures/'
	copy_dest_dir = r'/home/tiburon/Desktop/ROT4/AFS_FINAL/experimental_structures_copy/'

	files_to_move = [ base_dir + f for f in listdir(base_dir) if f.find(pdb_id) != -1]

	for f in files_to_move:
	if isdir(f):
	print('cp -r ' + f + ' ' + copy_dest_dir)
	os.system('cp -r ' + f + ' ' + copy_dest_dir)
	new_dir = copy_dest_dir + pdb_id + '/'
	for f2 in listdir(new_dir):
	print('mv ' + new_dir + f2 + ' ' + new_dir + 'exp_'+f2)
	os.system('mv ' + new_dir + f2 + ' ' + new_dir + 'exp_'+f2)
	print('rm ' + new_dir + 'exp_exp_*')
	os.system('rm ' + new_dir + 'exp_exp_*')

	print('cp -r ' + f + ' ' + dest_dir)
	os.system('cp -r ' + f + ' ' + dest_dir)
	new_dir = dest_dir + pdb_id + '/'
	for f2 in listdir(new_dir):
	print('mv ' + new_dir + f2 + ' ' + new_dir + 'exp_'+f2)
	os.system('mv ' + new_dir + f2 + ' ' + new_dir + 'exp_'+f2)
	print('rm ' + new_dir + 'exp_exp_*')
	os.system('rm ' + new_dir + 'exp_exp_*')

	medoid = find_medoid_structure([ new_dir + f for f in listdir(new_dir) ])
	print('cp ' + medoid + ' ' + dest_dir + 'exp_'+pdb_id+'.pdb')
	os.system('cp ' + medoid + ' ' + dest_dir + 'exp_'+pdb_id+'.pdb')
	print('cp ' + medoid + ' ' + copy_dest_dir + 'exp_'+pdb_id+'.pdb')
	os.system('cp ' + medoid + ' ' + copy_dest_dir + 'exp_'+pdb_id+'.pdb')
	elif f.endswith('.pdb'):
	print('cp ' + f + ' ' + dest_dir + 'exp_' + pdb_id + '.pdb' )
	os.system('cp ' + f + ' ' + dest_dir + 'exp_' + pdb_id + '.pdb' )
	print('cp ' + f + ' ' + copy_dest_dir + 'exp_' + pdb_id + '.pdb' )
	os.system('cp ' + f + ' ' + copy_dest_dir + 'exp_' + pdb_id + '.pdb' )


	def convert_aa_name(three_letter_code):
	"""Convert three-letter amino acid codes to one-letter codes. Placeholder function."""
	aa_dict = {
	'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D',
	'CYS': 'C', 'GLU': 'E', 'GLN': 'Q', 'GLY': 'G',
	'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',
	'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
	'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
	}
	return aa_dict.get(three_letter_code, '?')

	def get_pdb_sequence(pdb_path):
	sequence = []
	residue_numbers = []
	chain_ids = []
	current_chain = ""
	prev_chain = ""
	prev_res_id = None

	with open(pdb_path, "r") as pdb_file:
	for line in pdb_file:
	if line.startswith("ATOM"):
	chain = line[21]
	res_name = line[17:20].strip()
	res_id = int(line[22:26])


	if prev_chain != "" and prev_chain != chain:
	sequence.append(':')
	residue_numbers.append(-1)
	chain_ids.append(' ')
	if prev_res_id != res_id:
	sequence.append(convert_aa_name(res_name))
	residue_numbers.append(res_id)
	chain_ids.append(chain)


	prev_chain = chain
	prev_res_id = res_id

	# Convert residue numbers into the desired format
	formatted_sequence = "".join(sequence)
	formatted_residue_numbers = format_residue_numbers(residue_numbers, chain_ids)

	return formatted_residue_numbers + "\n" + formatted_sequence

	def format_residue_numbers(residue_numbers, chain_ids):

	num_lines = math.ceil(math.log(max(residue_numbers), 10) + 1)
	"""Format residue numbers to align with the sequence positions."""

	positions = [[" "] * len(residue_numbers) for i in range(0, num_lines)]
	for line in range(0, num_lines):
	if line == 0:
	for col, chain_id in enumerate(chain_ids):
	positions[line][col] = chain_id
	else:
	for col,residue_number in enumerate(residue_numbers):
	if residue_number == -1:
	continue
	t_res = residue_number - (residue_number // int(math.pow(10, num_lines-line))) * int(math.pow(10, num_lines-line))
	positions[line][col] = str( t_res // int(math.pow(10,num_lines - line - 1)))

	lines = [''.join(position) for position in positions]
	formatted_lines = '\n'.join(lines[:]) # Reverse to have the ones on top

	return formatted_lines

	def find_removal_ranges(experimental_sequence, afs_trimmed_sequence):
	# Placeholder for the logic to parse your actual sequence data
	# This should map sequence identifiers (e.g., 'A', 'B') to their respective sequences and positions
	exp_seq = parse_sequence(experimental_sequence)
	afs_seq = parse_sequence(afs_trimmed_sequence)

	#print('exp_seq')
	#print(exp_seq)
	#print('afs_seq')
	#print(afs_seq)
	ranges = []

	for chain, exp_chain_seq in exp_seq.items():
	afs_chain_seq = afs_seq[chain]

	# Find the start and end of the AFS sequence within the experimental sequence
	exp_seq_split = experimental_sequence.split('\n')
	start_pos = exp_seq_split[len(exp_seq_split)-1].find(afs_chain_seq)

	start_ind = 0
	end_ind = 0
	for i in range(1, len(exp_seq_split)-1):
	add = int(int(exp_seq_split[i][start_pos]) * math.pow(10, len(exp_seq_split)-i-2))
	start_ind += add

	end_ind = start_ind + len(afs_chain_seq) - 1


	# Assuming positions are 1-indexed in your system
	ranges.append(f"{chain}:{start_ind}..{end_ind}")

	return ', '.join(ranges)

	def parse_sequence(sequence_str):
	# Split the string into lines
	lines = sequence_str.strip().split('\n')

	# Assume the last line contains the actual sequence
	sequence_line = lines[-1]

	# Split sequences for different chains based on ':'
	chain_sequences = sequence_line.split(':')

	sequences = {}
	chain_id = 'A' # Starting chain ID, increment for each chain

	for chain_sequence in chain_sequences:
	sequences[chain_id] = chain_sequence.replace(" ", "") # Remove spaces
	# Increment chain ID, simple logic assuming sequence of A, B, C, etc.
	chain_id = chr(ord(chain_id) + 1)

	return sequences

	def parse_ranges(ranges_str):
	"""Parse the input string to extract chains and residue ranges."""
	ranges = {}
	for part in ranges_str.split(', '):
	chain, start_end = part.split(":")
	start_res, end_res = map(int, start_end.split(".."))
	if chain not in ranges:
	ranges[chain] = []
	ranges[chain].append((start_res, end_res))
	return ranges

	def adjust_residue_numbers(ranges):
	"""Adjust residue numbers to start from 1 for each chain."""
	adjustment_maps = {}
	for chain, chain_ranges in ranges.items():
	adjustment_map = {}
	new_residue_num = 1
	for start_res, end_res in chain_ranges:
	for original_res_num in range(start_res, end_res + 1):
	adjustment_map[original_res_num] = new_residue_num
	new_residue_num += 1
	adjustment_maps[chain] = adjustment_map
	return adjustment_maps

	def trim_pdb_by_residues(pdb_file_path, ranges_str):
	"""Trim and reindex residues in a PDB file based on provided ranges string."""
	ranges = parse_ranges(ranges_str)
	adjustment_maps = adjust_residue_numbers(ranges)
	temp_file, temp_file_path = tempfile.mkstemp()

	with open(pdb_file_path, 'r') as pdb_file, os.fdopen(temp_file, 'w') as output_file:
	for line in pdb_file:
	if line.startswith("ATOM") or line.startswith("HETATM"):
	chain_id = line[21]
	residue_num = int(line[22:26].strip())
	if chain_id in ranges:
	for start, end in ranges[chain_id]:
	if start <= residue_num <= end:
	# Adjust residue number
	adjusted_residue_num = adjustment_maps[chain_id][residue_num]
	# Rewrite line with adjusted residue number
	new_line = line[:22] + "{:>4}".format(adjusted_residue_num) + line[26:]
	output_file.write(new_line)
	break
	else:
	# Write lines for chains not in ranges as they are
	output_file.write(line)

	# Replace the original file with the filtered content
	os.replace(temp_file_path, pdb_file_path)
	return pdb_file_path

	def swap_and_relabel_chains(input_pdb_path, output_pdb_path):
	chain_a_atoms = []
	chain_b_atoms = []
	other_lines = []

	# Read the PDB file and segregate lines based on chain A, chain B, and others
	with open(input_pdb_path, 'r') as pdb_file:
	for line in pdb_file:
	if line.startswith('ATOM'):
	chain_id = line[21] # Chain identifier is at column 22 (0-based indexing)
	if chain_id == 'A':
	chain_a_atoms.append(line)
	elif chain_id == 'B':
	chain_b_atoms.append(line)
	else:
	other_lines.append(line)
	else:
	other_lines.append(line)

	# Swap chain identifiers: A -> B and B -> A
	swapped_chain_a_atoms = [atom[:21] + 'B' + atom[22:] for atom in chain_a_atoms]
	swapped_chain_b_atoms = [atom[:21] + 'A' + atom[22:] for atom in chain_b_atoms]

	# Combine swapped chains and other lines, with chain B atoms first
	modified_lines = swapped_chain_b_atoms + swapped_chain_a_atoms + other_lines

	# Write the modified content to a new PDB file
	with open(output_pdb_path, 'w') as output_pdb:
	for line in modified_lines:
	output_pdb.write(line)

	def continue_prompt():
	while True:
	user_input = input("Do you want to continue? (y/n): ").lower()
	if user_input in ['n', 'no']:
	print("Terminating script.")
	exit()
	elif user_input in ['y', 'yes']:
	print("Continuing execution.")
	break
	else:
	print("Invalid input. Please enter 'y' for yes or 'n' for no.")

	def main(bound, apo, ranges_str):
	bound_identifier = bound
	csv_file_path = f'./CSP_{bound_identifier}.csv'

	if not os.path.isfile(csv_file_path):
	data = pd.DataFrame(columns=['apo_bmrb', 'holo_pdb', 'holo_model_path'])
	else:
	data = pd.read_csv(csv_file_path)

	pdb_files = glob.glob(f'./{bound.lower()}/*.pdb')
	new_rows = []
	unique_keys = set(data['holo_model_path'])

	for file in pdb_files:
	new_row_key = file
	#print(new_row_key)
	if new_row_key not in unique_keys:
	if new_row_key.endswith(apo.upper() + '.pdb'):
	os.system(f'rm "{new_row_key}"')
	continue
	# Filter the PDB file before adding it to the CSV
	new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
	unique_keys.add(new_row_key)

	if isdir(f'./{bound.lower()}_alt/'):
	pdb_files = glob.glob(f'./{bound.lower()}_alt/*.pdb')
	unique_keys = set(data['holo_model_path'])

	for file in pdb_files:
	new_row_key = file
	if new_row_key not in unique_keys:
	if new_row_key.endswith(apo.upper() + '.pdb'):
	os.system(f'rm "{new_row_key}"')
	continue
	# Filter the PDB file before adding it to the CSV
	new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
	unique_keys.add(new_row_key)


	exp_dir = f'./experimental_structures/{bound}/'
	if not os.path.isdir(exp_dir):
	exp_dir = './experimental_structures/'

	experimental_pdb_files = glob.glob(exp_dir + f'exp_{bound}*.pdb')
	for file in experimental_pdb_files:
	new_row_key = file
	trim_pdb_by_residues(new_row_key, ranges_str)
	print("Trimmed " + new_row_key)
	if new_row_key not in unique_keys:
	#swap_and_relabel_chains(new_row_key, new_row_key) # SOMETIMES WE NEED TO SWAP CHAIN A AND B
	# Assume experimental PDB files do not need filtering
	new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
	unique_keys.add(new_row_key)

	exp_dir = './experimental_structures/'
	experimental_pdb_files = glob.glob(exp_dir + f'exp_{bound}*.pdb')
	for file in experimental_pdb_files:
	new_row_key = file
	trim_pdb_by_residues(new_row_key, ranges_str)
	print("Trimmed " + new_row_key)
	if new_row_key not in unique_keys:
	#swap_and_relabel_chains(new_row_key, new_row_key) # SOMETIMES WE NEED TO SWAP CHAIN A AND B
	# Assume experimental PDB files do not need filtering
	new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
	unique_keys.add(new_row_key)

	comp_dir = f'./computational_structures/'
	comp_pdb_files = glob.glob(comp_dir + f'comp_{bound}*.pdb')
	for file in comp_pdb_files:
	new_row_key = file
	trim_pdb_by_residues(new_row_key, ranges_str)
	print("Trimmed " + new_row_key)
	if new_row_key not in unique_keys:
	#swap_and_relabel_chains(new_row_key, new_row_key) # SOMETIMES WE NEED TO SWAP CHAIN A AND B
	new_rows.append({'apo_bmrb': apo.upper(), 'holo_pdb': bound.upper(), 'holo_model_path': new_row_key})
	unique_keys.add(new_row_key)

	new_data = pd.DataFrame(new_rows)
	print(len(new_data))

	data = pd.concat([data, new_data]).reset_index(drop=True)
	data.to_csv(csv_file_path, index=False)

	if __name__ == "__main__":
	if len(sys.argv) not in [2, 4]:
	print("Usage: python script.py <bound> <apo> <residue_ranges>")
	print('e.g python3 init_csv_and_trim.py 6bnh 15125 "A:8..76, B:1..13"')
	print("Alternative usage: python script.py <bound>")
	sys.exit(1)

	bound = sys.argv[1]
	df = pd.read_csv('/home/tiburon/Desktop/ROT4/complex4/csp.csv')
	filtered_df = df[df['holo_pdb'] == bound.upper()]
	apo = str(list(filtered_df['apo_bmrb'])[0])


	experimental_sequence = get_pdb_sequence('./experimental_structures/exp_' + bound + '.pdb')
	print('EXPERIMENTAL_SEQUENCE')
	print(experimental_sequence)

	AFS_file_example = ""
	if isdir('./'+bound+'/'):
	AFS_file_example = './'+bound+'/' + listdir('./'+bound+'/')[0]
	elif not(isdir('./'+bound+'/')) and isdir('./'+bound+'_alt/'):
	AFS_file_example = './'+bound+'_alt/' + listdir('./'+bound+'_alt/')[0]
	elif not(isdir('./'+bound+'/')) and not(isdir('./'+bound+'_alt/')):
	print("FOUND NO ENHANCED SAMPLING DIRECTORY, PLEASE FINISH POST PROCESSING PROCEDURE.")
	# raise
	AFS_trimmed_sequence = get_pdb_sequence(AFS_file_example)
	print('AFS_TRIMMED_SEQUENCE')
	print(AFS_trimmed_sequence)