get_NMR_holo_pdb_files.py

import pandas as pd
import os
from util import *
from Bio.PDB import PDBList, PDBParser, PPBuilder

print("Reading data from CSPRANK.csv")
csv_file_path = '/home/tiburon/Desktop/ROT4/CSP_Rank/CSPRANK.csv'
data = pd.read_csv(csv_file_path)

NMR_holo_structure_dir = '/home/tiburon/Desktop/ROT4/CSP_Rank/NMR_holo_structure_files/'
print(f"Creating directory {NMR_holo_structure_dir} if it does not exist")
if not(exists(NMR_holo_structure_dir)):
    os.system('mkdir ' + NMR_holo_structure_dir)

print("Processing holo_pdb IDs")
for i, pdb_id in enumerate(data['holo_pdb']):
    print(f"Downloading PDB file for holo PDB ID {pdb_id}")
    get_pdb_file(pdb_id, output_dir=NMR_holo_structure_dir)
    old_file_path = f'{NMR_holo_structure_dir}{pdb_id}.pdb'

print("Listing files in NMR_holo_structure_dir")
files = [f for f in os.listdir(NMR_holo_structure_dir)]
NMR_holo_pdb_split_dir = './NMR_holo_pdb_split_files/'
print(f"Creating directory {NMR_holo_pdb_split_dir} if it does not exist")
if not os.path.exists(NMR_holo_pdb_split_dir):
    os.makedirs(NMR_holo_pdb_split_dir)

new_files = []
print("Expanding PDB files")
for file in files:
    basename = os.path.basename(file)
    stri = f"python3 expand.py {NMR_holo_structure_dir}{file} {NMR_holo_pdb_split_dir}"
    print(stri)
    os.system(stri)

new_files = [NMR_holo_pdb_split_dir + f for f in os.listdir(NMR_holo_pdb_split_dir) if f.endswith('.pdb')]

outdirect = NMR_holo_pdb_split_dir
print(f"Creating directory {outdirect} if it does not exist")
if not(exists(outdirect)):
    os.system('mkdir ' + outdirect)

def renumber_pdb_residues(input_pdb, output_pdb):
    """
    Renumber residues for all chains in a PDB file so that each chain starts at residue 1
    and increments sequentially.

    Parameters
    ----------
    input_pdb : str
        Path to the input PDB file.
    output_pdb : str
        Path to the output PDB file.
    """

    # Data structures to keep track of renumbering
    current_chain = None
    current_res_label = None  # A tuple (chain, old_resnum, iCode)
    chain_res_map = {}        # Maps (chain, old_resnum, iCode) to new resnum
    chain_current_res_count = {}

    with open(input_pdb, 'r') as infile, open(output_pdb, 'w') as outfile:
        for line in infile:
            record_type = line[0:6].strip()

            # We will focus on ATOM/HETATM lines for renumbering
            if record_type in ('ATOM', 'HETATM'):
                chain_id = line[21].strip()  # Chain identifier
                old_resnum = line[22:26].strip()  # Original residue number
                iCode = line[26].strip()  # Insertion code if present

                # Check if chain changed or first time seeing this chain
                if chain_id not in chain_current_res_count:
                    chain_current_res_count[chain_id] = 0

                # Create a residue label tuple
                res_label = (chain_id, old_resnum, iCode)

                # If we encounter a new residue within the chain, increment the count
                if res_label not in chain_res_map:
                    chain_current_res_count[chain_id] += 1
                    chain_res_map[res_label] = chain_current_res_count[chain_id]

                # Get the new residue number
                new_resnum = chain_res_map[res_label]

                # Construct the new line with updated residue number
                # Residue number occupies columns 23–26 in PDB format (1-based indexing)
                # We must ensure correct formatting: right-justified in a width of 4
                new_line = line[:22] + f"{new_resnum:>4}" + line[26:]
                outfile.write(new_line)

            else:
                # Non-ATOM/HETATM lines are written unchanged
                outfile.write(line)


for file in tqdm(new_files):
    new_file = file[:file.rfind('.')] + '_renumbered.pdb'
    renumber_pdb_residues(file, new_file)
    os.system('mv ' + new_file +  ' ' + file)

print("CONTINUE IF YOU WANT TO CONVERT TO IUPAC")
if continue_prompt():
    strings = []
    for i, inf in tqdm(enumerate(listdir(outdirect))):
        string = ""
        string += 'load coo pdb ' + outdirect + inf + '\n'
        string += 'to iupac\n'
        string += 'write coo pdb ' + outdirect + inf + "\n"
        with open('cmd.txt', 'w') as outf:
            outf.write(string)
        os.system('pdbstat -s < cmd.txt')
	import pandas as pd
	import os
	from util import *
	from Bio.PDB import PDBList, PDBParser, PPBuilder

	print("Reading data from CSPRANK.csv")
	csv_file_path = '/home/tiburon/Desktop/ROT4/CSP_Rank/CSPRANK.csv'
	data = pd.read_csv(csv_file_path)

	NMR_holo_structure_dir = '/home/tiburon/Desktop/ROT4/CSP_Rank/NMR_holo_structure_files/'
	print(f"Creating directory {NMR_holo_structure_dir} if it does not exist")
	if not(exists(NMR_holo_structure_dir)):
	os.system('mkdir ' + NMR_holo_structure_dir)

	print("Processing holo_pdb IDs")
	for i, pdb_id in enumerate(data['holo_pdb']):
	print(f"Downloading PDB file for holo PDB ID {pdb_id}")
	get_pdb_file(pdb_id, output_dir=NMR_holo_structure_dir)
	old_file_path = f'{NMR_holo_structure_dir}{pdb_id}.pdb'

	print("Listing files in NMR_holo_structure_dir")
	files = [f for f in os.listdir(NMR_holo_structure_dir)]
	NMR_holo_pdb_split_dir = './NMR_holo_pdb_split_files/'
	print(f"Creating directory {NMR_holo_pdb_split_dir} if it does not exist")
	if not os.path.exists(NMR_holo_pdb_split_dir):
	os.makedirs(NMR_holo_pdb_split_dir)

	new_files = []
	print("Expanding PDB files")
	for file in files:
	basename = os.path.basename(file)
	stri = f"python3 expand.py {NMR_holo_structure_dir}{file} {NMR_holo_pdb_split_dir}"
	print(stri)
	os.system(stri)

	new_files = [NMR_holo_pdb_split_dir + f for f in os.listdir(NMR_holo_pdb_split_dir) if f.endswith('.pdb')]

	outdirect = NMR_holo_pdb_split_dir
	print(f"Creating directory {outdirect} if it does not exist")
	if not(exists(outdirect)):
	os.system('mkdir ' + outdirect)

	def renumber_pdb_residues(input_pdb, output_pdb):
	"""
	Renumber residues for all chains in a PDB file so that each chain starts at residue 1
	and increments sequentially.

	Parameters
	----------
	input_pdb : str
	Path to the input PDB file.
	output_pdb : str
	Path to the output PDB file.
	"""

	# Data structures to keep track of renumbering
	current_chain = None
	current_res_label = None # A tuple (chain, old_resnum, iCode)
	chain_res_map = {} # Maps (chain, old_resnum, iCode) to new resnum
	chain_current_res_count = {}

	with open(input_pdb, 'r') as infile, open(output_pdb, 'w') as outfile:
	for line in infile:
	record_type = line[0:6].strip()

	# We will focus on ATOM/HETATM lines for renumbering
	if record_type in ('ATOM', 'HETATM'):
	chain_id = line[21].strip() # Chain identifier
	old_resnum = line[22:26].strip() # Original residue number
	iCode = line[26].strip() # Insertion code if present

	# Check if chain changed or first time seeing this chain
	if chain_id not in chain_current_res_count:
	chain_current_res_count[chain_id] = 0

	# Create a residue label tuple
	res_label = (chain_id, old_resnum, iCode)

	# If we encounter a new residue within the chain, increment the count
	if res_label not in chain_res_map:
	chain_current_res_count[chain_id] += 1
	chain_res_map[res_label] = chain_current_res_count[chain_id]

	# Get the new residue number
	new_resnum = chain_res_map[res_label]

	# Construct the new line with updated residue number
	# Residue number occupies columns 23–26 in PDB format (1-based indexing)
	# We must ensure correct formatting: right-justified in a width of 4
	new_line = line[:22] + f"{new_resnum:>4}" + line[26:]
	outfile.write(new_line)

	else:
	# Non-ATOM/HETATM lines are written unchanged
	outfile.write(line)



	for file in tqdm(new_files):
	new_file = file[:file.rfind('.')] + '_renumbered.pdb'
	renumber_pdb_residues(file, new_file)
	os.system('mv ' + new_file + ' ' + file)

	print("CONTINUE IF YOU WANT TO CONVERT TO IUPAC")
	if continue_prompt():
	strings = []
	for i, inf in tqdm(enumerate(listdir(outdirect))):
	string = ""
	string += 'load coo pdb ' + outdirect + inf + '\n'
	string += 'to iupac\n'
	string += 'write coo pdb ' + outdirect + inf + "\n"
	with open('cmd.txt', 'w') as outf:
	outf.write(string)
	os.system('pdbstat -s < cmd.txt')