get_NMR_apo_pdb_files.py

import pandas as pd
import os
from util import *

from Bio.PDB import PDBList, PDBParser, PPBuilder

def is_protein_monomer(pdb_id):
    """
    Check if the given PDB ID corresponds to a protein monomer.

    Parameters:
        pdb_id (str): The 4-character PDB ID code.

    Returns:
        bool: True if the structure is a single protein chain, False otherwise.
    """
    # Fetch PDB file
    pdbl = PDBList()
    pdb_file = pdbl.retrieve_pdb_file(pdb_id, pdir='.', file_format='pdb')

    # Parse the structure
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure(pdb_id, pdb_file)

    # Extract polypeptide chains
    ppb = PPBuilder()
    polypeptides = []
    for model in structure:
        for chain in model:
            # Get polypeptide segments in the chain
            pps = ppb.build_peptides(chain)
            if pps:
                polypeptides.append(chain)
        os.remove(pdb_file)
        # Check if we have exactly one polypeptide chain
        if len(polypeptides) == 1:
            return True
        else:
            return False


print("Reading data from CSPRANK.csv")
csv_file_path = '/home/tiburon/Desktop/ROT4/CSP_Rank/CSPRANK.csv'
data = pd.read_csv(csv_file_path)

NMR_apo_pdb_dir = './NMR_apo_pdb_files/'
print(f"Creating directory {NMR_apo_pdb_dir} if it does not exist")
if not os.path.exists(NMR_apo_pdb_dir):
    os.makedirs(NMR_apo_pdb_dir)

manual_apo_bmrbs = []
visited_bmb_ids = []
print("Processing apo_bmrb IDs")
for bmrb_id in data['apo_bmrb']:
    bmrb_id = str(bmrb_id)
    if bmrb_id in visited_bmb_ids:
        continue
    if bmrb_id in manual_apo_bmrbs:
        continue
    visited_bmb_ids.append(bmrb_id)

    pdb_id = get_pdb_from_bmrb(bmrb_id)

    # raise
    if pdb_id is None:
        manual_apo_bmrbs.append(bmrb_id)
        continue

    if not is_protein_monomer(pdb_id):
        print(f"Skipping PDB ID {pdb_id} for BMRB ID {bmrb_id} as it is not a protein monomer")
        manual_apo_bmrbs.append(bmrb_id)
        continue

    print(f"Downloading PDB file for BMRB ID {bmrb_id} with PDB ID {pdb_id}")
    get_pdb_file(pdb_id, output_dir=NMR_apo_pdb_dir)
    old_file_path = f'{NMR_apo_pdb_dir}{pdb_id}.pdb'
    new_file_path = f'{NMR_apo_pdb_dir}{bmrb_id}_{pdb_id}.pdb'
    if os.path.exists(old_file_path):
        os.rename(old_file_path, new_file_path)

print("Processing holo_pdb IDs")
for i, pdb_id in enumerate(data['holo_pdb']):
    if str(data['apo_bmrb'][i]) not in manual_apo_bmrbs:
        continue
    bmrb_id = str(data['apo_bmrb'][i])
    print(f"Downloading PDB file for holo PDB ID {pdb_id}")
    get_pdb_file(pdb_id, output_dir=NMR_apo_pdb_dir)
    old_file_path = f'{NMR_apo_pdb_dir}{pdb_id}.pdb'
    new_file_path = f'{NMR_apo_pdb_dir}{bmrb_id}_{pdb_id}.pdb'
    if os.path.exists(old_file_path):
        os.rename(old_file_path, new_file_path)

print("Listing files in NMR_apo_pdb_dir")
files = [f for f in os.listdir(NMR_apo_pdb_dir)]
NMR_apo_pdb_split_dir = './NMR_apo_pdb_split_files/'
print(f"Creating directory {NMR_apo_pdb_split_dir} if it does not exist")
if not os.path.exists(NMR_apo_pdb_split_dir):
    os.makedirs(NMR_apo_pdb_split_dir)

new_files = []
print("Expanding PDB files")
for file in files:
    basename = os.path.basename(file)
    stri = f"python3 expand.py {NMR_apo_pdb_dir}{file} {NMR_apo_pdb_split_dir}"
    print(stri)
    os.system(stri)

new_files = [NMR_apo_pdb_split_dir + f for f in os.listdir(NMR_apo_pdb_split_dir) if f.endswith('.pdb')]

def remove_chain_from_file(file, chain):
    with open(file, 'r') as f:
        lines = f.readlines()

    stri = ""
    for line in lines:
        if line.startswith('ATOM') and line[21] != chain:
            continue
        stri += line

    with open(file, 'w') as f:
        f.write(stri)

print("Removing chains from new files")
for file in new_files:
    longest_chain = get_longest_chain(file)
    remove_chain_from_file(file, longest_chain)

output_dir = f'NMR_apo_shift_predictions/'
outdirect = 'NMR_apo_for_CSpred/'
print(f"Creating directory {outdirect} if it does not exist")
if not(exists(outdirect)):
    os.system('mkdir ' + outdirect)

print("Making files for CSpred with single chain.")
for f in tqdm(new_files):
    last_residue_id = 0
    current_residue_id = None
    outs = ""
    basename = f[f.rfind('/')+1:]
    with open(f, 'r') as infi:
        for line in infi:
            if line.startswith('ATOM') or line.startswith('HETATM'):
                residue_id = int(line[22:26].strip())
                if residue_id != current_residue_id:
                    current_residue_id = residue_id
                    last_residue_id += 1
                new_residue_id_str = str(last_residue_id).rjust(4, ' ')
                updated_line = line[:21] + "A" + new_residue_id_str + line[26:]
                outs += updated_line
            elif not line.startswith('TER'):
                outs += line

    outfile = outdirect + basename
    with open(outfile, 'w') as outf:
        outf.write(outs)

def get_ph(pdb):
    ph = -1
    with open('./db_apo_cond.txt', 'r') as inf:
        for l in inf:
            this_pdb = l.split(' ')[0].split('/')[-1].split('.')[0].split('_')[0]
            if this_pdb == pdb:
                ph = float(l.split(' ')[1].strip())
    if ph != -1:
        return ph
    if pdb == '1dmo':
        return 7.5
    print(pdb + " not found in db_apo_cond.txt")
    raise

print("Generating NMR_apo_cond.txt")
string = ''
for f in tqdm(new_files):
    pdb = f[f.rfind('/')+1:].split('.')[0].split('_')[0].split('-')[0]
    ph = get_ph(pdb.lower())
    basename = f[f.rfind('/')+1:]
    string += outdirect + basename + ' ' + str(ph) + '\n'
print(string)
with open('NMR_apo_cond.txt', 'w') as fout:
    fout.write(string)

print("CONTINUE IF YOU WANT TO CONVERT TO IUPAC")
if continue_prompt():
    strings = []
    for i, inf in tqdm(enumerate(listdir(outdirect))):
        string = ""
        string += 'load coo pdb ' + outdirect + inf + '\n'
        string += 'to iupac\n'
        string += 'write coo pdb ' + outdirect + inf + "\n"
        with open('cmd.txt', 'w') as outf:
            outf.write(string)
        os.system('pdbstat -s < cmd.txt')
	import pandas as pd
	import os
	from util import *

	from Bio.PDB import PDBList, PDBParser, PPBuilder

	def is_protein_monomer(pdb_id):
	"""
	Check if the given PDB ID corresponds to a protein monomer.

	Parameters:
	pdb_id (str): The 4-character PDB ID code.

	Returns:
	bool: True if the structure is a single protein chain, False otherwise.
	"""
	# Fetch PDB file
	pdbl = PDBList()
	pdb_file = pdbl.retrieve_pdb_file(pdb_id, pdir='.', file_format='pdb')

	# Parse the structure
	parser = PDBParser(QUIET=True)
	structure = parser.get_structure(pdb_id, pdb_file)

	# Extract polypeptide chains
	ppb = PPBuilder()
	polypeptides = []
	for model in structure:
	for chain in model:
	# Get polypeptide segments in the chain
	pps = ppb.build_peptides(chain)
	if pps:
	polypeptides.append(chain)
	os.remove(pdb_file)
	# Check if we have exactly one polypeptide chain
	if len(polypeptides) == 1:
	return True
	else:
	return False


	print("Reading data from CSPRANK.csv")
	csv_file_path = '/home/tiburon/Desktop/ROT4/CSP_Rank/CSPRANK.csv'
	data = pd.read_csv(csv_file_path)

	NMR_apo_pdb_dir = './NMR_apo_pdb_files/'
	print(f"Creating directory {NMR_apo_pdb_dir} if it does not exist")
	if not os.path.exists(NMR_apo_pdb_dir):
	os.makedirs(NMR_apo_pdb_dir)

	manual_apo_bmrbs = []
	visited_bmb_ids = []
	print("Processing apo_bmrb IDs")
	for bmrb_id in data['apo_bmrb']:
	bmrb_id = str(bmrb_id)
	if bmrb_id in visited_bmb_ids:
	continue
	if bmrb_id in manual_apo_bmrbs:
	continue
	visited_bmb_ids.append(bmrb_id)

	pdb_id = get_pdb_from_bmrb(bmrb_id)

	# raise
	if pdb_id is None:
	manual_apo_bmrbs.append(bmrb_id)
	continue

	if not is_protein_monomer(pdb_id):
	print(f"Skipping PDB ID {pdb_id} for BMRB ID {bmrb_id} as it is not a protein monomer")
	manual_apo_bmrbs.append(bmrb_id)
	continue

	print(f"Downloading PDB file for BMRB ID {bmrb_id} with PDB ID {pdb_id}")
	get_pdb_file(pdb_id, output_dir=NMR_apo_pdb_dir)
	old_file_path = f'{NMR_apo_pdb_dir}{pdb_id}.pdb'
	new_file_path = f'{NMR_apo_pdb_dir}{bmrb_id}_{pdb_id}.pdb'
	if os.path.exists(old_file_path):
	os.rename(old_file_path, new_file_path)

	print("Processing holo_pdb IDs")
	for i, pdb_id in enumerate(data['holo_pdb']):
	if str(data['apo_bmrb'][i]) not in manual_apo_bmrbs:
	continue
	bmrb_id = str(data['apo_bmrb'][i])
	print(f"Downloading PDB file for holo PDB ID {pdb_id}")
	get_pdb_file(pdb_id, output_dir=NMR_apo_pdb_dir)
	old_file_path = f'{NMR_apo_pdb_dir}{pdb_id}.pdb'
	new_file_path = f'{NMR_apo_pdb_dir}{bmrb_id}_{pdb_id}.pdb'
	if os.path.exists(old_file_path):
	os.rename(old_file_path, new_file_path)

	print("Listing files in NMR_apo_pdb_dir")
	files = [f for f in os.listdir(NMR_apo_pdb_dir)]
	NMR_apo_pdb_split_dir = './NMR_apo_pdb_split_files/'
	print(f"Creating directory {NMR_apo_pdb_split_dir} if it does not exist")
	if not os.path.exists(NMR_apo_pdb_split_dir):
	os.makedirs(NMR_apo_pdb_split_dir)

	new_files = []
	print("Expanding PDB files")
	for file in files:
	basename = os.path.basename(file)
	stri = f"python3 expand.py {NMR_apo_pdb_dir}{file} {NMR_apo_pdb_split_dir}"
	print(stri)
	os.system(stri)

	new_files = [NMR_apo_pdb_split_dir + f for f in os.listdir(NMR_apo_pdb_split_dir) if f.endswith('.pdb')]

	def remove_chain_from_file(file, chain):
	with open(file, 'r') as f:
	lines = f.readlines()

	stri = ""
	for line in lines:
	if line.startswith('ATOM') and line[21] != chain:
	continue
	stri += line

	with open(file, 'w') as f:
	f.write(stri)

	print("Removing chains from new files")
	for file in new_files:
	longest_chain = get_longest_chain(file)
	remove_chain_from_file(file, longest_chain)

	output_dir = f'NMR_apo_shift_predictions/'
	outdirect = 'NMR_apo_for_CSpred/'
	print(f"Creating directory {outdirect} if it does not exist")
	if not(exists(outdirect)):
	os.system('mkdir ' + outdirect)

	print("Making files for CSpred with single chain.")
	for f in tqdm(new_files):
	last_residue_id = 0
	current_residue_id = None
	outs = ""
	basename = f[f.rfind('/')+1:]
	with open(f, 'r') as infi:
	for line in infi:
	if line.startswith('ATOM') or line.startswith('HETATM'):
	residue_id = int(line[22:26].strip())
	if residue_id != current_residue_id:
	current_residue_id = residue_id
	last_residue_id += 1
	new_residue_id_str = str(last_residue_id).rjust(4, ' ')
	updated_line = line[:21] + "A" + new_residue_id_str + line[26:]
	outs += updated_line
	elif not line.startswith('TER'):
	outs += line

	outfile = outdirect + basename
	with open(outfile, 'w') as outf:
	outf.write(outs)

	def get_ph(pdb):
	ph = -1
	with open('./db_apo_cond.txt', 'r') as inf:
	for l in inf:
	this_pdb = l.split(' ')[0].split('/')[-1].split('.')[0].split('_')[0]
	if this_pdb == pdb:
	ph = float(l.split(' ')[1].strip())
	if ph != -1:
	return ph
	if pdb == '1dmo':
	return 7.5
	print(pdb + " not found in db_apo_cond.txt")
	raise

	print("Generating NMR_apo_cond.txt")
	string = ''
	for f in tqdm(new_files):
	pdb = f[f.rfind('/')+1:].split('.')[0].split('_')[0].split('-')[0]
	ph = get_ph(pdb.lower())
	basename = f[f.rfind('/')+1:]
	string += outdirect + basename + ' ' + str(ph) + '\n'
	print(string)
	with open('NMR_apo_cond.txt', 'w') as fout:
	fout.write(string)

	print("CONTINUE IF YOU WANT TO CONVERT TO IUPAC")
	if continue_prompt():
	strings = []
	for i, inf in tqdm(enumerate(listdir(outdirect))):
	string = ""
	string += 'load coo pdb ' + outdirect + inf + '\n'
	string += 'to iupac\n'
	string += 'write coo pdb ' + outdirect + inf + "\n"
	with open('cmd.txt', 'w') as outf:
	outf.write(string)
	os.system('pdbstat -s < cmd.txt')