prep_for_CSpred.py

import os
from os import listdir
from tqdm import tqdm
import sys
from os.path import exists, isdir
from paths import *
from util import *

if len(sys.argv) < 2:
    print('usage: prep_for_CSpred.py <pdb_id>')
    #print('usage: prep_for_CSpred.py <directory_to_prep> <new_directory>')
    raise

pdb = sys.argv[1]

# 9/1/24 remove _aligned suffix, possibility of running CS predictions using models with missing hydrogens
#directs = [PDB_FILES + pdb.upper() + '_aligned/', PDB_FILES + pdb.upper() + '_alt_aligned/']
directs = [PDB_FILES + pdb.upper() +'/', PDB_FILES + pdb.upper() + '_alt/']
files = []
for direct in directs:
    if isdir(direct):
        for f in listdir(direct):
            files.append(direct + f)
experimental_direct = experimental_structures+pdb.lower()+'/'
for f in listdir(experimental_direct):
    if f.endswith('.pdb'):
        files.append(experimental_direct + f)
if exists(experimental_structures+'exp_'+pdb.lower()+'.pdb'):
    files.append(experimental_structures+'exp_'+pdb.lower()+'.pdb')

computational_direct = computational_structures
for f in listdir(computational_direct):
    if f.find(pdb.lower()+'.pdb') != -1 and f.endswith('.pdb'):
        files.append(computational_direct + f)


# look for existing directory of output in f'{CS_Predictions}{pdb.lower()}_AFS_shift_predictions/'
# remove any matches in files with matching file.basename:

# Directory where the CS prediction outputs are stored
output_dir = f'{CS_Predictions}{pdb.lower()}_AFS_shift_predictions/'

# Check if the output directory exists
if os.path.isdir(output_dir):
    # List all files in the output directory
    output_files = os.listdir(output_dir)
    # Set of base filenames from the output directory

    # IT IS POSSIBLE TO FITLER WHICH FILES GO INTO THE DIRECTORY BOUND FOR NMRBOX. YOU CAN DO SO HERE::
    output_basenames = {os.path.basename(file) for file in output_files}
    # Filter out files from the `files` list that have matching basenames in the output directory
    files = [file for file in files if os.path.basename(file) not in output_basenames and file.find('dropout') != -1 or file.find('exp') != -1 or file.find('comp') != -1]

print("number of files = " + str(len(files)) + ".")
if not(continue_prompt()):
    raise

outdirect = PDB_FILES + pdb.lower()+'_for_CSpred/'
outdirect_for_cond_file = pdb.lower()+'_for_CSpred/'
if not(exists(outdirect)):
    os.system('mkdir ' + outdirect)

print("Making files for CSpred with single chain.")
for f in tqdm(files):
    last_residue_id = 0
    current_residue_id = None
    outs = ""
    basename = f[f.rfind('/')+1:]
    with open( f, 'r') as infi:
        for line in infi:
            if line.startswith('ATOM') or line.startswith('HETATM'):
                residue_id = int(line[22:26].strip())  # Extract current residue ID, stripping spaces for safety
                if residue_id != current_residue_id:
                    current_residue_id = residue_id
                    last_residue_id += 1  # Increment our continuous residue ID counter
                # Reconstruct the line with the updated residue ID, keeping everything else the same
                new_residue_id_str = str(last_residue_id).rjust(4)
                updated_line = line[:21] + "A" + new_residue_id_str + line[26:]
                outs += updated_line
            elif not line.startswith('TER'):
                outs += line

    # IT IS POSSIBLE TO FITLER WHICH FILES GO INTO THE DIRECTORY BOUND FOR NMRBOX. YOU CAN DO SO HERE::
    outfile = outdirect + basename
    with open(outfile, 'w') as outf:
        outf.write(outs)


def get_ph(pdb):
    ph = -1
    with open('./db_holo_cond.txt', 'r') as inf:
        for l in inf:
            this_pdb = l.split(' ')[0]
            this_pdb = this_pdb[this_pdb.rfind('/')+1:this_pdb.rfind('.')]
            if this_pdb == pdb:
                ph = float(l.split(' ')[1].strip())
    if ph != -1:
        return ph
    raise

ph = get_ph(pdb.lower())
string = ''
for f in tqdm(listdir(outdirect)):
    string += outdirect_for_cond_file + f + ' ' + str(ph) + '\n'

with open(pdb + '_cond.txt', 'w') as fout:
    fout.write(string)

print("CONTINUE IF YOU WANT TO CONVERT TO IUPAC")
if continue_prompt():

    strings = []
    for i,inf in tqdm(enumerate(listdir(outdirect))):
        string = ""

        string += 'load coo pdb ' + outdirect + inf + '\n'
        string += 'to iupac\n'
        string += 'write coo pdb ' + outdirect + inf +"\n"
        with open('cmd.txt', 'w') as outf:
            outf.write(string)
        os.system('pdbstat -s < cmd.txt')
	import os
	from os import listdir
	from tqdm import tqdm
	import sys
	from os.path import exists, isdir
	from paths import *
	from util import *

	if len(sys.argv) < 2:
	print('usage: prep_for_CSpred.py <pdb_id>')
	#print('usage: prep_for_CSpred.py <directory_to_prep> <new_directory>')
	raise

	pdb = sys.argv[1]

	# 9/1/24 remove _aligned suffix, possibility of running CS predictions using models with missing hydrogens
	#directs = [PDB_FILES + pdb.upper() + '_aligned/', PDB_FILES + pdb.upper() + '_alt_aligned/']
	directs = [PDB_FILES + pdb.upper() +'/', PDB_FILES + pdb.upper() + '_alt/']
	files = []
	for direct in directs:
	if isdir(direct):
	for f in listdir(direct):
	files.append(direct + f)
	experimental_direct = experimental_structures+pdb.lower()+'/'
	for f in listdir(experimental_direct):
	if f.endswith('.pdb'):
	files.append(experimental_direct + f)
	if exists(experimental_structures+'exp_'+pdb.lower()+'.pdb'):
	files.append(experimental_structures+'exp_'+pdb.lower()+'.pdb')

	computational_direct = computational_structures
	for f in listdir(computational_direct):
	if f.find(pdb.lower()+'.pdb') != -1 and f.endswith('.pdb'):
	files.append(computational_direct + f)


	# look for existing directory of output in f'{CS_Predictions}{pdb.lower()}_AFS_shift_predictions/'
	# remove any matches in files with matching file.basename:

	# Directory where the CS prediction outputs are stored
	output_dir = f'{CS_Predictions}{pdb.lower()}_AFS_shift_predictions/'

	# Check if the output directory exists
	if os.path.isdir(output_dir):
	# List all files in the output directory
	output_files = os.listdir(output_dir)
	# Set of base filenames from the output directory

	# IT IS POSSIBLE TO FITLER WHICH FILES GO INTO THE DIRECTORY BOUND FOR NMRBOX. YOU CAN DO SO HERE::
	output_basenames = {os.path.basename(file) for file in output_files}
	# Filter out files from the `files` list that have matching basenames in the output directory
	files = [file for file in files if os.path.basename(file) not in output_basenames and file.find('dropout') != -1 or file.find('exp') != -1 or file.find('comp') != -1]

	print("number of files = " + str(len(files)) + ".")
	if not(continue_prompt()):
	raise

	outdirect = PDB_FILES + pdb.lower()+'_for_CSpred/'
	outdirect_for_cond_file = pdb.lower()+'_for_CSpred/'
	if not(exists(outdirect)):
	os.system('mkdir ' + outdirect)

	print("Making files for CSpred with single chain.")
	for f in tqdm(files):
	last_residue_id = 0
	current_residue_id = None
	outs = ""
	basename = f[f.rfind('/')+1:]
	with open( f, 'r') as infi:
	for line in infi:
	if line.startswith('ATOM') or line.startswith('HETATM'):
	residue_id = int(line[22:26].strip()) # Extract current residue ID, stripping spaces for safety
	if residue_id != current_residue_id:
	current_residue_id = residue_id
	last_residue_id += 1 # Increment our continuous residue ID counter
	# Reconstruct the line with the updated residue ID, keeping everything else the same
	new_residue_id_str = str(last_residue_id).rjust(4)
	updated_line = line[:21] + "A" + new_residue_id_str + line[26:]
	outs += updated_line
	elif not line.startswith('TER'):
	outs += line

	# IT IS POSSIBLE TO FITLER WHICH FILES GO INTO THE DIRECTORY BOUND FOR NMRBOX. YOU CAN DO SO HERE::
	outfile = outdirect + basename
	with open(outfile, 'w') as outf:
	outf.write(outs)


	def get_ph(pdb):
	ph = -1
	with open('./db_holo_cond.txt', 'r') as inf:
	for l in inf:
	this_pdb = l.split(' ')[0]
	this_pdb = this_pdb[this_pdb.rfind('/')+1:this_pdb.rfind('.')]
	if this_pdb == pdb:
	ph = float(l.split(' ')[1].strip())
	if ph != -1:
	return ph
	raise

	ph = get_ph(pdb.lower())
	string = ''
	for f in tqdm(listdir(outdirect)):
	string += outdirect_for_cond_file + f + ' ' + str(ph) + '\n'

	with open(pdb + '_cond.txt', 'w') as fout:
	fout.write(string)

	print("CONTINUE IF YOU WANT TO CONVERT TO IUPAC")
	if continue_prompt():

	strings = []
	for i,inf in tqdm(enumerate(listdir(outdirect))):
	string = ""

	string += 'load coo pdb ' + outdirect + inf + '\n'
	string += 'to iupac\n'
	string += 'write coo pdb ' + outdirect + inf +"\n"
	with open('cmd.txt', 'w') as outf:
	outf.write(string)
	os.system('pdbstat -s < cmd.txt')