Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
CSP_Rank/get_NMR_apo_pdb_files.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
192 lines (164 sloc)
6.08 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import os | |
from util import * | |
from Bio.PDB import PDBList, PDBParser, PPBuilder | |
def is_protein_monomer(pdb_id): | |
""" | |
Check if the given PDB ID corresponds to a protein monomer. | |
Parameters: | |
pdb_id (str): The 4-character PDB ID code. | |
Returns: | |
bool: True if the structure is a single protein chain, False otherwise. | |
""" | |
# Fetch PDB file | |
pdbl = PDBList() | |
pdb_file = pdbl.retrieve_pdb_file(pdb_id, pdir='.', file_format='pdb') | |
# Parse the structure | |
parser = PDBParser(QUIET=True) | |
structure = parser.get_structure(pdb_id, pdb_file) | |
# Extract polypeptide chains | |
ppb = PPBuilder() | |
polypeptides = [] | |
for model in structure: | |
for chain in model: | |
# Get polypeptide segments in the chain | |
pps = ppb.build_peptides(chain) | |
if pps: | |
polypeptides.append(chain) | |
os.remove(pdb_file) | |
# Check if we have exactly one polypeptide chain | |
if len(polypeptides) == 1: | |
return True | |
else: | |
return False | |
print("Reading data from CSPRANK.csv") | |
csv_file_path = '/home/tiburon/Desktop/ROT4/CSP_Rank/CSPRANK.csv' | |
data = pd.read_csv(csv_file_path) | |
NMR_apo_pdb_dir = './NMR_apo_pdb_files/' | |
print(f"Creating directory {NMR_apo_pdb_dir} if it does not exist") | |
if not os.path.exists(NMR_apo_pdb_dir): | |
os.makedirs(NMR_apo_pdb_dir) | |
manual_apo_bmrbs = [] | |
visited_bmb_ids = [] | |
print("Processing apo_bmrb IDs") | |
for bmrb_id in data['apo_bmrb']: | |
bmrb_id = str(bmrb_id) | |
if bmrb_id in visited_bmb_ids: | |
continue | |
if bmrb_id in manual_apo_bmrbs: | |
continue | |
visited_bmb_ids.append(bmrb_id) | |
pdb_id = get_pdb_from_bmrb(bmrb_id) | |
# raise | |
if pdb_id is None: | |
manual_apo_bmrbs.append(bmrb_id) | |
continue | |
if not is_protein_monomer(pdb_id): | |
print(f"Skipping PDB ID {pdb_id} for BMRB ID {bmrb_id} as it is not a protein monomer") | |
manual_apo_bmrbs.append(bmrb_id) | |
continue | |
print(f"Downloading PDB file for BMRB ID {bmrb_id} with PDB ID {pdb_id}") | |
get_pdb_file(pdb_id, output_dir=NMR_apo_pdb_dir) | |
old_file_path = f'{NMR_apo_pdb_dir}{pdb_id}.pdb' | |
new_file_path = f'{NMR_apo_pdb_dir}{bmrb_id}_{pdb_id}.pdb' | |
if os.path.exists(old_file_path): | |
os.rename(old_file_path, new_file_path) | |
print("Processing holo_pdb IDs") | |
for i, pdb_id in enumerate(data['holo_pdb']): | |
if str(data['apo_bmrb'][i]) not in manual_apo_bmrbs: | |
continue | |
bmrb_id = str(data['apo_bmrb'][i]) | |
print(f"Downloading PDB file for holo PDB ID {pdb_id}") | |
get_pdb_file(pdb_id, output_dir=NMR_apo_pdb_dir) | |
old_file_path = f'{NMR_apo_pdb_dir}{pdb_id}.pdb' | |
new_file_path = f'{NMR_apo_pdb_dir}{bmrb_id}_{pdb_id}.pdb' | |
if os.path.exists(old_file_path): | |
os.rename(old_file_path, new_file_path) | |
print("Listing files in NMR_apo_pdb_dir") | |
files = [f for f in os.listdir(NMR_apo_pdb_dir)] | |
NMR_apo_pdb_split_dir = './NMR_apo_pdb_split_files/' | |
print(f"Creating directory {NMR_apo_pdb_split_dir} if it does not exist") | |
if not os.path.exists(NMR_apo_pdb_split_dir): | |
os.makedirs(NMR_apo_pdb_split_dir) | |
new_files = [] | |
print("Expanding PDB files") | |
for file in files: | |
basename = os.path.basename(file) | |
stri = f"python3 expand.py {NMR_apo_pdb_dir}{file} {NMR_apo_pdb_split_dir}" | |
print(stri) | |
os.system(stri) | |
new_files = [NMR_apo_pdb_split_dir + f for f in os.listdir(NMR_apo_pdb_split_dir) if f.endswith('.pdb')] | |
def remove_chain_from_file(file, chain): | |
with open(file, 'r') as f: | |
lines = f.readlines() | |
stri = "" | |
for line in lines: | |
if line.startswith('ATOM') and line[21] != chain: | |
continue | |
stri += line | |
with open(file, 'w') as f: | |
f.write(stri) | |
print("Removing chains from new files") | |
for file in new_files: | |
longest_chain = get_longest_chain(file) | |
remove_chain_from_file(file, longest_chain) | |
output_dir = f'NMR_apo_shift_predictions/' | |
outdirect = 'NMR_apo_for_CSpred/' | |
print(f"Creating directory {outdirect} if it does not exist") | |
if not(exists(outdirect)): | |
os.system('mkdir ' + outdirect) | |
print("Making files for CSpred with single chain.") | |
for f in tqdm(new_files): | |
last_residue_id = 0 | |
current_residue_id = None | |
outs = "" | |
basename = f[f.rfind('/')+1:] | |
with open(f, 'r') as infi: | |
for line in infi: | |
if line.startswith('ATOM') or line.startswith('HETATM'): | |
residue_id = int(line[22:26].strip()) | |
if residue_id != current_residue_id: | |
current_residue_id = residue_id | |
last_residue_id += 1 | |
new_residue_id_str = str(last_residue_id).rjust(4, ' ') | |
updated_line = line[:21] + "A" + new_residue_id_str + line[26:] | |
outs += updated_line | |
elif not line.startswith('TER'): | |
outs += line | |
outfile = outdirect + basename | |
with open(outfile, 'w') as outf: | |
outf.write(outs) | |
def get_ph(pdb): | |
ph = -1 | |
with open('./db_apo_cond.txt', 'r') as inf: | |
for l in inf: | |
this_pdb = l.split(' ')[0].split('/')[-1].split('.')[0].split('_')[0] | |
if this_pdb == pdb: | |
ph = float(l.split(' ')[1].strip()) | |
if ph != -1: | |
return ph | |
if pdb == '1dmo': | |
return 7.5 | |
print(pdb + " not found in db_apo_cond.txt") | |
raise | |
print("Generating NMR_apo_cond.txt") | |
string = '' | |
for f in tqdm(new_files): | |
pdb = f[f.rfind('/')+1:].split('.')[0].split('_')[0].split('-')[0] | |
ph = get_ph(pdb.lower()) | |
basename = f[f.rfind('/')+1:] | |
string += outdirect + basename + ' ' + str(ph) + '\n' | |
print(string) | |
with open('NMR_apo_cond.txt', 'w') as fout: | |
fout.write(string) | |
print("CONTINUE IF YOU WANT TO CONVERT TO IUPAC") | |
if continue_prompt(): | |
strings = [] | |
for i, inf in tqdm(enumerate(listdir(outdirect))): | |
string = "" | |
string += 'load coo pdb ' + outdirect + inf + '\n' | |
string += 'to iupac\n' | |
string += 'write coo pdb ' + outdirect + inf + "\n" | |
with open('cmd.txt', 'w') as outf: | |
outf.write(string) | |
os.system('pdbstat -s < cmd.txt') |