import os
from typing import List, Optional, Tuple, Dict
from Sequence import Sequence
from paths import *
import numpy as np
from Bio import PDB as BioPDB
import subprocess
import re
import pandas as pd
from util import compute_DockQ_score, compute_structure_similarity
class PDB:
"""Class for handling PDB files and their associated sequences"""
def __init__(self, pdb_id: str = None, pdb_path: str = None):
Initialize PDB object from either PDB ID or path
pdb_id: PDB identifier (e.g., '2jw1')
pdb_path: Direct path to PDB file
self.pdb_id = pdb_id.lower() if pdb_id else None
self.pdb_path = pdb_path
self.sequence = None
if pdb_id and not pdb_path:
self.pdb_path = self._find_pdb_file()
if self.pdb_path:
self.sequence = Sequence.from_pdb(self.pdb_path)
def _find_pdb_file(self) -> str:
"""Find PDB file in standard locations based on PDB ID"""
if not self.pdb_id:
raise ValueError("No PDB ID provided")
# Standard locations to check
possible_paths = [
# Experimental structures
os.path.join(experimental_structures, f'exp_{self.pdb_id}.pdb'),
os.path.join(experimental_structures, self.pdb_id, f'exp_{self.pdb_id}.pdb'),
# Computational structures
os.path.join(computational_structures, f'comp_{self.pdb_id}.pdb'),
# General PDB files
os.path.join(PDB_FILES, f'{self.pdb_id}.pdb'),
# Return first existing file
for path in possible_paths:
if os.path.exists(path):
return path
raise FileNotFoundError(f"No PDB file found for {self.pdb_id}")
def from_id(cls, pdb_id: str) -> 'PDB':
"""Create PDB object from PDB ID"""
return cls(pdb_id=pdb_id)
def from_path(cls, pdb_path: str) -> 'PDB':
"""Create PDB object from file path"""
if not os.path.exists(pdb_path):
raise FileNotFoundError(f"PDB file not found: {pdb_path}")
#print(f"PDB file found: {pdb_path}")
# Try to extract PDB ID from filename
filename = os.path.basename(pdb_path)
# Look for patterns like 'pdb1abc.ent', '1abc.pdb', 'exp_1abc.pdb', etc
pdb_id_match ='(?:pdb|exp_|comp_)?([0-9a-zA-Z]{4})(?:\.pdb|\.ent)', filename)
pdb_id = if pdb_id_match else None
#if pdb_id:
# print(f"Found matching pdb_id {pdb_id}")
return cls(pdb_path=pdb_path, pdb_id=pdb_id)
def save(self, output_path: str = None) -> str:
Save PDB file to specified path or default location
Returns path where file was saved
if not output_path and not self.pdb_id:
raise ValueError("Must provide output_path or initialize with pdb_id")
if not output_path:
# Create default path in PDB_FILES directory
output_path = os.path.join(PDB_FILES, f'{self.pdb_id}.pdb')
# Ensure directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Copy file to new location
if self.pdb_path != output_path:
with open(self.pdb_path, 'r') as src, open(output_path, 'w') as dst:
return output_path
def get_chain_ids(self) -> List[str]:
"""Get list of chain IDs in the PDB"""
if not self.sequence:
raise ValueError("No sequence loaded")
return [seq.chain_id for seq in self.sequence.subsequences]
def get_protein_chains(self) -> List[str]:
"""Get list of protein chain IDs"""
if not self.sequence:
raise ValueError("No sequence loaded")
return [seq.chain_id for seq in self.sequence.protein_sequences]
def get_peptide_chains(self) -> List[str]:
"""Get list of peptide chain IDs"""
if not self.sequence:
raise ValueError("No sequence loaded")
return [seq.chain_id for seq in self.sequence.peptide_sequences]
def __str__(self) -> str:
"""String representation showing PDB ID/path and sequences"""
parts = []
if self.pdb_id:
parts.append(f"PDB ID: {self.pdb_id}")
parts.append(f"Path: {self.pdb_path}")
if self.sequence:
return "\n".join(parts)
def get_ph(self) -> float:
"""Get pH value from db_holo_cond.txt file for given PDB ID"""
if not os.path.exists('./db_holo_cond.txt'):
# Try to get pH from csp_stats_consensus.csv
if not os.path.exists('./csp_stats_consensus.csv'):
raise FileNotFoundError("Could not find csp_stats_consensus.csv")
df = pd.read_csv('./csp_stats_consensus.csv')
# First try to match holo_pdb
match = df[df['holo_pdb'].str.lower() == self.pdb_id.lower()]
if len(match) == 0:
# If no holo match, try apo_pdb
match = df[df['apo_pdb'].str.lower() == self.pdb_id.lower()]
if len(match) == 0:
raise ValueError(f"Could not find pH value for {self.pdb_id}")
try: = float(match['apo ph'].iloc[0])
except (ValueError, IndexError):
raise ValueError(f"Invalid pH value for {self.pdb_id}")
try: = float(match['holo ph'].iloc[0])
except (ValueError, IndexError):
raise ValueError(f"Invalid pH value for {self.pdb_id}")
with open('./db_holo_cond.txt', 'r') as inf:
for line in inf:
this_pdb = line.split(' ')[0]
this_pdb = this_pdb[this_pdb.rfind('/')+1:this_pdb.rfind('.')]
if this_pdb == self.pdb_id: = float(line.split(' ')[1].strip())
raise ValueError(f"Could not find pH value for {self.pdb_id}")
def prep_for_CSpred(self, out_directory: str) -> str:
Prepare PDB file for CSpred by converting all chains to chain 'A'
and renumbering residues continuously.
out_directory: Directory to save modified PDB file
Condition string for CSpred in format: "path/to/file.pdb ph_value"
# Ensure output directory exists
os.makedirs(out_directory, exist_ok=True)
# Get basename of original file
basename = os.path.basename(self.pdb_path)
output_path = os.path.join(out_directory, basename)
# Process PDB file
last_residue_id = 0
current_residue_id = None
prev_residue_id = None
output_content = ""
with open(self.pdb_path, 'r') as infile:
for line in infile:
if line.startswith('ATOM') or line.startswith('HETATM'):
residue_id = int(line[22:26].strip())
# Update residue numbering when we see a new residue
if residue_id != current_residue_id:
current_residue_id = residue_id
# Check for chain break by looking for non-consecutive residue IDs
if prev_residue_id is not None and residue_id != prev_residue_id + 1:
# Add extra increment for chain break
last_residue_id += 2
last_residue_id += 1
prev_residue_id = residue_id
# Reconstruct line with chain A and new residue number
new_residue_id_str = str(last_residue_id).rjust(4)
updated_line = line[:21] + "A" + new_residue_id_str + line[26:]
output_content += updated_line
elif not line.startswith('TER'):
output_content += line
# Write modified file
with open(output_path, 'w') as outfile:
# Get relative path for condition string
relative_outdir = os.path.basename(out_directory.rstrip('/')) + '/'
# If pH not provided, try to get it from db_holo_cond.txt
if is None and self.pdb_id:
ph = self.get_ph()
except ValueError as e:
print(f"Warning: {e}")
condition_string = f"{relative_outdir}{basename} { if is not None else '7.0'}"
return condition_string.strip()
def calc_dockq(self, other: 'PDB') -> Tuple[float, float, float, float, float, float, int]:
Calculate DockQ score using this structure as reference and other as target
other: PDB object to compare against
Tuple of (iRMS, LRMS, DockQ, Fnat, Fnonnat, F1, clashes) scores where:
- iRMS: Interface Root Mean Square Deviation
- LRMS: Ligand Root Mean Square Deviation
- DockQ: Overall DockQ score
- Fnat: Fraction of native contacts
- Fnonnat: Fraction of non-native contacts
- F1: F1 score
- clashes: Number of clashes
if not self.pdb_path or not other.pdb_path:
raise ValueError("Both PDB files must exist")
iRMS, LRMS, DockQ, Fnat, Fnonnat, F1, clashes = compute_DockQ_score(other.pdb_path, self.pdb_path)
return iRMS, LRMS, DockQ, Fnat, Fnonnat, F1, clashes
def calc_tm(self, other: 'PDB', multimer: bool = True) -> float:
Calculate TM-score using this structure as reference and other as target
other: PDB object to compare against
multimer: Whether to use multimer mode for TM-score calculation
TM-score between the structures
if not self.pdb_path or not other.pdb_path:
raise ValueError("Both PDB files must exist")
return compute_structure_similarity(self.pdb_path, other.pdb_path, multimer=multimer)
def calc_gdt_ts(self, other: 'PDB', cutoffs: List[float] = [1.0, 2.0, 4.0, 8.0]) -> float:
Calculate GDT_TS score using this structure as reference and other as target
other: PDB object to compare against
cutoffs: Distance cutoffs in angstroms for GDT calculation
GDT_TS score between the structures
if not self.pdb_path or not other.pdb_path:
raise ValueError("Both PDB files must exist")
parser = BioPDB.PDBParser(QUIET=True)
structure_ref = parser.get_structure("reference", self.pdb_path)
structure_model = parser.get_structure("model", other.pdb_path)
# Extract C-alpha coordinates by (chain_id, residue_number)
ref_coords = {}
for chain in structure_ref.get_chains():
for residue in chain.get_residues():
if residue.has_id('CA'):
chain_id = chain.get_id()
res_id = residue.get_id()[1]
ref_coords[(chain_id, res_id)] = residue['CA'].get_vector()
model_coords = {}
for chain in structure_model.get_chains():
for residue in chain.get_residues():
if residue.has_id('CA'):
chain_id = chain.get_id()
res_id = residue.get_id()[1]
model_coords[(chain_id, res_id)] = residue['CA'].get_vector()
# Identify matching residues
common_keys = set(ref_coords.keys()).intersection(model_coords.keys())
if not common_keys:
raise ValueError("No matching residues found between structures")
# Calculate distances for each matching residue
distances = []
for key in common_keys:
dist = (ref_coords[key] - model_coords[key]).norm()
# For each cutoff, compute fraction of residues within cutoff
gdt_scores = []
total_matched = len(distances)
for c in cutoffs:
within_cutoff = sum(d <= c for d in distances)
fraction = within_cutoff / total_matched
# GDT_TS is average of fractions * 100
gdt_ts = (sum(gdt_scores) / len(gdt_scores)) * 100.0
return gdt_ts
def superpose_onto(self, other: 'PDB', output_path: Optional[str] = None) -> str:
Superpose this structure onto another structure
other: Reference PDB structure to superpose onto
output_path: Optional path to save superposed structure
Path to superposed structure
if not self.pdb_path or not other.pdb_path:
raise ValueError("Both PDB files must exist")
parser = BioPDB.PDBParser(QUIET=True)
ref_structure = parser.get_structure('ref', other.pdb_path)
mobile_structure = parser.get_structure('mobile', self.pdb_path)
# Get largest chains
def get_largest_chain(structure):
largest_chain = None
max_residues = 0
for model in structure:
for chain in model:
num_residues = len([r for r in chain if BioPDB.is_aa(r)])
if num_residues > max_residues:
max_residues = num_residues
largest_chain = chain
return largest_chain
ref_chain = get_largest_chain(ref_structure)
mobile_chain = get_largest_chain(mobile_structure)
# Align chains
super_imposer = BioPDB.Superimposer()
ref_atoms = [r['CA'] for r in ref_chain if 'CA' in r]
mobile_atoms = [r['CA'] for r in mobile_chain if 'CA' in r]
super_imposer.set_atoms(ref_atoms, mobile_atoms)
# Apply transformation
for model in mobile_structure:
for chain in model:
# Save superposed structure
if not output_path:
output_path = self.pdb_path.replace('.pdb', '_aligned.pdb')
io = BioPDB.PDBIO()
return output_path
def find_pdb_files(pdb_id: str) -> List[str]:
Find all PDB files matching the given ID in standard locations.
Useful for when multiple files exist for the same PDB ID.
pdb_files = []
# Patterns to check
patterns = [
# Experimental structures
os.path.join(experimental_structures, f'exp_{pdb_id.lower()}.pdb'),
os.path.join(experimental_structures, pdb_id.lower(), f'exp_{pdb_id.lower()}.pdb'),
os.path.join(experimental_structures, pdb_id.upper(), f'exp_{pdb_id.upper()}.pdb'),
# Computational structures
os.path.join(computational_structures, f'comp_{pdb_id.lower()}.pdb'),
os.path.join(computational_structures, f'comp_{pdb_id.upper()}.pdb'),
# General PDB files
os.path.join(PDB_FILES, f'{pdb_id.lower()}.pdb'),
os.path.join(PDB_FILES, f'{pdb_id.upper()}.pdb')
# Collect all existing files
for pattern in patterns:
if os.path.exists(pattern):
return pdb_files