Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import os
from typing import List, Optional, Tuple, Dict
from Sequence import Sequence
from paths import *
import numpy as np
from Bio import PDB as BioPDB
import subprocess
import re
import pandas as pd
from util import compute_DockQ_score, compute_structure_similarity
class PDB:
"""Class for handling PDB files and their associated sequences"""
def __init__(self, pdb_id: str = None, pdb_path: str = None):
"""
Initialize PDB object from either PDB ID or path
Args:
pdb_id: PDB identifier (e.g., '2jw1')
pdb_path: Direct path to PDB file
"""
self.pdb_id = pdb_id.lower() if pdb_id else None
self.pdb_path = pdb_path
self.sequence = None
if pdb_id and not pdb_path:
self.pdb_path = self._find_pdb_file()
if self.pdb_path:
self.sequence = Sequence.from_pdb(self.pdb_path)
def _find_pdb_file(self) -> str:
"""Find PDB file in standard locations based on PDB ID"""
if not self.pdb_id:
raise ValueError("No PDB ID provided")
# Standard locations to check
possible_paths = [
# Experimental structures
os.path.join(experimental_structures, f'exp_{self.pdb_id}.pdb'),
os.path.join(experimental_structures, self.pdb_id, f'exp_{self.pdb_id}.pdb'),
# Computational structures
os.path.join(computational_structures, f'comp_{self.pdb_id}.pdb'),
# General PDB files
os.path.join(PDB_FILES, f'{self.pdb_id}.pdb'),
]
# Return first existing file
for path in possible_paths:
if os.path.exists(path):
return path
raise FileNotFoundError(f"No PDB file found for {self.pdb_id}")
@classmethod
def from_id(cls, pdb_id: str) -> 'PDB':
"""Create PDB object from PDB ID"""
return cls(pdb_id=pdb_id)
@classmethod
def from_path(cls, pdb_path: str) -> 'PDB':
"""Create PDB object from file path"""
if not os.path.exists(pdb_path):
raise FileNotFoundError(f"PDB file not found: {pdb_path}")
else:
#print(f"PDB file found: {pdb_path}")
# Try to extract PDB ID from filename
filename = os.path.basename(pdb_path)
# Look for patterns like 'pdb1abc.ent', '1abc.pdb', 'exp_1abc.pdb', etc
pdb_id_match = re.search(r'(?:pdb|exp_|comp_)?([0-9a-zA-Z]{4})(?:\.pdb|\.ent)', filename)
pdb_id = pdb_id_match.group(1).lower() if pdb_id_match else None
#if pdb_id:
# print(f"Found matching pdb_id {pdb_id}")
return cls(pdb_path=pdb_path, pdb_id=pdb_id)
def save(self, output_path: str = None) -> str:
"""
Save PDB file to specified path or default location
Returns path where file was saved
"""
if not output_path and not self.pdb_id:
raise ValueError("Must provide output_path or initialize with pdb_id")
if not output_path:
# Create default path in PDB_FILES directory
output_path = os.path.join(PDB_FILES, f'{self.pdb_id}.pdb')
# Ensure directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Copy file to new location
if self.pdb_path != output_path:
with open(self.pdb_path, 'r') as src, open(output_path, 'w') as dst:
dst.write(src.read())
return output_path
def get_chain_ids(self) -> List[str]:
"""Get list of chain IDs in the PDB"""
if not self.sequence:
raise ValueError("No sequence loaded")
return [seq.chain_id for seq in self.sequence.subsequences]
def get_protein_chains(self) -> List[str]:
"""Get list of protein chain IDs"""
if not self.sequence:
raise ValueError("No sequence loaded")
return [seq.chain_id for seq in self.sequence.protein_sequences]
def get_peptide_chains(self) -> List[str]:
"""Get list of peptide chain IDs"""
if not self.sequence:
raise ValueError("No sequence loaded")
return [seq.chain_id for seq in self.sequence.peptide_sequences]
def __str__(self) -> str:
"""String representation showing PDB ID/path and sequences"""
parts = []
if self.pdb_id:
parts.append(f"PDB ID: {self.pdb_id}")
parts.append(f"Path: {self.pdb_path}")
if self.sequence:
parts.append("\nSequences:")
parts.append(str(self.sequence))
return "\n".join(parts)
def get_ph(self) -> float:
"""Get pH value from db_holo_cond.txt file for given PDB ID"""
if not os.path.exists('./db_holo_cond.txt'):
# Try to get pH from csp_stats_consensus.csv
if not os.path.exists('./csp_stats_consensus.csv'):
raise FileNotFoundError("Could not find csp_stats_consensus.csv")
df = pd.read_csv('./csp_stats_consensus.csv')
# First try to match holo_pdb
match = df[df['holo_pdb'].str.lower() == self.pdb_id.lower()]
if len(match) == 0:
# If no holo match, try apo_pdb
match = df[df['apo_pdb'].str.lower() == self.pdb_id.lower()]
if len(match) == 0:
raise ValueError(f"Could not find pH value for {self.pdb_id}")
try:
self.ph = float(match['apo ph'].iloc[0])
return self.ph
except (ValueError, IndexError):
raise ValueError(f"Invalid pH value for {self.pdb_id}")
try:
self.ph = float(match['holo ph'].iloc[0])
return self.ph
except (ValueError, IndexError):
raise ValueError(f"Invalid pH value for {self.pdb_id}")
else:
with open('./db_holo_cond.txt', 'r') as inf:
for line in inf:
this_pdb = line.split(' ')[0]
this_pdb = this_pdb[this_pdb.rfind('/')+1:this_pdb.rfind('.')]
if this_pdb == self.pdb_id:
self.ph = float(line.split(' ')[1].strip())
return self.ph
raise ValueError(f"Could not find pH value for {self.pdb_id}")
def prep_for_CSpred(self, out_directory: str) -> str:
"""
Prepare PDB file for CSpred by converting all chains to chain 'A'
and renumbering residues continuously.
Args:
out_directory: Directory to save modified PDB file
Returns:
Condition string for CSpred in format: "path/to/file.pdb ph_value"
"""
# Ensure output directory exists
os.makedirs(out_directory, exist_ok=True)
# Get basename of original file
basename = os.path.basename(self.pdb_path)
output_path = os.path.join(out_directory, basename)
# Process PDB file
last_residue_id = 0
current_residue_id = None
prev_residue_id = None
output_content = ""
with open(self.pdb_path, 'r') as infile:
for line in infile:
if line.startswith('ATOM') or line.startswith('HETATM'):
residue_id = int(line[22:26].strip())
# Update residue numbering when we see a new residue
if residue_id != current_residue_id:
current_residue_id = residue_id
# Check for chain break by looking for non-consecutive residue IDs
if prev_residue_id is not None and residue_id != prev_residue_id + 1:
# Add extra increment for chain break
last_residue_id += 2
else:
last_residue_id += 1
prev_residue_id = residue_id
# Reconstruct line with chain A and new residue number
new_residue_id_str = str(last_residue_id).rjust(4)
updated_line = line[:21] + "A" + new_residue_id_str + line[26:]
output_content += updated_line
elif not line.startswith('TER'):
output_content += line
# Write modified file
with open(output_path, 'w') as outfile:
outfile.write(output_content)
# Get relative path for condition string
relative_outdir = os.path.basename(out_directory.rstrip('/')) + '/'
# If pH not provided, try to get it from db_holo_cond.txt
if self.ph is None and self.pdb_id:
try:
ph = self.get_ph()
except ValueError as e:
print(f"Warning: {e}")
condition_string = f"{relative_outdir}{basename} {self.ph if self.ph is not None else '7.0'}"
return condition_string.strip()
def calc_dockq(self, other: 'PDB') -> Tuple[float, float, float, float, float, float, int]:
"""
Calculate DockQ score using this structure as reference and other as target
Args:
other: PDB object to compare against
Returns:
Tuple of (iRMS, LRMS, DockQ, Fnat, Fnonnat, F1, clashes) scores where:
- iRMS: Interface Root Mean Square Deviation
- LRMS: Ligand Root Mean Square Deviation
- DockQ: Overall DockQ score
- Fnat: Fraction of native contacts
- Fnonnat: Fraction of non-native contacts
- F1: F1 score
- clashes: Number of clashes
"""
if not self.pdb_path or not other.pdb_path:
raise ValueError("Both PDB files must exist")
iRMS, LRMS, DockQ, Fnat, Fnonnat, F1, clashes = compute_DockQ_score(other.pdb_path, self.pdb_path)
return iRMS, LRMS, DockQ, Fnat, Fnonnat, F1, clashes
def calc_tm(self, other: 'PDB', multimer: bool = True) -> float:
"""
Calculate TM-score using this structure as reference and other as target
Args:
other: PDB object to compare against
multimer: Whether to use multimer mode for TM-score calculation
Returns:
TM-score between the structures
"""
if not self.pdb_path or not other.pdb_path:
raise ValueError("Both PDB files must exist")
return compute_structure_similarity(self.pdb_path, other.pdb_path, multimer=multimer)
def calc_gdt_ts(self, other: 'PDB', cutoffs: List[float] = [1.0, 2.0, 4.0, 8.0]) -> float:
"""
Calculate GDT_TS score using this structure as reference and other as target
Args:
other: PDB object to compare against
cutoffs: Distance cutoffs in angstroms for GDT calculation
Returns:
GDT_TS score between the structures
"""
if not self.pdb_path or not other.pdb_path:
raise ValueError("Both PDB files must exist")
parser = BioPDB.PDBParser(QUIET=True)
structure_ref = parser.get_structure("reference", self.pdb_path)
structure_model = parser.get_structure("model", other.pdb_path)
# Extract C-alpha coordinates by (chain_id, residue_number)
ref_coords = {}
for chain in structure_ref.get_chains():
for residue in chain.get_residues():
if residue.has_id('CA'):
chain_id = chain.get_id()
res_id = residue.get_id()[1]
ref_coords[(chain_id, res_id)] = residue['CA'].get_vector()
model_coords = {}
for chain in structure_model.get_chains():
for residue in chain.get_residues():
if residue.has_id('CA'):
chain_id = chain.get_id()
res_id = residue.get_id()[1]
model_coords[(chain_id, res_id)] = residue['CA'].get_vector()
# Identify matching residues
common_keys = set(ref_coords.keys()).intersection(model_coords.keys())
if not common_keys:
raise ValueError("No matching residues found between structures")
# Calculate distances for each matching residue
distances = []
for key in common_keys:
dist = (ref_coords[key] - model_coords[key]).norm()
distances.append(dist)
# For each cutoff, compute fraction of residues within cutoff
gdt_scores = []
total_matched = len(distances)
for c in cutoffs:
within_cutoff = sum(d <= c for d in distances)
fraction = within_cutoff / total_matched
gdt_scores.append(fraction)
# GDT_TS is average of fractions * 100
gdt_ts = (sum(gdt_scores) / len(gdt_scores)) * 100.0
return gdt_ts
def superpose_onto(self, other: 'PDB', output_path: Optional[str] = None) -> str:
"""
Superpose this structure onto another structure
Args:
other: Reference PDB structure to superpose onto
output_path: Optional path to save superposed structure
Returns:
Path to superposed structure
"""
if not self.pdb_path or not other.pdb_path:
raise ValueError("Both PDB files must exist")
parser = BioPDB.PDBParser(QUIET=True)
ref_structure = parser.get_structure('ref', other.pdb_path)
mobile_structure = parser.get_structure('mobile', self.pdb_path)
# Get largest chains
def get_largest_chain(structure):
largest_chain = None
max_residues = 0
for model in structure:
for chain in model:
num_residues = len([r for r in chain if BioPDB.is_aa(r)])
if num_residues > max_residues:
max_residues = num_residues
largest_chain = chain
return largest_chain
ref_chain = get_largest_chain(ref_structure)
mobile_chain = get_largest_chain(mobile_structure)
# Align chains
super_imposer = BioPDB.Superimposer()
ref_atoms = [r['CA'] for r in ref_chain if 'CA' in r]
mobile_atoms = [r['CA'] for r in mobile_chain if 'CA' in r]
super_imposer.set_atoms(ref_atoms, mobile_atoms)
# Apply transformation
for model in mobile_structure:
for chain in model:
super_imposer.apply(chain.get_atoms())
# Save superposed structure
if not output_path:
output_path = self.pdb_path.replace('.pdb', '_aligned.pdb')
io = BioPDB.PDBIO()
io.set_structure(mobile_structure)
io.save(output_path)
return output_path
def find_pdb_files(pdb_id: str) -> List[str]:
"""
Find all PDB files matching the given ID in standard locations.
Useful for when multiple files exist for the same PDB ID.
"""
pdb_files = []
# Patterns to check
patterns = [
# Experimental structures
os.path.join(experimental_structures, f'exp_{pdb_id.lower()}.pdb'),
os.path.join(experimental_structures, pdb_id.lower(), f'exp_{pdb_id.lower()}.pdb'),
os.path.join(experimental_structures, pdb_id.upper(), f'exp_{pdb_id.upper()}.pdb'),
# Computational structures
os.path.join(computational_structures, f'comp_{pdb_id.lower()}.pdb'),
os.path.join(computational_structures, f'comp_{pdb_id.upper()}.pdb'),
# General PDB files
os.path.join(PDB_FILES, f'{pdb_id.lower()}.pdb'),
os.path.join(PDB_FILES, f'{pdb_id.upper()}.pdb')
]
# Collect all existing files
for pattern in patterns:
if os.path.exists(pattern):
pdb_files.append(pattern)
return pdb_files