Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
CSP_Rank/PDB.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
406 lines (337 sloc)
16 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from typing import List, Optional, Tuple, Dict | |
from Sequence import Sequence | |
from paths import * | |
import numpy as np | |
from Bio import PDB as BioPDB | |
import subprocess | |
import re | |
import pandas as pd | |
from util import compute_DockQ_score, compute_structure_similarity | |
class PDB: | |
"""Class for handling PDB files and their associated sequences""" | |
def __init__(self, pdb_id: str = None, pdb_path: str = None): | |
""" | |
Initialize PDB object from either PDB ID or path | |
Args: | |
pdb_id: PDB identifier (e.g., '2jw1') | |
pdb_path: Direct path to PDB file | |
""" | |
self.pdb_id = pdb_id.lower() if pdb_id else None | |
self.pdb_path = pdb_path | |
self.sequence = None | |
if pdb_id and not pdb_path: | |
self.pdb_path = self._find_pdb_file() | |
if self.pdb_path: | |
self.sequence = Sequence.from_pdb(self.pdb_path) | |
def _find_pdb_file(self) -> str: | |
"""Find PDB file in standard locations based on PDB ID""" | |
if not self.pdb_id: | |
raise ValueError("No PDB ID provided") | |
# Standard locations to check | |
possible_paths = [ | |
# Experimental structures | |
os.path.join(experimental_structures, f'exp_{self.pdb_id}.pdb'), | |
os.path.join(experimental_structures, self.pdb_id, f'exp_{self.pdb_id}.pdb'), | |
# Computational structures | |
os.path.join(computational_structures, f'comp_{self.pdb_id}.pdb'), | |
# General PDB files | |
os.path.join(PDB_FILES, f'{self.pdb_id}.pdb'), | |
] | |
# Return first existing file | |
for path in possible_paths: | |
if os.path.exists(path): | |
return path | |
raise FileNotFoundError(f"No PDB file found for {self.pdb_id}") | |
@classmethod | |
def from_id(cls, pdb_id: str) -> 'PDB': | |
"""Create PDB object from PDB ID""" | |
return cls(pdb_id=pdb_id) | |
@classmethod | |
def from_path(cls, pdb_path: str) -> 'PDB': | |
"""Create PDB object from file path""" | |
if not os.path.exists(pdb_path): | |
raise FileNotFoundError(f"PDB file not found: {pdb_path}") | |
else: | |
#print(f"PDB file found: {pdb_path}") | |
# Try to extract PDB ID from filename | |
filename = os.path.basename(pdb_path) | |
# Look for patterns like 'pdb1abc.ent', '1abc.pdb', 'exp_1abc.pdb', etc | |
pdb_id_match = re.search(r'(?:pdb|exp_|comp_)?([0-9a-zA-Z]{4})(?:\.pdb|\.ent)', filename) | |
pdb_id = pdb_id_match.group(1).lower() if pdb_id_match else None | |
#if pdb_id: | |
# print(f"Found matching pdb_id {pdb_id}") | |
return cls(pdb_path=pdb_path, pdb_id=pdb_id) | |
def save(self, output_path: str = None) -> str: | |
""" | |
Save PDB file to specified path or default location | |
Returns path where file was saved | |
""" | |
if not output_path and not self.pdb_id: | |
raise ValueError("Must provide output_path or initialize with pdb_id") | |
if not output_path: | |
# Create default path in PDB_FILES directory | |
output_path = os.path.join(PDB_FILES, f'{self.pdb_id}.pdb') | |
# Ensure directory exists | |
os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
# Copy file to new location | |
if self.pdb_path != output_path: | |
with open(self.pdb_path, 'r') as src, open(output_path, 'w') as dst: | |
dst.write(src.read()) | |
return output_path | |
def get_chain_ids(self) -> List[str]: | |
"""Get list of chain IDs in the PDB""" | |
if not self.sequence: | |
raise ValueError("No sequence loaded") | |
return [seq.chain_id for seq in self.sequence.subsequences] | |
def get_protein_chains(self) -> List[str]: | |
"""Get list of protein chain IDs""" | |
if not self.sequence: | |
raise ValueError("No sequence loaded") | |
return [seq.chain_id for seq in self.sequence.protein_sequences] | |
def get_peptide_chains(self) -> List[str]: | |
"""Get list of peptide chain IDs""" | |
if not self.sequence: | |
raise ValueError("No sequence loaded") | |
return [seq.chain_id for seq in self.sequence.peptide_sequences] | |
def __str__(self) -> str: | |
"""String representation showing PDB ID/path and sequences""" | |
parts = [] | |
if self.pdb_id: | |
parts.append(f"PDB ID: {self.pdb_id}") | |
parts.append(f"Path: {self.pdb_path}") | |
if self.sequence: | |
parts.append("\nSequences:") | |
parts.append(str(self.sequence)) | |
return "\n".join(parts) | |
def get_ph(self) -> float: | |
"""Get pH value from db_holo_cond.txt file for given PDB ID""" | |
if not os.path.exists('./db_holo_cond.txt'): | |
# Try to get pH from csp_stats_consensus.csv | |
if not os.path.exists('./csp_stats_consensus.csv'): | |
raise FileNotFoundError("Could not find csp_stats_consensus.csv") | |
df = pd.read_csv('./csp_stats_consensus.csv') | |
# First try to match holo_pdb | |
match = df[df['holo_pdb'].str.lower() == self.pdb_id.lower()] | |
if len(match) == 0: | |
# If no holo match, try apo_pdb | |
match = df[df['apo_pdb'].str.lower() == self.pdb_id.lower()] | |
if len(match) == 0: | |
raise ValueError(f"Could not find pH value for {self.pdb_id}") | |
try: | |
self.ph = float(match['apo ph'].iloc[0]) | |
return self.ph | |
except (ValueError, IndexError): | |
raise ValueError(f"Invalid pH value for {self.pdb_id}") | |
try: | |
self.ph = float(match['holo ph'].iloc[0]) | |
return self.ph | |
except (ValueError, IndexError): | |
raise ValueError(f"Invalid pH value for {self.pdb_id}") | |
else: | |
with open('./db_holo_cond.txt', 'r') as inf: | |
for line in inf: | |
this_pdb = line.split(' ')[0] | |
this_pdb = this_pdb[this_pdb.rfind('/')+1:this_pdb.rfind('.')] | |
if this_pdb == self.pdb_id: | |
self.ph = float(line.split(' ')[1].strip()) | |
return self.ph | |
raise ValueError(f"Could not find pH value for {self.pdb_id}") | |
def prep_for_CSpred(self, out_directory: str) -> str: | |
""" | |
Prepare PDB file for CSpred by converting all chains to chain 'A' | |
and renumbering residues continuously. | |
Args: | |
out_directory: Directory to save modified PDB file | |
Returns: | |
Condition string for CSpred in format: "path/to/file.pdb ph_value" | |
""" | |
# Ensure output directory exists | |
os.makedirs(out_directory, exist_ok=True) | |
# Get basename of original file | |
basename = os.path.basename(self.pdb_path) | |
output_path = os.path.join(out_directory, basename) | |
# Process PDB file | |
last_residue_id = 0 | |
current_residue_id = None | |
prev_residue_id = None | |
output_content = "" | |
with open(self.pdb_path, 'r') as infile: | |
for line in infile: | |
if line.startswith('ATOM') or line.startswith('HETATM'): | |
residue_id = int(line[22:26].strip()) | |
# Update residue numbering when we see a new residue | |
if residue_id != current_residue_id: | |
current_residue_id = residue_id | |
# Check for chain break by looking for non-consecutive residue IDs | |
if prev_residue_id is not None and residue_id != prev_residue_id + 1: | |
# Add extra increment for chain break | |
last_residue_id += 2 | |
else: | |
last_residue_id += 1 | |
prev_residue_id = residue_id | |
# Reconstruct line with chain A and new residue number | |
new_residue_id_str = str(last_residue_id).rjust(4) | |
updated_line = line[:21] + "A" + new_residue_id_str + line[26:] | |
output_content += updated_line | |
elif not line.startswith('TER'): | |
output_content += line | |
# Write modified file | |
with open(output_path, 'w') as outfile: | |
outfile.write(output_content) | |
# Get relative path for condition string | |
relative_outdir = os.path.basename(out_directory.rstrip('/')) + '/' | |
# If pH not provided, try to get it from db_holo_cond.txt | |
if self.ph is None and self.pdb_id: | |
try: | |
ph = self.get_ph() | |
except ValueError as e: | |
print(f"Warning: {e}") | |
condition_string = f"{relative_outdir}{basename} {self.ph if self.ph is not None else '7.0'}" | |
return condition_string.strip() | |
def calc_dockq(self, other: 'PDB') -> Tuple[float, float, float, float, float, float, int]: | |
""" | |
Calculate DockQ score using this structure as reference and other as target | |
Args: | |
other: PDB object to compare against | |
Returns: | |
Tuple of (iRMS, LRMS, DockQ, Fnat, Fnonnat, F1, clashes) scores where: | |
- iRMS: Interface Root Mean Square Deviation | |
- LRMS: Ligand Root Mean Square Deviation | |
- DockQ: Overall DockQ score | |
- Fnat: Fraction of native contacts | |
- Fnonnat: Fraction of non-native contacts | |
- F1: F1 score | |
- clashes: Number of clashes | |
""" | |
if not self.pdb_path or not other.pdb_path: | |
raise ValueError("Both PDB files must exist") | |
iRMS, LRMS, DockQ, Fnat, Fnonnat, F1, clashes = compute_DockQ_score(other.pdb_path, self.pdb_path) | |
return iRMS, LRMS, DockQ, Fnat, Fnonnat, F1, clashes | |
def calc_tm(self, other: 'PDB', multimer: bool = True) -> float: | |
""" | |
Calculate TM-score using this structure as reference and other as target | |
Args: | |
other: PDB object to compare against | |
multimer: Whether to use multimer mode for TM-score calculation | |
Returns: | |
TM-score between the structures | |
""" | |
if not self.pdb_path or not other.pdb_path: | |
raise ValueError("Both PDB files must exist") | |
return compute_structure_similarity(self.pdb_path, other.pdb_path, multimer=multimer) | |
def calc_gdt_ts(self, other: 'PDB', cutoffs: List[float] = [1.0, 2.0, 4.0, 8.0]) -> float: | |
""" | |
Calculate GDT_TS score using this structure as reference and other as target | |
Args: | |
other: PDB object to compare against | |
cutoffs: Distance cutoffs in angstroms for GDT calculation | |
Returns: | |
GDT_TS score between the structures | |
""" | |
if not self.pdb_path or not other.pdb_path: | |
raise ValueError("Both PDB files must exist") | |
parser = BioPDB.PDBParser(QUIET=True) | |
structure_ref = parser.get_structure("reference", self.pdb_path) | |
structure_model = parser.get_structure("model", other.pdb_path) | |
# Extract C-alpha coordinates by (chain_id, residue_number) | |
ref_coords = {} | |
for chain in structure_ref.get_chains(): | |
for residue in chain.get_residues(): | |
if residue.has_id('CA'): | |
chain_id = chain.get_id() | |
res_id = residue.get_id()[1] | |
ref_coords[(chain_id, res_id)] = residue['CA'].get_vector() | |
model_coords = {} | |
for chain in structure_model.get_chains(): | |
for residue in chain.get_residues(): | |
if residue.has_id('CA'): | |
chain_id = chain.get_id() | |
res_id = residue.get_id()[1] | |
model_coords[(chain_id, res_id)] = residue['CA'].get_vector() | |
# Identify matching residues | |
common_keys = set(ref_coords.keys()).intersection(model_coords.keys()) | |
if not common_keys: | |
raise ValueError("No matching residues found between structures") | |
# Calculate distances for each matching residue | |
distances = [] | |
for key in common_keys: | |
dist = (ref_coords[key] - model_coords[key]).norm() | |
distances.append(dist) | |
# For each cutoff, compute fraction of residues within cutoff | |
gdt_scores = [] | |
total_matched = len(distances) | |
for c in cutoffs: | |
within_cutoff = sum(d <= c for d in distances) | |
fraction = within_cutoff / total_matched | |
gdt_scores.append(fraction) | |
# GDT_TS is average of fractions * 100 | |
gdt_ts = (sum(gdt_scores) / len(gdt_scores)) * 100.0 | |
return gdt_ts | |
def superpose_onto(self, other: 'PDB', output_path: Optional[str] = None) -> str: | |
""" | |
Superpose this structure onto another structure | |
Args: | |
other: Reference PDB structure to superpose onto | |
output_path: Optional path to save superposed structure | |
Returns: | |
Path to superposed structure | |
""" | |
if not self.pdb_path or not other.pdb_path: | |
raise ValueError("Both PDB files must exist") | |
parser = BioPDB.PDBParser(QUIET=True) | |
ref_structure = parser.get_structure('ref', other.pdb_path) | |
mobile_structure = parser.get_structure('mobile', self.pdb_path) | |
# Get largest chains | |
def get_largest_chain(structure): | |
largest_chain = None | |
max_residues = 0 | |
for model in structure: | |
for chain in model: | |
num_residues = len([r for r in chain if BioPDB.is_aa(r)]) | |
if num_residues > max_residues: | |
max_residues = num_residues | |
largest_chain = chain | |
return largest_chain | |
ref_chain = get_largest_chain(ref_structure) | |
mobile_chain = get_largest_chain(mobile_structure) | |
# Align chains | |
super_imposer = BioPDB.Superimposer() | |
ref_atoms = [r['CA'] for r in ref_chain if 'CA' in r] | |
mobile_atoms = [r['CA'] for r in mobile_chain if 'CA' in r] | |
super_imposer.set_atoms(ref_atoms, mobile_atoms) | |
# Apply transformation | |
for model in mobile_structure: | |
for chain in model: | |
super_imposer.apply(chain.get_atoms()) | |
# Save superposed structure | |
if not output_path: | |
output_path = self.pdb_path.replace('.pdb', '_aligned.pdb') | |
io = BioPDB.PDBIO() | |
io.set_structure(mobile_structure) | |
io.save(output_path) | |
return output_path | |
def find_pdb_files(pdb_id: str) -> List[str]: | |
""" | |
Find all PDB files matching the given ID in standard locations. | |
Useful for when multiple files exist for the same PDB ID. | |
""" | |
pdb_files = [] | |
# Patterns to check | |
patterns = [ | |
# Experimental structures | |
os.path.join(experimental_structures, f'exp_{pdb_id.lower()}.pdb'), | |
os.path.join(experimental_structures, pdb_id.lower(), f'exp_{pdb_id.lower()}.pdb'), | |
os.path.join(experimental_structures, pdb_id.upper(), f'exp_{pdb_id.upper()}.pdb'), | |
# Computational structures | |
os.path.join(computational_structures, f'comp_{pdb_id.lower()}.pdb'), | |
os.path.join(computational_structures, f'comp_{pdb_id.upper()}.pdb'), | |
# General PDB files | |
os.path.join(PDB_FILES, f'{pdb_id.lower()}.pdb'), | |
os.path.join(PDB_FILES, f'{pdb_id.upper()}.pdb') | |
] | |
# Collect all existing files | |
for pattern in patterns: | |
if os.path.exists(pattern): | |
pdb_files.append(pattern) | |
return pdb_files |