PDB.py

import os
from typing import List, Optional, Tuple, Dict
from Sequence import Sequence
from paths import *
import numpy as np
from Bio import PDB as BioPDB
import subprocess
import re
import pandas as pd
from util import compute_DockQ_score, compute_structure_similarity

class PDB:
    """Class for handling PDB files and their associated sequences"""

    def __init__(self, pdb_id: str = None, pdb_path: str = None):
        """
        Initialize PDB object from either PDB ID or path

        Args:
            pdb_id: PDB identifier (e.g., '2jw1')
            pdb_path: Direct path to PDB file
        """
        self.pdb_id = pdb_id.lower() if pdb_id else None
        self.pdb_path = pdb_path
        self.sequence = None

        if pdb_id and not pdb_path:
            self.pdb_path = self._find_pdb_file()

        if self.pdb_path:
            self.sequence = Sequence.from_pdb(self.pdb_path)

    def _find_pdb_file(self) -> str:
        """Find PDB file in standard locations based on PDB ID"""
        if not self.pdb_id:
            raise ValueError("No PDB ID provided")

        # Standard locations to check
        possible_paths = [
            # Experimental structures
            os.path.join(experimental_structures, f'exp_{self.pdb_id}.pdb'),
            os.path.join(experimental_structures, self.pdb_id, f'exp_{self.pdb_id}.pdb'),
            # Computational structures
            os.path.join(computational_structures, f'comp_{self.pdb_id}.pdb'),
            # General PDB files
            os.path.join(PDB_FILES, f'{self.pdb_id}.pdb'),
        ]

        # Return first existing file
        for path in possible_paths:
            if os.path.exists(path):
                return path

        raise FileNotFoundError(f"No PDB file found for {self.pdb_id}")

    @classmethod
    def from_id(cls, pdb_id: str) -> 'PDB':
        """Create PDB object from PDB ID"""
        return cls(pdb_id=pdb_id)

    @classmethod
    def from_path(cls, pdb_path: str) -> 'PDB':
        """Create PDB object from file path"""
        if not os.path.exists(pdb_path):
            raise FileNotFoundError(f"PDB file not found: {pdb_path}")
        else:
            #print(f"PDB file found: {pdb_path}")
            # Try to extract PDB ID from filename
            filename = os.path.basename(pdb_path)
            # Look for patterns like 'pdb1abc.ent', '1abc.pdb', 'exp_1abc.pdb', etc
            pdb_id_match = re.search(r'(?:pdb|exp_|comp_)?([0-9a-zA-Z]{4})(?:\.pdb|\.ent)', filename)
            pdb_id = pdb_id_match.group(1).lower() if pdb_id_match else None
            #if pdb_id:
            #    print(f"Found matching pdb_id {pdb_id}")
            return cls(pdb_path=pdb_path, pdb_id=pdb_id)

    def save(self, output_path: str = None) -> str:
        """
        Save PDB file to specified path or default location
        Returns path where file was saved
        """
        if not output_path and not self.pdb_id:
            raise ValueError("Must provide output_path or initialize with pdb_id")

        if not output_path:
            # Create default path in PDB_FILES directory
            output_path = os.path.join(PDB_FILES, f'{self.pdb_id}.pdb')

        # Ensure directory exists
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        # Copy file to new location
        if self.pdb_path != output_path:
            with open(self.pdb_path, 'r') as src, open(output_path, 'w') as dst:
                dst.write(src.read())

        return output_path

    def get_chain_ids(self) -> List[str]:
        """Get list of chain IDs in the PDB"""
        if not self.sequence:
            raise ValueError("No sequence loaded")
        return [seq.chain_id for seq in self.sequence.subsequences]

    def get_protein_chains(self) -> List[str]:
        """Get list of protein chain IDs"""
        if not self.sequence:
            raise ValueError("No sequence loaded")
        return [seq.chain_id for seq in self.sequence.protein_sequences]

    def get_peptide_chains(self) -> List[str]:
        """Get list of peptide chain IDs"""
        if not self.sequence:
            raise ValueError("No sequence loaded")
        return [seq.chain_id for seq in self.sequence.peptide_sequences]

    def __str__(self) -> str:
        """String representation showing PDB ID/path and sequences"""
        parts = []
        if self.pdb_id:
            parts.append(f"PDB ID: {self.pdb_id}")
        parts.append(f"Path: {self.pdb_path}")
        if self.sequence:
            parts.append("\nSequences:")
            parts.append(str(self.sequence))
        return "\n".join(parts)

    def get_ph(self) -> float:
        """Get pH value from db_holo_cond.txt file for given PDB ID"""
        if not os.path.exists('./db_holo_cond.txt'):
            # Try to get pH from csp_stats_consensus.csv
            if not os.path.exists('./csp_stats_consensus.csv'):
                raise FileNotFoundError("Could not find csp_stats_consensus.csv")

            df = pd.read_csv('./csp_stats_consensus.csv')
            # First try to match holo_pdb
            match = df[df['holo_pdb'].str.lower() == self.pdb_id.lower()]
            if len(match) == 0:
                # If no holo match, try apo_pdb
                match = df[df['apo_pdb'].str.lower() == self.pdb_id.lower()]
                if len(match) == 0:
                    raise ValueError(f"Could not find pH value for {self.pdb_id}")
                try:
                    self.ph = float(match['apo ph'].iloc[0])
                    return self.ph
                except (ValueError, IndexError):
                    raise ValueError(f"Invalid pH value for {self.pdb_id}")
            try:
                self.ph = float(match['holo ph'].iloc[0])
                return self.ph
            except (ValueError, IndexError):
                raise ValueError(f"Invalid pH value for {self.pdb_id}")
        else:
            with open('./db_holo_cond.txt', 'r') as inf:
                for line in inf:
                    this_pdb = line.split(' ')[0]
                    this_pdb = this_pdb[this_pdb.rfind('/')+1:this_pdb.rfind('.')]
                    if this_pdb == self.pdb_id:
                        self.ph = float(line.split(' ')[1].strip())
                        return self.ph
            raise ValueError(f"Could not find pH value for {self.pdb_id}")

    def prep_for_CSpred(self, out_directory: str) -> str:
        """
        Prepare PDB file for CSpred by converting all chains to chain 'A'
        and renumbering residues continuously.

        Args:
            out_directory: Directory to save modified PDB file

        Returns:
            Condition string for CSpred in format: "path/to/file.pdb ph_value"
        """
        # Ensure output directory exists
        os.makedirs(out_directory, exist_ok=True)

        # Get basename of original file
        basename = os.path.basename(self.pdb_path)
        output_path = os.path.join(out_directory, basename)

        # Process PDB file
        last_residue_id = 0
        current_residue_id = None
        prev_residue_id = None
        output_content = ""

        with open(self.pdb_path, 'r') as infile:
            for line in infile:
                if line.startswith('ATOM') or line.startswith('HETATM'):
                    residue_id = int(line[22:26].strip())

                    # Update residue numbering when we see a new residue
                    if residue_id != current_residue_id:
                        current_residue_id = residue_id

                        # Check for chain break by looking for non-consecutive residue IDs
                        if prev_residue_id is not None and residue_id != prev_residue_id + 1:
                            # Add extra increment for chain break
                            last_residue_id += 2
                        else:
                            last_residue_id += 1

                        prev_residue_id = residue_id

                    # Reconstruct line with chain A and new residue number
                    new_residue_id_str = str(last_residue_id).rjust(4)
                    updated_line = line[:21] + "A" + new_residue_id_str + line[26:]
                    output_content += updated_line
                elif not line.startswith('TER'):
                    output_content += line

        # Write modified file
        with open(output_path, 'w') as outfile:
            outfile.write(output_content)

        # Get relative path for condition string
        relative_outdir = os.path.basename(out_directory.rstrip('/')) + '/'

        # If pH not provided, try to get it from db_holo_cond.txt
        if self.ph is None and self.pdb_id:
            try:
                ph = self.get_ph()
            except ValueError as e:
                print(f"Warning: {e}")

        condition_string = f"{relative_outdir}{basename} {self.ph if self.ph is not None else '7.0'}"
        return condition_string.strip()

    def calc_dockq(self, other: 'PDB') -> Tuple[float, float, float, float, float, float, int]:
        """
        Calculate DockQ score using this structure as reference and other as target

        Args:
            other: PDB object to compare against

        Returns:
            Tuple of (iRMS, LRMS, DockQ, Fnat, Fnonnat, F1, clashes) scores where:
            - iRMS: Interface Root Mean Square Deviation
            - LRMS: Ligand Root Mean Square Deviation
            - DockQ: Overall DockQ score
            - Fnat: Fraction of native contacts
            - Fnonnat: Fraction of non-native contacts
            - F1: F1 score
            - clashes: Number of clashes
        """
        if not self.pdb_path or not other.pdb_path:
            raise ValueError("Both PDB files must exist")

        iRMS, LRMS, DockQ, Fnat, Fnonnat, F1, clashes = compute_DockQ_score(other.pdb_path, self.pdb_path)
        return iRMS, LRMS, DockQ, Fnat, Fnonnat, F1, clashes

    def calc_tm(self, other: 'PDB', multimer: bool = True) -> float:
        """
        Calculate TM-score using this structure as reference and other as target

        Args:
            other: PDB object to compare against
            multimer: Whether to use multimer mode for TM-score calculation

        Returns:
            TM-score between the structures
        """
        if not self.pdb_path or not other.pdb_path:
            raise ValueError("Both PDB files must exist")

        return compute_structure_similarity(self.pdb_path, other.pdb_path, multimer=multimer)

    def calc_gdt_ts(self, other: 'PDB', cutoffs: List[float] = [1.0, 2.0, 4.0, 8.0]) -> float:
        """
        Calculate GDT_TS score using this structure as reference and other as target

        Args:
            other: PDB object to compare against
            cutoffs: Distance cutoffs in angstroms for GDT calculation

        Returns:
            GDT_TS score between the structures
        """
        if not self.pdb_path or not other.pdb_path:
            raise ValueError("Both PDB files must exist")

        parser = BioPDB.PDBParser(QUIET=True)
        structure_ref = parser.get_structure("reference", self.pdb_path)
        structure_model = parser.get_structure("model", other.pdb_path)

        # Extract C-alpha coordinates by (chain_id, residue_number)
        ref_coords = {}
        for chain in structure_ref.get_chains():
            for residue in chain.get_residues():
                if residue.has_id('CA'):
                    chain_id = chain.get_id()
                    res_id = residue.get_id()[1]
                    ref_coords[(chain_id, res_id)] = residue['CA'].get_vector()

        model_coords = {}
        for chain in structure_model.get_chains():
            for residue in chain.get_residues():
                if residue.has_id('CA'):
                    chain_id = chain.get_id()
                    res_id = residue.get_id()[1]
                    model_coords[(chain_id, res_id)] = residue['CA'].get_vector()

        # Identify matching residues
        common_keys = set(ref_coords.keys()).intersection(model_coords.keys())
        if not common_keys:
            raise ValueError("No matching residues found between structures")

        # Calculate distances for each matching residue
        distances = []
        for key in common_keys:
            dist = (ref_coords[key] - model_coords[key]).norm()
            distances.append(dist)

        # For each cutoff, compute fraction of residues within cutoff
        gdt_scores = []
        total_matched = len(distances)
        for c in cutoffs:
            within_cutoff = sum(d <= c for d in distances)
            fraction = within_cutoff / total_matched
            gdt_scores.append(fraction)

        # GDT_TS is average of fractions * 100
        gdt_ts = (sum(gdt_scores) / len(gdt_scores)) * 100.0
        return gdt_ts

    def superpose_onto(self, other: 'PDB', output_path: Optional[str] = None) -> str:
        """
        Superpose this structure onto another structure

        Args:
            other: Reference PDB structure to superpose onto
            output_path: Optional path to save superposed structure

        Returns:
            Path to superposed structure
        """
        if not self.pdb_path or not other.pdb_path:
            raise ValueError("Both PDB files must exist")

        parser = BioPDB.PDBParser(QUIET=True)
        ref_structure = parser.get_structure('ref', other.pdb_path)
        mobile_structure = parser.get_structure('mobile', self.pdb_path)

        # Get largest chains
        def get_largest_chain(structure):
            largest_chain = None
            max_residues = 0
            for model in structure:
                for chain in model:
                    num_residues = len([r for r in chain if BioPDB.is_aa(r)])
                    if num_residues > max_residues:
                        max_residues = num_residues
                        largest_chain = chain
            return largest_chain

        ref_chain = get_largest_chain(ref_structure)
        mobile_chain = get_largest_chain(mobile_structure)

        # Align chains
        super_imposer = BioPDB.Superimposer()
        ref_atoms = [r['CA'] for r in ref_chain if 'CA' in r]
        mobile_atoms = [r['CA'] for r in mobile_chain if 'CA' in r]
        super_imposer.set_atoms(ref_atoms, mobile_atoms)

        # Apply transformation
        for model in mobile_structure:
            for chain in model:
                super_imposer.apply(chain.get_atoms())

        # Save superposed structure
        if not output_path:
            output_path = self.pdb_path.replace('.pdb', '_aligned.pdb')

        io = BioPDB.PDBIO()
        io.set_structure(mobile_structure)
        io.save(output_path)

        return output_path

def find_pdb_files(pdb_id: str) -> List[str]:
    """
    Find all PDB files matching the given ID in standard locations.
    Useful for when multiple files exist for the same PDB ID.
    """
    pdb_files = []

    # Patterns to check
    patterns = [
        # Experimental structures
        os.path.join(experimental_structures, f'exp_{pdb_id.lower()}.pdb'),
        os.path.join(experimental_structures, pdb_id.lower(), f'exp_{pdb_id.lower()}.pdb'),
        os.path.join(experimental_structures, pdb_id.upper(), f'exp_{pdb_id.upper()}.pdb'),
        # Computational structures
        os.path.join(computational_structures, f'comp_{pdb_id.lower()}.pdb'),
        os.path.join(computational_structures, f'comp_{pdb_id.upper()}.pdb'),
        # General PDB files
        os.path.join(PDB_FILES, f'{pdb_id.lower()}.pdb'),
        os.path.join(PDB_FILES, f'{pdb_id.upper()}.pdb')
    ]

    # Collect all existing files
    for pattern in patterns:
        if os.path.exists(pattern):
            pdb_files.append(pattern)

    return pdb_files
	import os
	from typing import List, Optional, Tuple, Dict
	from Sequence import Sequence
	from paths import *
	import numpy as np
	from Bio import PDB as BioPDB
	import subprocess
	import re
	import pandas as pd
	from util import compute_DockQ_score, compute_structure_similarity

	class PDB:
	"""Class for handling PDB files and their associated sequences"""

	def __init__(self, pdb_id: str = None, pdb_path: str = None):
	"""
	Initialize PDB object from either PDB ID or path

	Args:
	pdb_id: PDB identifier (e.g., '2jw1')
	pdb_path: Direct path to PDB file
	"""
	self.pdb_id = pdb_id.lower() if pdb_id else None
	self.pdb_path = pdb_path
	self.sequence = None

	if pdb_id and not pdb_path:
	self.pdb_path = self._find_pdb_file()

	if self.pdb_path:
	self.sequence = Sequence.from_pdb(self.pdb_path)

	def _find_pdb_file(self) -> str:
	"""Find PDB file in standard locations based on PDB ID"""
	if not self.pdb_id:
	raise ValueError("No PDB ID provided")

	# Standard locations to check
	possible_paths = [
	# Experimental structures
	os.path.join(experimental_structures, f'exp_{self.pdb_id}.pdb'),
	os.path.join(experimental_structures, self.pdb_id, f'exp_{self.pdb_id}.pdb'),
	# Computational structures
	os.path.join(computational_structures, f'comp_{self.pdb_id}.pdb'),
	# General PDB files
	os.path.join(PDB_FILES, f'{self.pdb_id}.pdb'),
	]

	# Return first existing file
	for path in possible_paths:
	if os.path.exists(path):
	return path

	raise FileNotFoundError(f"No PDB file found for {self.pdb_id}")

	@classmethod
	def from_id(cls, pdb_id: str) -> 'PDB':
	"""Create PDB object from PDB ID"""
	return cls(pdb_id=pdb_id)

	@classmethod
	def from_path(cls, pdb_path: str) -> 'PDB':
	"""Create PDB object from file path"""
	if not os.path.exists(pdb_path):
	raise FileNotFoundError(f"PDB file not found: {pdb_path}")
	else:
	#print(f"PDB file found: {pdb_path}")
	# Try to extract PDB ID from filename
	filename = os.path.basename(pdb_path)
	# Look for patterns like 'pdb1abc.ent', '1abc.pdb', 'exp_1abc.pdb', etc
	pdb_id_match = re.search(r'(?:pdb\|exp_\|comp_)?([0-9a-zA-Z]{4})(?:\.pdb\|\.ent)', filename)
	pdb_id = pdb_id_match.group(1).lower() if pdb_id_match else None
	#if pdb_id:
	# print(f"Found matching pdb_id {pdb_id}")
	return cls(pdb_path=pdb_path, pdb_id=pdb_id)

	def save(self, output_path: str = None) -> str:
	"""
	Save PDB file to specified path or default location
	Returns path where file was saved
	"""
	if not output_path and not self.pdb_id:
	raise ValueError("Must provide output_path or initialize with pdb_id")

	if not output_path:
	# Create default path in PDB_FILES directory
	output_path = os.path.join(PDB_FILES, f'{self.pdb_id}.pdb')

	# Ensure directory exists
	os.makedirs(os.path.dirname(output_path), exist_ok=True)

	# Copy file to new location
	if self.pdb_path != output_path:
	with open(self.pdb_path, 'r') as src, open(output_path, 'w') as dst:
	dst.write(src.read())

	return output_path

	def get_chain_ids(self) -> List[str]:
	"""Get list of chain IDs in the PDB"""
	if not self.sequence:
	raise ValueError("No sequence loaded")
	return [seq.chain_id for seq in self.sequence.subsequences]

	def get_protein_chains(self) -> List[str]:
	"""Get list of protein chain IDs"""
	if not self.sequence:
	raise ValueError("No sequence loaded")
	return [seq.chain_id for seq in self.sequence.protein_sequences]

	def get_peptide_chains(self) -> List[str]:
	"""Get list of peptide chain IDs"""
	if not self.sequence:
	raise ValueError("No sequence loaded")
	return [seq.chain_id for seq in self.sequence.peptide_sequences]

	def __str__(self) -> str:
	"""String representation showing PDB ID/path and sequences"""
	parts = []
	if self.pdb_id:
	parts.append(f"PDB ID: {self.pdb_id}")
	parts.append(f"Path: {self.pdb_path}")
	if self.sequence:
	parts.append("\nSequences:")
	parts.append(str(self.sequence))
	return "\n".join(parts)

	def get_ph(self) -> float:
	"""Get pH value from db_holo_cond.txt file for given PDB ID"""
	if not os.path.exists('./db_holo_cond.txt'):
	# Try to get pH from csp_stats_consensus.csv
	if not os.path.exists('./csp_stats_consensus.csv'):
	raise FileNotFoundError("Could not find csp_stats_consensus.csv")

	df = pd.read_csv('./csp_stats_consensus.csv')
	# First try to match holo_pdb
	match = df[df['holo_pdb'].str.lower() == self.pdb_id.lower()]
	if len(match) == 0:
	# If no holo match, try apo_pdb
	match = df[df['apo_pdb'].str.lower() == self.pdb_id.lower()]
	if len(match) == 0:
	raise ValueError(f"Could not find pH value for {self.pdb_id}")
	try:
	self.ph = float(match['apo ph'].iloc[0])
	return self.ph
	except (ValueError, IndexError):
	raise ValueError(f"Invalid pH value for {self.pdb_id}")
	try:
	self.ph = float(match['holo ph'].iloc[0])
	return self.ph
	except (ValueError, IndexError):
	raise ValueError(f"Invalid pH value for {self.pdb_id}")
	else:
	with open('./db_holo_cond.txt', 'r') as inf:
	for line in inf:
	this_pdb = line.split(' ')[0]
	this_pdb = this_pdb[this_pdb.rfind('/')+1:this_pdb.rfind('.')]
	if this_pdb == self.pdb_id:
	self.ph = float(line.split(' ')[1].strip())
	return self.ph
	raise ValueError(f"Could not find pH value for {self.pdb_id}")

	def prep_for_CSpred(self, out_directory: str) -> str:
	"""
	Prepare PDB file for CSpred by converting all chains to chain 'A'
	and renumbering residues continuously.

	Args:
	out_directory: Directory to save modified PDB file

	Returns:
	Condition string for CSpred in format: "path/to/file.pdb ph_value"
	"""
	# Ensure output directory exists
	os.makedirs(out_directory, exist_ok=True)

	# Get basename of original file
	basename = os.path.basename(self.pdb_path)
	output_path = os.path.join(out_directory, basename)

	# Process PDB file
	last_residue_id = 0
	current_residue_id = None
	prev_residue_id = None
	output_content = ""

	with open(self.pdb_path, 'r') as infile:
	for line in infile:
	if line.startswith('ATOM') or line.startswith('HETATM'):
	residue_id = int(line[22:26].strip())

	# Update residue numbering when we see a new residue
	if residue_id != current_residue_id:
	current_residue_id = residue_id

	# Check for chain break by looking for non-consecutive residue IDs
	if prev_residue_id is not None and residue_id != prev_residue_id + 1:
	# Add extra increment for chain break
	last_residue_id += 2
	else:
	last_residue_id += 1

	prev_residue_id = residue_id

	# Reconstruct line with chain A and new residue number
	new_residue_id_str = str(last_residue_id).rjust(4)
	updated_line = line[:21] + "A" + new_residue_id_str + line[26:]
	output_content += updated_line
	elif not line.startswith('TER'):
	output_content += line

	# Write modified file
	with open(output_path, 'w') as outfile:
	outfile.write(output_content)

	# Get relative path for condition string
	relative_outdir = os.path.basename(out_directory.rstrip('/')) + '/'

	# If pH not provided, try to get it from db_holo_cond.txt
	if self.ph is None and self.pdb_id:
	try:
	ph = self.get_ph()
	except ValueError as e:
	print(f"Warning: {e}")

	condition_string = f"{relative_outdir}{basename} {self.ph if self.ph is not None else '7.0'}"
	return condition_string.strip()

	def calc_dockq(self, other: 'PDB') -> Tuple[float, float, float, float, float, float, int]:
	"""
	Calculate DockQ score using this structure as reference and other as target

	Args:
	other: PDB object to compare against

	Returns:
	Tuple of (iRMS, LRMS, DockQ, Fnat, Fnonnat, F1, clashes) scores where:
	- iRMS: Interface Root Mean Square Deviation
	- LRMS: Ligand Root Mean Square Deviation
	- DockQ: Overall DockQ score
	- Fnat: Fraction of native contacts
	- Fnonnat: Fraction of non-native contacts
	- F1: F1 score
	- clashes: Number of clashes
	"""
	if not self.pdb_path or not other.pdb_path:
	raise ValueError("Both PDB files must exist")

	iRMS, LRMS, DockQ, Fnat, Fnonnat, F1, clashes = compute_DockQ_score(other.pdb_path, self.pdb_path)
	return iRMS, LRMS, DockQ, Fnat, Fnonnat, F1, clashes

	def calc_tm(self, other: 'PDB', multimer: bool = True) -> float:
	"""
	Calculate TM-score using this structure as reference and other as target

	Args:
	other: PDB object to compare against
	multimer: Whether to use multimer mode for TM-score calculation

	Returns:
	TM-score between the structures
	"""
	if not self.pdb_path or not other.pdb_path:
	raise ValueError("Both PDB files must exist")

	return compute_structure_similarity(self.pdb_path, other.pdb_path, multimer=multimer)

	def calc_gdt_ts(self, other: 'PDB', cutoffs: List[float] = [1.0, 2.0, 4.0, 8.0]) -> float:
	"""
	Calculate GDT_TS score using this structure as reference and other as target

	Args:
	other: PDB object to compare against
	cutoffs: Distance cutoffs in angstroms for GDT calculation

	Returns:
	GDT_TS score between the structures
	"""
	if not self.pdb_path or not other.pdb_path:
	raise ValueError("Both PDB files must exist")

	parser = BioPDB.PDBParser(QUIET=True)
	structure_ref = parser.get_structure("reference", self.pdb_path)
	structure_model = parser.get_structure("model", other.pdb_path)

	# Extract C-alpha coordinates by (chain_id, residue_number)
	ref_coords = {}
	for chain in structure_ref.get_chains():
	for residue in chain.get_residues():
	if residue.has_id('CA'):
	chain_id = chain.get_id()
	res_id = residue.get_id()[1]
	ref_coords[(chain_id, res_id)] = residue['CA'].get_vector()

	model_coords = {}
	for chain in structure_model.get_chains():
	for residue in chain.get_residues():
	if residue.has_id('CA'):
	chain_id = chain.get_id()
	res_id = residue.get_id()[1]
	model_coords[(chain_id, res_id)] = residue['CA'].get_vector()

	# Identify matching residues
	common_keys = set(ref_coords.keys()).intersection(model_coords.keys())
	if not common_keys:
	raise ValueError("No matching residues found between structures")

	# Calculate distances for each matching residue
	distances = []
	for key in common_keys:
	dist = (ref_coords[key] - model_coords[key]).norm()
	distances.append(dist)

	# For each cutoff, compute fraction of residues within cutoff
	gdt_scores = []
	total_matched = len(distances)
	for c in cutoffs:
	within_cutoff = sum(d <= c for d in distances)
	fraction = within_cutoff / total_matched
	gdt_scores.append(fraction)

	# GDT_TS is average of fractions * 100
	gdt_ts = (sum(gdt_scores) / len(gdt_scores)) * 100.0
	return gdt_ts

	def superpose_onto(self, other: 'PDB', output_path: Optional[str] = None) -> str:
	"""
	Superpose this structure onto another structure

	Args:
	other: Reference PDB structure to superpose onto
	output_path: Optional path to save superposed structure

	Returns:
	Path to superposed structure
	"""
	if not self.pdb_path or not other.pdb_path:
	raise ValueError("Both PDB files must exist")

	parser = BioPDB.PDBParser(QUIET=True)
	ref_structure = parser.get_structure('ref', other.pdb_path)
	mobile_structure = parser.get_structure('mobile', self.pdb_path)

	# Get largest chains
	def get_largest_chain(structure):
	largest_chain = None
	max_residues = 0
	for model in structure:
	for chain in model:
	num_residues = len([r for r in chain if BioPDB.is_aa(r)])
	if num_residues > max_residues:
	max_residues = num_residues
	largest_chain = chain
	return largest_chain

	ref_chain = get_largest_chain(ref_structure)
	mobile_chain = get_largest_chain(mobile_structure)

	# Align chains
	super_imposer = BioPDB.Superimposer()
	ref_atoms = [r['CA'] for r in ref_chain if 'CA' in r]
	mobile_atoms = [r['CA'] for r in mobile_chain if 'CA' in r]
	super_imposer.set_atoms(ref_atoms, mobile_atoms)

	# Apply transformation
	for model in mobile_structure:
	for chain in model:
	super_imposer.apply(chain.get_atoms())

	# Save superposed structure
	if not output_path:
	output_path = self.pdb_path.replace('.pdb', '_aligned.pdb')

	io = BioPDB.PDBIO()
	io.set_structure(mobile_structure)
	io.save(output_path)

	return output_path

	def find_pdb_files(pdb_id: str) -> List[str]:
	"""
	Find all PDB files matching the given ID in standard locations.
	Useful for when multiple files exist for the same PDB ID.
	"""
	pdb_files = []

	# Patterns to check
	patterns = [
	# Experimental structures
	os.path.join(experimental_structures, f'exp_{pdb_id.lower()}.pdb'),
	os.path.join(experimental_structures, pdb_id.lower(), f'exp_{pdb_id.lower()}.pdb'),
	os.path.join(experimental_structures, pdb_id.upper(), f'exp_{pdb_id.upper()}.pdb'),
	# Computational structures
	os.path.join(computational_structures, f'comp_{pdb_id.lower()}.pdb'),
	os.path.join(computational_structures, f'comp_{pdb_id.upper()}.pdb'),
	# General PDB files
	os.path.join(PDB_FILES, f'{pdb_id.lower()}.pdb'),
	os.path.join(PDB_FILES, f'{pdb_id.upper()}.pdb')
	]

	# Collect all existing files
	for pattern in patterns:
	if os.path.exists(pattern):
	pdb_files.append(pattern)

	return pdb_files