Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
from dataclasses import dataclass
from typing import List
import math
@dataclass
class SubSequence:
"""Class representing a subsequence (protein or peptide chain)"""
sequence: str
chain_id: str
is_protein: bool
is_peptide: bool
residue_numbers: List[int] = None
@property
def length(self) -> int:
return len(self.sequence)
def __str__(self) -> str:
if not self.residue_numbers:
return f"Chain {self.chain_id} ({'protein' if self.is_protein else 'peptide'}):\n{self.sequence}"
# Get the maximum number of digits
max_digits = int(math.log10(max(self.residue_numbers))) + 1
# Create position markers for each digit place
position_lines = []
for place in range(max_digits-1, -1, -1):
line = []
for num in self.residue_numbers:
digit = (num // (10 ** place)) % 10
line.append(str(digit))
position_lines.append("".join(line))
# Combine all lines
return (f"Chain {self.chain_id} ({'protein' if self.is_protein else 'peptide'}):\n" +
"\n".join(position_lines) + "\n" +
self.sequence)
class Sequence:
"""Class representing a protein-peptide complex sequence"""
def __init__(self, subsequences: List[SubSequence] = None):
self.subsequences = subsequences or []
@classmethod
def from_pdb(cls, pdb_path: str, protein_chains: List[str] = None, peptide_chains: List[str] = None):
"""
Create Sequence object from PDB file
Args:
pdb_path: Path to PDB file
protein_chains: List of chain IDs that are proteins
peptide_chains: List of chain IDs that are peptides
If neither is specified:
- For single chain: assume it's a protein
- For multiple chains: assume longest is protein, others are peptides
"""
# Parse PDB file to get sequences by chain
chain_sequences = {}
chain_residue_numbers = {}
prev_res_id = None
prev_chain = None
with open(pdb_path, "r") as pdb_file:
for line in pdb_file:
if line.startswith("ATOM"):
chain = line[21]
res_name = line[17:20].strip()
res_id = int(line[22:26])
if chain not in chain_sequences:
chain_sequences[chain] = []
chain_residue_numbers[chain] = []
if prev_chain != chain or prev_res_id != res_id:
chain_sequences[chain].append(convert_aa_name(res_name))
chain_residue_numbers[chain].append(res_id)
prev_chain = chain
prev_res_id = res_id
# Convert sequence lists to strings
chain_sequences = {k: "".join(v) for k, v in chain_sequences.items()}
# If chains not specified, make automatic classification
if not protein_chains and not peptide_chains:
if len(chain_sequences) == 1:
# Single chain case - assume it's a protein
protein_chains = list(chain_sequences.keys())
peptide_chains = []
else:
# Multiple chain case - assume longest is protein
longest_chain = max(chain_sequences.items(), key=lambda x: len(x[1]))[0]
protein_chains = [longest_chain]
peptide_chains = [c for c in chain_sequences.keys() if c != longest_chain]
# Create subsequences
subsequences = []
for chain_id, sequence in chain_sequences.items():
is_protein = chain_id in (protein_chains or [])
is_peptide = chain_id in (peptide_chains or [])
# Validate chain classification
if not is_protein and not is_peptide:
print(f"Warning: Chain {chain_id} not classified as protein or peptide")
continue
if is_protein and is_peptide:
print(f"Warning: Chain {chain_id} classified as both protein and peptide")
continue
subsequences.append(SubSequence(
sequence=sequence,
chain_id=chain_id,
is_protein=is_protein,
is_peptide=is_peptide,
residue_numbers=chain_residue_numbers[chain_id]
))
return cls(subsequences)
@property
def protein_sequences(self) -> List[SubSequence]:
"""Get list of protein subsequences"""
return [s for s in self.subsequences if s.is_protein]
@property
def peptide_sequences(self) -> List[SubSequence]:
"""Get list of peptide subsequences"""
return [s for s in self.subsequences if s.is_peptide]
def get_chain(self, chain_id: str) -> SubSequence:
"""Get subsequence by chain ID"""
for seq in self.subsequences:
if seq.chain_id == chain_id:
return seq
raise KeyError(f"No chain found with ID {chain_id}")
def __str__(self) -> str:
parts = []
if self.protein_sequences:
parts.extend(str(s) for s in self.protein_sequences)
if self.peptide_sequences:
parts.extend(str(s) for s in self.peptide_sequences)
return "\n\n".join(parts)
def convert_aa_name(three_letter_code):
"""Convert three-letter amino acid codes to one-letter codes."""
aa_dict = {
'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D',
'CYS': 'C', 'GLU': 'E', 'GLN': 'Q', 'GLY': 'G',
'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',
'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
}
return aa_dict.get(three_letter_code, '?')