Sequence.py

from dataclasses import dataclass
from typing import List
import math

@dataclass
class SubSequence:
    """Class representing a subsequence (protein or peptide chain)"""
    sequence: str
    chain_id: str
    is_protein: bool
    is_peptide: bool
    residue_numbers: List[int] = None

    @property
    def length(self) -> int:
        return len(self.sequence)

    def __str__(self) -> str:
        if not self.residue_numbers:
            return f"Chain {self.chain_id} ({'protein' if self.is_protein else 'peptide'}):\n{self.sequence}"

        # Get the maximum number of digits
        max_digits = int(math.log10(max(self.residue_numbers))) + 1

        # Create position markers for each digit place
        position_lines = []
        for place in range(max_digits-1, -1, -1):
            line = []
            for num in self.residue_numbers:
                digit = (num // (10 ** place)) % 10
                line.append(str(digit))
            position_lines.append("".join(line))

        # Combine all lines
        return (f"Chain {self.chain_id} ({'protein' if self.is_protein else 'peptide'}):\n" +
                "\n".join(position_lines) + "\n" +
                self.sequence)

class Sequence:
    """Class representing a protein-peptide complex sequence"""
    def __init__(self, subsequences: List[SubSequence] = None):
        self.subsequences = subsequences or []

    @classmethod
    def from_pdb(cls, pdb_path: str, protein_chains: List[str] = None, peptide_chains: List[str] = None):
        """
        Create Sequence object from PDB file

        Args:
            pdb_path: Path to PDB file
            protein_chains: List of chain IDs that are proteins
            peptide_chains: List of chain IDs that are peptides
            If neither is specified:
            - For single chain: assume it's a protein
            - For multiple chains: assume longest is protein, others are peptides
        """
        # Parse PDB file to get sequences by chain
        chain_sequences = {}
        chain_residue_numbers = {}
        prev_res_id = None
        prev_chain = None

        with open(pdb_path, "r") as pdb_file:
            for line in pdb_file:
                if line.startswith("ATOM"):
                    chain = line[21]
                    res_name = line[17:20].strip()
                    res_id = int(line[22:26])

                    if chain not in chain_sequences:
                        chain_sequences[chain] = []
                        chain_residue_numbers[chain] = []

                    if prev_chain != chain or prev_res_id != res_id:
                        chain_sequences[chain].append(convert_aa_name(res_name))
                        chain_residue_numbers[chain].append(res_id)

                    prev_chain = chain
                    prev_res_id = res_id

        # Convert sequence lists to strings
        chain_sequences = {k: "".join(v) for k, v in chain_sequences.items()}

        # If chains not specified, make automatic classification
        if not protein_chains and not peptide_chains:
            if len(chain_sequences) == 1:
                # Single chain case - assume it's a protein
                protein_chains = list(chain_sequences.keys())
                peptide_chains = []
            else:
                # Multiple chain case - assume longest is protein
                longest_chain = max(chain_sequences.items(), key=lambda x: len(x[1]))[0]
                protein_chains = [longest_chain]
                peptide_chains = [c for c in chain_sequences.keys() if c != longest_chain]

        # Create subsequences
        subsequences = []
        for chain_id, sequence in chain_sequences.items():
            is_protein = chain_id in (protein_chains or [])
            is_peptide = chain_id in (peptide_chains or [])

            # Validate chain classification
            if not is_protein and not is_peptide:
                print(f"Warning: Chain {chain_id} not classified as protein or peptide")
                continue
            if is_protein and is_peptide:
                print(f"Warning: Chain {chain_id} classified as both protein and peptide")
                continue

            subsequences.append(SubSequence(
                sequence=sequence,
                chain_id=chain_id,
                is_protein=is_protein,
                is_peptide=is_peptide,
                residue_numbers=chain_residue_numbers[chain_id]
            ))

        return cls(subsequences)

    @property
    def protein_sequences(self) -> List[SubSequence]:
        """Get list of protein subsequences"""
        return [s for s in self.subsequences if s.is_protein]

    @property
    def peptide_sequences(self) -> List[SubSequence]:
        """Get list of peptide subsequences"""
        return [s for s in self.subsequences if s.is_peptide]

    def get_chain(self, chain_id: str) -> SubSequence:
        """Get subsequence by chain ID"""
        for seq in self.subsequences:
            if seq.chain_id == chain_id:
                return seq
        raise KeyError(f"No chain found with ID {chain_id}")

    def __str__(self) -> str:
        parts = []
        if self.protein_sequences:
            parts.extend(str(s) for s in self.protein_sequences)
        if self.peptide_sequences:
            parts.extend(str(s) for s in self.peptide_sequences)
        return "\n\n".join(parts)

def convert_aa_name(three_letter_code):
    """Convert three-letter amino acid codes to one-letter codes."""
    aa_dict = {
        'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D',
        'CYS': 'C', 'GLU': 'E', 'GLN': 'Q', 'GLY': 'G',
        'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',
        'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
        'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
    }
    return aa_dict.get(three_letter_code, '?')
	from dataclasses import dataclass
	from typing import List
	import math

	@dataclass
	class SubSequence:
	"""Class representing a subsequence (protein or peptide chain)"""
	sequence: str
	chain_id: str
	is_protein: bool
	is_peptide: bool
	residue_numbers: List[int] = None

	@property
	def length(self) -> int:
	return len(self.sequence)

	def __str__(self) -> str:
	if not self.residue_numbers:
	return f"Chain {self.chain_id} ({'protein' if self.is_protein else 'peptide'}):\n{self.sequence}"

	# Get the maximum number of digits
	max_digits = int(math.log10(max(self.residue_numbers))) + 1

	# Create position markers for each digit place
	position_lines = []
	for place in range(max_digits-1, -1, -1):
	line = []
	for num in self.residue_numbers:
	digit = (num // (10 ** place)) % 10
	line.append(str(digit))
	position_lines.append("".join(line))

	# Combine all lines
	return (f"Chain {self.chain_id} ({'protein' if self.is_protein else 'peptide'}):\n" +
	"\n".join(position_lines) + "\n" +
	self.sequence)

	class Sequence:
	"""Class representing a protein-peptide complex sequence"""
	def __init__(self, subsequences: List[SubSequence] = None):
	self.subsequences = subsequences or []

	@classmethod
	def from_pdb(cls, pdb_path: str, protein_chains: List[str] = None, peptide_chains: List[str] = None):
	"""
	Create Sequence object from PDB file

	Args:
	pdb_path: Path to PDB file
	protein_chains: List of chain IDs that are proteins
	peptide_chains: List of chain IDs that are peptides
	If neither is specified:
	- For single chain: assume it's a protein
	- For multiple chains: assume longest is protein, others are peptides
	"""
	# Parse PDB file to get sequences by chain
	chain_sequences = {}
	chain_residue_numbers = {}
	prev_res_id = None
	prev_chain = None

	with open(pdb_path, "r") as pdb_file:
	for line in pdb_file:
	if line.startswith("ATOM"):
	chain = line[21]
	res_name = line[17:20].strip()
	res_id = int(line[22:26])

	if chain not in chain_sequences:
	chain_sequences[chain] = []
	chain_residue_numbers[chain] = []

	if prev_chain != chain or prev_res_id != res_id:
	chain_sequences[chain].append(convert_aa_name(res_name))
	chain_residue_numbers[chain].append(res_id)

	prev_chain = chain
	prev_res_id = res_id

	# Convert sequence lists to strings
	chain_sequences = {k: "".join(v) for k, v in chain_sequences.items()}

	# If chains not specified, make automatic classification
	if not protein_chains and not peptide_chains:
	if len(chain_sequences) == 1:
	# Single chain case - assume it's a protein
	protein_chains = list(chain_sequences.keys())
	peptide_chains = []
	else:
	# Multiple chain case - assume longest is protein
	longest_chain = max(chain_sequences.items(), key=lambda x: len(x[1]))[0]
	protein_chains = [longest_chain]
	peptide_chains = [c for c in chain_sequences.keys() if c != longest_chain]

	# Create subsequences
	subsequences = []
	for chain_id, sequence in chain_sequences.items():
	is_protein = chain_id in (protein_chains or [])
	is_peptide = chain_id in (peptide_chains or [])

	# Validate chain classification
	if not is_protein and not is_peptide:
	print(f"Warning: Chain {chain_id} not classified as protein or peptide")
	continue
	if is_protein and is_peptide:
	print(f"Warning: Chain {chain_id} classified as both protein and peptide")
	continue

	subsequences.append(SubSequence(
	sequence=sequence,
	chain_id=chain_id,
	is_protein=is_protein,
	is_peptide=is_peptide,
	residue_numbers=chain_residue_numbers[chain_id]
	))

	return cls(subsequences)

	@property
	def protein_sequences(self) -> List[SubSequence]:
	"""Get list of protein subsequences"""
	return [s for s in self.subsequences if s.is_protein]

	@property
	def peptide_sequences(self) -> List[SubSequence]:
	"""Get list of peptide subsequences"""
	return [s for s in self.subsequences if s.is_peptide]

	def get_chain(self, chain_id: str) -> SubSequence:
	"""Get subsequence by chain ID"""
	for seq in self.subsequences:
	if seq.chain_id == chain_id:
	return seq
	raise KeyError(f"No chain found with ID {chain_id}")

	def __str__(self) -> str:
	parts = []
	if self.protein_sequences:
	parts.extend(str(s) for s in self.protein_sequences)
	if self.peptide_sequences:
	parts.extend(str(s) for s in self.peptide_sequences)
	return "\n\n".join(parts)

	def convert_aa_name(three_letter_code):
	"""Convert three-letter amino acid codes to one-letter codes."""
	aa_dict = {
	'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D',
	'CYS': 'C', 'GLU': 'E', 'GLN': 'Q', 'GLY': 'G',
	'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',
	'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
	'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
	}
	return aa_dict.get(three_letter_code, '?')