Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
CSP_Rank/Sequence.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
154 lines (130 sloc)
5.95 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from dataclasses import dataclass | |
from typing import List | |
import math | |
@dataclass | |
class SubSequence: | |
"""Class representing a subsequence (protein or peptide chain)""" | |
sequence: str | |
chain_id: str | |
is_protein: bool | |
is_peptide: bool | |
residue_numbers: List[int] = None | |
@property | |
def length(self) -> int: | |
return len(self.sequence) | |
def __str__(self) -> str: | |
if not self.residue_numbers: | |
return f"Chain {self.chain_id} ({'protein' if self.is_protein else 'peptide'}):\n{self.sequence}" | |
# Get the maximum number of digits | |
max_digits = int(math.log10(max(self.residue_numbers))) + 1 | |
# Create position markers for each digit place | |
position_lines = [] | |
for place in range(max_digits-1, -1, -1): | |
line = [] | |
for num in self.residue_numbers: | |
digit = (num // (10 ** place)) % 10 | |
line.append(str(digit)) | |
position_lines.append("".join(line)) | |
# Combine all lines | |
return (f"Chain {self.chain_id} ({'protein' if self.is_protein else 'peptide'}):\n" + | |
"\n".join(position_lines) + "\n" + | |
self.sequence) | |
class Sequence: | |
"""Class representing a protein-peptide complex sequence""" | |
def __init__(self, subsequences: List[SubSequence] = None): | |
self.subsequences = subsequences or [] | |
@classmethod | |
def from_pdb(cls, pdb_path: str, protein_chains: List[str] = None, peptide_chains: List[str] = None): | |
""" | |
Create Sequence object from PDB file | |
Args: | |
pdb_path: Path to PDB file | |
protein_chains: List of chain IDs that are proteins | |
peptide_chains: List of chain IDs that are peptides | |
If neither is specified: | |
- For single chain: assume it's a protein | |
- For multiple chains: assume longest is protein, others are peptides | |
""" | |
# Parse PDB file to get sequences by chain | |
chain_sequences = {} | |
chain_residue_numbers = {} | |
prev_res_id = None | |
prev_chain = None | |
with open(pdb_path, "r") as pdb_file: | |
for line in pdb_file: | |
if line.startswith("ATOM"): | |
chain = line[21] | |
res_name = line[17:20].strip() | |
res_id = int(line[22:26]) | |
if chain not in chain_sequences: | |
chain_sequences[chain] = [] | |
chain_residue_numbers[chain] = [] | |
if prev_chain != chain or prev_res_id != res_id: | |
chain_sequences[chain].append(convert_aa_name(res_name)) | |
chain_residue_numbers[chain].append(res_id) | |
prev_chain = chain | |
prev_res_id = res_id | |
# Convert sequence lists to strings | |
chain_sequences = {k: "".join(v) for k, v in chain_sequences.items()} | |
# If chains not specified, make automatic classification | |
if not protein_chains and not peptide_chains: | |
if len(chain_sequences) == 1: | |
# Single chain case - assume it's a protein | |
protein_chains = list(chain_sequences.keys()) | |
peptide_chains = [] | |
else: | |
# Multiple chain case - assume longest is protein | |
longest_chain = max(chain_sequences.items(), key=lambda x: len(x[1]))[0] | |
protein_chains = [longest_chain] | |
peptide_chains = [c for c in chain_sequences.keys() if c != longest_chain] | |
# Create subsequences | |
subsequences = [] | |
for chain_id, sequence in chain_sequences.items(): | |
is_protein = chain_id in (protein_chains or []) | |
is_peptide = chain_id in (peptide_chains or []) | |
# Validate chain classification | |
if not is_protein and not is_peptide: | |
print(f"Warning: Chain {chain_id} not classified as protein or peptide") | |
continue | |
if is_protein and is_peptide: | |
print(f"Warning: Chain {chain_id} classified as both protein and peptide") | |
continue | |
subsequences.append(SubSequence( | |
sequence=sequence, | |
chain_id=chain_id, | |
is_protein=is_protein, | |
is_peptide=is_peptide, | |
residue_numbers=chain_residue_numbers[chain_id] | |
)) | |
return cls(subsequences) | |
@property | |
def protein_sequences(self) -> List[SubSequence]: | |
"""Get list of protein subsequences""" | |
return [s for s in self.subsequences if s.is_protein] | |
@property | |
def peptide_sequences(self) -> List[SubSequence]: | |
"""Get list of peptide subsequences""" | |
return [s for s in self.subsequences if s.is_peptide] | |
def get_chain(self, chain_id: str) -> SubSequence: | |
"""Get subsequence by chain ID""" | |
for seq in self.subsequences: | |
if seq.chain_id == chain_id: | |
return seq | |
raise KeyError(f"No chain found with ID {chain_id}") | |
def __str__(self) -> str: | |
parts = [] | |
if self.protein_sequences: | |
parts.extend(str(s) for s in self.protein_sequences) | |
if self.peptide_sequences: | |
parts.extend(str(s) for s in self.peptide_sequences) | |
return "\n\n".join(parts) | |
def convert_aa_name(three_letter_code): | |
"""Convert three-letter amino acid codes to one-letter codes.""" | |
aa_dict = { | |
'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', | |
'CYS': 'C', 'GLU': 'E', 'GLN': 'Q', 'GLY': 'G', | |
'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', | |
'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S', | |
'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V' | |
} | |
return aa_dict.get(three_letter_code, '?') |