Skip to content
Permalink
5ab4ed8e4a
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
179 lines (146 sloc) 6.34 KB
from Bio.PDB import PDBParser, PDBIO
from Bio.PDB import PDBParser, PDBIO, Select
from Bio import PDB
def get_pdb_sequence_from_fasta(fasta_path):
sequence = []
with open(fasta_path, "r") as fasta_file:
first = True
for line in fasta_file:
if line.startswith(">"):
continue
else:
if first:
sequence.append(line.strip())
first = False
else:
sequence.append(':')
sequence.append(line.strip())
# Convert residue numbers into the desired format
formatted_sequence = "".join(sequence)
return formatted_sequence
class GlycineLinkerRemover(PDB.Select):
def __init__(self, gly_start, gly_end):
self.gly_start = gly_start
self.gly_end = gly_end
self.in_glycine_linker = False
self.residue_offset = None
def accept_residue(self, residue):
# Skip glycine linker residues
if self.gly_start <= residue.id[1] <= self.gly_end:
return 0
return 1
def accept_chain(self, chain):
if chain.id == 'A':
return 1
return 0
def convert_aa_name(three_letter_code):
"""Convert three-letter amino acid codes to one-letter codes. Placeholder function."""
aa_dict = {
'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D',
'CYS': 'C', 'GLU': 'E', 'GLN': 'Q', 'GLY': 'G',
'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',
'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
}
return aa_dict.get(three_letter_code, '?')
def find_glycine_linker(chain, length, sequence_to_match):
gly_runs = []
current_run = []
sequence = ""
for residue in chain:
sequence += convert_aa_name(residue.get_resname())
if residue.get_resname() == 'GLY':
current_run.append(residue.id[1])
if len(current_run) == length:
gly_runs.append((current_run[0]-1, current_run[-1]))
current_run.pop(0) # Move the window
else:
current_run = [] # Reset if a non-glycine residue is encountered
for gly_run in gly_runs:
t_sequence = sequence[:gly_run[0]] + ':' + sequence[gly_run[1]:]
if t_sequence.strip() == sequence_to_match.strip():
print("FOUND MATCHING SEQUENCE = \n" + t_sequence)
print(" SEQUENCE = \n" + sequence_to_match)
return [gly_run]
else:
t_sequence = sequence[:gly_run[0]] + ':' + sequence[gly_run[1]+1:]
if t_sequence.strip() == sequence_to_match.strip():
print("FOUND MATCHING SEQUENCE = \n" + t_sequence)
print(" SEQUENCE = \n" + sequence_to_match)
return [gly_run[0], gly_run[1]+1]
# print(t_sequence)
#print(sequence)
#print(gly_runs)
#raise
print("FOUND NO EXACT SEQUENCE MATCH FOR ANY GLY TRIM OPTION.")
#print(gly_runs)
return gly_runs
def modify_pdb(input_pdb, output_pdb, trimmed_sequence, gly_length=30):
# Define start and end of the glycine linker
gly_start, gly_end = None, None
current_residue_number = None
new_residue_number = 1
chain_changed = False
parser = PDB.PDBParser(QUIET=True)
structure = parser.get_structure('structure', input_pdb)
model = structure[0] # Assuming single model
chain_a = model['A']
gly_linker_segments = find_glycine_linker(chain_a, gly_length, trimmed_sequence)
if not gly_linker_segments:
print("No glycine linker segment of specified length found.")
return
gly_start, gly_end = gly_linker_segments[0][0], gly_linker_segments[0][1]
previous_residue_number = None # Initialize variable to track the previous residue number
chain_changed = False
with open(input_pdb, 'r') as infile, open(output_pdb, 'w') as outfile:
for line in infile:
if line.startswith("ATOM") or line.startswith("HETATM"):
# Extract chain ID and residue number from the line
chain_id = line[21]
residue_number = int(line[22:26].strip())
# Set gly_start and gly_end based on the sequence of glycines
if chain_id == 'A':
current_residue_number = residue_number
# Before the glycine linker
if residue_number <= gly_start:
outfile.write(line)
# After the glycine linker
elif residue_number > gly_end:
if not chain_changed: # Initialize new residue numbering and previous residue tracking
new_residue_number = 1
previous_residue_number = residue_number # Start tracking from the first residue after gly_end
chain_changed = True
else:
# Only increment new_residue_number if the current residue number is different from the previous one
if residue_number != previous_residue_number:
new_residue_number += 1
previous_residue_number = residue_number # Update previous_residue_number for the next iteration
# Adjust the line with the new chain ID and new residue number
line = line[:21] + 'B' + line[22:26].replace(line[22:26], f"{new_residue_number:>4}") + line[26:]
outfile.write(line)
# Non-ATOM/HETATM lines are copied as-is
else:
outfile.write(line)
import os
from os import listdir
from os.path import isdir, exists
import sys
if len(sys.argv) < 3:
print("python3 AFAlt_remove_linker.py ./inDir/ ./outDir/")
raise
indir = sys.argv[1]
outdir = sys.argv[2]
pdb = indir[indir.find('/')+1:indir.find('/')+5]
sequence = get_pdb_sequence_from_fasta('./trimmed_fastas/'+pdb+'_trimmed.fasta')
#print(pdb)
#print(sequence)
#raise
if not(exists(outdir)):
print('mkdir ' + outdir)
os.system('mkdir ' + outdir)
for pdb in listdir(indir):
if pdb.endswith('.pdb'):
pdb_filename = indir + pdb
output_filename = outdir + pdb
modify_pdb(pdb_filename, output_filename, sequence)
print(output_filename)