Source code for conkit.io.PdbIO

"""
Parser module specific to Pdb files
"""

from __future__ import division

__author__ = "Felix Simkovic"
__date__ = "27 Sep 2016"
__version__ = 0.1

import collections
import itertools
import warnings

from Bio.PDB import PDBParser

from conkit import constants
from conkit.core import Contact
from conkit.core import ContactFile
from conkit.core import ContactMap
from conkit.core import Sequence
from conkit.io._ParserIO import _ContactFileParser


[docs]class PdbParser(_ContactFileParser): """ Class to parse a PDB file and extract distance restraints as residue-residue contacts """ def __init__(self): super(PdbParser, self).__init__() def _build_sequence(self, chain): """Build a peptide using Biopython to extract the sequence""" return Sequence(chain.id + '_seq', ''.join(constants.THREE_TO_ONE[residue.resname] for residue in chain)) def _chain_contacts(self, chain1, chain2): """Determine the contact pairs intra- or inter-molecular Parameters ---------- chain1 : :obj:`Bio.PDB.Chain` A first chain object chain2 : :obj:`Bio.PDB.Chain` A second chain object Yields ------ atom_comb : list A list of tuples containing the contact information """ Atom = collections.namedtuple('Atom', 'resname resseq resseq_alt reschain') if chain1 == chain2: range1 = range2 = list(range(1, len(chain1) + 1)) else: range1 = list(range(1, len(chain1) + 1)) range2 = list(range(len(chain1) + 1, len(chain1)+len(chain2) + 1)) assert len(range1) == len(chain1) assert len(range2) == len(chain2) for (resseq1_alt, residue1), (resseq2_alt, residue2) in itertools.product(list(zip(range1, chain1)), list(zip(range2, chain2))): for atom1, atom2 in itertools.product(residue1, residue2): # Ignore duplicates if int(residue1.id[1]) > int(residue2.id[1]): continue elif int(residue1.id[1]) == int(residue2.id[1]): continue # Biopython implementation to calculate distance between atoms distance = atom1 - atom2 construct1 = Atom( resname=residue1.resname, resseq=int(residue1.id[1]), resseq_alt=resseq1_alt, reschain=chain1.id, ) construct2 = Atom( resname=residue2.resname, resseq=int(residue2.id[1]), resseq_alt=resseq2_alt, reschain=chain2.id, ) yield (construct1, construct2, distance) def _remove_atom(self, chain, type): """Tidy up a chain removing all HETATM entries""" for residue in chain.copy(): for atom in residue.copy(): if atom.is_disordered(): chain[residue.id].detach_child(atom.id) elif residue.resname == 'GLY' and type == 'CB' and atom.id == 'CA': continue elif atom.id != type: chain[residue.id].detach_child(atom.id) def _remove_hetatm(self, chain): """Tidy up a chain removing all HETATM entries""" for residue in chain.copy(): if residue.id[0].strip() and residue.resname not in constants.THREE_TO_ONE: chain.detach_child(residue.id)
[docs] def read(self, f_handle, f_id="pdb", distance_cutoff=8, atom_type='CB'): """Read a contact file Parameters ---------- f_handle Open file handle [read permissions] f_id : str, optional Unique contact file identifier distance_cutoff : int, optional Distance cutoff for which to determine contacts [default: 8] atom_type : str, optional Atom type between which distances are calculated [default: CB] Returns ------- :obj:`ContactFile <conkit.core.ContactFile>` """ structure = PDBParser(QUIET=True).get_structure("pdb", f_handle) hierarchies = [] for model in structure: # Create a per-model hierarchy hierarchy = ContactFile(f_id + '_' + str(model.id)) chains = list(chain for chain in model) # Tidy the chains of this model for chain in chains: self._remove_hetatm(chain) self._remove_atom(chain, atom_type) for chain1, chain2 in itertools.product(chains, chains): # Define the ContactMap Id if chain1.id == chain2.id: # intra contact_map = ContactMap(chain1.id) else: # inter contact_map = ContactMap(chain1.id + chain2.id) # Find all contacts between the two chains for (atom1, atom2, distance) in self._chain_contacts(chain1, chain2): contact = Contact( atom1.resseq, atom2.resseq, round(1.0-(distance/100), 6), distance_bound=(0, distance_cutoff) ) contact.res1_altseq = atom1.resseq_alt contact.res2_altseq = atom2.resseq_alt contact.res1 = atom1.resname contact.res2 = atom2.resname contact.res1_chain = atom1.reschain contact.res2_chain = atom2.reschain if distance < distance_cutoff: contact.define_true_positive() else: contact.define_false_positive() contact_map.add(contact) # Tidy up empty maps if len(contact_map) > 0: # Get the full length of the peptide sequence and store it if len(contact_map.id) == 1: # INTRA !!! contact_map.sequence = self._build_sequence(chain1) assert len(contact_map.sequence.seq) == len(chain1) # Check that ncon analyzed == len_chains else: # INTER !!! contact_map.sequence = self._build_sequence(chain1) + self._build_sequence(chain2) assert len(contact_map.sequence.seq) == len(chain1) + len(chain2) # Check that ncon analyzed == len_chains hierarchy.add(contact_map) # Save the map into the hierarchy else: del contact_map # Delete the empty contact map hierarchy.method = 'Contact map extracted from PDB {0}'.format(model.id) hierarchy.remark = ['The model id is the chain identifier, i.e XY equates to chain X and chain Y.', 'Residue numbers in column 1 are chain X, and numbers in column 2 are chain Y.'] hierarchies.append(hierarchy) if len(hierarchies) > 1: msg = "Super-level to contact file not yet implemented. " \ "Parser returns hierarchy for top model only!" warnings.warn(msg, FutureWarning) return hierarchies[0]
[docs] def write(self, f_handle, hierarchy): """Write a contact file instance to to file Parameters ---------- f_handle Open file handle [write permissions] hierarchy : :obj:`ContactFile <conkit.core.ContactFile>`, :obj:`ContactMap <conkit.core.ContactMap>` or :obj:`ContactMap <conkit.core.Contact>` Raises ------ RuntimeError Not available """ raise RuntimeError("Not available")