Source code for conkit.io.PdbIO

"""
Parser module specific to Pdb files
"""

from __future__ import division

__author__ = "Felix Simkovic"
__date__ = "27 Sep 2016"
__version__ = "0.1"

import collections
import itertools
import warnings

from Bio.PDB import MMCIFParser
from Bio.PDB import PDBParser

from conkit.core import Contact
from conkit.core import ContactFile
from conkit.core import ContactMap
from conkit.core import Sequence
from conkit.core import THREE_TO_ONE
from conkit.io._ParserIO import _ContactFileParser


class _GenericStructureParser(_ContactFileParser):
    """
    Parent class to parse a PDB file and extract distance restraints
    as residue-residue contacts
    """
    def __init__(self):
        super(_GenericStructureParser, self).__init__()

    def _build_sequence(self, chain):
        """Build a peptide using Biopython to extract the sequence"""
        return Sequence(chain.id + '_seq', ''.join(THREE_TO_ONE[residue.resname] for residue in chain))

    def _chain_contacts(self, chain1, chain2):
        """Determine the contact pairs intra- or inter-molecular

        Parameters
        ----------
        chain1 : :obj:`Bio.PDB.Chain`
           A first chain object
        chain2 : :obj:`Bio.PDB.Chain`
           A second chain object

        Yields
        ------
        atom_comb : list
           A list of tuples containing the contact information

        """
        Atom = collections.namedtuple('Atom', 'resname resseq resseq_alt reschain')

        if chain1 == chain2:
            range1 = range2 = list(range(1, len(chain1) + 1))
        else:
            range1 = list(range(1, len(chain1) + 1))
            range2 = list(range(len(chain1) + 1, len(chain1)+len(chain2) + 1))

        assert len(range1) == len(chain1)
        assert len(range2) == len(chain2)

        for (resseq1_alt, residue1), (resseq2_alt, residue2) in itertools.product(list(zip(range1, chain1)),
                                                                                  list(zip(range2, chain2))):
            for atom1, atom2 in itertools.product(residue1, residue2):

                # Ignore duplicates
                if chain1 == chain2 and int(residue1.id[1]) >= int(residue2.id[1]):
                    continue

                # Biopython implementation to calculate distance between atoms
                distance = atom1 - atom2

                construct1 = Atom(
                    resname=residue1.resname,
                    resseq=int(residue1.id[1]),
                    resseq_alt=resseq1_alt,
                    reschain=chain1.id,
                )
                construct2 = Atom(
                    resname=residue2.resname,
                    resseq=int(residue2.id[1]),
                    resseq_alt=resseq2_alt,
                    reschain=chain2.id,
                )
                yield (construct1, construct2, distance)

    def _remove_atom(self, chain, type):
        """Tidy up a chain removing all HETATM entries"""
        for residue in chain.copy():
            for atom in residue.copy():
                if atom.is_disordered():
                    chain[residue.id].detach_child(atom.id)
                elif residue.resname == 'GLY' and type == 'CB' and atom.id == 'CA':
                    continue
                elif atom.id != type:
                    chain[residue.id].detach_child(atom.id)

    def _remove_hetatm(self, chain):
        """Tidy up a chain removing all HETATM entries"""
        for residue in chain.copy():
            if residue.id[0].strip() and residue.resname not in THREE_TO_ONE:
                chain.detach_child(residue.id)

    def _read(self, structure, f_id, distance_cutoff, atom_type):
        """Read a contact file

        Parameters
        ----------
        structure
           A :obj:`Structure <Bio.PDB.Structure.Structure>` instance
        f_id : str
           Unique contact file identifier
        distance_cutoff : int
           Distance cutoff for which to determine contacts
        atom_type : str
           Atom type between which distances are calculated

        Returns
        -------
        :obj:`ContactFile <conkit.core.ContactFile>`

        """
        hierarchies = []
        for model in structure:

            # Create a per-model hierarchy
            hierarchy = ContactFile(f_id + '_' + str(model.id))
            chains = list(chain for chain in model)

            # Tidy the chains of this model
            for chain in chains:
                self._remove_hetatm(chain)
                self._remove_atom(chain, atom_type)

            for chain1, chain2 in itertools.product(chains, chains):

                # Define the ContactMap Id
                if chain1.id == chain2.id:                          # intra
                    contact_map = ContactMap(chain1.id)
                else:                                               # inter
                    contact_map = ContactMap(chain1.id + chain2.id)

                # Find all contacts between the two chains
                for (atom1, atom2, distance) in self._chain_contacts(chain1, chain2):
                    contact = Contact(
                        atom1.resseq,
                        atom2.resseq,
                        round(1.0-(distance/100), 6),
                        distance_bound=(0, distance_cutoff)
                    )

                    contact.res1_altseq = atom1.resseq_alt
                    contact.res2_altseq = atom2.resseq_alt
                    contact.res1 = atom1.resname
                    contact.res2 = atom2.resname
                    contact.res1_chain = atom1.reschain
                    contact.res2_chain = atom2.reschain

                    if distance < distance_cutoff:
                        contact.define_match()
                        contact_map.add(contact)

                # Tidy up empty maps
                if len(contact_map) > 0:
                    # Get the full length of the peptide sequence and store it
                    if len(contact_map.id) == 1:                                                # INTRA !!!
                        contact_map.sequence = self._build_sequence(chain1)
                        assert len(contact_map.sequence.seq) == len(chain1)                     # Check that ncon analyzed == len_chains
                    else:                                                                       # INTER !!!
                        contact_map.sequence = self._build_sequence(chain1) + self._build_sequence(chain2)
                        assert len(contact_map.sequence.seq) == len(chain1) + len(chain2)       # Check that ncon analyzed == len_chains
                    hierarchy.add(contact_map)                                                  # Save the map into the hierarchy
                else:
                    del contact_map                                                             # Delete the empty contact map

            hierarchy.method = 'Contact map extracted from PDB {0}'.format(model.id)
            hierarchy.remark = ['The model id is the chain identifier, i.e XY equates to chain X and chain Y.',
                                'Residue numbers in column 1 are chain X, and numbers in column 2 are chain Y.']
            hierarchies.append(hierarchy)

        if len(hierarchies) > 1:
            msg = "Super-level to contact file not yet implemented. " \
                  "Parser returns hierarchy for top model only!"
            warnings.warn(msg, FutureWarning)
        return hierarchies[0]

    def _write(self):
        """Write a contact file instance to to file

        Raises
        ------
        RuntimeError
           Not available

        """
        raise RuntimeError("Not available")


[docs]class MmCifParser(_GenericStructureParser): """ Class to parse a mmCIF file and extract distance restraints as residue-residue contacts """ def __init__(self): super(MmCifParser, self).__init__()
[docs] def read(self, f_handle, f_id="mmcif", distance_cutoff=8, atom_type='CB'): """Read a contact file Parameters ---------- f_handle Open file handle [read permissions] f_id : str, optional Unique contact file identifier distance_cutoff : int, optional Distance cutoff for which to determine contacts [default: 8] atom_type : str, optional Atom type between which distances are calculated [default: CB] Returns ------- :obj:`ContactFile <conkit.core.ContactFile>` """ structure = MMCIFParser(QUIET=True).get_structure("mmcif", f_handle) return self._read(structure, f_id, distance_cutoff, atom_type)
[docs] def write(self, f_handle, hierarchy): """Write a contact file instance to to file Parameters ---------- f_handle Open file handle [write permissions] hierarchy : :obj:`ContactFile <conkit.core.ContactFile>`, :obj:`ContactMap <conkit.core.ContactMap>` or :obj:`ContactMap <conkit.core.Contact>` Raises ------ RuntimeError Not available """ self._write()
[docs]class PdbParser(_GenericStructureParser): """ Class to parse a PDB file and extract distance restraints as residue-residue contacts """ def __init__(self): super(PdbParser, self).__init__()
[docs] def read(self, f_handle, f_id="pdb", distance_cutoff=8, atom_type='CB'): """Read a contact file Parameters ---------- f_handle Open file handle [read permissions] f_id : str, optional Unique contact file identifier distance_cutoff : int, optional Distance cutoff for which to determine contacts [default: 8] atom_type : str, optional Atom type between which distances are calculated [default: CB] Returns ------- :obj:`ContactFile <conkit.core.ContactFile>` """ structure = PDBParser(QUIET=True).get_structure("pdb", f_handle) return self._read(structure, f_id, distance_cutoff, atom_type)
[docs] def write(self, f_handle, hierarchy): """Write a contact file instance to to file Parameters ---------- f_handle Open file handle [write permissions] hierarchy : :obj:`ContactFile <conkit.core.ContactFile>`, :obj:`ContactMap <conkit.core.ContactMap>` or :obj:`ContactMap <conkit.core.Contact>` Raises ------ RuntimeError Not available """ self._write()