Source code for conkit.plot.modelvalidation

# BSD 3-Clause License
#
# Copyright (c) 2016-21, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""A module to produce a model validation plot

It uses one external program:

   map_align for contact map alignment

*** This program needs to be installed separately from https://github.com/sokrypton/map_align***
"""

from __future__ import division
from __future__ import print_function

import os
from Bio.PDB.DSSP import DSSP
import numpy as np
import pandas as pd
import tempfile

from conkit.applications import MapAlignCommandline
from conkit.core.distance import Distance
import conkit.io
from conkit.misc import load_validation_model, SELECTED_VALIDATION_FEATURES, ALL_VALIDATION_FEATURES
from conkit.plot.figure import Figure
import conkit.plot.tools as tools

LINEKWARGS = dict(linestyle="--", linewidth=1.0, alpha=0.5, color=tools.ColorDefinitions.MISMATCH, zorder=1)
MARKERKWARGS = dict(marker='|', linestyle='None')
_MARKERKWARGS = dict(marker='s', linestyle='None')


[docs]class ModelValidationFigure(Figure):
    """A Figure object specifc for a model validation. This figure represents the proabbility that each given residue
    in the model is involved in a model error. This is donw by feeding a trained classfier the differences observed
    between the predicted distogram and the observed inter-residue contacts and distances at the PDB model.

    Attributes
    ----------
    model: :obj:`~conkit.core.distogram.Distogram`
       The PDB model that will be validated
    prediction: :obj:`~conkit.core.distogram.Distogram`
       The distogram with the residue distance predictions
    sequence: :obj:`~conkit.core.sequence.Sequence`
       The sequence of the structure
    dssp: :obj:`Bio.PDB.DSSP.DSSP`
        The DSSP output for the PDB model that will be validated
    map_align_exe: str
        The path to map_align executable [default: None]
    dist_bins: list, tuple
        A list of tuples with the boundaries of the distance bins to use in the calculation [default: CASP2 bins]
    l_factor: float
        The L/N factor used to filter the contacts before finding the False Negatives [default: 0.5]
    absent_residues: set
        The residues not observed in the model that will be validated (only if in PDB format)

    Examples
    --------
    >>> from Bio.PDB import PDBParser
    >>> from Bio.PDB.DSSP import DSSP
    >>> p = PDBParser()
    >>> structure = p.get_structure('TOXD', 'toxd/toxd.pdb')[0]
    >>> dssp = DSSP(structure, 'toxd/toxd.pdb', dssp='mkdssp', acc_array='Wilke')
    >>> import conkit
    >>> sequence = conkit.io.read('toxd/toxd.fasta', 'fasta').top
    >>> model = conkit.io.read('toxd/toxd.pdb', 'pdb').top_map
    >>> prediction = conkit.io.read('toxd/toxd.npz', 'rosettanpz').top_map
    >>> conkit.plot.ModelValidationFigure(model, prediction, sequence, dssp)

    """

    def __init__(self, model, prediction, sequence, dssp, map_align_exe=None, dist_bins=None, l_factor=0.5, **kwargs):
        """A new model validation plot

        Parameters
        ----------
        model: :obj:`~conkit.core.distogram.Distogram`
            The PDB model that will be validated
        prediction: :obj:`~conkit.core.distogram.Distogram`
            The distogram with the residue distance predictions
        sequence: :obj:`~conkit.core.sequence.Sequence`
            The sequence of the structure
        dssp: :obj:`Bio.PDB.DSSP.DSSP`
            The DSSP output for the PDB model that will be validated
        map_align_exe: str
            The path to map_align executable [default: None]
        dist_bins: list, tuple
            A list of tuples with the boundaries of the distance bins to use in the calculation [default: CASP2 bins]
        l_factor: float
            The L/N factor used to filter the contacts before finding the False Negatives [default: 0.5]

        **kwargs
           General :obj:`~conkit.plot.figure.Figure` keyword arguments

        """
        super(ModelValidationFigure, self).__init__(**kwargs)
        self._model = None
        self._prediction = None
        self._sequence = None
        self._distance_bins = None
        self.data = None
        self.alignment = {}
        self.sorted_scores = None
        self.smooth_scores = None

        if len(sequence) < 5:
            raise ValueError('Cannot validate a model with less than 5 residues')

        self.map_align_exe = map_align_exe
        self.l_factor = l_factor
        self.dist_bins = dist_bins
        self.model = model
        self.prediction = prediction
        self.sequence = sequence
        self.classifier, self.scaler = load_validation_model()
        self.absent_residues = self._get_absent_residues()
        self.dssp = self._parse_dssp(dssp)

        self.draw()

    def __repr__(self):
        return self.__class__.__name__

    @property
    def dist_bins(self):
        return self._dist_bins

    @dist_bins.setter
    def dist_bins(self, dist_bins):
        if dist_bins is None:
            self._dist_bins = ((0, 4), (4, 6), (6, 8), (8, 10), (10, 12), (12, 14),
                               (14, 16), (16, 18), (18, 20), (20, np.inf))
        else:
            Distance._assert_valid_bins(dist_bins)
            self._dist_bins = dist_bins

    @property
    def sequence(self):
        return self._sequence

    @sequence.setter
    def sequence(self, sequence):
        if sequence and tools._isinstance(sequence, "Sequence"):
            self._sequence = sequence
        else:
            raise TypeError("Invalid hierarchy type for sequence: %s" % sequence.__class__.__name__)

    @property
    def prediction(self):
        return self._prediction

    @prediction.setter
    def prediction(self, prediction):
        if prediction and tools._isinstance(prediction, "Distogram"):
            self._prediction = prediction
        else:
            raise TypeError("Invalid hierarchy type for prediction: %s" % prediction.__class__.__name__)

    @property
    def model(self):
        return self._model

    @model.setter
    def model(self, model):
        if model and tools._isinstance(model, "Distogram"):
            self._model = model
        else:
            raise TypeError("Invalid hierarchy type for model: %s" % model.__class__.__name__)

    def _get_absent_residues(self):
        """Get a set of residues absent from the :attr:`~conkit.plot.ModelValidationFigure.model` and
        :attr:`~conkit.plot.ModelValidationFigure.prediction`. Only distograms originating from PDB files
        are considered."""

        absent_residues = []
        if self.model.original_file_format == "pdb":
            absent_residues += self.model.get_absent_residues(len(self.sequence))
        if self.prediction.original_file_format == "pdb":
            absent_residues += self.prediction.get_absent_residues(len(self.sequence))
        return set(absent_residues)

    def _prepare_distogram(self, distogram):
        """General operations to prepare a :obj:`~conkit.core.distogram.Distogram` instance before plotting."""
        distogram.get_unique_distances(inplace=True)
        distogram.sequence = self.sequence
        distogram.set_sequence_register()

        if distogram.original_file_format != "pdb":
            distogram.reshape_bins(self.dist_bins)

        return distogram

    def _prepare_contactmap(self, distogram):
        """General operations to prepare a :obj:`~conkit.core.contactmap.ContactMap` instance before plotting."""
        contactmap = distogram.as_contactmap()
        contactmap.sequence = self.sequence
        contactmap.set_sequence_register()
        contactmap.remove_neighbors(inplace=True)

        if distogram.original_file_format != "pdb":
            contactmap.sort("raw_score", reverse=True, inplace=True)
            contactmap.slice_map(seq_len=len(self.sequence), l_factor=self.l_factor, inplace=True)

        return contactmap

    def _parse_dssp(self, dssp):
        """Parse :obj:`Bio.PDB.DSSP.DSSP` into a :obj:`pandas.DataFrame` with secondary structure information
        about the model"""

        if not tools._isinstance(dssp, DSSP):
            raise TypeError("Invalid hierarchy type for dssp: %s" % dssp.__class__.__name__)

        _dssp_list = []
        for residue in sorted(dssp.keys(), key=lambda x: x[1][1]):
            resnum = residue[1][1]
            if resnum in self.absent_residues:
                _dssp_list.append((resnum, np.nan, np.nan, np.nan, np.nan))
                continue
            acc = dssp[residue][3]
            if dssp[residue][2] in ('-', 'T', 'S'):
                ss2 = (1, 0, 0)
            elif dssp[residue][2] in ('H', 'G', 'I'):
                ss2 = (0, 1, 0)
            else:
                ss2 = (0, 0, 1)
            _dssp_list.append((resnum, *ss2, acc))

        dssp = pd.DataFrame(_dssp_list)
        dssp.columns = ['RESNUM', 'COIL', 'HELIX', 'SHEET', 'ACC']
        return dssp

    def _get_cmap_alignment(self):
        """Obtain a contact map alignment between :attr:`~conkit.plot.ModelValidationFigure.model` and
        :attr:`~conkit.plot.ModelValidationFigure.prediction` and get the misaligned residues"""
        with tempfile.TemporaryDirectory() as tmpdirname:
            contact_map_a = os.path.join(tmpdirname, 'contact_map_a.mapalign')
            contact_map_b = os.path.join(tmpdirname, 'contact_map_b.mapalign')
            conkit.io.write(contact_map_a, 'mapalign', self.prediction)
            conkit.io.write(contact_map_b, 'mapalign', self.model)

            map_align_cline = MapAlignCommandline(
                cmd=self.map_align_exe,
                contact_map_a=contact_map_a,
                contact_map_b=contact_map_b)

            stdout, stderr = map_align_cline()
            self.alignment = tools.parse_map_align_stdout(stdout)

    def _parse_data(self, predicted_dict, *metrics):
        """Create a :obj:`pandas.DataFrame` with the features of the residues in the model"""
        _features = []
        for residue_features in zip(sorted(predicted_dict.keys()), *metrics):
            _features.append((*residue_features,))

        self.data = pd.DataFrame(_features)
        self.data.columns = ALL_VALIDATION_FEATURES
        self.data = self.data.merge(self.dssp, how='inner', on=['RESNUM'])

        if self.map_align_exe is not None:
            self._get_cmap_alignment()
            self.data['MISALIGNED'] = self.data.RESNUM.isin(self.alignment.keys())
        else:
            self.data['MISALIGNED'] = False

    def _add_legend(self):
        """Adds legend to the :obj:`~conkit.plot.ModelValidationFigure`"""
        _error = self.ax.plot([], [], c=tools.ColorDefinitions.ERROR, label='Predicted Error', **_MARKERKWARGS)
        _correct = self.ax.plot([], [], c=tools.ColorDefinitions.CORRECT, label='Predicted Correct', **_MARKERKWARGS)
        _threshold_line = [self.ax.axvline(0, ymin=0, ymax=0, label="Score Threshold", **LINEKWARGS)]
        _score_plot = self.ax.plot([], [], color=tools.ColorDefinitions.SCORE, label='Smoothed Score')
        plots = _score_plot + _threshold_line + _correct + _error

        if self.map_align_exe is not None:
            _misaligned = self.ax.plot([], [], c=tools.ColorDefinitions.MISALIGNED, label='Misaligned', **_MARKERKWARGS)
            _aligned = self.ax.plot([], [], c=tools.ColorDefinitions.ALIGNED, label='Aligned', **_MARKERKWARGS)
            plots += _misaligned + _aligned

        labels = [l.get_label() for l in plots]
        self.ax.legend(plots, labels, bbox_to_anchor=(0.0, 1.02, 1.0, 0.102), loc=3,
                       ncol=3, mode="expand", borderaxespad=0.0, scatterpoints=1)

    def _predict_score(self, resnum):
        """Predict whether a given residue is part of a model error or not"""
        residue_features = self.data.loc[self.data.RESNUM == resnum][SELECTED_VALIDATION_FEATURES]
        if (self.absent_residues and resnum in self.absent_residues) or residue_features.isnull().values.any():
            return np.nan
        scaled_features = self.scaler.transform(residue_features.values)
        return self.classifier.predict_proba(scaled_features)[0, 1]

[docs]    def draw(self):
        model_distogram = self._prepare_distogram(self.model.copy())
        prediction_distogram = self._prepare_distogram(self.prediction.copy())
        model_cmap = self._prepare_contactmap(self.model.copy())
        model_dict = model_cmap.as_dict()
        prediction_cmap = self._prepare_contactmap(self.prediction.copy())
        predicted_dict = prediction_cmap.as_dict()

        cmap_metrics, cmap_metrics_smooth = tools.get_cmap_validation_metrics(model_dict, predicted_dict,
                                                                              self.sequence, self.absent_residues)
        rmsd, rmsd_smooth = tools.get_rmsd(prediction_distogram, model_distogram)
        zscore_metrics = tools.get_zscores(model_distogram, predicted_dict, self.absent_residues, rmsd, *cmap_metrics)
        self._parse_data(predicted_dict, rmsd_smooth, *cmap_metrics, *cmap_metrics_smooth, *zscore_metrics)

        scores = {}
        misaligned_residues = set(self.alignment.keys())

        for resnum in sorted(predicted_dict.keys()):
            _score = self._predict_score(resnum)
            scores[resnum] = _score
            color = tools.ColorDefinitions.ERROR if _score > 0.5 else tools.ColorDefinitions.CORRECT
            self.ax.plot(resnum - 1, -0.01, mfc=color, c=color, **MARKERKWARGS)
            if self.map_align_exe is not None:
                if resnum in misaligned_residues:
                    color = tools.ColorDefinitions.MISALIGNED
                else:
                    color = tools.ColorDefinitions.ALIGNED
                self.ax.plot(resnum - 1, -0.05, mfc=color, c=color, **MARKERKWARGS)

        self.data['SCORE'] = self.data['RESNUM'].apply(lambda x: scores.get(x))
        self.sorted_scores = np.nan_to_num([scores[resnum] for resnum in sorted(scores.keys())])
        self.smooth_scores = tools.convolution_smooth_values(self.sorted_scores)
        self.ax.axhline(0.5, **LINEKWARGS)
        self.ax.plot(self.smooth_scores, color=tools.ColorDefinitions.SCORE)
        self.ax.set_xlabel('Residue Number')
        self.ax.set_ylabel('Smoothed score')

        if self.legend:
            self._add_legend()

        # TODO: deprecate this in 0.14
        if self._file_name:
            self.savefig(self._file_name, dpi=self._dpi)