Source code for conkit.plot.modelvalidation
# BSD 3-Clause License
#
# Copyright (c) 2016-21, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""A module to produce a model validation plot
It uses one external program:
map_align for contact map alignment
*** This program needs to be installed separately from https://github.com/sokrypton/map_align***
"""
from __future__ import division
from __future__ import print_function
import os
from Bio.PDB.DSSP import DSSP
import numpy as np
import pandas as pd
import tempfile
from conkit.applications import MapAlignCommandline
from conkit.core.distance import Distance
import conkit.io
from conkit.misc import load_validation_model, SELECTED_VALIDATION_FEATURES, ALL_VALIDATION_FEATURES
from conkit.plot.figure import Figure
import conkit.plot.tools as tools
LINEKWARGS = dict(linestyle="--", linewidth=1.0, alpha=0.5, color=tools.ColorDefinitions.MISMATCH, zorder=1)
MARKERKWARGS = dict(marker='|', linestyle='None')
_MARKERKWARGS = dict(marker='s', linestyle='None')
[docs]class ModelValidationFigure(Figure):
"""A Figure object specifc for a model validation. This figure represents the proabbility that each given residue
in the model is involved in a model error. This is donw by feeding a trained classfier the differences observed
between the predicted distogram and the observed inter-residue contacts and distances at the PDB model.
Attributes
----------
model: :obj:`~conkit.core.distogram.Distogram`
The PDB model that will be validated
prediction: :obj:`~conkit.core.distogram.Distogram`
The distogram with the residue distance predictions
sequence: :obj:`~conkit.core.sequence.Sequence`
The sequence of the structure
dssp: :obj:`Bio.PDB.DSSP.DSSP`
The DSSP output for the PDB model that will be validated
map_align_exe: str
The path to map_align executable [default: None]
dist_bins: list, tuple
A list of tuples with the boundaries of the distance bins to use in the calculation [default: CASP2 bins]
l_factor: float
The L/N factor used to filter the contacts before finding the False Negatives [default: 0.5]
absent_residues: set
The residues not observed in the model that will be validated (only if in PDB format)
Examples
--------
>>> from Bio.PDB import PDBParser
>>> from Bio.PDB.DSSP import DSSP
>>> p = PDBParser()
>>> structure = p.get_structure('TOXD', 'toxd/toxd.pdb')[0]
>>> dssp = DSSP(structure, 'toxd/toxd.pdb', dssp='mkdssp', acc_array='Wilke')
>>> import conkit
>>> sequence = conkit.io.read('toxd/toxd.fasta', 'fasta').top
>>> model = conkit.io.read('toxd/toxd.pdb', 'pdb').top_map
>>> prediction = conkit.io.read('toxd/toxd.npz', 'rosettanpz').top_map
>>> conkit.plot.ModelValidationFigure(model, prediction, sequence, dssp)
"""
def __init__(self, model, prediction, sequence, dssp, map_align_exe=None, dist_bins=None, l_factor=0.5, **kwargs):
"""A new model validation plot
Parameters
----------
model: :obj:`~conkit.core.distogram.Distogram`
The PDB model that will be validated
prediction: :obj:`~conkit.core.distogram.Distogram`
The distogram with the residue distance predictions
sequence: :obj:`~conkit.core.sequence.Sequence`
The sequence of the structure
dssp: :obj:`Bio.PDB.DSSP.DSSP`
The DSSP output for the PDB model that will be validated
map_align_exe: str
The path to map_align executable [default: None]
dist_bins: list, tuple
A list of tuples with the boundaries of the distance bins to use in the calculation [default: CASP2 bins]
l_factor: float
The L/N factor used to filter the contacts before finding the False Negatives [default: 0.5]
**kwargs
General :obj:`~conkit.plot.figure.Figure` keyword arguments
"""
super(ModelValidationFigure, self).__init__(**kwargs)
self._model = None
self._prediction = None
self._sequence = None
self._distance_bins = None
self.data = None
self.alignment = {}
self.sorted_scores = None
self.smooth_scores = None
if len(sequence) < 5:
raise ValueError('Cannot validate a model with less than 5 residues')
self.map_align_exe = map_align_exe
self.l_factor = l_factor
self.dist_bins = dist_bins
self.model = model
self.prediction = prediction
self.sequence = sequence
self.classifier, self.scaler = load_validation_model()
self.absent_residues = self._get_absent_residues()
self.dssp = self._parse_dssp(dssp)
self.draw()
def __repr__(self):
return self.__class__.__name__
@property
def dist_bins(self):
return self._dist_bins
@dist_bins.setter
def dist_bins(self, dist_bins):
if dist_bins is None:
self._dist_bins = ((0, 4), (4, 6), (6, 8), (8, 10), (10, 12), (12, 14),
(14, 16), (16, 18), (18, 20), (20, np.inf))
else:
Distance._assert_valid_bins(dist_bins)
self._dist_bins = dist_bins
@property
def sequence(self):
return self._sequence
@sequence.setter
def sequence(self, sequence):
if sequence and tools._isinstance(sequence, "Sequence"):
self._sequence = sequence
else:
raise TypeError("Invalid hierarchy type for sequence: %s" % sequence.__class__.__name__)
@property
def prediction(self):
return self._prediction
@prediction.setter
def prediction(self, prediction):
if prediction and tools._isinstance(prediction, "Distogram"):
self._prediction = prediction
else:
raise TypeError("Invalid hierarchy type for prediction: %s" % prediction.__class__.__name__)
@property
def model(self):
return self._model
@model.setter
def model(self, model):
if model and tools._isinstance(model, "Distogram"):
self._model = model
else:
raise TypeError("Invalid hierarchy type for model: %s" % model.__class__.__name__)
def _get_absent_residues(self):
"""Get a set of residues absent from the :attr:`~conkit.plot.ModelValidationFigure.model` and
:attr:`~conkit.plot.ModelValidationFigure.prediction`. Only distograms originating from PDB files
are considered."""
absent_residues = []
if self.model.original_file_format == "pdb":
absent_residues += self.model.get_absent_residues(len(self.sequence))
if self.prediction.original_file_format == "pdb":
absent_residues += self.prediction.get_absent_residues(len(self.sequence))
return set(absent_residues)
def _prepare_distogram(self, distogram):
"""General operations to prepare a :obj:`~conkit.core.distogram.Distogram` instance before plotting."""
distogram.get_unique_distances(inplace=True)
distogram.sequence = self.sequence
distogram.set_sequence_register()
if distogram.original_file_format != "pdb":
distogram.reshape_bins(self.dist_bins)
return distogram
def _prepare_contactmap(self, distogram):
"""General operations to prepare a :obj:`~conkit.core.contactmap.ContactMap` instance before plotting."""
contactmap = distogram.as_contactmap()
contactmap.sequence = self.sequence
contactmap.set_sequence_register()
contactmap.remove_neighbors(inplace=True)
if distogram.original_file_format != "pdb":
contactmap.sort("raw_score", reverse=True, inplace=True)
contactmap.slice_map(seq_len=len(self.sequence), l_factor=self.l_factor, inplace=True)
return contactmap
def _parse_dssp(self, dssp):
"""Parse :obj:`Bio.PDB.DSSP.DSSP` into a :obj:`pandas.DataFrame` with secondary structure information
about the model"""
if not tools._isinstance(dssp, DSSP):
raise TypeError("Invalid hierarchy type for dssp: %s" % dssp.__class__.__name__)
_dssp_list = []
for residue in sorted(dssp.keys(), key=lambda x: x[1][1]):
resnum = residue[1][1]
if resnum in self.absent_residues:
_dssp_list.append((resnum, np.nan, np.nan, np.nan, np.nan))
continue
acc = dssp[residue][3]
if dssp[residue][2] in ('-', 'T', 'S'):
ss2 = (1, 0, 0)
elif dssp[residue][2] in ('H', 'G', 'I'):
ss2 = (0, 1, 0)
else:
ss2 = (0, 0, 1)
_dssp_list.append((resnum, *ss2, acc))
dssp = pd.DataFrame(_dssp_list)
dssp.columns = ['RESNUM', 'COIL', 'HELIX', 'SHEET', 'ACC']
return dssp
def _get_cmap_alignment(self):
"""Obtain a contact map alignment between :attr:`~conkit.plot.ModelValidationFigure.model` and
:attr:`~conkit.plot.ModelValidationFigure.prediction` and get the misaligned residues"""
with tempfile.TemporaryDirectory() as tmpdirname:
contact_map_a = os.path.join(tmpdirname, 'contact_map_a.mapalign')
contact_map_b = os.path.join(tmpdirname, 'contact_map_b.mapalign')
conkit.io.write(contact_map_a, 'mapalign', self.prediction)
conkit.io.write(contact_map_b, 'mapalign', self.model)
map_align_cline = MapAlignCommandline(
cmd=self.map_align_exe,
contact_map_a=contact_map_a,
contact_map_b=contact_map_b)
stdout, stderr = map_align_cline()
self.alignment = tools.parse_map_align_stdout(stdout)
def _parse_data(self, predicted_dict, *metrics):
"""Create a :obj:`pandas.DataFrame` with the features of the residues in the model"""
_features = []
for residue_features in zip(sorted(predicted_dict.keys()), *metrics):
_features.append((*residue_features,))
self.data = pd.DataFrame(_features)
self.data.columns = ALL_VALIDATION_FEATURES
self.data = self.data.merge(self.dssp, how='inner', on=['RESNUM'])
if self.map_align_exe is not None:
self._get_cmap_alignment()
self.data['MISALIGNED'] = self.data.RESNUM.isin(self.alignment.keys())
else:
self.data['MISALIGNED'] = False
def _add_legend(self):
"""Adds legend to the :obj:`~conkit.plot.ModelValidationFigure`"""
_error = self.ax.plot([], [], c=tools.ColorDefinitions.ERROR, label='Predicted Error', **_MARKERKWARGS)
_correct = self.ax.plot([], [], c=tools.ColorDefinitions.CORRECT, label='Predicted Correct', **_MARKERKWARGS)
_threshold_line = [self.ax.axvline(0, ymin=0, ymax=0, label="Score Threshold", **LINEKWARGS)]
_score_plot = self.ax.plot([], [], color=tools.ColorDefinitions.SCORE, label='Smoothed Score')
plots = _score_plot + _threshold_line + _correct + _error
if self.map_align_exe is not None:
_misaligned = self.ax.plot([], [], c=tools.ColorDefinitions.MISALIGNED, label='Misaligned', **_MARKERKWARGS)
_aligned = self.ax.plot([], [], c=tools.ColorDefinitions.ALIGNED, label='Aligned', **_MARKERKWARGS)
plots += _misaligned + _aligned
labels = [l.get_label() for l in plots]
self.ax.legend(plots, labels, bbox_to_anchor=(0.0, 1.02, 1.0, 0.102), loc=3,
ncol=3, mode="expand", borderaxespad=0.0, scatterpoints=1)
def _predict_score(self, resnum):
"""Predict whether a given residue is part of a model error or not"""
residue_features = self.data.loc[self.data.RESNUM == resnum][SELECTED_VALIDATION_FEATURES]
if (self.absent_residues and resnum in self.absent_residues) or residue_features.isnull().values.any():
return np.nan
scaled_features = self.scaler.transform(residue_features.values)
return self.classifier.predict_proba(scaled_features)[0, 1]
[docs] def draw(self):
model_distogram = self._prepare_distogram(self.model.copy())
prediction_distogram = self._prepare_distogram(self.prediction.copy())
model_cmap = self._prepare_contactmap(self.model.copy())
model_dict = model_cmap.as_dict()
prediction_cmap = self._prepare_contactmap(self.prediction.copy())
predicted_dict = prediction_cmap.as_dict()
cmap_metrics, cmap_metrics_smooth = tools.get_cmap_validation_metrics(model_dict, predicted_dict,
self.sequence, self.absent_residues)
rmsd, rmsd_smooth = tools.get_rmsd(prediction_distogram, model_distogram)
zscore_metrics = tools.get_zscores(model_distogram, predicted_dict, self.absent_residues, rmsd, *cmap_metrics)
self._parse_data(predicted_dict, rmsd_smooth, *cmap_metrics, *cmap_metrics_smooth, *zscore_metrics)
scores = {}
misaligned_residues = set(self.alignment.keys())
for resnum in sorted(predicted_dict.keys()):
_score = self._predict_score(resnum)
scores[resnum] = _score
color = tools.ColorDefinitions.ERROR if _score > 0.5 else tools.ColorDefinitions.CORRECT
self.ax.plot(resnum - 1, -0.01, mfc=color, c=color, **MARKERKWARGS)
if self.map_align_exe is not None:
if resnum in misaligned_residues:
color = tools.ColorDefinitions.MISALIGNED
else:
color = tools.ColorDefinitions.ALIGNED
self.ax.plot(resnum - 1, -0.05, mfc=color, c=color, **MARKERKWARGS)
self.data['SCORE'] = self.data['RESNUM'].apply(lambda x: scores.get(x))
self.sorted_scores = np.nan_to_num([scores[resnum] for resnum in sorted(scores.keys())])
self.smooth_scores = tools.convolution_smooth_values(self.sorted_scores)
self.ax.axhline(0.5, **LINEKWARGS)
self.ax.plot(self.smooth_scores, color=tools.ColorDefinitions.SCORE)
self.ax.set_xlabel('Residue Number')
self.ax.set_ylabel('Smoothed score')
if self.legend:
self._add_legend()
# TODO: deprecate this in 0.14
if self._file_name:
self.savefig(self._file_name, dpi=self._dpi)