Source code for conkit.plot.tools

# BSD 3-Clause License
#
# Copyright (c) 2016-21, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""Internal utility functions"""

__author__ = "Felix Simkovic"
__date__ = "16 Feb 2017"
__version__ = "0.13.3"

import numpy as np
import os

from conkit.core.contact import Contact
from conkit.core.contactmap import ContactMap
from conkit.core.contactfile import ContactFile
from conkit.core.sequence import Sequence
from conkit.core.sequencefile import SequenceFile
from conkit.core.distogram import Distogram
from conkit.core.distancefile import DistanceFile

HierarchyIndex = {
    "Contact": Contact,
    "ContactMap": ContactMap,
    "ContactFile": ContactFile,
    "Sequence": Sequence,
    "SequenceFile": SequenceFile,
    "Distogram": Distogram,
    "DistanceFile": DistanceFile
}


[docs]class ColorDefinitions(object): """A class storing all color definitions for the various plots for fast and easy handling """ GENERAL = "#000000" MATCH = "#0F0B2C" MISMATCH = "#DC4869" STRUCTURAL = "#D8D6D6" L5CUTOFF = "#3F4587" L20CUTOFF = "#B5DD2B" PRECISION50 = L5CUTOFF FACTOR1 = L20CUTOFF SCORE = '#3299a8' ERROR = '#f54242' CORRECT = '#40eef7' ALIGNED = '#3d8beb' MISALIGNED = '#f7ba40' AA_ENCODING = { "A": "#882D17", "C": "#F3C300", "D": "#875692", "E": "#F38400", "F": "#A1CAF1", "G": "#BE0032", "H": "#C2B280", "I": "#848482", "K": "#008856", "L": "#E68FAC", "M": "#0067A5", "N": "#F99379", "P": "#604E97", "Q": "#F6A600", "R": "#B3446C", "S": "#DCD300", "T": "#8DB600", "V": "#654522", "W": "#E25822", "Y": "#2B3D26", "X": "#000000", }
[docs]def find_minima(data, order=1): """Find the minima in a 1-D list Parameters ---------- data : list, tuple A list of values order : int, optional The order, i.e. number of points next to point to consider Returns ------- list A list of indices for minima Warning ------- For multi-dimensional problems, see :func:`~scipy.signal.argrelmin`. Raises ------ :exc:`ValueError` Order needs to be >= 1! :exc:`ValueError` More than two elements required! """ if order < 1: raise ValueError("Order needs to be >= 1!") data = np.asarray(data) nelements = data.shape[0] if nelements < 2: raise ValueError("More than two elements required!") results = np.zeros(nelements, dtype=np.bool_) for i in np.arange(1, nelements - 1): start = 0 if i - order < 0 else i - order end = nelements if i + order + 1 > nelements else i + order + 1 results[i] = np.all(data[start:i] > data[i]) and np.all(data[i] < data[i + 1: end]) return np.where(results)[0].tolist()
[docs]def get_adjusted_aspect(ax, aspect_ratio): """Adjust the aspect ratio Parameters ---------- ax : :obj:`~matplotlib.axes.Axes` A :obj:`~matplotlib.axes.Axes` instance aspect_ratio : float The desired aspect ratio for :obj:`~matplotlib.axes.Axes` Returns ------- float The required aspect ratio to achieve the desired one Warning ------- This function only works for non-logarithmic axes. """ default_ratio = (ax.get_xlim()[1] - ax.get_xlim()[0]) / (ax.get_ylim()[1] - ax.get_ylim()[0]) return float(default_ratio * aspect_ratio)
[docs]def get_points_on_circle(radius, h=0, k=0): """Calculate points on a circle with even spacing Parameters ---------- radius : int The radius of the circle h : int, optional The x coordinate of the origin k : int, optional The y coordinate of the origin Returns ------- list The list of coordinates for each point """ if radius == 0: return [[]] else: space = 2 * np.pi / radius coords = np.zeros((radius, 2)) for i in np.arange(radius): coords[i] = [round(h + radius * np.cos(space * i), 6), round(k + radius * np.sin(space * i), 6)] return coords.tolist()
[docs]def get_radius_around_circle(p1, p2): """Obtain the radius around a given circle Parameters ---------- p1 : list, tuple Point 1 p2 : list, tuple Point 2 adjacent `p1` Returns ------- float The radius for points so p1 and p2 do not intersect """ dist = np.linalg.norm(np.array(p1) - np.array(p2)) return dist / 2.0 - dist * 0.1
def _isinstance(hierarchy, hierarchy_type): """Confirm the data structure to be a ConKit definition""" if isinstance(hierarchy_type, str) and hierarchy_type in HierarchyIndex: return isinstance(hierarchy, HierarchyIndex[hierarchy_type]) else: return isinstance(hierarchy, hierarchy_type)
[docs]def is_executable(executable): """Check if a given program can be executed Parameters ---------- executable : str The path or name for an executable Returns ------- str The absolute path to the executable Raises ------ ValueError The executable cannot be accessed Credits ------- https://stackoverflow.com/a/377028/3046533 """ if executable is None: return fpath, fname = os.path.split(executable) if fpath: if os.path.isfile(executable) and os.access(executable, os.X_OK): return executable else: for path in os.environ["PATH"].split(os.pathsep): exe_file = os.path.join(path, executable) if os.path.isfile(exe_file) and os.access(exe_file, os.X_OK): return exe_file raise ValueError('Executable {} cannot be accessed'.format(executable))
[docs]def convolution_smooth_values(x, window=5): """Use convolutions to smooth a list of numeric values Parameters ---------- x : list, tuple A list with the numeric values to be smoothed window : int The residue window to be used to smooth values [default: 5] Returns ------- list A list with the smoothed numeric values """ box = np.ones(window) / window x_smooth = np.convolve(x, box, mode='same') return x_smooth
[docs]def get_rmsd(distogram_1, distogram_2, calculate_wrmsd=True): """Calculate the RMSD between two different distograms Parameters ---------- distogram_1 : :obj:`~conkit.core.distogram.Distogram` Distogram 1 distogram_2 : :obj:`~conkit.core.distogram.Distogram` Distogram 2 calculate_wrmsd: bool If True then the WRMSD is calculated using the confidence scores from distogram 1 Returns ------- tuple Two lists with the raw/smoothed RMSD values at each residue position """ rmsd_raw = Distogram.calculate_rmsd(distogram_1, distogram_2, calculate_wrmsd=calculate_wrmsd) rmsd_smooth = convolution_smooth_values(np.nan_to_num(rmsd_raw), 10) return rmsd_raw, rmsd_smooth
[docs]def get_cmap_validation_metrics(model_cmap_dict, predicted_cmap_dict, sequence, absent_residues): """For a given observed contact map and predicted contact map calculate a series of validation metrics at each residue position (Accuracy, FN, FNR, FP, FPR, Sensitivity, Specificity) Parameters ---------- model_cmap_dict : dict Dictionary representation of the contact map observed in the model predicted_cmap_dict : dict Dictionary representation of the predicted contact map sequence: :obj:`~conkit.core.sequence.Sequence` The sequence of the model to be validated absent_residues: list, tuple, set The residues that are missing from the model Returns ------- tuple Two lists with the raw/smoothed values of each validation metric at each residue position """ def accuracy(tp, fp, tn, fn): if (tp + fp + tn + fn) > 0: return (tp + tn) / (tp + fp + tn + fn) return 0 def fn(*args): return args[-1] def fn_rate(tp, fp, tn, fn): if (fn + tn) > 0: return fn / (fn + tn) return 0 def fp(*args): return args[1] def fp_rate(tp, fp, tn, fn): if (fp + tp) > 0: return fp / (fp + tp) return 0 def sensitivity(tp, fp, tn, fn): if (fn + tp) > 0: return tp / (fn + tp) return 0 def specificity(tp, fp, tn, fn): if (fp + tn) > 0: return tn / (fp + tn) return 0 nresidues = len(sequence) - len(absent_residues) cmap_metrics = [[] for i in range(7)] metrics_list = (accuracy, fn, fn_rate, fp, fp_rate, sensitivity, specificity) for resnum in sorted(predicted_cmap_dict.keys()): if absent_residues and resnum in absent_residues: for metric in cmap_metrics: metric.append(np.nan) continue predicted_contact_set = {c for c in predicted_cmap_dict[resnum] if c[0] not in absent_residues and c[1] not in absent_residues} model_contact_set = {c for c in model_cmap_dict[resnum] if c[0] not in absent_residues and c[1] not in absent_residues} _fn = len(predicted_contact_set - model_contact_set) _tp = len(predicted_contact_set & model_contact_set) _fp = len(model_contact_set - predicted_contact_set) _tn = nresidues - _fn - _tp - _fp for idx, metric in enumerate(metrics_list): cmap_metrics[idx].append(metric(_tp, _fp, _tn, _fn)) smooth_cmap_metrics = [] for metric in cmap_metrics: smooth_cmap_metrics.append(convolution_smooth_values(np.nan_to_num(metric), 5)) return cmap_metrics, smooth_cmap_metrics
[docs]def get_zscores(model_distogram, predicted_cmap_dict, absent_residues, *metrics): """Calculate the Z-Scores for a series of metrics at each residue position using the population of residues within 10A Parameters ---------- model_distogram : :obj:`~conkit.core.distogram.Distogram` Distogram of the model that will be validated predicted_cmap_dict : dict Dictionary representation of the predicted contact map absent_residues: list, tuple, set The residues that are missing from the model *metrics: list The mertics for which the Z-Scores will be calculated Returns ------- list A list of lists where each sublist contains the Z-Scores for the input metrics across all the residues. The sublists containing the Z-Scores are ordered in the same original order as in the input *metrics """ zscore_cmap_metrics = [[] for i in metrics] for resnum in sorted(predicted_cmap_dict.keys()): if absent_residues and resnum in absent_residues: for zscore_metric in zscore_cmap_metrics: zscore_metric.append(np.nan) continue neighbour_residues = model_distogram.find_residues_within(resnum, 10) for cmap_metric, zscore_metric in zip(metrics, zscore_cmap_metrics): population_scores = [cmap_metric[resid - 1] for resid in neighbour_residues] observed_score = cmap_metric[resnum - 1] zscore_metric.append(calculate_zscore(observed_score, population_scores)) return zscore_cmap_metrics
[docs]def calculate_zscore(observed_score, population_scores): """Calculate the Z-Score for a given population of values. Z-Score = (score - mean) / stdev Parameters ---------- observed_score : float The observed score used to calculate the Z-Score population_scores : list A list containing the scores observed across the samples in the population Returns ------- float The calculated Z-Score """ if len(population_scores) < 2: return 0 stdev = np.std(population_scores).astype(float) if stdev == 0: return 0 mean = np.mean(population_scores).astype(float) zscore = (observed_score - mean) / stdev return zscore
[docs]def get_residue_ranges(numbers): """Given a list of integers, creates a list of ranges with the consecutive numbers found in the list. Parameters ---------- numbers: list A list of integers Returns ------ list A list with the ranges of consecutive numbers found in the list """ nums = sorted(set(numbers)) gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s + 3 < e] edges = iter(nums[:1] + sum(gaps, []) + nums[-1:]) return list(zip(edges, edges))
[docs]def parse_map_align_stdout(stdout): """Parse the stdout of map_align and extract the alignment of residues. Parameters ---------- stdout : str Standard output created with map_align Returns ------ dict A dictionary where aligned residue numbers in map_b are the keys and residue numbers in map_a values. Only misaligned regions are included. """ alignment_dict = {} for line in stdout.split('\n'): if line and line.split()[0] == "MAX": line = line.rstrip().lstrip().split() for residue_pair in line[8:]: residue_pair = residue_pair.split(":") if residue_pair[0] != residue_pair[1]: alignment_dict[int(residue_pair[1])] = int(residue_pair[0]) return alignment_dict