Source code for conkit.plot.tools

# BSD 3-Clause License
#
# Copyright (c) 2016-21, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""Internal utility functions"""

__author__ = "Felix Simkovic"
__date__ = "16 Feb 2017"
__version__ = "0.13.3"

import numpy as np
import os

from conkit.core.contact import Contact
from conkit.core.contactmap import ContactMap
from conkit.core.contactfile import ContactFile
from conkit.core.sequence import Sequence
from conkit.core.sequencefile import SequenceFile
from conkit.core.distogram import Distogram
from conkit.core.distancefile import DistanceFile

HierarchyIndex = {
    "Contact": Contact,
    "ContactMap": ContactMap,
    "ContactFile": ContactFile,
    "Sequence": Sequence,
    "SequenceFile": SequenceFile,
    "Distogram": Distogram,
    "DistanceFile": DistanceFile
}


[docs]class ColorDefinitions(object):
    """A class storing all color definitions for the various plots
    for fast and easy handling
    """

    GENERAL = "#000000"
    MATCH = "#0F0B2C"
    MISMATCH = "#DC4869"
    STRUCTURAL = "#D8D6D6"
    L5CUTOFF = "#3F4587"
    L20CUTOFF = "#B5DD2B"
    PRECISION50 = L5CUTOFF
    FACTOR1 = L20CUTOFF
    SCORE = '#3299a8'
    ERROR = '#f54242'
    CORRECT = '#40eef7'
    ALIGNED = '#3d8beb'
    MISALIGNED = '#f7ba40'
    AA_ENCODING = {
        "A": "#882D17",
        "C": "#F3C300",
        "D": "#875692",
        "E": "#F38400",
        "F": "#A1CAF1",
        "G": "#BE0032",
        "H": "#C2B280",
        "I": "#848482",
        "K": "#008856",
        "L": "#E68FAC",
        "M": "#0067A5",
        "N": "#F99379",
        "P": "#604E97",
        "Q": "#F6A600",
        "R": "#B3446C",
        "S": "#DCD300",
        "T": "#8DB600",
        "V": "#654522",
        "W": "#E25822",
        "Y": "#2B3D26",
        "X": "#000000",
    }


[docs]def find_minima(data, order=1):
    """Find the minima in a 1-D list

    Parameters
    ----------
    data : list, tuple
       A list of values
    order : int, optional
       The order, i.e. number of points next to point to consider

    Returns
    -------
    list
       A list of indices for minima

    Warning
    -------
    For multi-dimensional problems, see :func:`~scipy.signal.argrelmin`.

    Raises
    ------
    :exc:`ValueError`
       Order needs to be >= 1!
    :exc:`ValueError`
       More than two elements required!

    """
    if order < 1:
        raise ValueError("Order needs to be >= 1!")
    data = np.asarray(data)
    nelements = data.shape[0]
    if nelements < 2:
        raise ValueError("More than two elements required!")
    results = np.zeros(nelements, dtype=np.bool_)
    for i in np.arange(1, nelements - 1):
        start = 0 if i - order < 0 else i - order
        end = nelements if i + order + 1 > nelements else i + order + 1
        results[i] = np.all(data[start:i] > data[i]) and np.all(data[i] < data[i + 1: end])
    return np.where(results)[0].tolist()


[docs]def get_adjusted_aspect(ax, aspect_ratio):
    """Adjust the aspect ratio

    Parameters
    ----------
    ax : :obj:`~matplotlib.axes.Axes`
       A :obj:`~matplotlib.axes.Axes` instance
    aspect_ratio : float
       The desired aspect ratio for :obj:`~matplotlib.axes.Axes`

    Returns
    -------
    float
       The required aspect ratio to achieve the desired one

    Warning
    -------
    This function only works for non-logarithmic axes.

    """
    default_ratio = (ax.get_xlim()[1] - ax.get_xlim()[0]) / (ax.get_ylim()[1] - ax.get_ylim()[0])
    return float(default_ratio * aspect_ratio)


[docs]def get_points_on_circle(radius, h=0, k=0):
    """Calculate points on a circle with even spacing

    Parameters
    ----------
    radius : int
       The radius of the circle
    h : int, optional
       The x coordinate of the origin
    k : int, optional
       The y coordinate of the origin

    Returns
    -------
    list
       The list of coordinates for each point

    """
    if radius == 0:
        return [[]]
    else:
        space = 2 * np.pi / radius
        coords = np.zeros((radius, 2))
        for i in np.arange(radius):
            coords[i] = [round(h + radius * np.cos(space * i), 6), round(k + radius * np.sin(space * i), 6)]
        return coords.tolist()


[docs]def get_radius_around_circle(p1, p2):
    """Obtain the radius around a given circle

    Parameters
    ----------
    p1 : list, tuple
       Point 1
    p2 : list, tuple
       Point 2 adjacent `p1`

    Returns
    -------
    float
       The radius for points so p1 and p2 do not intersect
    """
    dist = np.linalg.norm(np.array(p1) - np.array(p2))
    return dist / 2.0 - dist * 0.1


def _isinstance(hierarchy, hierarchy_type):
    """Confirm the data structure to be a ConKit definition"""
    if isinstance(hierarchy_type, str) and hierarchy_type in HierarchyIndex:
        return isinstance(hierarchy, HierarchyIndex[hierarchy_type])
    else:
        return isinstance(hierarchy, hierarchy_type)


[docs]def is_executable(executable):
    """Check if a given program can be executed

    Parameters
    ----------
    executable : str
       The path or name for an executable

    Returns
    -------
    str
       The absolute path to the executable

    Raises
    ------
    ValueError
        The executable cannot be accessed

    Credits
    -------
    https://stackoverflow.com/a/377028/3046533
    """
    if executable is None:
        return

    fpath, fname = os.path.split(executable)

    if fpath:
        if os.path.isfile(executable) and os.access(executable, os.X_OK):
            return executable

    else:
        for path in os.environ["PATH"].split(os.pathsep):
            exe_file = os.path.join(path, executable)
            if os.path.isfile(exe_file) and os.access(exe_file, os.X_OK):
                return exe_file

    raise ValueError('Executable {} cannot be accessed'.format(executable))


[docs]def convolution_smooth_values(x, window=5):
    """Use convolutions to smooth a list of numeric values

    Parameters
    ----------
    x : list, tuple
       A list with the numeric values to be smoothed
    window : int
       The residue window to be used to smooth values [default: 5]

    Returns
    -------
    list
       A list with the smoothed numeric values
    """
    box = np.ones(window) / window
    x_smooth = np.convolve(x, box, mode='same')
    return x_smooth


[docs]def get_rmsd(distogram_1, distogram_2, calculate_wrmsd=True):
    """Calculate the RMSD between two different distograms

    Parameters
    ----------
    distogram_1 : :obj:`~conkit.core.distogram.Distogram`
       Distogram 1
    distogram_2 : :obj:`~conkit.core.distogram.Distogram`
       Distogram 2
    calculate_wrmsd: bool
        If True then the WRMSD is calculated using the confidence scores from distogram 1

    Returns
    -------
    tuple
       Two lists with the raw/smoothed RMSD values at each residue position
    """
    rmsd_raw = Distogram.calculate_rmsd(distogram_1, distogram_2, calculate_wrmsd=calculate_wrmsd)
    rmsd_smooth = convolution_smooth_values(np.nan_to_num(rmsd_raw), 10)
    return rmsd_raw, rmsd_smooth


[docs]def get_cmap_validation_metrics(model_cmap_dict, predicted_cmap_dict, sequence, absent_residues):
    """For a given observed contact map and predicted contact map calculate a series of validation metrics at each
    residue position (Accuracy, FN, FNR, FP, FPR, Sensitivity, Specificity)

    Parameters
    ----------
    model_cmap_dict : dict
       Dictionary representation of the contact map observed in the model
    predicted_cmap_dict : dict
       Dictionary representation of the predicted contact map
    sequence: :obj:`~conkit.core.sequence.Sequence`
        The sequence of the model to be validated
    absent_residues: list, tuple, set
        The residues that are missing from the model
    Returns
    -------
    tuple
        Two lists with the raw/smoothed values of each validation metric at each residue position
    """

    def accuracy(tp, fp, tn, fn):
        if (tp + fp + tn + fn) > 0:
            return (tp + tn) / (tp + fp + tn + fn)
        return 0

    def fn(*args):
        return args[-1]

    def fn_rate(tp, fp, tn, fn):
        if (fn + tn) > 0:
            return fn / (fn + tn)
        return 0

    def fp(*args):
        return args[1]

    def fp_rate(tp, fp, tn, fn):
        if (fp + tp) > 0:
            return fp / (fp + tp)
        return 0

    def sensitivity(tp, fp, tn, fn):
        if (fn + tp) > 0:
            return tp / (fn + tp)
        return 0

    def specificity(tp, fp, tn, fn):
        if (fp + tn) > 0:
            return tn / (fp + tn)
        return 0

    nresidues = len(sequence) - len(absent_residues)
    cmap_metrics = [[] for i in range(7)]
    metrics_list = (accuracy, fn, fn_rate, fp, fp_rate, sensitivity, specificity)

    for resnum in sorted(predicted_cmap_dict.keys()):
        if absent_residues and resnum in absent_residues:
            for metric in cmap_metrics:
                metric.append(np.nan)
            continue

        predicted_contact_set = {c for c in predicted_cmap_dict[resnum] if
                                 c[0] not in absent_residues and c[1] not in absent_residues}
        model_contact_set = {c for c in model_cmap_dict[resnum] if
                             c[0] not in absent_residues and c[1] not in absent_residues}

        _fn = len(predicted_contact_set - model_contact_set)
        _tp = len(predicted_contact_set & model_contact_set)
        _fp = len(model_contact_set - predicted_contact_set)
        _tn = nresidues - _fn - _tp - _fp

        for idx, metric in enumerate(metrics_list):
            cmap_metrics[idx].append(metric(_tp, _fp, _tn, _fn))

    smooth_cmap_metrics = []
    for metric in cmap_metrics:
        smooth_cmap_metrics.append(convolution_smooth_values(np.nan_to_num(metric), 5))

    return cmap_metrics, smooth_cmap_metrics


[docs]def get_zscores(model_distogram, predicted_cmap_dict, absent_residues, *metrics):
    """Calculate the Z-Scores for a series of metrics at each residue position
    using the population of residues within 10A

    Parameters
    ----------
    model_distogram : :obj:`~conkit.core.distogram.Distogram`
       Distogram of the model that will be validated
    predicted_cmap_dict : dict
       Dictionary representation of the predicted contact map
    absent_residues: list, tuple, set
        The residues that are missing from the model
    *metrics: list
        The mertics for which the Z-Scores will be calculated

    Returns
    -------
    list
       A list of lists where each sublist contains the Z-Scores for the input metrics across all the residues. The
       sublists containing the Z-Scores are ordered in the same original order as in the input *metrics
    """
    zscore_cmap_metrics = [[] for i in metrics]

    for resnum in sorted(predicted_cmap_dict.keys()):
        if absent_residues and resnum in absent_residues:
            for zscore_metric in zscore_cmap_metrics:
                zscore_metric.append(np.nan)
            continue

        neighbour_residues = model_distogram.find_residues_within(resnum, 10)
        for cmap_metric, zscore_metric in zip(metrics, zscore_cmap_metrics):
            population_scores = [cmap_metric[resid - 1] for resid in neighbour_residues]
            observed_score = cmap_metric[resnum - 1]
            zscore_metric.append(calculate_zscore(observed_score, population_scores))

    return zscore_cmap_metrics


[docs]def calculate_zscore(observed_score, population_scores):
    """Calculate the Z-Score for a given population of values. Z-Score = (score - mean) / stdev

    Parameters
    ----------
    observed_score : float
       The observed score used to calculate the Z-Score
    population_scores : list
       A list containing the scores observed across the samples in the population

    Returns
    -------
    float
       The calculated Z-Score
    """

    if len(population_scores) < 2:
        return 0
    stdev = np.std(population_scores).astype(float)
    if stdev == 0:
        return 0
    mean = np.mean(population_scores).astype(float)
    zscore = (observed_score - mean) / stdev
    return zscore


[docs]def get_residue_ranges(numbers):
    """Given a list of integers, creates a list of ranges with the consecutive numbers found in the list.

    Parameters
    ----------
    numbers: list
       A list of integers

    Returns
    ------
    list
        A list with the ranges of consecutive numbers found in the list
    """
    nums = sorted(set(numbers))
    gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s + 3 < e]
    edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
    return list(zip(edges, edges))


[docs]def parse_map_align_stdout(stdout):
    """Parse the stdout of map_align and extract the alignment of residues.

    Parameters
    ----------
    stdout : str
       Standard output created with map_align

    Returns
    ------
    dict
        A dictionary where aligned residue numbers in map_b are the keys and residue numbers in map_a values. Only
        misaligned regions are included.
    """

    alignment_dict = {}

    for line in stdout.split('\n'):
        if line and line.split()[0] == "MAX":
            line = line.rstrip().lstrip().split()
            for residue_pair in line[8:]:
                residue_pair = residue_pair.split(":")
                if residue_pair[0] != residue_pair[1]:
                    alignment_dict[int(residue_pair[1])] = int(residue_pair[0])

    return alignment_dict