Source code for conkit.core.SequenceFile

"""
Storage space for a sequence file
"""

from __future__ import division
from __future__ import print_function

__author__ = "Felix Simkovic"
__date__ = "07 Sep 2016"
__version__ = 0.1

from conkit.core.Entity import Entity

import numpy
try:
    import scipy.spatial
    SCIPY = True
except ImportError:
    SCIPY = False


[docs]class SequenceFile(Entity):
    """A sequence file object representing a single sequence file

    Description
    -----------
    The :obj:`conkit.core.SequenceFile` class represents a data structure to hold 
    :obj:`conkit.core.Sequence` instances in a single sequence file. It contains 
    functions to store and analyze sequences.

    Attributes
    ----------
    id : str
       A unique identifier
    is_alignment : bool
       A boolean status for the alignment
    nseqs : int
       The number of sequences in the :obj:`conkit.core.SequenceFile`
    remark : list
       The :obj:`conkit.core.SequenceFile`-specific remarks
    status : int
       An indication of the sequence file, i.e alignment, no alignment, or unknown
    top_sequence : :obj:`conkit.core.Sequence`, None
       The first :obj:`conkit.core.Sequence` entry in the file


    Examples
    --------
    >>> from conkit.core import Sequence, SequenceFile
    >>> sequence_file = SequenceFile("example")
    >>> sequence_file.add(Sequence("foo", "ABCDEF"))
    >>> sequence_file.add(Sequence("bar", "ZYXWVU"))
    >>> print(sequence_file)
    SequenceFile(id="example" nseqs=2)

    """
    __slots__ = ['_remark', '_status']

    _UNKNOWN = 0
    _NO_ALIGNMENT = -1
    _YES_ALIGNMENT = 1

    def __init__(self, id):
        """Initialise a new sequence file

        Parameters
        ----------
        id : str
           A unique identifier for the sequence file

        """
        self._remark = []
        self._status = SequenceFile._UNKNOWN
        super(SequenceFile, self).__init__(id)

    def __repr__(self):
        return "SequenceFile(id=\"{0}\" nseqs={1})".format(self.id, self.nseqs)

    @property
    def is_alignment(self):
        """A boolean status for the alignment

        Returns
        -------
        is_alignment : bool
           A boolean status for the alignment
        """
        seq_length = self.top_sequence.seq_len
        self.status = SequenceFile._YES_ALIGNMENT
        for sequence in self:
            if sequence.seq_len != seq_length:
                self.status = SequenceFile._NO_ALIGNMENT
                break
        return True if self.status == SequenceFile._YES_ALIGNMENT else False

    @property
    def nseqs(self):
        """The number of :obj:`conkit.core.Sequence` instances in the :obj:`conkit.core.SequenceFile`

        Returns
        -------
        nseqs : int
           The number of sequences in the :obj:`conkit.core.SequenceFile`

        """
        return len(self)

    @property
    def remark(self):
        """The :obj:`conkit.core.SequenceFile`-specific remarks"""
        return self._remark

    @remark.setter
    def remark(self, remark):
        """Set the :obj:`conkit.core.SequenceFile` remark

        Parameters
        ----------
        remark : str, list
           The remark will be added to the list of remarks

        """
        if isinstance(remark, list):
            self._remark += remark
        elif isinstance(remark, tuple):
            self._remark += list(remark)
        else:
            self._remark += [remark]

    @property
    def status(self):
        """An indication of the residue status, i.e true positive, false positive, or unknown"""
        return self._status

    @status.setter
    def status(self, status):
        """Set the status

        Parameters
        ----------
        status : int
           [0] for `unknown`, [-1] for `no alignment`, or [1] for `alignment`

        Raises
        ------
        ValueError
           Unknown status

        """
        if any(i == status for i in [SequenceFile._UNKNOWN, SequenceFile._NO_ALIGNMENT, SequenceFile._YES_ALIGNMENT]):
            self._status = status
        else:
            raise ValueError("Unknown status")

    @property
    def top_sequence(self):
        """The first :obj:`conkit.core.Sequence` entry in :obj:`conkit.core.SequenceFile`

        Returns
        -------
        top_sequence : :obj:`conkit.core.Sequence`, None
           The first :obj:`conkit.core.Sequence` entry in :obj:`conkit.core.SequenceFile`

        """
        if len(self) > 0:
            return self[0]
        else:
            return None

[docs]    def calculate_meff(self, identity=0.7):
        """Calculate the number of effective sequences

        This function calculates the number of effective
        sequences (`Meff`) in the Multiple Sequence Alignment.

        The mathematical function used to calculate `Meff` is

        .. math::

           M_{eff}=\\sum_{i}\\frac{1}{\\sum_{j}S_{i,j}}

        Parameters
        ----------
        identity : float, optional
           The sequence identity to use for similarity decision [default: 0.7]

        Returns
        -------
        meff : int
           The number of effective sequences

        Raises
        ------
        MemoryError
           Too many sequences in the alignment for Hamming distance calculation
        RuntimeError
           SciPy package not installed
        ValueError
           :obj:`conkit.core.SequenceFile` is not an alignment
        ValueError
           Sequence Identity needs to be between 0 and 1

        """
        if not SCIPY:
            raise RuntimeError('Cannot find SciPy package')

        if not self.is_alignment:
            raise ValueError('This is not an alignment')

        if identity < 0 or identity > 1:
            raise ValueError("Sequence Identity needs to be between 0 and 1")

        # Alignment to unsigned integer matrix
        msa_mat = numpy.asarray(
            [numpy.fromstring(sequence_entry.seq, dtype=numpy.uint8) for sequence_entry in self], dtype=numpy.uint8
        )

        # Pre-define some variables
        n = msa_mat.shape[0]                        # size of the data
        batch_size = min(n, 250)                    # size of the batches
        hamming = numpy.zeros(n, dtype=numpy.int)   # storage for data

        # Separate the distance calculations into batches to avoid MemoryError exceptions.
        # This answer was provided by a StackOverflow user. The corresponding suggestion by
        # user @WarrenWeckesser: http://stackoverflow.com/a/41090953/3046533
        num_full_batches, last_batch = divmod(n, batch_size)
        batches = [batch_size] * num_full_batches
        if last_batch != 0:
            batches.append(last_batch)
        for k, batch in enumerate(batches):
            i = batch_size * k
            dists = scipy.spatial.distance.cdist(msa_mat[i:i+batch], msa_mat, metric='hamming')
            hamming[i:i+batch] = (dists < (1 - identity)).sum(axis=1)

        return (1. / hamming).sum().astype(int).item()

[docs]    def calculate_freq(self):
        """Calculate the gap frequency in each alignment column

        This function calculates the frequency of gaps at each
        position in the Multiple Sequence Alignment.

        Returns
        -------
        frequency : list
           A list containing the per alignment-column amino acid
           frequency count

        Raises
        ------
        MemoryError
           Too many sequences in the alignment
        RuntimeError
           :obj:`conkit.core.SequenceFile` is not an alignment

        """
        if not self.is_alignment:
            raise ValueError('This is not an alignment')

        msa_mat = numpy.asarray(
            [numpy.fromstring(sequence_entry.seq, dtype='uint8') for sequence_entry in self], dtype='uint8'
        )
        # matrix of 0s and 1s; 1 if char is '-'
        frequencies = numpy.where(msa_mat != 45, 1, 0)
        # sum all values per row
        gap_counts = numpy.sum(frequencies, axis=0)
        # divide all by sequence length
        return (gap_counts / len(msa_mat.T[0])).tolist()

[docs]    def sort(self, kword, reverse=False, inplace=False):
        """Sort the :obj:`conkit.core.SequenceFile`

        Parameters
        ----------
        kword : str
           The dictionary key to sort contacts by
        reverse : bool, optional
           Sort the contact pairs in descending order [default: False]
        inplace : bool, optional
           Replace the saved order of contacts [default: False]

        Returns
        -------
        contact_map : :obj:`conkit.core.ContactMap`
           The reference to the :obj:`conkit.core.ContactMap`, regardless of inplace

        Raises
        ------
        ValueError
           ``kword`` not in :obj:`conkit.core.SequenceFile`

        """
        sequence_file = self._inplace(inplace)
        sequence_file._sort(kword, reverse)
        return sequence_file