Source code for conkit.core.sequencefile

# coding=utf-8
#
# BSD 3-Clause License
#
# Copyright (c) 2016-21, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""SequenceFile container used throughout ConKit"""

from __future__ import division

__author__ = "Felix Simkovic"
__date__ = "03 Aug 2016"
__version__ = "0.13.3"

import numpy as np
import sys

from conkit.core.entity import Entity
from conkit.core.mappings import AminoAcidMapping, SequenceAlignmentState


[docs]class SequenceFile(Entity): """A sequence file object representing a single sequence file The :obj:`~conkit.core.sequencefile.SequenceFile` class represents a data structure to hold :obj:`~conkit.core.sequence.Sequence` instances in a single sequence file. It contains functions to store and analyze sequences. Attributes ---------- id : str A unique identifier is_alignment : bool A boolean status for the alignment meff : int The number of effective sequences in the :obj:`~conkit.core.sequencefile.SequenceFile` nseq : int The number of sequences in the :obj:`~conkit.core.sequencefile.SequenceFile` remark : list The :obj:`~conkit.core.sequencefile.SequenceFile`-specific remarks status : int An indication of the sequence file, i.e alignment, no alignment, or unknown top_sequence : :obj:`~conkit.core.sequence.Sequence`, None The first :obj:`~conkit.core.sequence.Sequence` entry in the file Examples -------- >>> from conkit.core import Sequence, SequenceFile >>> sequence_file = SequenceFile("example") >>> sequence_file.add(Sequence("foo", "ABCDEF")) >>> sequence_file.add(Sequence("bar", "ZYXWVU")) >>> print(sequence_file) SequenceFile(id="example" nseq=2) """ __slots__ = ["_remark", "_status"] def __init__(self, id): """Initialise a new :obj:`~conkit.core.sequencefile.SequenceFile` Parameters ---------- id : str A unique identifier for the sequence file """ self._remark = [] self._status = SequenceAlignmentState.unknown super(SequenceFile, self).__init__(id) def __repr__(self): return '{}(id="{}" nseq={})'.format(self.__class__.__name__, self.id, self.nseq) @property def ascii_matrix(self): """The alignment encoded in a 2-D ASCII matrix""" return [list(seq.seq_ascii) for seq in self] @property def encoded_matrix(self): """The alignment encoded for contact prediction""" return [list(seq.seq_encoded) for seq in self] @property def is_alignment(self): """A boolean status for the alignment Returns ------- bool A boolean status for the alignment """ seq_length = self.top_sequence.seq_len self._status = SequenceAlignmentState.aligned for sequence in self: if sequence.seq_len != seq_length: self._status = SequenceAlignmentState.unaligned break return self._status == SequenceAlignmentState.aligned @property def diversity(self): """The diversity of an alignment defined by :math:`\\sqrt{N}/L`. ``N`` equals the number of sequences in the alignment and ``L`` the sequence length """ if self.empty: return 0.0 elif self.is_alignment: return (np.sqrt(len(self)) / float(self.top.seq_len)).round(decimals=3).item() else: raise ValueError("This is not an alignment") @property def empty(self): """Status of emptiness of sequencefile""" return len(self) < 1 @property def meff(self): """The number of effective sequences""" return int(sum(self.get_weights())) @property def nseq(self): """The number of sequences""" return len(self) @property def remark(self): """The :obj:`~conkit.core.sequencefile.SequenceFile`-specific remarks""" return self._remark @remark.setter def remark(self, remark): """Set the :obj:`~conkit.core.sequencefile.SequenceFile` remark Parameters ---------- remark : str, list The remark will be added to the list of remarks """ if isinstance(remark, list): self._remark += remark elif isinstance(remark, tuple): self._remark += list(remark) else: self._remark += [remark] @property def status(self): """An indication of the residue status, i.e true positive, false positive, or unknown""" return self._status.value @status.setter def status(self, status): """Set the status Parameters ---------- status : int [0] for `unknown`, [-1] for `no alignment`, or [1] for `alignment` Raises ------ :exc:`ValueError` Cannot determine if your sequence file is an alignment or not """ self._status = SequenceAlignmentState(status) @property def top_sequence(self): """The first :obj:`~conkit.core.sequence.Sequence` entry in :obj:`~conkit.core.sequencefile.SequenceFile` Returns ------- :obj:`~conkit.core.sequence.Sequence` The first :obj:`~conkit.core.sequence.Sequence` entry in :obj:`~conkit.core.sequencefile.SequenceFile` """ return self.top
[docs] def get_meff_with_id(self, identity): """Calculate the number of effective sequences with specified sequence identity See Also -------- meff, get_weights """ return int(sum(self.get_weights(identity=identity)))
[docs] def get_weights(self, identity=0.8): """Calculate the sequence weights This function calculates the sequence weights in the the Multiple Sequence Alignment. The mathematical function used to calculate `Meff` is .. math:: M_{eff}=\\sum_{i}\\frac{1}{\\sum_{j}S_{i,j}} Parameters ---------- identity : float, optional The sequence identity to use for similarity decision [default: 0.8] Returns ------- list A list of the sequence weights in the alignment Raises ------ :exc:`ValueError` :obj:`~conkit.core.sequencefile.SequenceFile` is not an alignment :exc:`ValueError` Sequence Identity needs to be between 0 and 1 """ if identity < 0 or identity > 1: raise ValueError("Sequence Identity needs to be between 0 and 1") if self.is_alignment: from conkit.core.ext.c_sequencefile import c_get_weights X = np.array(self.ascii_matrix, dtype=np.int64) hamming = np.zeros(X.shape[0], dtype=np.float64) c_get_weights(X, identity, hamming) return hamming.tolist() else: raise ValueError("This is not an alignment")
[docs] def get_frequency(self, symbol): """Calculate the frequency of an amino acid (symbol) in each Multiple Sequence Alignment column Returns ------- list A list containing the per alignment-column amino acid frequency count Raises ------ :exc:`RuntimeError` :obj:`~conkit.core.sequencefile.SequenceFile` is not an alignment """ if self.is_alignment: from conkit.core.ext.c_sequencefile import c_get_frequency X = np.array(self.encoded_matrix, dtype=np.int64) symbol = getattr(AminoAcidMapping, symbol, AminoAcidMapping["X"]).value frequencies = np.zeros(X.shape[1], dtype=np.int64) c_get_frequency(X, symbol, frequencies) return frequencies.tolist() else: raise ValueError("This is not an alignment")
[docs] def filter(self, min_id=0.3, max_id=0.9, inplace=False): """Filter sequences from an alignment according to the minimum and maximum identity between the sequences Parameters ---------- min_id : float, optional Minimum sequence identity max_id : float, optional Maximum sequence identity inplace : bool, optional Replace the saved order of sequences [default: False] Returns ------- :obj:`~conkit.core.sequencefile.SequenceFile` The reference to the :obj:`~conkit.core.sequencefile.SequenceFile`, regardless of inplace Raises ------ :exc:`ValueError` :obj:`~conkit.core.sequencefile.SequenceFile` is not an alignment :exc:`ValueError` Minimum sequence identity needs to be between 0 and 1 :exc:`ValueError` Maximum sequence identity needs to be between 0 and 1 """ if 0 > min_id > 1: raise ValueError("Minimum sequence identity needs to be between 0 and 1") elif 0 > max_id > 1: raise ValueError("Maximum sequence identity needs to be between 0 and 1") if self.is_alignment: from conkit.core.ext.c_sequencefile import c_filter X = np.array(self.ascii_matrix, dtype=np.int64) throwables = np.full(X.shape[0], False, dtype=np.bool) c_filter(X, min_id, max_id, throwables) filtered = self._inplace(inplace) for i, sequence in enumerate(self): if throwables[i]: filtered.remove(sequence.id) return filtered else: raise ValueError("This is not an alignment")
[docs] def filter_gapped(self, min_prop=0.0, max_prop=0.9, inplace=True): """Filter all sequences a gap proportion greater than the limit Parameters ---------- min_prop : float, optional Minimum allowed gap proportion [default: 0.0] max_prop : float, optional Maximum allowed gap proportion [default: 0.9] inplace : bool, optional Replace the saved order of sequences [default: False] Returns ------- :obj:`~conkit.core.sequencefile.SequenceFile` The reference to the :obj:`~conkit.core.sequencefile.SequenceFile`, regardless of inplace Raises ------ :exc:`ValueError` :obj:`~conkit.core.sequencefile.SequenceFile` is not an alignment :exc:`ValueError` Minimum gap proportion needs to be between 0 and 1 :exc:`ValueError` Maximum gap proportion needs to be between 0 and 1 """ if 0.0 > min_prop > 1.0: raise ValueError("Minimum gap proportion needs to be between 0 and 1") elif 0.0 > max_prop > 1.0: raise ValueError("Maximum gap proportion needs to be between 0 and 1") symbol = "X" if self.is_alignment: from conkit.core.ext.c_sequencefile import c_filter_symbol X = np.array(self.encoded_matrix, dtype=np.int64) symbol = getattr(AminoAcidMapping, symbol, AminoAcidMapping["X"]).value throwables = np.full(X.shape[0], False, dtype=np.bool) c_filter_symbol(X, min_prop, max_prop, symbol, throwables) filtered = self._inplace(inplace) for i, sequence in enumerate(self): if throwables[i]: filtered.remove(sequence.id) return filtered else: raise ValueError("This is not an alignment")
[docs] def sort(self, kword, reverse=False, inplace=False): """Sort the :obj:`~conkit.core.sequencefile.SequenceFile` Parameters ---------- kword : str The dictionary key to sort sequences by reverse : bool, optional Sort the sequences in reverse order [default: False] inplace : bool, optional Replace the saved order of sequences [default: False] Returns ------- :obj:`~conkit.core.sequencefile.SequenceFile` The reference to the :obj:`~conkit.core.sequencefile.SequenceFile`, regardless of inplace Raises ------ :exc:`ValueError` ``kword`` not in :obj:`~conkit.core.sequencefile.SequenceFile` """ sequence_file = self._inplace(inplace) sequence_file._sort(kword, reverse) return sequence_file
[docs] def to_string(self): """Return the :obj:`~conkit.core.sequencefile.SequenceFile` as :obj:`str`""" content = [s.seq for s in self] return "\n".join(content)
[docs] def trim(self, start, end, inplace=False): """Trim the :obj:`~conkit.core.sequencefile.SequenceFile` Parameters ---------- start : int First residue to include end : int Final residue to include inplace : bool, optional Replace the saved order of sequences [default: False] Returns ------- :obj:`~conkit.core.sequencefile.SequenceFile` The reference to the :obj:`~conkit.core.sequencefile.SequenceFile`, regardless of inplace """ sequence_file = self._inplace(inplace) if self.is_alignment: i = start - 1 j = end for sequence in sequence_file: sequence.seq = sequence.seq[i:j] return sequence_file else: raise ValueError("This is not an alignment")
[docs] def summary(self): """Generate a summary for the :obj:`~conkit.core.sequencefile.SequenceFile` Returns ------- str """ sstream = "Summary for {id}{nline}" sstream += "-------------------------------{nline}" sstream += "Alignment:{tab}{tab}{is_alignment}" sstream += "Number of sequences:{tab}{nseq}{nline}" % self.nseq sstream += "Alignment depth (0.8):{tab}{meff}{nline}" % self.meff return sstream.format( id=self.id, is_alignment=self.is_alignment, tab="\t", nline="\n", nseq=self.nseq, meff=self.meff )