# coding=utf-8
#
# BSD 3-Clause License
#
# Copyright (c) 2016-17, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""SequenceFile container used throughout ConKit"""
from __future__ import division
from __future__ import print_function
__author__ = "Felix Simkovic"
__date__ = "03 Aug 2016"
__version__ = "1.0"
import numpy as np
import sys
if sys.version_info.major < 3:
from itertools import izip as zip
from enum import Enum
from conkit.core._entity import _Entity
[docs]class SequenceAlignmentState(Enum):
"""Alignment states"""
unknown = 0
unaligned = 1
aligned = 2
[docs]class SequenceFile(_Entity):
"""A sequence file object representing a single sequence file
The :obj:`SequenceFile <conkit.core.SequenceFile>` class represents a data structure to hold
:obj:`Sequence <conkit.core.Sequence>` instances in a single sequence file. It contains
functions to store and analyze sequences.
Attributes
----------
id : str
A unique identifier
is_alignment : bool
A boolean status for the alignment
neff : int
The number of effective sequences in the :obj:`SequenceFile <conkit.core.SequenceFile>`
nseq : int
The number of sequences in the :obj:`SequenceFile <conkit.core.SequenceFile>`
remark : list
The :obj:`SequenceFile <conkit.core.SequenceFile>`-specific remarks
status : int
An indication of the sequence file, i.e alignment, no alignment, or unknown
top_sequence : :obj:`Sequence <conkit.core.Sequence>`, None
The first :obj:`Sequence <conkit.core.Sequence>` entry in the file
Examples
--------
>>> from conkit.core import Sequence, SequenceFile
>>> sequence_file = SequenceFile("example")
>>> sequence_file.add(Sequence("foo", "ABCDEF"))
>>> sequence_file.add(Sequence("bar", "ZYXWVU"))
>>> print(sequence_file)
SequenceFile(id="example" nseq=2)
"""
__slots__ = ['_remark', '_status']
def __init__(self, id):
"""Initialise a new :obj:`SequenceFile <conkit.core.SequenceFile>`
Parameters
----------
id : str
A unique identifier for the sequence file
"""
self._remark = []
self._status = SequenceAlignmentState.unknown
super(SequenceFile, self).__init__(id)
def __repr__(self):
return "{0}(id=\"{1}\" nseq={2})".format(self.__class__.__name__, self.id, self.nseq)
@property
def ascii_matrix(self):
"""The alignment encoded in a 2-D ASCII matrix"""
return [list(seq.seq_ascii) for seq in self]
@property
def is_alignment(self):
"""A boolean status for the alignment
Returns
-------
bool
A boolean status for the alignment
"""
seq_length = self.top_sequence.seq_len
self._status = SequenceAlignmentState.aligned
for sequence in self:
if sequence.seq_len != seq_length:
self._status = SequenceAlignmentState.unaligned
break
return self._status == SequenceAlignmentState.aligned
@property
def empty(self):
"""Status of emptiness of sequencefile"""
return len(self) < 1
@property
def neff(self):
"""The number of effective sequences"""
return int(sum(self.calculate_weights()))
@property
def nseq(self):
"""The number of sequences"""
return len(self)
@property
def remark(self):
"""The :obj:`SequenceFile <conkit.core.SequenceFile>`-specific remarks"""
return self._remark
@remark.setter
def remark(self, remark):
"""Set the :obj:`SequenceFile <conkit.core.SequenceFile>` remark
Parameters
----------
remark : str, list
The remark will be added to the list of remarks
"""
if isinstance(remark, list):
self._remark += remark
elif isinstance(remark, tuple):
self._remark += list(remark)
else:
self._remark += [remark]
@property
def status(self):
"""An indication of the residue status, i.e true positive, false positive, or unknown"""
return self._status.value
@status.setter
def status(self, status):
"""Set the status
Parameters
----------
status : int
[0] for `unknown`, [-1] for `no alignment`, or [1] for `alignment`
Raises
------
ValueError
Cannot determine if your sequence file is an alignment or not
"""
self._status = SequenceAlignmentState(status)
@property
def top_sequence(self):
"""The first :obj:`Sequence <conkit.core.Sequence>` entry in :obj:`SequenceFile <conkit.core.SequenceFile>`
Returns
-------
obj
The first :obj:`Sequence <conkit.core.Sequence>` entry in :obj:`SequenceFile <conkit.core.SequenceFile>`
"""
return self.top
[docs] def calculate_meff(self, identity=0.8):
"""Calculate the number of effective sequences
See Also
--------
neff
"""
import warnings
warnings.warn("This function will be deprecated in a future release! Use calculate_neff_with_identity instead!")
return self.calculate_neff_with_identity(identity)
[docs] def calculate_neff_with_identity(self, identity):
"""Calculate the number of effective sequences with specified sequence identity
See Also
--------
neff, calculate_weights
"""
return int(sum(self.calculate_weights(identity=identity)))
[docs] def calculate_weights(self, identity=0.8):
"""Calculate the sequence weights
This function calculates the sequence weights in the
the Multiple Sequence Alignment.
The mathematical function used to calculate `Meff` is
.. math::
M_{eff}=\\sum_{i}\\frac{1}{\\sum_{j}S_{i,j}}
Parameters
----------
identity : float, optional
The sequence identity to use for similarity decision [default: 0.8]
Returns
-------
list
A list of the sequence weights in the alignment
Raises
------
MemoryError
Too many sequences in the alignment for Hamming distance calculation
RuntimeError
SciPy package not installed
ValueError
:obj:`SequenceFile <conkit.core.SequenceFile>` is not an alignment
ValueError
Sequence Identity needs to be between 0 and 1
"""
try:
import scipy.spatial
except ImportError:
raise RuntimeError('Cannot find SciPy package')
if not self.is_alignment:
raise ValueError('This is not an alignment')
if identity < 0 or identity > 1:
raise ValueError("Sequence Identity needs to be between 0 and 1")
msa_mat = np.array(self.ascii_matrix)
n = msa_mat.shape[0] # size of the data
batch_size = min(n, 250) # size of the batches
hamming = np.zeros(n, dtype=np.int) # storage for data
# Separate the distance calculations into batches to avoid MemoryError exceptions.
# This answer was provided by a StackOverflow user. The corresponding suggestion by
# user @WarrenWeckesser: http://stackoverflow.com/a/41090953/3046533
num_full_batches, last_batch = divmod(n, batch_size)
batches = [batch_size] * num_full_batches
if last_batch != 0:
batches.append(last_batch)
for k, batch in enumerate(batches):
i = batch_size * k
dists = scipy.spatial.distance.cdist(msa_mat[i:i + batch], msa_mat, metric='hamming')
hamming[i:i + batch] = (dists < (1 - identity)).sum(axis=1)
return (1. / hamming).tolist()
[docs] def calculate_freq(self):
"""Calculate the gap frequency in each alignment column
This function calculates the frequency of gaps at each
position in the Multiple Sequence Alignment.
Returns
-------
list
A list containing the per alignment-column amino acid
frequency count
Raises
------
MemoryError
Too many sequences in the alignment
RuntimeError
:obj:`SequenceFile <conkit.core.SequenceFile>` is not an alignment
"""
if self.is_alignment:
msa_mat = np.array(self.ascii_matrix)
aa_frequencies = np.where(msa_mat != 45, 1, 0)
aa_counts = np.sum(aa_frequencies, axis=0)
return (aa_counts / len(msa_mat[:, 0])).tolist()
else:
raise ValueError('This is not an alignment')
[docs] def filter(self, min_id=0.3, max_id=0.9, inplace=False):
"""Filter an alignment
Parameters
----------
min_id : float, optional
max_id : float, optional
inplace : bool, optional
Replace the saved order of sequences [default: False]
Returns
-------
obj
The reference to the :obj:`SequenceFile <conkit.core.SequenceFile>`, regardless of inplace
Raises
------
MemoryError
Too many sequences in the alignment for Hamming distance calculation
RuntimeError
SciPy package not installed
ValueError
:obj:`SequenceFile <conkit.core.SequenceFile>` is not an alignment
ValueError
Minimum sequence Identity needs to be between 0 and 1
ValueError
Maximum sequence Identity needs to be between 0 and 1
"""
try:
import scipy.spatial
except ImportError:
raise RuntimeError('Cannot find SciPy package')
if not self.is_alignment:
raise ValueError('This is not an alignment')
if 0 > min_id > 1:
raise ValueError("Minimum sequence Identity needs to be between 0 and 1")
elif 0 > max_id > 1:
raise ValueError("Maximum sequence Identity needs to be between 0 and 1")
# Alignment to ASCII matrix
msa_mat = np.array(self.ascii_matrix)
# Find all throwable sequences
throw = set()
for i in np.arange(len(self)):
ident = 1 - scipy.spatial.distance.cdist([msa_mat[i]], msa_mat[i + 1:], metric='hamming')[0]
throw.update((1 + i + np.argwhere((ident < min_id) | (ident > max_id)).flatten()).tolist())
# Throw the previously selected sequences
sequence_file = self._inplace(inplace)
for i in reversed(list(throw)):
sequence_file.remove(self[i].id)
return sequence_file
[docs] def sort(self, kword, reverse=False, inplace=False):
"""Sort the :obj:`SequenceFile <conkit.core.SequenceFile>`
Parameters
----------
kword : str
The dictionary key to sort sequences by
reverse : bool, optional
Sort the sequences in reverse order [default: False]
inplace : bool, optional
Replace the saved order of sequences [default: False]
Returns
-------
obj
The reference to the :obj:`SequenceFile <conkit.core.SequenceFile>`, regardless of inplace
Raises
------
ValueError
``kword`` not in :obj:`SequenceFile <conkit.core.SequenceFile>`
"""
sequence_file = self._inplace(inplace)
sequence_file._sort(kword, reverse)
return sequence_file
[docs] def trim(self, start, end, inplace=False):
"""Trim the :obj:`SequenceFile <conkit.core.SequenceFile>`
Parameters
----------
start : int
First residue to include
end : int
Final residue to include
inplace : bool, optional
Replace the saved order of sequences [default: False]
Returns
-------
obj
The reference to the :obj:`SequenceFile <conkit.core.SequenceFile>`, regardless of inplace
"""
sequence_file = self._inplace(inplace)
if self.is_alignment:
i = start - 1
j = end
for sequence in sequence_file:
sequence.seq = sequence.seq[i:j]
return sequence_file
else:
raise ValueError("This is not an alignment")