Source code for conkit.applications.jackhmmer

# coding=utf-8
#
# BSD 3-Clause License
#
# Copyright (c) 2016-17, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Command line object for Jackhmmer Multiple Sequence Alignment generation
"""

__author__ = "Felix Simkovic"
__date__ = "01 June 2016"
__version__ = "0.1"

from Bio.Application import _Argument
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline


[docs]class JackhmmerCommandline(AbstractCommandline):
    """
    Command line object for Jackhmmer [#]_ alignment generation

    http://hmmer.org/

    Jackhmmer is an algorithm that uses iterative searches a protein sequence
    against a protein sequence database to find sequence homologs.

    .. [#] Johnson L. S., Eddy S. R., Portugaly E. (2010). Hidden Markov
       Model Speed Heuristic and Iterative HMM Search Procedure. BMC Bioinformatics 11, 431.

    Examples
    --------
    To generate a Multiple Sequence Alignment, use:

    >>> from conkit.applications import JackhmmerCommandline
    >>> jackhmmer_cline = JackhmmerCommandline(
    ...     input="test.fasta", database="uniref100.fasta"
    ... )
    >>> print(jackhmmer_cline)
    jackhmmer test.fasta uniref100.fasta

    You would typically run the command line with :func:`jackhmmer_cline` or via
    the Python subprocess module.

    """

    def __init__(self, cmd='jackhmmer', **kwargs):
        self.parameters = [
            _Option(['-N', 'niterations'],
                    'set maximum number of iterations [default: 5]',
                    equate=False),

            # Options directing output
            _Option(['-o', 'output'],
                    'direct output to file <f>, not stdout',
                    filename=True,
                    equate=False),
            _Option(['-A', 'msa_hits'],
                    'save multiple alignment of hits to file <f>',
                    filename=True,
                    equate=False),
            _Option(['--tblout', 'per_sequence_hits'],
                    'save parseable table of per-sequence hits to file <f>',
                    filename=True,
                    equate=False),
            _Option(['--domtblout', 'per_domain_hits'],
                    'save parseable table of per-domain hits to file <f>',
                    filename=True,
                    equate=False),
            _Option(['--chkhmm', 'hmm_checkpoints'],
                    'save HMM checkpoints to files <f>-<iteration>.hmm',
                    filename=True,
                    equate=False),
            _Option(['--chkali', 'alignment_checkpoints'],
                    'save alignment checkpoints to files <f>-<iteration>.sto',
                    filename=True,
                    equate=False),
            _Switch(['--acc', 'accession'],
                    'prefer accessions over names in output'),
            _Switch(['--noali', 'no_alignment'],
                    'don\'t output alignments, so output is smaller'),
            _Switch(['--notextw', 'notextw'],
                    'unlimit ASCII text output line width'),
            _Switch(['--textw', 'textw'],
                    'set max width of ASCII text output lines [default: 120] (n>=120)'),

            # Options controlling scoring system in first iteration
            _Option(['--popen', 'gap_open_probability'],
                    'gap open probability',
                    equate=False),
            _Option(['--pextend', 'gap_extend_probability'],
                    'gap extend probability',
                    equate=False),
            _Option(['--mx', 'matrix_choice'],
                    'substitution score matrix choice (of some built-in matrices)',
                    equate=False),
            _Option(['--mxfile', 'matrix_option'],
                    'read substitution score matrix from file <f>',
                    filename=True,
                    equate=False),

            # Options controlling reporting thresholds
            _Option(['-E', 'evalue'],
                    'report sequences <= this E-value threshold in output [default: 10.0] (x>0)',
                    equate=False),
            _Option(['-T', 'score_threshold'],
                    'report sequences >= this score threshold in output',
                    equate=False),
            _Option(['--domE', 'domain_evalue'],
                    'report domains <= this E-value threshold in output [default: 10.0] (x>0)',
                    equate=False),
            _Option(['--domT', 'domain_score_threshold'],
                    'report domains >= this score cutoff in output',
                    equate=False),

            # Options controlling significance thresholds for inclusion in next round
            _Option(['--incE', 'inclusion_evalue'],
                    'consider sequences <= this E-value threshold as significant',
                    equate=False),
            _Option(['--incT', 'inclusion_score_threshold'],
                    'consider sequences >= this score threshold as significant',
                    equate=False),
            _Option(['--incdomE', 'inclusion_domain_evalue'],
                    'consider domains <= this E-value threshold as significant',
                    equate=False),
            _Option(['--incdomT', 'inclusion_domain_score_threshold'],
                    'consider domains >= this score threshold as significant',
                    equate=False),

            # Options controlling acceleration heuristics
            _Switch(['--max', 'no_heuristics'],
                    'Turn all heuristic filters off (less speed, more power)'),
            _Option(['--F1', 'stage1_threshold'],
                    'Stage 1 (MSV) threshold: promote hits w/ P <= F1 [default: 0.02]',
                    equate=False),
            _Option(['--F2', 'stage2_threshold'],
                    'Stage 2 (Vit) threshold: promote hits w/ P <= F2 [default: 1e-3]',
                    equate=False),
            _Option(['--F3', 'stage3_threshold'],
                    'Stage 3 (Fwd) threshold: promote hits w/ P <= F3 [default: 1e-5]',
                    equate=False),
            _Switch(['--nobias', 'nobias'],
                    'turn off composition bias filter'),

            # Options controlling model construction after first iteration
            _Switch(['--fast', 'fast'],
                    'assign cols w/ >= symfrac residues as consensus'),
            _Switch(['--hand', 'hand'],
                    'manual construction (requires reference annotation)'),
            _Option(['--symfrac', 'symfrac'],
                    'sets sym fraction controlling --fast construction',
                    equate=False),
            _Option(['--fragthres', 'fragthres'],
                    'if L <= x*alen, tag sequence as a fragment',
                    equate=False),

            # Options controlling relative weights in models after first iteration
            _Switch(['--wpb', 'henikoff_pb_weights'],
                    'Henikoff position-based weights  [default]'),
            _Switch(['--wgsc', 'GSC_weights'],
                    'Gerstein/Sonnhammer/Chothia tree weights'),
            _Switch(['--wblosum', 'henikoff_sf_weights'],
                    'Henikoff simple filter weights'),
            _Switch(['--wnone', 'no_weight'],
                    'don\'t do any relative weighting; set all to 1'),
            _Option(['--wid', 'wblosum_cutoff'],
                    'for --wblosum: set identity cutoff [default: 0.62] (0<=x<=1)',
                    equate=False),

            # Options controlling effective seq number in models after first iteration
            _Switch(['--eent', 'eent'],
                    'adjust eff seq # to achieve relative entropy target [default]'),
            _Switch(['--eclust', 'ecluse'],
                    'eff seq # is # of single linkage clusters'),
            _Switch(['--enone', 'enone'],
                    'no effective seq # weighting: just use nseq'),
            _Option(['--eset', 'eset'],
                    'set eff seq # for all models to <x>',
                    equate=False),
            _Option(['--ere', 'ere'],
                    'for --eent: set minimum rel entropy/position to <x>',
                    equate=False),
            _Option(['--esigma', 'esigma'],
                    'for --eent: set sigma param to <x> [default: 45.0]',
                    equate=False),
            _Option(['--eid', 'eid'],
                    'for --eclust: set fractional identity cutoff to <x> [default: 0.62]',
                    equate=False),

            # Options controlling prior strategy in models after first iteration
            _Switch(['--pnone', 'pnone'],
                    'don\'t use any prior; parameters are frequencies'),
            _Switch(['--plaplace', 'plaplace'],
                    'use a Laplace +1 prior'),

            # Options controlling E value calibration
            _Option(['--EmL', 'eml'],
                    'length of sequences for MSV Gumbel mu fit [default: 200] (n>0)',
                    equate=False),
            _Option(['--EmN', 'emn'],
                    'number of sequences for MSV Gumbel mu fit [default: 200] (n>0)',
                    equate=False),
            _Option(['--EvL', 'evl'],
                    'length of sequences for Viterbi Gumbel mu fit [default: 200] (n>0)',
                    equate=False),
            _Option(['--EvN', 'evn'],
                    'number of sequences for Viterbi Gumbel mu fit [default: 200] (n>0)',
                    equate=False),
            _Option(['--EfL', 'efl'],
                    'length of sequences for Forward exp tail tau fit [default: 100] (n>0)',
                    equate=False),
            _Option(['--EfN', 'efn'],
                    'number of sequences for Forward exp tail tau fit [default: 200] (n>0)',
                    equate=False),
            _Option(['--Eft', 'eft'],
                    'tail mass for Forward exponential tail tau fit [default: 0.04] (0<x<1)',
                    equate=False),

            # Other expert options
            _Switch(['--nonull2', 'nonull2'],
                    'turn off biased composition score corrections'),
            _Option(['-Z', 'ncomparison'],
                    'set # of comparisons done, for E-value calculation',
                    equate=False),
            _Option(['--domZ', 'domz'],
                    'set # of significant seqs, for domain E-value calculation',
                    equate=False),
            _Option(['--seed', 'seed'],
                    'set RNG seed to <n> (if 0: one-time arbitrary seed) [default: 42]',
                    equate=False),
            _Option(['--qformat', 'qformat'],
                    'assert query <seqfile> is in format <s>: no autodetection',
                    equate=False),
            _Option(['--tformat', 'tformat'],
                    'assert target < seqdb > is in format < s >>: no autodetection',
                    equate=False),
            _Option(['--cpu', 'cpu'],
                    'number of parallel CPU workers to use for multithreads',
                    equate=False),

            # Required arguments
            _Argument(['input'],
                      'sequence containing file',
                      filename=True,
                      is_required=True),
            _Argument(['database'],
                      'sequence database',
                      filename=True,
                      is_required=True),
        ]

        AbstractCommandline.__init__(self, cmd, **kwargs)