Source code for conkit.applications.jackhmmer

# coding=utf-8
#
# BSD 3-Clause License
#
# Copyright (c) 2016-18, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Command line object for Jackhmmer Multiple Sequence Alignment generation
"""

__author__ = "Felix Simkovic"
__date__ = "01 June 2016"
__version__ = "0.1"

from Bio.Application import _Argument
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline


[docs]class JackhmmerCommandline(AbstractCommandline):
    """
    Command line object for Jackhmmer [#]_ alignment generation

    http://hmmer.org/

    Jackhmmer is an algorithm that uses iterative searches a protein sequence
    against a protein sequence database to find sequence homologs.

    .. [#] Johnson L. S., Eddy S. R., Portugaly E. (2010). Hidden Markov
       Model Speed Heuristic and Iterative HMM Search Procedure. BMC Bioinformatics 11, 431.

    Examples
    --------
    To generate a Multiple Sequence Alignment, use:

    >>> from conkit.applications import JackhmmerCommandline
    >>> jackhmmer_cline = JackhmmerCommandline(
    ...     input="test.fasta", database="uniref100.fasta"
    ... )
    >>> print(jackhmmer_cline)
    jackhmmer test.fasta uniref100.fasta

    You would typically run the command line with :func:`jackhmmer_cline` or via
    the :mod:`~subprocess` module.

    """

    def __init__(self, cmd='jackhmmer', **kwargs):
        self.parameters = [
            _Option(['-N', 'niterations'], 'set maximum number of iterations [default: 5]', equate=False),

            # Options directing output
            _Option(['-o', 'output'], 'direct output to file <f>, not stdout', filename=True, equate=False),
            _Option(['-A', 'msa_hits'], 'save multiple alignment of hits to file <f>', filename=True, equate=False),
            _Option(
                ['--tblout', 'per_sequence_hits'],
                'save parseable table of per-sequence hits to file <f>',
                filename=True,
                equate=False),
            _Option(
                ['--domtblout', 'per_domain_hits'],
                'save parseable table of per-domain hits to file <f>',
                filename=True,
                equate=False),
            _Option(
                ['--chkhmm', 'hmm_checkpoints'],
                'save HMM checkpoints to files <f>-<iteration>.hmm',
                filename=True,
                equate=False),
            _Option(
                ['--chkali', 'alignment_checkpoints'],
                'save alignment checkpoints to files <f>-<iteration>.sto',
                filename=True,
                equate=False),
            _Switch(['--acc', 'accession'], 'prefer accessions over names in output'),
            _Switch(['--noali', 'no_alignment'], 'don\'t output alignments, so output is smaller'),
            _Switch(['--notextw', 'notextw'], 'unlimit ASCII text output line width'),
            _Switch(['--textw', 'textw'], 'set max width of ASCII text output lines [default: 120] (n>=120)'),

            # Options controlling scoring system in first iteration
            _Option(['--popen', 'gap_open_probability'], 'gap open probability', equate=False),
            _Option(['--pextend', 'gap_extend_probability'], 'gap extend probability', equate=False),
            _Option(
                ['--mx', 'matrix_choice'], 'substitution score matrix choice (of some built-in matrices)',
                equate=False),
            _Option(
                ['--mxfile', 'matrix_option'],
                'read substitution score matrix from file <f>',
                filename=True,
                equate=False),

            # Options controlling reporting thresholds
            _Option(
                ['-E', 'evalue'],
                'report sequences <= this E-value threshold in output [default: 10.0] (x>0)',
                equate=False),
            _Option(['-T', 'score_threshold'], 'report sequences >= this score threshold in output', equate=False),
            _Option(
                ['--domE', 'domain_evalue'],
                'report domains <= this E-value threshold in output [default: 10.0] (x>0)',
                equate=False),
            _Option(
                ['--domT', 'domain_score_threshold'], 'report domains >= this score cutoff in output', equate=False),

            # Options controlling significance thresholds for inclusion in next round
            _Option(
                ['--incE', 'inclusion_evalue'],
                'consider sequences <= this E-value threshold as significant',
                equate=False),
            _Option(
                ['--incT', 'inclusion_score_threshold'],
                'consider sequences >= this score threshold as significant',
                equate=False),
            _Option(
                ['--incdomE', 'inclusion_domain_evalue'],
                'consider domains <= this E-value threshold as significant',
                equate=False),
            _Option(
                ['--incdomT', 'inclusion_domain_score_threshold'],
                'consider domains >= this score threshold as significant',
                equate=False),

            # Options controlling acceleration heuristics
            _Switch(['--max', 'no_heuristics'], 'Turn all heuristic filters off (less speed, more power)'),
            _Option(
                ['--F1', 'stage1_threshold'],
                'Stage 1 (MSV) threshold: promote hits w/ P <= F1 [default: 0.02]',
                equate=False),
            _Option(
                ['--F2', 'stage2_threshold'],
                'Stage 2 (Vit) threshold: promote hits w/ P <= F2 [default: 1e-3]',
                equate=False),
            _Option(
                ['--F3', 'stage3_threshold'],
                'Stage 3 (Fwd) threshold: promote hits w/ P <= F3 [default: 1e-5]',
                equate=False),
            _Switch(['--nobias', 'nobias'], 'turn off composition bias filter'),

            # Options controlling model construction after first iteration
            _Switch(['--fast', 'fast'], 'assign cols w/ >= symfrac residues as consensus'),
            _Switch(['--hand', 'hand'], 'manual construction (requires reference annotation)'),
            _Option(['--symfrac', 'symfrac'], 'sets sym fraction controlling --fast construction', equate=False),
            _Option(['--fragthres', 'fragthres'], 'if L <= x*alen, tag sequence as a fragment', equate=False),

            # Options controlling relative weights in models after first iteration
            _Switch(['--wpb', 'henikoff_pb_weights'], 'Henikoff position-based weights  [default]'),
            _Switch(['--wgsc', 'GSC_weights'], 'Gerstein/Sonnhammer/Chothia tree weights'),
            _Switch(['--wblosum', 'henikoff_sf_weights'], 'Henikoff simple filter weights'),
            _Switch(['--wnone', 'no_weight'], 'don\'t do any relative weighting; set all to 1'),
            _Option(
                ['--wid', 'wblosum_cutoff'],
                'for --wblosum: set identity cutoff [default: 0.62] (0<=x<=1)',
                equate=False),

            # Options controlling effective seq number in models after first iteration
            _Switch(['--eent', 'eent'], 'adjust eff seq # to achieve relative entropy target [default]'),
            _Switch(['--eclust', 'ecluse'], 'eff seq # is # of single linkage clusters'),
            _Switch(['--enone', 'enone'], 'no effective seq # weighting: just use nseq'),
            _Option(['--eset', 'eset'], 'set eff seq # for all models to <x>', equate=False),
            _Option(['--ere', 'ere'], 'for --eent: set minimum rel entropy/position to <x>', equate=False),
            _Option(['--esigma', 'esigma'], 'for --eent: set sigma param to <x> [default: 45.0]', equate=False),
            _Option(
                ['--eid', 'eid'], 'for --eclust: set fractional identity cutoff to <x> [default: 0.62]', equate=False),

            # Options controlling prior strategy in models after first iteration
            _Switch(['--pnone', 'pnone'], 'don\'t use any prior; parameters are frequencies'),
            _Switch(['--plaplace', 'plaplace'], 'use a Laplace +1 prior'),

            # Options controlling E value calibration
            _Option(['--EmL', 'eml'], 'length of sequences for MSV Gumbel mu fit [default: 200] (n>0)', equate=False),
            _Option(['--EmN', 'emn'], 'number of sequences for MSV Gumbel mu fit [default: 200] (n>0)', equate=False),
            _Option(
                ['--EvL', 'evl'], 'length of sequences for Viterbi Gumbel mu fit [default: 200] (n>0)', equate=False),
            _Option(
                ['--EvN', 'evn'], 'number of sequences for Viterbi Gumbel mu fit [default: 200] (n>0)', equate=False),
            _Option(
                ['--EfL', 'efl'], 'length of sequences for Forward exp tail tau fit [default: 100] (n>0)',
                equate=False),
            _Option(
                ['--EfN', 'efn'], 'number of sequences for Forward exp tail tau fit [default: 200] (n>0)',
                equate=False),
            _Option(
                ['--Eft', 'eft'],
                'tail mass for Forward exponential tail tau fit [default: 0.04] (0<x<1)',
                equate=False),

            # Other expert options
            _Switch(['--nonull2', 'nonull2'], 'turn off biased composition score corrections'),
            _Option(['-Z', 'ncomparison'], 'set # of comparisons done, for E-value calculation', equate=False),
            _Option(['--domZ', 'domz'], 'set # of significant seqs, for domain E-value calculation', equate=False),
            _Option(
                ['--seed', 'seed'], 'set RNG seed to <n> (if 0: one-time arbitrary seed) [default: 42]', equate=False),
            _Option(
                ['--qformat', 'qformat'], 'assert query <seqfile> is in format <s>: no autodetection', equate=False),
            _Option(
                ['--tformat', 'tformat'], 'assert target < seqdb > is in format < s >>: no autodetection',
                equate=False),
            _Option(['--cpu', 'cpu'], 'number of parallel CPU workers to use for multithreads', equate=False),

            # Required arguments
            _Argument(['input'], 'sequence containing file', filename=True, is_required=True),
            _Argument(['database'], 'sequence database', filename=True, is_required=True),
        ]

        AbstractCommandline.__init__(self, cmd, **kwargs)