Source code for conkit.applications.jackhmmer

# coding=utf-8
#
# BSD 3-Clause License
#
# Copyright (c) 2016-21, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Command line object for Jackhmmer Multiple Sequence Alignment generation
"""

__author__ = "Felix Simkovic"
__date__ = "01 June 2016"
__version__ = "0.13.3"

from Bio.Application import _Argument
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline


[docs]class JackhmmerCommandline(AbstractCommandline):
    """
    Command line object for Jackhmmer [#]_ alignment generation

    http://hmmer.org/

    Jackhmmer is an algorithm that uses iterative searches a protein sequence
    against a protein sequence database to find sequence homologs.

    .. [#] Johnson L. S., Eddy S. R., Portugaly E. (2010). Hidden Markov
       Model Speed Heuristic and Iterative HMM Search Procedure. BMC Bioinformatics 11, 431.

    Examples
    --------
    To generate a Multiple Sequence Alignment, use:

    >>> from conkit.applications import JackhmmerCommandline
    >>> jackhmmer_cline = JackhmmerCommandline(
    ...     input="test.fasta", database="uniref100.fasta"
    ... )
    >>> print(jackhmmer_cline)
    jackhmmer test.fasta uniref100.fasta

    You would typically run the command line with :func:`jackhmmer_cline` or via
    the :mod:`~subprocess` module.

    """

    def __init__(self, cmd="jackhmmer", **kwargs):
        self.parameters = [
            _Option(["-N", "niterations"], "set maximum number of iterations [default: 5]", equate=False),
            # Options directing output
            _Option(["-o", "output"], "direct output to file <f>, not stdout", filename=True, equate=False),
            _Option(["-A", "msa_hits"], "save multiple alignment of hits to file <f>", filename=True, equate=False),
            _Option(
                ["--tblout", "per_sequence_hits"],
                "save parseable table of per-sequence hits to file <f>",
                filename=True,
                equate=False,
            ),
            _Option(
                ["--domtblout", "per_domain_hits"],
                "save parseable table of per-domain hits to file <f>",
                filename=True,
                equate=False,
            ),
            _Option(
                ["--chkhmm", "hmm_checkpoints"],
                "save HMM checkpoints to files <f>-<iteration>.hmm",
                filename=True,
                equate=False,
            ),
            _Option(
                ["--chkali", "alignment_checkpoints"],
                "save alignment checkpoints to files <f>-<iteration>.sto",
                filename=True,
                equate=False,
            ),
            _Switch(["--acc", "accession"], "prefer accessions over names in output"),
            _Switch(["--noali", "no_alignment"], "don't output alignments, so output is smaller"),
            _Switch(["--notextw", "notextw"], "unlimit ASCII text output line width"),
            _Switch(["--textw", "textw"], "set max width of ASCII text output lines [default: 120] (n>=120)"),
            # Options controlling scoring system in first iteration
            _Option(["--popen", "gap_open_probability"], "gap open probability", equate=False),
            _Option(["--pextend", "gap_extend_probability"], "gap extend probability", equate=False),
            _Option(
                ["--mx", "matrix_choice"], "substitution score matrix choice (of some built-in matrices)", equate=False
            ),
            _Option(
                ["--mxfile", "matrix_option"],
                "read substitution score matrix from file <f>",
                filename=True,
                equate=False,
            ),
            # Options controlling reporting thresholds
            _Option(
                ["-E", "evalue"],
                "report sequences <= this E-value threshold in output [default: 10.0] (x>0)",
                equate=False,
            ),
            _Option(["-T", "score_threshold"], "report sequences >= this score threshold in output", equate=False),
            _Option(
                ["--domE", "domain_evalue"],
                "report domains <= this E-value threshold in output [default: 10.0] (x>0)",
                equate=False,
            ),
            _Option(
                ["--domT", "domain_score_threshold"], "report domains >= this score cutoff in output", equate=False
            ),
            # Options controlling significance thresholds for inclusion in next round
            _Option(
                ["--incE", "inclusion_evalue"],
                "consider sequences <= this E-value threshold as significant",
                equate=False,
            ),
            _Option(
                ["--incT", "inclusion_score_threshold"],
                "consider sequences >= this score threshold as significant",
                equate=False,
            ),
            _Option(
                ["--incdomE", "inclusion_domain_evalue"],
                "consider domains <= this E-value threshold as significant",
                equate=False,
            ),
            _Option(
                ["--incdomT", "inclusion_domain_score_threshold"],
                "consider domains >= this score threshold as significant",
                equate=False,
            ),
            # Options controlling acceleration heuristics
            _Switch(["--max", "no_heuristics"], "Turn all heuristic filters off (less speed, more power)"),
            _Option(
                ["--F1", "stage1_threshold"],
                "Stage 1 (MSV) threshold: promote hits w/ P <= F1 [default: 0.02]",
                equate=False,
            ),
            _Option(
                ["--F2", "stage2_threshold"],
                "Stage 2 (Vit) threshold: promote hits w/ P <= F2 [default: 1e-3]",
                equate=False,
            ),
            _Option(
                ["--F3", "stage3_threshold"],
                "Stage 3 (Fwd) threshold: promote hits w/ P <= F3 [default: 1e-5]",
                equate=False,
            ),
            _Switch(["--nobias", "nobias"], "turn off composition bias filter"),
            # Options controlling model construction after first iteration
            _Switch(["--fast", "fast"], "assign cols w/ >= symfrac residues as consensus"),
            _Switch(["--hand", "hand"], "manual construction (requires reference annotation)"),
            _Option(["--symfrac", "symfrac"], "sets sym fraction controlling --fast construction", equate=False),
            _Option(["--fragthres", "fragthres"], "if L <= x*alen, tag sequence as a fragment", equate=False),
            # Options controlling relative weights in models after first iteration
            _Switch(["--wpb", "henikoff_pb_weights"], "Henikoff position-based weights  [default]"),
            _Switch(["--wgsc", "GSC_weights"], "Gerstein/Sonnhammer/Chothia tree weights"),
            _Switch(["--wblosum", "henikoff_sf_weights"], "Henikoff simple filter weights"),
            _Switch(["--wnone", "no_weight"], "don't do any relative weighting; set all to 1"),
            _Option(
                ["--wid", "wblosum_cutoff"],
                "for --wblosum: set identity cutoff [default: 0.62] (0<=x<=1)",
                equate=False,
            ),
            # Options controlling effective seq number in models after first iteration
            _Switch(["--eent", "eent"], "adjust eff seq # to achieve relative entropy target [default]"),
            _Switch(["--eclust", "ecluse"], "eff seq # is # of single linkage clusters"),
            _Switch(["--enone", "enone"], "no effective seq # weighting: just use nseq"),
            _Option(["--eset", "eset"], "set eff seq # for all models to <x>", equate=False),
            _Option(["--ere", "ere"], "for --eent: set minimum rel entropy/position to <x>", equate=False),
            _Option(["--esigma", "esigma"], "for --eent: set sigma param to <x> [default: 45.0]", equate=False),
            _Option(
                ["--eid", "eid"], "for --eclust: set fractional identity cutoff to <x> [default: 0.62]", equate=False
            ),
            # Options controlling prior strategy in models after first iteration
            _Switch(["--pnone", "pnone"], "don't use any prior; parameters are frequencies"),
            _Switch(["--plaplace", "plaplace"], "use a Laplace +1 prior"),
            # Options controlling E value calibration
            _Option(["--EmL", "eml"], "length of sequences for MSV Gumbel mu fit [default: 200] (n>0)", equate=False),
            _Option(["--EmN", "emn"], "number of sequences for MSV Gumbel mu fit [default: 200] (n>0)", equate=False),
            _Option(
                ["--EvL", "evl"], "length of sequences for Viterbi Gumbel mu fit [default: 200] (n>0)", equate=False
            ),
            _Option(
                ["--EvN", "evn"], "number of sequences for Viterbi Gumbel mu fit [default: 200] (n>0)", equate=False
            ),
            _Option(
                ["--EfL", "efl"], "length of sequences for Forward exp tail tau fit [default: 100] (n>0)", equate=False
            ),
            _Option(
                ["--EfN", "efn"], "number of sequences for Forward exp tail tau fit [default: 200] (n>0)", equate=False
            ),
            _Option(
                ["--Eft", "eft"], "tail mass for Forward exponential tail tau fit [default: 0.04] (0<x<1)", equate=False
            ),
            # Other expert options
            _Switch(["--nonull2", "nonull2"], "turn off biased composition score corrections"),
            _Option(["-Z", "ncomparison"], "set # of comparisons done, for E-value calculation", equate=False),
            _Option(["--domZ", "domz"], "set # of significant seqs, for domain E-value calculation", equate=False),
            _Option(
                ["--seed", "seed"], "set RNG seed to <n> (if 0: one-time arbitrary seed) [default: 42]", equate=False
            ),
            _Option(
                ["--qformat", "qformat"], "assert query <seqfile> is in format <s>: no autodetection", equate=False
            ),
            _Option(
                ["--tformat", "tformat"], "assert target < seqdb > is in format < s >>: no autodetection", equate=False
            ),
            _Option(["--cpu", "cpu"], "number of parallel CPU workers to use for multithreads", equate=False),
            # Required arguments
            _Argument(["input"], "sequence containing file", filename=True, is_required=True),
            _Argument(["database"], "sequence database", filename=True, is_required=True),
        ]

        AbstractCommandline.__init__(self, cmd, **kwargs)