# coding=utf-8
#
# BSD 3-Clause License
#
# Copyright (c) 2016-18, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Command line object for Jackhmmer Multiple Sequence Alignment generation
"""
__author__ = "Felix Simkovic"
__date__ = "01 June 2016"
__version__ = "0.1"
from Bio.Application import _Argument
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline
[docs]class JackhmmerCommandline(AbstractCommandline):
"""
Command line object for Jackhmmer [#]_ alignment generation
http://hmmer.org/
Jackhmmer is an algorithm that uses iterative searches a protein sequence
against a protein sequence database to find sequence homologs.
.. [#] Johnson L. S., Eddy S. R., Portugaly E. (2010). Hidden Markov
Model Speed Heuristic and Iterative HMM Search Procedure. BMC Bioinformatics 11, 431.
Examples
--------
To generate a Multiple Sequence Alignment, use:
>>> from conkit.applications import JackhmmerCommandline
>>> jackhmmer_cline = JackhmmerCommandline(
... input="test.fasta", database="uniref100.fasta"
... )
>>> print(jackhmmer_cline)
jackhmmer test.fasta uniref100.fasta
You would typically run the command line with :func:`jackhmmer_cline` or via
the :mod:`~subprocess` module.
"""
def __init__(self, cmd='jackhmmer', **kwargs):
self.parameters = [
_Option(['-N', 'niterations'], 'set maximum number of iterations [default: 5]', equate=False),
# Options directing output
_Option(['-o', 'output'], 'direct output to file <f>, not stdout', filename=True, equate=False),
_Option(['-A', 'msa_hits'], 'save multiple alignment of hits to file <f>', filename=True, equate=False),
_Option(
['--tblout', 'per_sequence_hits'],
'save parseable table of per-sequence hits to file <f>',
filename=True,
equate=False),
_Option(
['--domtblout', 'per_domain_hits'],
'save parseable table of per-domain hits to file <f>',
filename=True,
equate=False),
_Option(
['--chkhmm', 'hmm_checkpoints'],
'save HMM checkpoints to files <f>-<iteration>.hmm',
filename=True,
equate=False),
_Option(
['--chkali', 'alignment_checkpoints'],
'save alignment checkpoints to files <f>-<iteration>.sto',
filename=True,
equate=False),
_Switch(['--acc', 'accession'], 'prefer accessions over names in output'),
_Switch(['--noali', 'no_alignment'], 'don\'t output alignments, so output is smaller'),
_Switch(['--notextw', 'notextw'], 'unlimit ASCII text output line width'),
_Switch(['--textw', 'textw'], 'set max width of ASCII text output lines [default: 120] (n>=120)'),
# Options controlling scoring system in first iteration
_Option(['--popen', 'gap_open_probability'], 'gap open probability', equate=False),
_Option(['--pextend', 'gap_extend_probability'], 'gap extend probability', equate=False),
_Option(
['--mx', 'matrix_choice'], 'substitution score matrix choice (of some built-in matrices)',
equate=False),
_Option(
['--mxfile', 'matrix_option'],
'read substitution score matrix from file <f>',
filename=True,
equate=False),
# Options controlling reporting thresholds
_Option(
['-E', 'evalue'],
'report sequences <= this E-value threshold in output [default: 10.0] (x>0)',
equate=False),
_Option(['-T', 'score_threshold'], 'report sequences >= this score threshold in output', equate=False),
_Option(
['--domE', 'domain_evalue'],
'report domains <= this E-value threshold in output [default: 10.0] (x>0)',
equate=False),
_Option(
['--domT', 'domain_score_threshold'], 'report domains >= this score cutoff in output', equate=False),
# Options controlling significance thresholds for inclusion in next round
_Option(
['--incE', 'inclusion_evalue'],
'consider sequences <= this E-value threshold as significant',
equate=False),
_Option(
['--incT', 'inclusion_score_threshold'],
'consider sequences >= this score threshold as significant',
equate=False),
_Option(
['--incdomE', 'inclusion_domain_evalue'],
'consider domains <= this E-value threshold as significant',
equate=False),
_Option(
['--incdomT', 'inclusion_domain_score_threshold'],
'consider domains >= this score threshold as significant',
equate=False),
# Options controlling acceleration heuristics
_Switch(['--max', 'no_heuristics'], 'Turn all heuristic filters off (less speed, more power)'),
_Option(
['--F1', 'stage1_threshold'],
'Stage 1 (MSV) threshold: promote hits w/ P <= F1 [default: 0.02]',
equate=False),
_Option(
['--F2', 'stage2_threshold'],
'Stage 2 (Vit) threshold: promote hits w/ P <= F2 [default: 1e-3]',
equate=False),
_Option(
['--F3', 'stage3_threshold'],
'Stage 3 (Fwd) threshold: promote hits w/ P <= F3 [default: 1e-5]',
equate=False),
_Switch(['--nobias', 'nobias'], 'turn off composition bias filter'),
# Options controlling model construction after first iteration
_Switch(['--fast', 'fast'], 'assign cols w/ >= symfrac residues as consensus'),
_Switch(['--hand', 'hand'], 'manual construction (requires reference annotation)'),
_Option(['--symfrac', 'symfrac'], 'sets sym fraction controlling --fast construction', equate=False),
_Option(['--fragthres', 'fragthres'], 'if L <= x*alen, tag sequence as a fragment', equate=False),
# Options controlling relative weights in models after first iteration
_Switch(['--wpb', 'henikoff_pb_weights'], 'Henikoff position-based weights [default]'),
_Switch(['--wgsc', 'GSC_weights'], 'Gerstein/Sonnhammer/Chothia tree weights'),
_Switch(['--wblosum', 'henikoff_sf_weights'], 'Henikoff simple filter weights'),
_Switch(['--wnone', 'no_weight'], 'don\'t do any relative weighting; set all to 1'),
_Option(
['--wid', 'wblosum_cutoff'],
'for --wblosum: set identity cutoff [default: 0.62] (0<=x<=1)',
equate=False),
# Options controlling effective seq number in models after first iteration
_Switch(['--eent', 'eent'], 'adjust eff seq # to achieve relative entropy target [default]'),
_Switch(['--eclust', 'ecluse'], 'eff seq # is # of single linkage clusters'),
_Switch(['--enone', 'enone'], 'no effective seq # weighting: just use nseq'),
_Option(['--eset', 'eset'], 'set eff seq # for all models to <x>', equate=False),
_Option(['--ere', 'ere'], 'for --eent: set minimum rel entropy/position to <x>', equate=False),
_Option(['--esigma', 'esigma'], 'for --eent: set sigma param to <x> [default: 45.0]', equate=False),
_Option(
['--eid', 'eid'], 'for --eclust: set fractional identity cutoff to <x> [default: 0.62]', equate=False),
# Options controlling prior strategy in models after first iteration
_Switch(['--pnone', 'pnone'], 'don\'t use any prior; parameters are frequencies'),
_Switch(['--plaplace', 'plaplace'], 'use a Laplace +1 prior'),
# Options controlling E value calibration
_Option(['--EmL', 'eml'], 'length of sequences for MSV Gumbel mu fit [default: 200] (n>0)', equate=False),
_Option(['--EmN', 'emn'], 'number of sequences for MSV Gumbel mu fit [default: 200] (n>0)', equate=False),
_Option(
['--EvL', 'evl'], 'length of sequences for Viterbi Gumbel mu fit [default: 200] (n>0)', equate=False),
_Option(
['--EvN', 'evn'], 'number of sequences for Viterbi Gumbel mu fit [default: 200] (n>0)', equate=False),
_Option(
['--EfL', 'efl'], 'length of sequences for Forward exp tail tau fit [default: 100] (n>0)',
equate=False),
_Option(
['--EfN', 'efn'], 'number of sequences for Forward exp tail tau fit [default: 200] (n>0)',
equate=False),
_Option(
['--Eft', 'eft'],
'tail mass for Forward exponential tail tau fit [default: 0.04] (0<x<1)',
equate=False),
# Other expert options
_Switch(['--nonull2', 'nonull2'], 'turn off biased composition score corrections'),
_Option(['-Z', 'ncomparison'], 'set # of comparisons done, for E-value calculation', equate=False),
_Option(['--domZ', 'domz'], 'set # of significant seqs, for domain E-value calculation', equate=False),
_Option(
['--seed', 'seed'], 'set RNG seed to <n> (if 0: one-time arbitrary seed) [default: 42]', equate=False),
_Option(
['--qformat', 'qformat'], 'assert query <seqfile> is in format <s>: no autodetection', equate=False),
_Option(
['--tformat', 'tformat'], 'assert target < seqdb > is in format < s >>: no autodetection',
equate=False),
_Option(['--cpu', 'cpu'], 'number of parallel CPU workers to use for multithreads', equate=False),
# Required arguments
_Argument(['input'], 'sequence containing file', filename=True, is_required=True),
_Argument(['database'], 'sequence database', filename=True, is_required=True),
]
AbstractCommandline.__init__(self, cmd, **kwargs)