"""
Command line object for Jackhmmer Multiple Sequence Alignment generation
"""
__author__ = "Felix Simkovic"
__date__ = "01 June 2016"
__version__ = 0.1
from Bio.Application import _Argument
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline
[docs]class JackhmmerCommandLine(AbstractCommandline):
"""
Command line object for Jackhmmer alignment generation
http://hmmer.org/
Jackhmmer is an algorithm that uses iterative searches a protein sequence
against a protein sequence database to find sequence homologs.
Examples
--------
To generate a Multiple Sequence Alignment, use:
>>> from conkit.applications import JackhmmerCommandLine
>>> jackhmmer_cline = JackhmmerCommandLine(
... input="test.fasta", database="uniref100.fasta"
... )
>>> print(jackhmmer_cline)
jackhmmer test.fasta uniref100.fasta
You would typically run the command line with :func:`jackhmmer_cline` or via
the Python subprocess module.
Citations
---------
Johnson L. S., Eddy S. R., Portugaly E. (2010). Hidden Markov
Model Speed Heuristic and Iterative HMM Search Procedure. BMC Bioinformatics 11, 431.
"""
def __init__(self, cmd='jackhmmer', **kwargs):
self.parameters = [
_Option(['-N', 'niterations'],
'set maximum number of iterations [default: 5]',
equate=False),
# Options directing output
_Option(['-o', 'output'],
'direct output to file <f>, not stdout',
filename=True,
equate=False),
_Option(['-A', 'msa_hits'],
'save multiple alignment of hits to file <f>',
filename=True,
equate=False),
_Option(['--tblout', 'per_sequence_hits'],
'save parseable table of per-sequence hits to file <f>',
filename=True,
equate=False),
_Option(['--domtblout', 'per_domain_hits'],
'save parseable table of per-domain hits to file <f>',
filename=True,
equate=False),
_Option(['--chkhmm', 'hmm_checkpoints'],
'save HMM checkpoints to files <f>-<iteration>.hmm',
filename=True,
equate=False),
_Option(['--chkali', 'alignment_checkpoints'],
'save alignment checkpoints to files <f>-<iteration>.sto',
filename=True,
equate=False),
_Switch(['--acc', 'accession'],
'prefer accessions over names in output'),
_Switch(['--noali', 'no_alignment'],
'don\'t output alignments, so output is smaller'),
_Switch(['--notextw', 'notextw'],
'unlimit ASCII text output line width'),
_Switch(['--textw', 'textw'],
'set max width of ASCII text output lines [default: 120] (n>=120)'),
# Options controlling scoring system in first iteration
_Option(['--popen', 'gap_open_probability'],
'gap open probability',
equate=False),
_Option(['--pextend', 'gap_extend_probability'],
'gap extend probability',
equate=False),
_Option(['--mx', 'matrix_choice'],
'substitution score matrix choice (of some built-in matrices)',
equate=False),
_Option(['--mxfile', 'matrix_option'],
'read substitution score matrix from file <f>',
filename=True,
equate=False),
# Options controlling reporting thresholds
_Option(['-E', 'evalue'],
'report sequences <= this E-value threshold in output [default: 10.0] (x>0)',
equate=False),
_Option(['-T', 'score_threshold'],
'report sequences >= this score threshold in output',
equate=False),
_Option(['--domE', 'domain_evalue'],
'report domains <= this E-value threshold in output [default: 10.0] (x>0)',
equate=False),
_Option(['--domT', 'domain_score_threshold'],
'report domains >= this score cutoff in output',
equate=False),
# Options controlling significance thresholds for inclusion in next round
_Option(['--incE', 'inclusion_evalue'],
'consider sequences <= this E-value threshold as significant',
equate=False),
_Option(['--incT', 'inclusion_score_threshold'],
'consider sequences >= this score threshold as significant',
equate=False),
_Option(['--incdomE', 'inclusion_domain_evalue'],
'consider domains <= this E-value threshold as significant',
equate=False),
_Option(['--incdomT', 'inclusion_domain_score_threshold'],
'consider domains >= this score threshold as significant',
equate=False),
# Options controlling acceleration heuristics
_Switch(['--max', 'no_heuristics'],
'Turn all heuristic filters off (less speed, more power)'),
_Option(['--F1', 'stage1_threshold'],
'Stage 1 (MSV) threshold: promote hits w/ P <= F1 [default: 0.02]',
equate=False),
_Option(['--F2', 'stage2_threshold'],
'Stage 2 (Vit) threshold: promote hits w/ P <= F2 [default: 1e-3]',
equate=False),
_Option(['--F3', 'stage3_threshold'],
'Stage 3 (Fwd) threshold: promote hits w/ P <= F3 [default: 1e-5]',
equate=False),
_Switch(['--nobias', 'nobias'],
'turn off composition bias filter'),
# Options controlling model construction after first iteration
_Switch(['--fast', 'fast'],
'assign cols w/ >= symfrac residues as consensus'),
_Switch(['--hand', 'hand'],
'manual construction (requires reference annotation)'),
_Option(['--symfrac', 'symfrac'],
'sets sym fraction controlling --fast construction',
equate=False),
_Option(['--fragthres', 'fragthres'],
'if L <= x*alen, tag sequence as a fragment',
equate=False),
# Options controlling relative weights in models after first iteration
_Switch(['--wpb', 'henikoff_pb_weights'],
'Henikoff position-based weights [default]'),
_Switch(['--wgsc', 'GSC_weights'],
'Gerstein/Sonnhammer/Chothia tree weights'),
_Switch(['--wblosum', 'henikoff_sf_weights'],
'Henikoff simple filter weights'),
_Switch(['--wnone', 'no_weight'],
'don\'t do any relative weighting; set all to 1'),
_Option(['--wid', 'wblosum_cutoff'],
'for --wblosum: set identity cutoff [default: 0.62] (0<=x<=1)',
equate=False),
# Options controlling effective seq number in models after first iteration
_Switch(['--eent', 'eent'],
'adjust eff seq # to achieve relative entropy target [default]'),
_Switch(['--eclust', 'ecluse'],
'eff seq # is # of single linkage clusters'),
_Switch(['--enone', 'enone'],
'no effective seq # weighting: just use nseq'),
_Option(['--eset', 'eset'],
'set eff seq # for all models to <x>',
equate=False),
_Option(['--ere', 'ere'],
'for --eent: set minimum rel entropy/position to <x>',
equate=False),
_Option(['--esigma', 'esigma'],
'for --eent: set sigma param to <x> [default: 45.0]',
equate=False),
_Option(['--eid', 'eid'],
'for --eclust: set fractional identity cutoff to <x> [default: 0.62]',
equate=False),
# Options controlling prior strategy in models after first iteration
_Switch(['--pnone', 'pnone'],
'don\'t use any prior; parameters are frequencies'),
_Switch(['--plaplace', 'plaplace'],
'use a Laplace +1 prior'),
# Options controlling E value calibration
_Option(['--EmL', 'eml'],
'length of sequences for MSV Gumbel mu fit [default: 200] (n>0)',
equate=False),
_Option(['--EmN', 'emn'],
'number of sequences for MSV Gumbel mu fit [default: 200] (n>0)',
equate=False),
_Option(['--EvL', 'evl'],
'length of sequences for Viterbi Gumbel mu fit [default: 200] (n>0)',
equate=False),
_Option(['--EvN', 'evn'],
'number of sequences for Viterbi Gumbel mu fit [default: 200] (n>0)',
equate=False),
_Option(['--EfL', 'efl'],
'length of sequences for Forward exp tail tau fit [default: 100] (n>0)',
equate=False),
_Option(['--EfN', 'efn'],
'number of sequences for Forward exp tail tau fit [default: 200] (n>0)',
equate=False),
_Option(['--Eft', 'eft'],
'tail mass for Forward exponential tail tau fit [default: 0.04] (0<x<1)',
equate=False),
# Other expert options
_Switch(['--nonull2', 'nonull2'],
'turn off biased composition score corrections'),
_Option(['-Z', 'ncomparison'],
'set # of comparisons done, for E-value calculation',
equate=False),
_Option(['--domZ', 'domz'],
'set # of significant seqs, for domain E-value calculation',
equate=False),
_Option(['--seed', 'seed'],
'set RNG seed to <n> (if 0: one-time arbitrary seed) [default: 42]',
equate=False),
_Option(['--qformat', 'qformat'],
'assert query <seqfile> is in format <s>: no autodetection',
equate=False),
_Option(['--tformat', 'tformat'],
'assert target < seqdb > is in format < s >>: no autodetection',
equate=False),
_Option(['--cpu', 'cpu'],
'number of parallel CPU workers to use for multithreads',
equate=False),
# Required arguments
_Argument(['input'],
'sequence containing file',
filename=True,
is_required=True),
_Argument(['database'],
'sequence database',
filename=True,
is_required=True),
]
AbstractCommandline.__init__(self, cmd, **kwargs)