Source code for conkit.applications.jackhmmer

# coding=utf-8
#
# BSD 3-Clause License
#
# Copyright (c) 2016-17, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Command line object for Jackhmmer Multiple Sequence Alignment generation
"""

__author__ = "Felix Simkovic"
__date__ = "01 June 2016"
__version__ = "0.1"

from Bio.Application import _Argument
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline


[docs]class JackhmmerCommandline(AbstractCommandline): """ Command line object for Jackhmmer [#]_ alignment generation http://hmmer.org/ Jackhmmer is an algorithm that uses iterative searches a protein sequence against a protein sequence database to find sequence homologs. .. [#] Johnson L. S., Eddy S. R., Portugaly E. (2010). Hidden Markov Model Speed Heuristic and Iterative HMM Search Procedure. BMC Bioinformatics 11, 431. Examples -------- To generate a Multiple Sequence Alignment, use: >>> from conkit.applications import JackhmmerCommandline >>> jackhmmer_cline = JackhmmerCommandline( ... input="test.fasta", database="uniref100.fasta" ... ) >>> print(jackhmmer_cline) jackhmmer test.fasta uniref100.fasta You would typically run the command line with :func:`jackhmmer_cline` or via the Python subprocess module. """ def __init__(self, cmd='jackhmmer', **kwargs): self.parameters = [ _Option(['-N', 'niterations'], 'set maximum number of iterations [default: 5]', equate=False), # Options directing output _Option(['-o', 'output'], 'direct output to file <f>, not stdout', filename=True, equate=False), _Option(['-A', 'msa_hits'], 'save multiple alignment of hits to file <f>', filename=True, equate=False), _Option(['--tblout', 'per_sequence_hits'], 'save parseable table of per-sequence hits to file <f>', filename=True, equate=False), _Option(['--domtblout', 'per_domain_hits'], 'save parseable table of per-domain hits to file <f>', filename=True, equate=False), _Option(['--chkhmm', 'hmm_checkpoints'], 'save HMM checkpoints to files <f>-<iteration>.hmm', filename=True, equate=False), _Option(['--chkali', 'alignment_checkpoints'], 'save alignment checkpoints to files <f>-<iteration>.sto', filename=True, equate=False), _Switch(['--acc', 'accession'], 'prefer accessions over names in output'), _Switch(['--noali', 'no_alignment'], 'don\'t output alignments, so output is smaller'), _Switch(['--notextw', 'notextw'], 'unlimit ASCII text output line width'), _Switch(['--textw', 'textw'], 'set max width of ASCII text output lines [default: 120] (n>=120)'), # Options controlling scoring system in first iteration _Option(['--popen', 'gap_open_probability'], 'gap open probability', equate=False), _Option(['--pextend', 'gap_extend_probability'], 'gap extend probability', equate=False), _Option(['--mx', 'matrix_choice'], 'substitution score matrix choice (of some built-in matrices)', equate=False), _Option(['--mxfile', 'matrix_option'], 'read substitution score matrix from file <f>', filename=True, equate=False), # Options controlling reporting thresholds _Option(['-E', 'evalue'], 'report sequences <= this E-value threshold in output [default: 10.0] (x>0)', equate=False), _Option(['-T', 'score_threshold'], 'report sequences >= this score threshold in output', equate=False), _Option(['--domE', 'domain_evalue'], 'report domains <= this E-value threshold in output [default: 10.0] (x>0)', equate=False), _Option(['--domT', 'domain_score_threshold'], 'report domains >= this score cutoff in output', equate=False), # Options controlling significance thresholds for inclusion in next round _Option(['--incE', 'inclusion_evalue'], 'consider sequences <= this E-value threshold as significant', equate=False), _Option(['--incT', 'inclusion_score_threshold'], 'consider sequences >= this score threshold as significant', equate=False), _Option(['--incdomE', 'inclusion_domain_evalue'], 'consider domains <= this E-value threshold as significant', equate=False), _Option(['--incdomT', 'inclusion_domain_score_threshold'], 'consider domains >= this score threshold as significant', equate=False), # Options controlling acceleration heuristics _Switch(['--max', 'no_heuristics'], 'Turn all heuristic filters off (less speed, more power)'), _Option(['--F1', 'stage1_threshold'], 'Stage 1 (MSV) threshold: promote hits w/ P <= F1 [default: 0.02]', equate=False), _Option(['--F2', 'stage2_threshold'], 'Stage 2 (Vit) threshold: promote hits w/ P <= F2 [default: 1e-3]', equate=False), _Option(['--F3', 'stage3_threshold'], 'Stage 3 (Fwd) threshold: promote hits w/ P <= F3 [default: 1e-5]', equate=False), _Switch(['--nobias', 'nobias'], 'turn off composition bias filter'), # Options controlling model construction after first iteration _Switch(['--fast', 'fast'], 'assign cols w/ >= symfrac residues as consensus'), _Switch(['--hand', 'hand'], 'manual construction (requires reference annotation)'), _Option(['--symfrac', 'symfrac'], 'sets sym fraction controlling --fast construction', equate=False), _Option(['--fragthres', 'fragthres'], 'if L <= x*alen, tag sequence as a fragment', equate=False), # Options controlling relative weights in models after first iteration _Switch(['--wpb', 'henikoff_pb_weights'], 'Henikoff position-based weights [default]'), _Switch(['--wgsc', 'GSC_weights'], 'Gerstein/Sonnhammer/Chothia tree weights'), _Switch(['--wblosum', 'henikoff_sf_weights'], 'Henikoff simple filter weights'), _Switch(['--wnone', 'no_weight'], 'don\'t do any relative weighting; set all to 1'), _Option(['--wid', 'wblosum_cutoff'], 'for --wblosum: set identity cutoff [default: 0.62] (0<=x<=1)', equate=False), # Options controlling effective seq number in models after first iteration _Switch(['--eent', 'eent'], 'adjust eff seq # to achieve relative entropy target [default]'), _Switch(['--eclust', 'ecluse'], 'eff seq # is # of single linkage clusters'), _Switch(['--enone', 'enone'], 'no effective seq # weighting: just use nseq'), _Option(['--eset', 'eset'], 'set eff seq # for all models to <x>', equate=False), _Option(['--ere', 'ere'], 'for --eent: set minimum rel entropy/position to <x>', equate=False), _Option(['--esigma', 'esigma'], 'for --eent: set sigma param to <x> [default: 45.0]', equate=False), _Option(['--eid', 'eid'], 'for --eclust: set fractional identity cutoff to <x> [default: 0.62]', equate=False), # Options controlling prior strategy in models after first iteration _Switch(['--pnone', 'pnone'], 'don\'t use any prior; parameters are frequencies'), _Switch(['--plaplace', 'plaplace'], 'use a Laplace +1 prior'), # Options controlling E value calibration _Option(['--EmL', 'eml'], 'length of sequences for MSV Gumbel mu fit [default: 200] (n>0)', equate=False), _Option(['--EmN', 'emn'], 'number of sequences for MSV Gumbel mu fit [default: 200] (n>0)', equate=False), _Option(['--EvL', 'evl'], 'length of sequences for Viterbi Gumbel mu fit [default: 200] (n>0)', equate=False), _Option(['--EvN', 'evn'], 'number of sequences for Viterbi Gumbel mu fit [default: 200] (n>0)', equate=False), _Option(['--EfL', 'efl'], 'length of sequences for Forward exp tail tau fit [default: 100] (n>0)', equate=False), _Option(['--EfN', 'efn'], 'number of sequences for Forward exp tail tau fit [default: 200] (n>0)', equate=False), _Option(['--Eft', 'eft'], 'tail mass for Forward exponential tail tau fit [default: 0.04] (0<x<1)', equate=False), # Other expert options _Switch(['--nonull2', 'nonull2'], 'turn off biased composition score corrections'), _Option(['-Z', 'ncomparison'], 'set # of comparisons done, for E-value calculation', equate=False), _Option(['--domZ', 'domz'], 'set # of significant seqs, for domain E-value calculation', equate=False), _Option(['--seed', 'seed'], 'set RNG seed to <n> (if 0: one-time arbitrary seed) [default: 42]', equate=False), _Option(['--qformat', 'qformat'], 'assert query <seqfile> is in format <s>: no autodetection', equate=False), _Option(['--tformat', 'tformat'], 'assert target < seqdb > is in format < s >>: no autodetection', equate=False), _Option(['--cpu', 'cpu'], 'number of parallel CPU workers to use for multithreads', equate=False), # Required arguments _Argument(['input'], 'sequence containing file', filename=True, is_required=True), _Argument(['database'], 'sequence database', filename=True, is_required=True), ] AbstractCommandline.__init__(self, cmd, **kwargs)