Source code for conkit.applications.cdhit

# coding=utf-8
#
# BSD 3-Clause License
#
# Copyright (c) 2016-17, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Command line object for CCMpred contact prediction application
"""

__author__ = "Felix Simkovic"
__date__ = "04 Aug 2016"
__version__ = "0.1"

from Bio.Application import _Option
from Bio.Application import AbstractCommandline


[docs]class CdhitCommandline(AbstractCommandline): """ Command line object for Cd-hit [#]_ [#]_ http://cd-hit.org CD-HIT is a very widely used program for clustering and comparing protein or nucleotide sequences. CD-HIT was originally developed by Dr. Weizhong Li at Dr. Adam Godzik's Lab at the Burnham Institute (now Sanford-Burnham Medical Research Institute). CD-HIT is very fast and can handle extremely large databases. CD-HIT helps to significantly reduce the computational and manual efforts in many sequence analysis tasks and aids in understanding the data structure and correct the bias within a dataset. .. [#] Li W, Jaroszewski L, Godzik A(2001). Clustering of highly homologous sequences to reduce thesize of large protein database. Bioinformatics 17, 282-283. .. [#] Li W, Jaroszewski L, Godzik A (2002). Tolerating some redundancy significantly speeds up clustering of large protein databases. Bioinformatics 18, 77-82. Examples -------- >>> from conkit.applications import CdhitCommandline >>> cdhit_cline = CdhitCommandline() >>> print(cdhit_cline) You would typically run the command line with :func:`cdhit_cline` or via the Python subprocess module. """ def __init__(self, cmd="cd-hit", **kwargs): self.parameters = [ _Option(['-i', 'input'], 'input filename in fasta format, required', filename=True, equate=False, is_required=True), _Option(['-o', 'output'], 'output filename, required', filename=True, equate=False, is_required=True), _Option(['-c', 'seq_id_thres'], "sequence identity threshold, default 0.9 " "this is the default cd-hit's 'global sequence identity' calculated as: " "number of identical amino acids in alignment divided by " "the full length of the shorter sequence", equate=False), _Option(['-G', 'global_seq_id'], "use global sequence identity, default 1 " "if set to 0, then use local sequence identity, calculated as : " "number of identical amino acids in alignment " "divided by the length of the alignment " "NOTE!!! don't use -G 0 unless you use alignment coverage controls " "see options -aL (kwarg: `cov_alignment_long`), -AL (kwarg: `cov_alignment_long_control`)," " -aS (kwarg: `cov_alignment_short`), -AS (kwarg: `cov_alignment_short_control`)", equate=False), _Option(['-b', 'band_width'], 'band_width of alignment, default 20', equate=False), _Option(['-M', 'memory_limit'], 'memory limit (in MB) for the program, default 800; 0 for unlimited', equate=False), _Option(['-T', 'num_threads'], 'number of threads, default 1; with 0, all CPUs will be used', equate=False), _Option(['-n', 'word_length'], "word_length, default 5, see user's guide for choosing it", equate=False), _Option(['-l', 'len_throw_away_seqs'], "length of throw_away_sequences, default 10", equate=False), _Option(['-t', 'tol_4_redundance'], "tolerance for redundance, default 2", equate=False), _Option(['-d', 'len_desc'], "length of description in .clstr file, default 20 " "if set to 0, it takes the fasta defline and stops at first space " "-s length difference cutoff, default 0.0", equate=False), _Option(['-s', 'len_diff_cutoff'], "length difference cutoff, default 0.0 " "if set to 0.9, the shorter sequences need to be " "at least 90% length of the representative of the cluster", equate=False), _Option(['-S', 'len_diff_cutoff_aa'], "length difference cutoff in amino acid, default 999999 " "if set to 60, the length difference between the shorter sequences " "and the representative of the cluster can not be bigger than 60", equate=False), _Option(['-aL', 'cov_alignment_long'], "alignment coverage for the longer sequence, default 0.0 " "if set to 0.9, the alignment must covers 90% of the sequence", equate=False), _Option(['-AL', 'cov_alignment_long_control'], "alignment coverage control for the longer sequence, default 99999999 " "if set to 60, and the length of the sequence is 400, " "then the alignment must be >= 340 (400-60) residues", equate=False), _Option(['-aS', 'cov_alignment_short'], "alignment coverage for the shorter sequence, default 0.0 " "if set to 0.9, the alignment must covers 90% of the sequence", equate=False), _Option(['-AS', 'cov_alignment_short_control'], "alignment coverage control for the shorter sequence, default 99999999 " "if set to 60, and the length of the sequence is 400, " "then the alignment must be >= 340 (400-60) residues", equate=False), _Option(['-A', 'cov_alignment'], "minimal alignment coverage control for the both sequences, default 0 " "alignment must cover >= this value for both sequences", equate=False), _Option(['-uL', 'max_unmatched_percentage_long'], "maximum unmatched percentage for the longer sequence, default 1.0 " "if set to 0.1, the unmatched region (excluding leading and tailing gaps) " "must not be more than 10% of the sequence", equate=False), _Option(['-uS', 'max_unmatched_percentage_short'], "maximum unmatched percentage for the shorter sequence, default 1.0 " "if set to 0.1, the unmatched region (excluding leading and tailing gaps) " "must not be more than 10% of the sequence", equate=False), _Option(['-U', 'len_max_unmatched'], "maximum unmatched length, default 99999999 " "if set to 10, the unmatched region (excluding leading and tailing gaps) " "must not be more than 10 bases", equate=False), _Option(['-B', 'hdd_storage'], "1 or 0, default 0, by default, sequences are stored in RAM " "if set to 1, sequence are stored on hard drive " "it is recommended to use -B 1 for huge databases", equate=False), _Option(['-p', 'aln_overlap_2_file'], "1 or 0, default 0 " "if set to 1, print alignment overlap in .clstr file", equate=False), _Option(['-g', 'accurate_mode'], "1 or 0, default 0 " "by cd-hit's default algorithm, a sequence is clustered to the first " "cluster that meet the threshold (fast cluster). If set to 1, the program " "will cluster it into the most similar cluster that meet the threshold " "(accurate but slow mode) " "but either 1 or 0 won't change the representatives of final clusters", equate=False), _Option(['-bak', 'backup'], "write backup cluster file (1 or 0, default 0)", equate=False), ] AbstractCommandline.__init__(self, cmd, **kwargs)