Source code for conkit.applications.cdhit

# coding=utf-8
#
# BSD 3-Clause License
#
# Copyright (c) 2016-17, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Command line object for CCMpred contact prediction application
"""

__author__ = "Felix Simkovic"
__date__ = "04 Aug 2016"
__version__ = "0.1"

from Bio.Application import _Option
from Bio.Application import AbstractCommandline


[docs]class CdhitCommandline(AbstractCommandline):
    """
    Command line object for Cd-hit [#]_ [#]_

    http://cd-hit.org

    CD-HIT is a very widely used program for clustering and comparing
    protein or nucleotide sequences. CD-HIT was originally developed
    by Dr. Weizhong Li at Dr. Adam Godzik's Lab at the Burnham Institute
    (now Sanford-Burnham Medical Research Institute).

    CD-HIT is very fast and can handle extremely large databases. CD-HIT
    helps to significantly reduce the computational and manual efforts in
    many sequence analysis tasks and aids in understanding the data
    structure and correct the bias within a dataset.
    
    .. [#] Li W, Jaroszewski L, Godzik A(2001). Clustering of highly homologous sequences
       to reduce thesize of large protein database. Bioinformatics 17, 282-283.

    .. [#] Li W, Jaroszewski L, Godzik A (2002). Tolerating some redundancy significantly
       speeds up clustering of large protein databases. Bioinformatics 18, 77-82.


    Examples
    --------
    >>> from conkit.applications import CdhitCommandline
    >>> cdhit_cline = CdhitCommandline()
    >>> print(cdhit_cline)

    You would typically run the command line with :func:`cdhit_cline` or via
    the Python subprocess module.

    """

    def __init__(self, cmd="cd-hit", **kwargs):
        self.parameters = [
            _Option(['-i', 'input'],
                    'input filename in fasta format, required',
                    filename=True,
                    equate=False,
                    is_required=True),
            _Option(['-o', 'output'],
                    'output filename, required',
                    filename=True,
                    equate=False,
                    is_required=True),
            _Option(['-c', 'seq_id_thres'],
                    "sequence identity threshold, default 0.9 "
                    "this is the default cd-hit's 'global sequence identity' calculated as: "
                    "number of identical amino acids in alignment divided by "
                    "the full length of the shorter sequence",
                    equate=False),
            _Option(['-G', 'global_seq_id'],
                    "use global sequence identity, default 1 "
                    "if set to 0, then use local sequence identity, calculated as : "
                    "number of identical amino acids in alignment "
                    "divided by the length of the alignment "
                    "NOTE!!! don't use -G 0 unless you use alignment coverage controls "
                    "see options -aL (kwarg: `cov_alignment_long`), -AL (kwarg: `cov_alignment_long_control`),"
                    "            -aS (kwarg: `cov_alignment_short`), -AS (kwarg: `cov_alignment_short_control`)",
                    equate=False),
            _Option(['-b', 'band_width'],
                    'band_width of alignment, default 20',
                    equate=False),
            _Option(['-M', 'memory_limit'],
                    'memory limit (in MB) for the program, default 800; 0 for unlimited',
                    equate=False),
            _Option(['-T', 'num_threads'],
                    'number of threads, default 1; with 0, all CPUs will be used',
                    equate=False),
            _Option(['-n', 'word_length'],
                    "word_length, default 5, see user's guide for choosing it",
                    equate=False),
            _Option(['-l', 'len_throw_away_seqs'],
                    "length of throw_away_sequences, default 10",
                    equate=False),
            _Option(['-t', 'tol_4_redundance'],
                    "tolerance for redundance, default 2",
                    equate=False),
            _Option(['-d', 'len_desc'],
                    "length of description in .clstr file, default 20 "
                    "if set to 0, it takes the fasta defline and stops at first space "
                    "-s	length difference cutoff, default 0.0",
                    equate=False),
            _Option(['-s', 'len_diff_cutoff'],
                    "length difference cutoff, default 0.0 "
                    "if set to 0.9, the shorter sequences need to be "
                    "at least 90% length of the representative of the cluster",
                    equate=False),
            _Option(['-S', 'len_diff_cutoff_aa'],
                    "length difference cutoff in amino acid, default 999999 "
                    "if set to 60, the length difference between the shorter sequences "
                    "and the representative of the cluster can not be bigger than 60",
                    equate=False),
            _Option(['-aL', 'cov_alignment_long'],
                    "alignment coverage for the longer sequence, default 0.0 "
                    "if set to 0.9, the alignment must covers 90% of the sequence",
                    equate=False),
            _Option(['-AL', 'cov_alignment_long_control'],
                    "alignment coverage control for the longer sequence, default 99999999 "
                    "if set to 60, and the length of the sequence is 400, "
                    "then the alignment must be >= 340 (400-60) residues",
                    equate=False),
            _Option(['-aS', 'cov_alignment_short'],
                    "alignment coverage for the shorter sequence, default 0.0 "
                    "if set to 0.9, the alignment must covers 90% of the sequence",
                    equate=False),
            _Option(['-AS', 'cov_alignment_short_control'],
                    "alignment coverage control for the shorter sequence, default 99999999 "
                    "if set to 60, and the length of the sequence is 400, "
                    "then the alignment must be >= 340 (400-60) residues",
                    equate=False),
            _Option(['-A', 'cov_alignment'],
                    "minimal alignment coverage control for the both sequences, default 0 "
                    "alignment must cover >= this value for both sequences",
                    equate=False),
            _Option(['-uL', 'max_unmatched_percentage_long'],
                    "maximum unmatched percentage for the longer sequence, default 1.0 "
                    "if set to 0.1, the unmatched region (excluding leading and tailing gaps) "
                    "must not be more than 10% of the sequence",
                    equate=False),
            _Option(['-uS', 'max_unmatched_percentage_short'],
                    "maximum unmatched percentage for the shorter sequence, default 1.0 "
                    "if set to 0.1, the unmatched region (excluding leading and tailing gaps) "
                    "must not be more than 10% of the sequence",
                    equate=False),
            _Option(['-U', 'len_max_unmatched'],
                    "maximum unmatched length, default 99999999 "
                    "if set to 10, the unmatched region (excluding leading and tailing gaps) "
                    "must not be more than 10 bases",
                    equate=False),
            _Option(['-B', 'hdd_storage'],
                    "1 or 0, default 0, by default, sequences are stored in RAM "
                    "if set to 1, sequence are stored on hard drive "
                    "it is recommended to use -B 1 for huge databases",
                    equate=False),
            _Option(['-p', 'aln_overlap_2_file'],
                    "1 or 0, default 0 "
                    "if set to 1, print alignment overlap in .clstr file",
                    equate=False),
            _Option(['-g', 'accurate_mode'],
                    "1 or 0, default 0 "
                    "by cd-hit's default algorithm, a sequence is clustered to the first "
                    "cluster that meet the threshold (fast cluster). If set to 1, the program "
                    "will cluster it into the most similar cluster that meet the threshold "
                    "(accurate but slow mode) "
                    "but either 1 or 0 won't change the representatives of final clusters",
                    equate=False),
            _Option(['-bak', 'backup'],
                    "write backup cluster file (1 or 0, default 0)",
                    equate=False),
        ]
        AbstractCommandline.__init__(self, cmd, **kwargs)