Source code for conkit.applications.cdhit

# coding=utf-8
#
# BSD 3-Clause License
#
# Copyright (c) 2016-19, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Command line object for CCMpred contact prediction application
"""

__author__ = "Felix Simkovic"
__date__ = "04 Aug 2016"
__version__ = "0.1"

from Bio.Application import _Option
from Bio.Application import AbstractCommandline


[docs]class CdhitCommandline(AbstractCommandline):
    """
    Command line object for Cd-hit [#]_ [#]_

    http://cd-hit.org

    CD-HIT is a very widely used program for clustering and comparing
    protein or nucleotide sequences. CD-HIT was originally developed
    by Dr. Weizhong Li at Dr. Adam Godzik's Lab at the Burnham Institute
    (now Sanford-Burnham Medical Research Institute).

    CD-HIT is very fast and can handle extremely large databases. CD-HIT
    helps to significantly reduce the computational and manual efforts in
    many sequence analysis tasks and aids in understanding the data
    structure and correct the bias within a dataset.

    .. [#] Li W, Jaroszewski L, Godzik A(2001). Clustering of highly homologous sequences
       to reduce thesize of large protein database. Bioinformatics 17, 282-283.

    .. [#] Li W, Jaroszewski L, Godzik A (2002). Tolerating some redundancy significantly
       speeds up clustering of large protein databases. Bioinformatics 18, 77-82.


    Examples
    --------
    >>> from conkit.applications import CdhitCommandline
    >>> cdhit_cline = CdhitCommandline()
    >>> print(cdhit_cline)

    You would typically run the command line with :func:`cdhit_cline` or via
    the :mod:`~subprocess` module.

    """

    def __init__(self, cmd="cd-hit", **kwargs):
        self.parameters = [
            _Option(
                ["-i", "input"],
                "input filename in fasta format, required",
                filename=True,
                equate=False,
                is_required=True,
            ),
            _Option(["-o", "output"], "output filename, required", filename=True, equate=False, is_required=True),
            _Option(
                ["-c", "seq_id_thres"],
                "sequence identity threshold, default 0.9 "
                "this is the default cd-hit's 'global sequence identity' calculated as: "
                "number of identical amino acids in alignment divided by "
                "the full length of the shorter sequence",
                equate=False,
            ),
            _Option(
                ["-G", "global_seq_id"],
                "use global sequence identity, default 1 "
                "if set to 0, then use local sequence identity, calculated as : "
                "number of identical amino acids in alignment "
                "divided by the length of the alignment "
                "NOTE!!! don't use -G 0 unless you use alignment coverage controls "
                "see options -aL (kwarg: `cov_alignment_long`), -AL (kwarg: `cov_alignment_long_control`),"
                "            -aS (kwarg: `cov_alignment_short`), -AS (kwarg: `cov_alignment_short_control`)",
                equate=False,
            ),
            _Option(["-b", "band_width"], "band_width of alignment, default 20", equate=False),
            _Option(
                ["-M", "memory_limit"],
                "memory limit (in MB) for the program, default 800; 0 for unlimited",
                equate=False,
            ),
            _Option(["-T", "num_threads"], "number of threads, default 1; with 0, all CPUs will be used", equate=False),
            _Option(["-n", "word_length"], "word_length, default 5, see user's guide for choosing it", equate=False),
            _Option(["-l", "len_throw_away_seqs"], "length of throw_away_sequences, default 10", equate=False),
            _Option(["-t", "tol_4_redundance"], "tolerance for redundance, default 2", equate=False),
            _Option(
                ["-d", "len_desc"],
                "length of description in .clstr file, default 20 "
                "if set to 0, it takes the fasta defline and stops at first space "
                "-s	length difference cutoff, default 0.0",
                equate=False,
            ),
            _Option(
                ["-s", "len_diff_cutoff"],
                "length difference cutoff, default 0.0 "
                "if set to 0.9, the shorter sequences need to be "
                "at least 90% length of the representative of the cluster",
                equate=False,
            ),
            _Option(
                ["-S", "len_diff_cutoff_aa"],
                "length difference cutoff in amino acid, default 999999 "
                "if set to 60, the length difference between the shorter sequences "
                "and the representative of the cluster can not be bigger than 60",
                equate=False,
            ),
            _Option(
                ["-aL", "cov_alignment_long"],
                "alignment coverage for the longer sequence, default 0.0 "
                "if set to 0.9, the alignment must covers 90% of the sequence",
                equate=False,
            ),
            _Option(
                ["-AL", "cov_alignment_long_control"],
                "alignment coverage control for the longer sequence, default 99999999 "
                "if set to 60, and the length of the sequence is 400, "
                "then the alignment must be >= 340 (400-60) residues",
                equate=False,
            ),
            _Option(
                ["-aS", "cov_alignment_short"],
                "alignment coverage for the shorter sequence, default 0.0 "
                "if set to 0.9, the alignment must covers 90% of the sequence",
                equate=False,
            ),
            _Option(
                ["-AS", "cov_alignment_short_control"],
                "alignment coverage control for the shorter sequence, default 99999999 "
                "if set to 60, and the length of the sequence is 400, "
                "then the alignment must be >= 340 (400-60) residues",
                equate=False,
            ),
            _Option(
                ["-A", "cov_alignment"],
                "minimal alignment coverage control for the both sequences, default 0 "
                "alignment must cover >= this value for both sequences",
                equate=False,
            ),
            _Option(
                ["-uL", "max_unmatched_percentage_long"],
                "maximum unmatched percentage for the longer sequence, default 1.0 "
                "if set to 0.1, the unmatched region (excluding leading and tailing gaps) "
                "must not be more than 10% of the sequence",
                equate=False,
            ),
            _Option(
                ["-uS", "max_unmatched_percentage_short"],
                "maximum unmatched percentage for the shorter sequence, default 1.0 "
                "if set to 0.1, the unmatched region (excluding leading and tailing gaps) "
                "must not be more than 10% of the sequence",
                equate=False,
            ),
            _Option(
                ["-U", "len_max_unmatched"],
                "maximum unmatched length, default 99999999 "
                "if set to 10, the unmatched region (excluding leading and tailing gaps) "
                "must not be more than 10 bases",
                equate=False,
            ),
            _Option(
                ["-B", "hdd_storage"],
                "1 or 0, default 0, by default, sequences are stored in RAM "
                "if set to 1, sequence are stored on hard drive "
                "it is recommended to use -B 1 for huge databases",
                equate=False,
            ),
            _Option(
                ["-p", "aln_overlap_2_file"],
                "1 or 0, default 0 " "if set to 1, print alignment overlap in .clstr file",
                equate=False,
            ),
            _Option(
                ["-g", "accurate_mode"],
                "1 or 0, default 0 "
                "by cd-hit's default algorithm, a sequence is clustered to the first "
                "cluster that meet the threshold (fast cluster). If set to 1, the program "
                "will cluster it into the most similar cluster that meet the threshold "
                "(accurate but slow mode) "
                "but either 1 or 0 won't change the representatives of final clusters",
                equate=False,
            ),
            _Option(["-bak", "backup"], "write backup cluster file (1 or 0, default 0)", equate=False),
        ]
        AbstractCommandline.__init__(self, cmd, **kwargs)