Source code for conkit.applications.cdhit

# coding=utf-8
#
# BSD 3-Clause License
#
# Copyright (c) 2016-19, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Command line object for CCMpred contact prediction application
"""

__author__ = "Felix Simkovic"
__date__ = "04 Aug 2016"
__version__ = "0.1"

from Bio.Application import _Option
from Bio.Application import AbstractCommandline


[docs]class CdhitCommandline(AbstractCommandline): """ Command line object for Cd-hit [#]_ [#]_ http://cd-hit.org CD-HIT is a very widely used program for clustering and comparing protein or nucleotide sequences. CD-HIT was originally developed by Dr. Weizhong Li at Dr. Adam Godzik's Lab at the Burnham Institute (now Sanford-Burnham Medical Research Institute). CD-HIT is very fast and can handle extremely large databases. CD-HIT helps to significantly reduce the computational and manual efforts in many sequence analysis tasks and aids in understanding the data structure and correct the bias within a dataset. .. [#] Li W, Jaroszewski L, Godzik A(2001). Clustering of highly homologous sequences to reduce thesize of large protein database. Bioinformatics 17, 282-283. .. [#] Li W, Jaroszewski L, Godzik A (2002). Tolerating some redundancy significantly speeds up clustering of large protein databases. Bioinformatics 18, 77-82. Examples -------- >>> from conkit.applications import CdhitCommandline >>> cdhit_cline = CdhitCommandline() >>> print(cdhit_cline) You would typically run the command line with :func:`cdhit_cline` or via the :mod:`~subprocess` module. """ def __init__(self, cmd="cd-hit", **kwargs): self.parameters = [ _Option( ["-i", "input"], "input filename in fasta format, required", filename=True, equate=False, is_required=True, ), _Option(["-o", "output"], "output filename, required", filename=True, equate=False, is_required=True), _Option( ["-c", "seq_id_thres"], "sequence identity threshold, default 0.9 " "this is the default cd-hit's 'global sequence identity' calculated as: " "number of identical amino acids in alignment divided by " "the full length of the shorter sequence", equate=False, ), _Option( ["-G", "global_seq_id"], "use global sequence identity, default 1 " "if set to 0, then use local sequence identity, calculated as : " "number of identical amino acids in alignment " "divided by the length of the alignment " "NOTE!!! don't use -G 0 unless you use alignment coverage controls " "see options -aL (kwarg: `cov_alignment_long`), -AL (kwarg: `cov_alignment_long_control`)," " -aS (kwarg: `cov_alignment_short`), -AS (kwarg: `cov_alignment_short_control`)", equate=False, ), _Option(["-b", "band_width"], "band_width of alignment, default 20", equate=False), _Option( ["-M", "memory_limit"], "memory limit (in MB) for the program, default 800; 0 for unlimited", equate=False, ), _Option(["-T", "num_threads"], "number of threads, default 1; with 0, all CPUs will be used", equate=False), _Option(["-n", "word_length"], "word_length, default 5, see user's guide for choosing it", equate=False), _Option(["-l", "len_throw_away_seqs"], "length of throw_away_sequences, default 10", equate=False), _Option(["-t", "tol_4_redundance"], "tolerance for redundance, default 2", equate=False), _Option( ["-d", "len_desc"], "length of description in .clstr file, default 20 " "if set to 0, it takes the fasta defline and stops at first space " "-s length difference cutoff, default 0.0", equate=False, ), _Option( ["-s", "len_diff_cutoff"], "length difference cutoff, default 0.0 " "if set to 0.9, the shorter sequences need to be " "at least 90% length of the representative of the cluster", equate=False, ), _Option( ["-S", "len_diff_cutoff_aa"], "length difference cutoff in amino acid, default 999999 " "if set to 60, the length difference between the shorter sequences " "and the representative of the cluster can not be bigger than 60", equate=False, ), _Option( ["-aL", "cov_alignment_long"], "alignment coverage for the longer sequence, default 0.0 " "if set to 0.9, the alignment must covers 90% of the sequence", equate=False, ), _Option( ["-AL", "cov_alignment_long_control"], "alignment coverage control for the longer sequence, default 99999999 " "if set to 60, and the length of the sequence is 400, " "then the alignment must be >= 340 (400-60) residues", equate=False, ), _Option( ["-aS", "cov_alignment_short"], "alignment coverage for the shorter sequence, default 0.0 " "if set to 0.9, the alignment must covers 90% of the sequence", equate=False, ), _Option( ["-AS", "cov_alignment_short_control"], "alignment coverage control for the shorter sequence, default 99999999 " "if set to 60, and the length of the sequence is 400, " "then the alignment must be >= 340 (400-60) residues", equate=False, ), _Option( ["-A", "cov_alignment"], "minimal alignment coverage control for the both sequences, default 0 " "alignment must cover >= this value for both sequences", equate=False, ), _Option( ["-uL", "max_unmatched_percentage_long"], "maximum unmatched percentage for the longer sequence, default 1.0 " "if set to 0.1, the unmatched region (excluding leading and tailing gaps) " "must not be more than 10% of the sequence", equate=False, ), _Option( ["-uS", "max_unmatched_percentage_short"], "maximum unmatched percentage for the shorter sequence, default 1.0 " "if set to 0.1, the unmatched region (excluding leading and tailing gaps) " "must not be more than 10% of the sequence", equate=False, ), _Option( ["-U", "len_max_unmatched"], "maximum unmatched length, default 99999999 " "if set to 10, the unmatched region (excluding leading and tailing gaps) " "must not be more than 10 bases", equate=False, ), _Option( ["-B", "hdd_storage"], "1 or 0, default 0, by default, sequences are stored in RAM " "if set to 1, sequence are stored on hard drive " "it is recommended to use -B 1 for huge databases", equate=False, ), _Option( ["-p", "aln_overlap_2_file"], "1 or 0, default 0 " "if set to 1, print alignment overlap in .clstr file", equate=False, ), _Option( ["-g", "accurate_mode"], "1 or 0, default 0 " "by cd-hit's default algorithm, a sequence is clustered to the first " "cluster that meet the threshold (fast cluster). If set to 1, the program " "will cluster it into the most similar cluster that meet the threshold " "(accurate but slow mode) " "but either 1 or 0 won't change the representatives of final clusters", equate=False, ), _Option(["-bak", "backup"], "write backup cluster file (1 or 0, default 0)", equate=False), ] AbstractCommandline.__init__(self, cmd, **kwargs)