Source code for conkit.command_line.conkit_predict

#!/usr/bin/env python
#
# BSD 3-Clause License
#
# Copyright (c) 2016-21, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""This script provides a simplified contact prediction pipeline. It will
take either a sequence or alignment as starting point and predict, analyse
and evaluate the final prediction and any intermediate results.

It uses two external programs to perform this task.

   - HHblits for Sequence Alignment generation, and
   - CCMpred for Direct Coupling Analysis.

*** The two programs above need to be installed separately ***

"""

__author__ = "Felix Simkovic"
__date__ = "01 June 2016"
__version__ = "0.13.3"

import argparse
import os
import time

import conkit.applications
import conkit.command_line
import conkit.io
import conkit.plot
import conkit.plot.tools

logger = None


[docs]def add_default_args(parser): """Define default arguments""" parser.add_argument("-prefix", default="conkit", help="Job ID") parser.add_argument("-wdir", default=os.getcwd(), help="Working directory") parser.add_argument("--demo", default=False, action="store_true", help=argparse.SUPPRESS)
[docs]def add_alignment_args(subparsers): """Parser with alignment as starting point""" from_alignment_subparser = subparsers.add_parser("aln") add_default_args(from_alignment_subparser) from_alignment_subparser.add_argument("ccmpred", help="Path to the CCMpred executable") from_alignment_subparser.add_argument("alignment_file", help="Path to alignment file") from_alignment_subparser.add_argument("alignment_format", help="Alignment format") from_alignment_subparser.set_defaults(which="alignment")
[docs]def add_sequence_args(subparsers): """Parser with sequence as starting point""" from_sequence_subparser = subparsers.add_parser("seq") add_default_args(from_sequence_subparser) from_sequence_subparser.add_argument("--nodca", default=False, action="store_true", help=argparse.SUPPRESS) from_sequence_subparser.add_argument("ccmpred", help="Path to the CCMpred executable") from_sequence_subparser.add_argument("hhblits", help="Path to the HHblits executable") from_sequence_subparser.add_argument("hhblitsdb", help="Path to HHblits database") from_sequence_subparser.add_argument("sequence_file", help="Path to sequence file") from_sequence_subparser.add_argument("sequence_format", help="Sequence format") from_sequence_subparser.set_defaults(which="sequence")
[docs]def main(argl=None): """The main routine for conkit-predict functionality Parameters ---------- argl : list, tuple, optional A list containing the command line flags """ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) subparsers = parser.add_subparsers() add_alignment_args(subparsers) add_sequence_args(subparsers) args = parser.parse_args(argl) global logger logger = conkit.command_line.setup_logging(level="info") logger.info("Prefix: %s", args.prefix) logger.info("Working dir: %s", args.wdir) if args.which == "alignment": aln_fname = os.path.abspath(args.alignment_file) aln_format = args.alignment_format.lower() logger.info("Input alignment file: %s", aln_fname) logger.info("Input alignment format: %s", aln_format) if aln_format not in conkit.io.SEQUENCE_FILE_PARSERS: msg = "Sequence format not yet implemented: %s" % aln_fname logger.critical(msg) raise ValueError(msg) if aln_format != "jones": jon_fname = os.path.join(args.wdir, args.prefix + ".jones") conkit.io.convert(aln_fname, aln_format, jon_fname, "jones") else: jon_fname = aln_fname elif args.which == "sequence": hhblits = os.path.abspath(args.hhblits) hhblitsdb = os.path.abspath(args.hhblitsdb) seq_fname = os.path.abspath(args.sequence_file) seq_format = args.sequence_format.lower() a3m_fname = os.path.join(args.wdir, args.prefix + ".a3m") hhr_fname = os.path.join(args.wdir, args.prefix + ".hhr") logger.info("HHblits DB: %s", hhblitsdb) logger.info("Input sequence file: %s", seq_fname) logger.info("Input sequence format: %s", seq_format) # Check that we can handle the sequence file if seq_format not in conkit.io.SEQUENCE_FILE_PARSERS: msg = "Sequence format not yet implemented: %s" % seq_format logger.critical(msg) raise ValueError(msg) # Convert our sequence file to FASTA format if seq_format != "fasta": seq_fname_tmp = seq_fname.rsplit(".", 1)[0] + ".fasta" conkit.io.convert(seq_fname, seq_format, seq_fname_tmp, "fasta") seq_fname = seq_fname_tmp # Generate a multiple sequence alignment hhblits_cline = conkit.applications.HHblitsCommandline( cmd=hhblits, input=seq_fname, output=hhr_fname, database=hhblitsdb, oa3m=a3m_fname, niterations=3, id=99, show_all=True, cov=60, diff="inf", maxfilt=500000, ) logger.info("Executing: %s", hhblits_cline) if args.demo: assert os.path.isfile(a3m_fname) time.sleep(5) else: hhblits_cline() jon_fname = os.path.join(args.wdir, args.prefix + ".jones") conkit.io.convert(a3m_fname, "a3m", jon_fname, "jones") else: raise RuntimeError("Error reading cli arguments - please report this bug") # CCMpred requires alignments to be in the *jones* format - i.e. the format created # and used by David Jones in PSICOV msa_h = conkit.io.read(jon_fname, "jones") freq_plot_fname = os.path.join(args.wdir, args.prefix + "freq.png") figure = conkit.plot.SequenceCoverageFigure(msa_h, legend=True) figure.ax.set_aspect(conkit.plot.tools.get_adjusted_aspect(figure.ax, 0.3)) figure.savefig(freq_plot_fname) logger.info("Final alignment file: %s", jon_fname) logger.info("|- Total Number of sequences: %d", msa_h.nseq) logger.info("|- Number of effective sequences: %d", msa_h.meff) logger.info("|- Plotted sequence coverage: %s", freq_plot_fname) if args.which == "sequence" and args.nodca: return ccmpred = args.ccmpred matrix_fname = os.path.join(args.wdir, args.prefix + ".mat") ccmpred_cline = conkit.applications.CCMpredCommandline( cmd=ccmpred, alnfile=jon_fname, matfile=matrix_fname, threads=2, renormalize=True ) logger.info("Executing: %s", ccmpred_cline) if args.demo: assert os.path.isfile(matrix_fname) time.sleep(5) else: ccmpred_cline() dtn = 5 dfactor = 1.0 cmap = conkit.io.read(matrix_fname, "ccmpred").top_map cmap.sequence = conkit.io.read(jon_fname, "jones").top_sequence cmap.remove_neighbors(min_distance=dtn, inplace=True) cmap.sort("raw_score", reverse=True, inplace=True) cmap = cmap[: cmap.sequence.seq_len] contact_map_fname = os.path.join(args.wdir, args.prefix + "cmap.png") figure = conkit.plot.ContactMapFigure(cmap, legend=True) figure.ax.set_aspect(conkit.plot.tools.get_adjusted_aspect(figure.ax, 1.0)) figure.savefig(contact_map_fname) logger.info("Plotted contact map: %s", contact_map_fname) logger.info("|- Min sequence separation for contacting residues: %d", dtn) logger.info("|- Contact list cutoff factor: %f * L", dfactor) casprr_fname = os.path.join(args.wdir, args.prefix + ".rr") conkit.io.convert(matrix_fname, "ccmpred", casprr_fname, "casprr") logger.info("Final prediction file: %s", casprr_fname)
if __name__ == "__main__": import sys import traceback try: main() sys.exit(0) except Exception as e: if not isinstance(e, SystemExit): msg = "".join(traceback.format_exception(*sys.exc_info())) logger.critical(msg) sys.exit(1)