Source code for conkit.command_line.conkit_predict

#!/usr/bin/env python
#
# BSD 3-Clause License
#
# Copyright (c) 2016-17, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""This script provides a simplified contact prediction pipeline. It will
take either a sequence or alignment as starting point and predict, analyse
and evaluate the final prediction and any intermediate results.

It uses two external programs to perform this task.

   - HHblits for Sequence Alignment generation, and
   - CCMpred for Direct Coupling Analysis.

*** The two programs above need to be installed separately ***

"""

__author__ = "Felix Simkovic"
__date__ = "01 June 2016"
__version__ = "0.1"

import argparse
import os
import time

import conkit.applications
import conkit.command_line
import conkit.io
import conkit.plot

logger = None


[docs]def add_default_args(parser): """Define default arguments""" parser.add_argument('-prefix', default='conkit', help='Job ID') parser.add_argument('-wdir', default=os.getcwd(), help='Working directory') parser.add_argument('--demo', default=False, action="store_true", help=argparse.SUPPRESS)
[docs]def add_alignment_args(subparsers): """Parser with alignment as starting point""" from_alignment_subparser = subparsers.add_parser('aln') add_default_args(from_alignment_subparser) from_alignment_subparser.add_argument( 'ccmpred', help='Path to the CCMpred executable') from_alignment_subparser.add_argument( 'alignment_file', help='Path to alignment file') from_alignment_subparser.add_argument( 'alignment_format', help='Alignment format') from_alignment_subparser.set_defaults(which='alignment')
[docs]def add_sequence_args(subparsers): """Parser with sequence as starting point""" from_sequence_subparser = subparsers.add_parser('seq') add_default_args(from_sequence_subparser) from_sequence_subparser.add_argument('--nodca', default=False, action='store_true', help=argparse.SUPPRESS) from_sequence_subparser.add_argument( 'ccmpred', help='Path to the CCMpred executable') from_sequence_subparser.add_argument( 'hhblits', help='Path to the HHblits executable') from_sequence_subparser.add_argument( 'hhblitsdb', help='Path to HHblits database') from_sequence_subparser.add_argument( 'sequence_file', help='Path to sequence file') from_sequence_subparser.add_argument( 'sequence_format', help='Sequence format') from_sequence_subparser.set_defaults(which='sequence')
[docs]def main(argl=None): """The main routine for conkit-predict functionality Parameters ---------- argl : list, tuple, optional A list containing the command line flags """ parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) subparsers = parser.add_subparsers() # Add the subparsers add_alignment_args(subparsers) add_sequence_args(subparsers) # Parse all arguments args = parser.parse_args(argl) # Setup the logger global logger logger = conkit.command_line.setup_logging(level='info') logger.info('Prefix: %s', args.prefix) logger.info('Working dir: %s', args.wdir) if args.which == 'alignment': aln_fname = os.path.abspath(args.alignment_file) aln_format = args.alignment_format.lower() logger.info('Input alignment file: %s', aln_fname) logger.info('Input alignment format: %s', aln_format) # Check that we can handle the alignment file if aln_format not in conkit.io.SEQUENCE_FILE_PARSERS: msg = 'Sequence format not yet implemented: %s' % aln_fname logger.critical(msg) raise ValueError(msg) # Convert our alignment file to JONES format if aln_format != 'jones': jon_fname = os.path.join(args.wdir, args.prefix + '.jones') conkit.io.convert(aln_fname, aln_format, jon_fname, 'jones') else: jon_fname = aln_fname elif args.which == 'sequence': hhblits = os.path.abspath(args.hhblits) hhblitsdb = os.path.abspath(args.hhblitsdb) seq_fname = os.path.abspath(args.sequence_file) seq_format = args.sequence_format.lower() a3m_fname = os.path.join(args.wdir, args.prefix + '.a3m') hhr_fname = os.path.join(args.wdir, args.prefix + '.hhr') logger.info('HHblits DB: %s', hhblitsdb) logger.info('Input sequence file: %s', seq_fname) logger.info('Input sequence format: %s', seq_format) # Check that we can handle the sequence file if seq_format not in conkit.io.SEQUENCE_FILE_PARSERS: msg = 'Sequence format not yet implemented: %s' % seq_format logger.critical(msg) raise ValueError(msg) # Convert our sequence file to FASTA format if seq_format != 'fasta': seq_fname_tmp = seq_fname.rsplit('.', 1)[0] + '.fasta' conkit.io.convert(seq_fname, seq_format, seq_fname_tmp, 'fasta') seq_fname = seq_fname_tmp # Generate a multiple sequence alignment hhblits_cline = conkit.applications.HHblitsCommandline(cmd=hhblits, input=seq_fname, output=hhr_fname, database=hhblitsdb, oa3m=a3m_fname, niterations=3, id=99, show_all=True, cov=60, diff='inf', maxfilt=500000) logger.info('Executing: %s', hhblits_cline) if args.demo: assert os.path.isfile(a3m_fname) time.sleep(5) else: hhblits_cline() jon_fname = os.path.join(args.wdir, args.prefix + '.jones') conkit.io.convert(a3m_fname, 'a3m', jon_fname, 'jones') else: raise RuntimeError('Should never get to here') # CCMpred requires alignments to be in the *jones* format - i.e. the format created # and used by David Jones in PSICOV logger.info('Final alignment file: %s', jon_fname) msa_h = conkit.io.read(jon_fname, 'jones') logger.info('|- Total Number of sequences: %d', msa_h.nseq) logger.info('|- Number of effective sequences: %d', msa_h.neff) freq_plot_fname = os.path.join(args.wdir, args.prefix + 'freq.pdf') conkit.plot.SequenceCoverageFigure(msa_h, file_name=freq_plot_fname) logger.info('|- Plotted sequence coverage: %s', freq_plot_fname) # Kill switch to not run CCMpred DCA if args.which == 'sequence' and args.nodca: return # Use the re-formatted alignment for contact prediction ccmpred = args.ccmpred matrix_fname = os.path.join(args.wdir, args.prefix + '.mat') ccmpred_cline = conkit.applications.CCMpredCommandline(cmd=ccmpred, alnfile=jon_fname, matfile=matrix_fname, threads=2, renormalize=True) logger.info('Executing: %s', ccmpred_cline) if args.demo: assert os.path.isfile(matrix_fname) time.sleep(5) else: ccmpred_cline() # Add sequence information to contact hierarchy dtn = 5 dfactor = 1. cmap = conkit.io.read(matrix_fname, 'ccmpred').top_map cmap.sequence = conkit.io.read(jon_fname, 'jones').top_sequence cmap.remove_neighbors(min_distance=dtn, inplace=True) cmap.sort('raw_score', reverse=True, inplace=True) cmap = cmap[:cmap.sequence.seq_len] # subset the selection contact_map_fname = os.path.join(args.wdir, args.prefix + 'cmap.pdf') conkit.plot.ContactMapFigure(cmap, file_name=contact_map_fname) logger.info('Plotted contact map: %s', contact_map_fname) logger.info('|- Min sequence separation for contacting residues: %d', dtn) logger.info('|- Contact list cutoff factor: %f * L', dfactor) # Use the ccmpred parser to write a contact file casprr_fname = os.path.join(args.wdir, args.prefix + '.rr') conkit.io.convert(matrix_fname, 'ccmpred', casprr_fname, 'casprr') logger.info('Final prediction file: %s', casprr_fname) return
if __name__ == "__main__": import sys import traceback try: main() sys.exit(0) except Exception as e: if not isinstance(e, SystemExit): msg = "".join(traceback.format_exception(*sys.exc_info())) logger.critical(msg) sys.exit(1)