Source code for conkit.io.gremlin

# BSD 3-Clause License
#
# Copyright (c) 2016-17, University of Liverpool
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Parser module specific to Gremlin predictions
"""

__author__ = "Felix Simkovic"
__date__ = "04 Oct 2016"
__version__ = "0.1"

import os
import re

from conkit.io._parser import ContactFileParser
from conkit.core.contact import Contact
from conkit.core.contactmap import ContactMap
from conkit.core.contactfile import ContactFile

RE_HEADER_INTRA = re.compile(r'^i\s+j\s+i_id\s+j_id\s+r_sco\s+s_sco\s+prob$')
RE_HEADER_INTER = re.compile(r'^i\s+j\s+gene\s+i_id\s+j_id\s+r_sco\s+s_sco\s+prob\s+I_prob$')
RE_COMMENT = re.compile(r'^#+(.*)$')
RE_SPLIT = re.compile(r'\s+')


[docs]class GremlinParser(ContactFileParser): """Parser class for GREMLIN contact prediction file """ def __init__(self): super(GremlinParser, self).__init__()
[docs] def read(self, f_handle, f_id="gremlin"): """Read a contact file Parameters ---------- f_handle Open file handle [read permissions] f_id : str, optional Unique contact file identifier Returns ------- :obj:`ContactFile <conkit.core.ContactFile>` """ hierarchy = ContactFile(f_id) lines = iter([l.rstrip() for l in f_handle if l.rstrip()]) done = object() line = next(lines, done) inter = False chain_list = set() contact_list = [] while line is not done: if RE_COMMENT.match(line): hierarchy.remark = RE_COMMENT.match(line).group(1) elif RE_HEADER_INTRA.match(line): inter = False elif RE_HEADER_INTER.match(line): inter = True else: if inter: res1_seq, res2_seq, chain, _, _, raw_score, scalar_score, _, _ = RE_SPLIT.split(line) else: res1_seq, res2_seq, _, _, raw_score, scalar_score, _ = RE_SPLIT.split(line) chain = 'UNK' c = Contact(int(res1_seq), int(res2_seq), float(raw_score)) c.scalar_score = float(scalar_score) if chain == 'UNK': chain_list.add('UNK') elif len(chain) == 1: c.res1_chain = chain[0] c.res2_chain = chain[0] chain_list.add((c.res1_chain, c.res2_chain)) elif len(chain) == 2: c.res1_chain = chain[0] c.res2_chain = chain[1] chain_list.add((c.res1_chain, c.res2_chain)) elif len(chain) > 2: raise ValueError('Cannot distinguish between chains') contact_list.append(c) line = next(lines, done) chain_list = list(chain_list) if len(chain_list) == 1 and chain_list[0] == 'UNK': contact_map = ContactMap('1') for c in contact_list: contact_map.add(c) hierarchy.add(contact_map) elif len(chain_list) == 1: chain = chain_list[0] map_id = chain[0] if chain[0] == chain[1] else "".join(chain) contact_map = ContactMap(map_id) for c in contact_list: contact_map.add(c) hierarchy.add(contact_map) else: for chain in chain_list: map_id = chain[0] if chain[0] == chain[1] else "".join(chain) contact_map = ContactMap(map_id) for c in contact_list: if c.res1_chain == chain[0] and c.res2_chain == chain[1]: contact_map.add(c) hierarchy.add(contact_map) hierarchy.sort('id', inplace=True) return hierarchy
[docs] def write(self, f_handle, hierarchy): """Write a contact file instance to to file Parameters ---------- f_handle Open file handle [write permissions] hierarchy : :obj:`ContactFile <conkit.core.ContactFile>`, :obj:`ContactMap <conkit.core.ContactMap>` or :obj:`Contact <conkit.core.Contact>` """ # Double check the type of hierarchy and reconstruct if necessary contact_file = self._reconstruct(hierarchy) content = "" if contact_file.top_map.top_contact.res1_chain and contact_file.top_map.top_contact.res2_chain: header_args = ['i', 'j', 'gene', 'i_id', 'j_id', 'r_sco', 's_sco', 'prob', 'I_prob'] content += '\t'.join(header_args) + os.linesep out_kwargs = ['{res1_seq}', '{res2_seq}', '{chains}', '{res1_code}', '{res2_code}', '{raw_score}', '{scalar_score}', '1.0', 'N/A'] else: header_args = ['i', 'j', 'i_id', 'j_id', 'r_sco', 's_sco', 'prob'] content += '\t'.join(header_args) + os.linesep out_kwargs = ['{res1_seq}', '{res2_seq}', '{res1_code}', '{res2_code}', '{raw_score}', '{scalar_score}', '1.0'] for contact_map in contact_file: contact_map.calculate_scalar_score() for c in contact_map: res1_code = str(c.res1_seq) + '_' + c.res1 res2_code = str(c.res2_seq) + '_' + c.res2 if c.res1_chain == c.res2_chain: chains = c.res1_chain else: chains = "{0}{1}".format(c.res1_chain, c.res2_chain) out_line = '\t'.join(out_kwargs) out_line = out_line.format(res1_seq=c.res1_seq, res2_seq=c.res2_seq, res1_code=res1_code, res2_code=res2_code, chains=chains, raw_score=c.raw_score, scalar_score=round(c.scalar_score, 1)) content += out_line + os.linesep f_handle.write(content)