Mercurial > repos > earlhaminst > t_coffee
diff t_coffee_to_cigar.py @ 1:b3833e5b50d4 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 011cabb2a2b3237bbbc4850ed26972816702a2ba-dirty
author | earlhaminst |
---|---|
date | Mon, 19 Dec 2016 17:47:31 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/t_coffee_to_cigar.py Mon Dec 19 17:47:31 2016 -0500 @@ -0,0 +1,47 @@ +#!/usr/bin/env python +""" A script to build specific fasta databases """ +from __future__ import print_function + +import re +import sys + + +FASTA_MATCH_RE = re.compile(r'[^-]') + + +def convert_and_print(header, sequence): + # Converts each match into M and each gap into D + tmp_seq = FASTA_MATCH_RE.sub('M', sequence) + tmp_seq = tmp_seq.replace('-', 'D') + # Split the sequence in substrings composed by the same letter + tmp_seq = tmp_seq.replace('DM', 'D,M') + tmp_seq = tmp_seq.replace('MD', 'M,D') + cigar_list = tmp_seq.split(',') + # Condense each substring, e.g. DDDD in 4D, and concatenate them again + cigar = '' + for s in cigar_list: + if len(s) > 1: + cigar += str(len(s)) + cigar += s[0] + print("%s\t%s" % (header, cigar)) + + +def main(): + with open(sys.argv[1]) as fh: + header = None + sequence = None + for line in fh: + line = line.strip() + if line and line[0] == '>': + if header: + convert_and_print(header, sequence) + header = line[1:] + sequence = '' + else: + sequence += line + if header: + convert_and_print(header, sequence) + + +if __name__ == "__main__": + main()