Mercurial > repos > galaxyp > metagene_annotator
diff convert_mga.py @ 0:b04960a7abf5 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/metagene_annotator commit 6d8b6e0fa2f1b47b337dbf21f5bc320586ccbd4c
author | galaxyp |
---|---|
date | Wed, 21 Mar 2018 17:15:25 -0400 |
parents | |
children | 17c7ab82bfbc |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/convert_mga.py Wed Mar 21 17:15:25 2018 -0400 @@ -0,0 +1,91 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import argparse +import math +import re +import sys + + +def __main__(): + parser = argparse.ArgumentParser( + description='Convert mga output to bed and tsv') + parser.add_argument( + 'input_mga', + help="mga output to convert, '-' for stdin") + parser.add_argument( + '-t', '--tsv', default=None, + help='Path to output mga.tsv') + parser.add_argument( + '-b', '--bed', default=None, + help='Path to output mga.bed') + parser.add_argument('-v', '--verbose', action='store_true', help='Verbose') + args = parser.parse_args() + + input_rdr = open(args.input_mga, 'r')\ + if args.input_mga != '-' else sys.stdin + + bed_wtr = open(args.bed, 'w') if args.bed is not None else None + tsv_wtr = open(args.tsv, 'w') if args.bed is not None else None + if tsv_wtr: + tsv_wtr.write('#%s\n' % '\t'.join([ + 'seq_id', 'seq_model', 'seq_gc', 'seq_rbs', + 'gene ID', 'start pos', 'end pos', 'strand', 'frame', + 'complete/partial', 'gene score', 'used model', + 'rbs start', 'rbs end', 'rbs score'])) + + gc_rbs_pat = '# gc = (-?[0-9]*[.]?[0-9]+), rbs = (-?[0-9]*[.]?[0-9]+)' + seq_count = 0 + gene_count = 0 + for i, line in enumerate(input_rdr): + # 1317/1 + # gc = 0.272955, rbs = -1 + # self: - + if line.startswith('# gc'): + try: + m = re.match(gc_rbs_pat, line.strip()) + seq_gc, seq_rbs = m.groups() + except: + seq_gc = seq_rbs = '' + elif line.startswith('# self:'): + seq_type = re.sub('# self:', '', line.rstrip()) + elif line.startswith('# '): + seq_name = re.sub('# (\S+).*$', '\\1', line.rstrip()) + seq_count += 1 + else: + fields = line.split('\t') + if len(fields) == 11: + gene_count += 1 + start = int(fields[1]) - 1 + end = int(fields[2]) + if tsv_wtr: + tsv_wtr.write('%s\t%s\t%s\t%s\t%s' % ( + seq_name, + seq_type, + seq_gc, + seq_rbs, + line)) + if bed_wtr: + bed_wtr.write( + '%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n' % ( + seq_name, + start, + end, + '%s:%s' % (seq_name, fields[0]), + int(math.ceil(float(fields[6]))), + fields[3], + start, + end, + 0, + 1, + abs(end - start), + 0)) + + if args.verbose: + print("sequences: %d\tgenes: %d" + % (seq_count, gene_count), file=sys.stdout) + + +if __name__ == "__main__": + __main__()