view format_fasta_hairpins.py @ 13:b045c30fb768 draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit af48e9f6df2717ffd3731a974be1ec36e4eff779"
author artbio
date Fri, 18 Oct 2019 19:18:50 -0400
parents de227b7307cf
children
line wrap: on
line source

import argparse
import gzip


def Parser():
    the_parser = argparse.ArgumentParser()
    the_parser.add_argument(
        '--hairpins_path', action="store", type=str,
        help="BASE url. ex: /pub/mirbase/22/")
    the_parser.add_argument(
        '--output', action="store", type=str,
        help="parsed hairpin output in fasta format")
    the_parser.add_argument(
        '--basename', action="store", type=str,
        help="genome basename of the parsed fasta")
    args = the_parser.parse_args()
    return args


def get_fasta_dic(gzipfile):
    '''
    gzipfile value example : 'mirbase/22/hairpin.fa.gz'
    '''
    item_dic = {}
    with gzip.open(gzipfile, 'rb') as f:
        current_item = ''
        stringlist = []
        for line in f:
            line = line.decode('utf-8').strip('\n')
            if (line[0] == ">"):
                # dump the sequence of the previous item
                if current_item and stringlist:
                    item_dic[current_item] = "".join(stringlist)
                # take first word of item '''
                current_item = line[1:].split()[0]
                stringlist = []
            else:
                stringlist.append(line)
        item_dic[current_item] = "".join(stringlist)  # for the last item
    return item_dic


def convert_and_print_hairpins(gzipfile, basename, fasta_output):
    raw_fasta_dict = get_fasta_dic(gzipfile)
    parsed_fasta_dict = {}
    for head in raw_fasta_dict:
        if basename in head:
            parsed_fasta_dict[head] = raw_fasta_dict[head]
            parsed_fasta_dict[head] = ''.join(
                [i if i != 'u' else 't' for i in parsed_fasta_dict[head]])
            parsed_fasta_dict[head] = ''.join(
                [i if i != 'U' else 'T' for i in parsed_fasta_dict[head]])
    with open(fasta_output, "w") as output:
        for head in sorted(parsed_fasta_dict):
            output.write('>%s\n%s\n' % (head, parsed_fasta_dict[head]))


def main(hairpins_path, basename, outfile):
    convert_and_print_hairpins(hairpins_path, basename, outfile)


if __name__ == "__main__":
    args = Parser()
    main(args.hairpins_path, args.basename, args.output)