Mercurial > repos > gpovysil > range2tag
view range2tag.py @ 4:e2ccee720583 draft
planemo upload for repository https://github.com/gpovysil/galaxy/tools/range2tag commit a7152b199688c32e1668c1b080efa7b7003b9fb3
author | gpovysil |
---|---|
date | Wed, 16 May 2018 09:43:08 -0400 |
parents | e66e151e07ab |
children |
line wrap: on
line source
"""range2tag.py Author -- Gundula Povysil Contact -- povysil@bioinf.jku.at Takes a SAM file, start and stop positions as input and prints all tags of reads that overlap with regions to user specified output file. ======= ========== ================= ================================ Version Date Author Description 0.0.2 2018-05-15 Gundula Povysil - ======= ========== ================= ================================ USAGE: python range2tag.py inputFile.sam ranges.txt outputFile.txt """ import numpy as np import re import argparse import sys import os def make_argparser(): parser = argparse.ArgumentParser(description='Takes a SAM file, start and stop positions as input and prints all tags of reads that overlap with regions to user specified output file.') parser.add_argument('inputFile', help='SAM file with aligned reads.') parser.add_argument('rangesFile', help='TXT file with start and stop positions.') parser.add_argument('outputFile', help='Output TXT file with tags that are within specified regions.') return parser def range2tag(argv): parser = make_argparser() args=parser.parse_args(argv[1:]) inputFile = args.inputFile rangesFile = args.rangesFile outputFile = args.outputFile if os.path.isfile(inputFile) is False: print("Error: Could not find '{}'".format(inputFile)) exit(0) if os.path.isfile(rangesFile) is False: print("Error: Could not find '{}'".format(rangesFile)) exit(0) with open(rangesFile, 'r') as regs: range_array = np.genfromtxt(regs, skip_header=0, delimiter='\t', comments='#') start_posList = range_array[:,0].astype(int) stop_posList = range_array[:,1].astype(int) if len(start_posList) == 0: print("Error: start_positions is empty") exit(2) if len(stop_posList) == 0: print("Error: end_positions is empty") exit(3) if len(start_posList) != len(stop_posList): print("start_positions and end_positions do not have the same length") exit(3) with open(inputFile, 'r') as sam: data_array = np.genfromtxt(sam, skip_header=0, delimiter='\t', usecols=range(11), comments='#', dtype='string') tags = np.array(data_array[:, 0]) ref_pos = np.array(data_array[:, 3]).astype(int) cigar = np.array(data_array[:, 5]) lst = [] ind = [] start_posList = np.array(start_posList).astype(int) stop_posList = np.array(stop_posList).astype(int) for start_pos, stop_pos in zip(start_posList, stop_posList): start_pos = start_pos - 3 stop_pos = stop_pos + 3 mut_tags = None for t in range(0, len(tags)): if cigar[t] != "*": c_split = re.split('([A-Z])', cigar[t]) cigar_long = None for i in range(1, len(c_split), 2): if cigar_long is None: cigar_long = np.repeat(c_split[i], c_split[i - 1]) else: cigar_long = np.concatenate((cigar_long, np.repeat(c_split[i], c_split[i - 1])), axis=0) pos = ref_pos[t] seq_pos = 0 # print(pos) if pos < stop_pos: for j in range(0, len(cigar_long)): if pos >= stop_pos: break if cigar_long[j] in ("M", "D", "N"): pos += 1 # print(pos) if pos > start_pos: if mut_tags is None: mut_tags = np.array((tags[t])) else: mut_tags = np.vstack((mut_tags, np.array(tags[t]))) index = np.repeat("{}_{}".format(start_pos, stop_pos), len(mut_tags)) ind.append(index) lst.append(mut_tags) index = np.concatenate((ind)) tags = np.concatenate((lst)) mut_tags = np.column_stack((index, tags)) np.savetxt(outputFile, mut_tags, fmt="%s") print("File saved under {} in {}!".format(outputFile, os.getcwd())) if __name__ == '__main__': sys.exit(range2tag(sys.argv))