Mercurial > repos > gpovysil > range2tag
comparison range2tag.py @ 0:0b486eaca66e draft
planemo upload for repository https://github.com/gpovysil/galaxy/tools/range2tag commit 16b1092dfe1052f9560188e67f39009f1e4b32bc
| author | gpovysil |
|---|---|
| date | Wed, 16 May 2018 04:53:11 -0400 |
| parents | |
| children | 7d4ec8f51253 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0b486eaca66e |
|---|---|
| 1 """range2tag.py | |
| 2 | |
| 3 Author -- Gundula Povysil | |
| 4 Contact -- povysil@bioinf.jku.at | |
| 5 | |
| 6 Takes a SAM file, start and stop positions as input and prints all tags | |
| 7 of reads that overlap with regions to user specified output file. | |
| 8 ======= ========== ================= ================================ | |
| 9 Version Date Author Description | |
| 10 0.0.2 2018-05-15 Gundula Povysil - | |
| 11 ======= ========== ================= ================================ | |
| 12 | |
| 13 USAGE: python range2tag.py inputFile.sam ranges.txt outputFile.txt | |
| 14 """ | |
| 15 | |
| 16 | |
| 17 import numpy as np | |
| 18 import re | |
| 19 import argparse | |
| 20 import sys | |
| 21 import os | |
| 22 | |
| 23 def make_argparser(): | |
| 24 parser = argparse.ArgumentParser(description='Takes a SAM file, start and stop positions as input and prints all tags of reads that overlap with regions to user specified output file.') | |
| 25 parser.add_argument('inputFile', | |
| 26 help='SAM file with aligned reads.') | |
| 27 parser.add_argument('rangesFile', | |
| 28 help='TXT file with start and stop positions.') | |
| 29 parser.add_argument('outputFile', | |
| 30 help='Output TXT file with tags that are within specified regions.') | |
| 31 return parser | |
| 32 | |
| 33 def range2tag(argv): | |
| 34 parser = make_argparser() | |
| 35 args=parser.parse_args(argv[1:]) | |
| 36 | |
| 37 inputFile = args.inputFile | |
| 38 rangesFile = args.rangesFile | |
| 39 outputFile = args.outputFile | |
| 40 | |
| 41 if os.path.isfile(inputFile) is False: | |
| 42 print("Error: Could not find '{}'".format(inputFile)) | |
| 43 exit(0) | |
| 44 | |
| 45 if os.path.isfile(rangesFile) is False: | |
| 46 print("Error: Could not find '{}'".format(rangesFile)) | |
| 47 exit(0) | |
| 48 | |
| 49 with open(rangesFile, 'r') as regs: | |
| 50 range_array = np.genfromtxt(regs, skip_header=0, delimiter='\t', comments='#') | |
| 51 | |
| 52 start_posList = range_array[:,0].astype(int) | |
| 53 stop_posList = range_array[:,1].astype(int) | |
| 54 | |
| 55 print(start_posList) | |
| 56 print(stop_posList) | |
| 57 | |
| 58 if len(start_posList) == 0: | |
| 59 print("Error: start_positions is empty") | |
| 60 exit(2) | |
| 61 | |
| 62 if len(stop_posList) == 0: | |
| 63 print("Error: end_positions is empty") | |
| 64 exit(3) | |
| 65 | |
| 66 if len(start_posList) != len(stop_posList): | |
| 67 print("start_positions and end_positions do not have the same length") | |
| 68 exit(3) | |
| 69 | |
| 70 with open(inputFile, 'r') as sam: | |
| 71 data_array = np.genfromtxt(sam, skip_header=0, delimiter='\t', usecols=range(11), comments='#', dtype='string') | |
| 72 | |
| 73 tags = np.array(data_array[:, 0]) | |
| 74 ref_pos = np.array(data_array[:, 3]).astype(int) | |
| 75 cigar = np.array(data_array[:, 5]) | |
| 76 | |
| 77 lst = [] | |
| 78 ind = [] | |
| 79 start_posList = np.array(start_posList).astype(int) | |
| 80 stop_posList = np.array(stop_posList).astype(int) | |
| 81 | |
| 82 for start_pos, stop_pos in zip(start_posList, stop_posList): | |
| 83 start_pos = start_pos - 3 | |
| 84 stop_pos = stop_pos + 3 | |
| 85 mut_tags = None | |
| 86 for t in range(0, len(tags)): | |
| 87 c_split = re.split('([A-Z])', cigar[t]) | |
| 88 cigar_long = None | |
| 89 | |
| 90 for i in range(1, len(c_split), 2): | |
| 91 if cigar_long is None: | |
| 92 cigar_long = np.repeat(c_split[i], c_split[i - 1]) | |
| 93 else: | |
| 94 cigar_long = np.concatenate((cigar_long, np.repeat(c_split[i], c_split[i - 1])), axis=0) | |
| 95 | |
| 96 pos = ref_pos[t] | |
| 97 seq_pos = 0 | |
| 98 # print(pos) | |
| 99 if pos < stop_pos: | |
| 100 for j in range(0, len(cigar_long)): | |
| 101 if pos >= stop_pos: | |
| 102 break | |
| 103 if cigar_long[j] in ("M", "D", "N"): | |
| 104 pos += 1 | |
| 105 # print(pos) | |
| 106 if pos > start_pos: | |
| 107 if mut_tags is None: | |
| 108 mut_tags = np.array((tags[t])) | |
| 109 else: | |
| 110 mut_tags = np.vstack((mut_tags, np.array(tags[t]))) | |
| 111 | |
| 112 index = np.repeat("{}_{}".format(start_pos, stop_pos), len(mut_tags)) | |
| 113 ind.append(index) | |
| 114 lst.append(mut_tags) | |
| 115 | |
| 116 index = np.concatenate((ind)) | |
| 117 tags = np.concatenate((lst)) | |
| 118 mut_tags = np.column_stack((index, tags)) | |
| 119 | |
| 120 np.savetxt(outputFile, mut_tags, fmt="%s") | |
| 121 print("File saved under {} in {}!".format(outputFile, os.getcwd())) | |
| 122 | |
| 123 | |
| 124 if __name__ == '__main__': | |
| 125 sys.exit(range2tag(sys.argv)) |
