diff range2tag.py @ 0:0b486eaca66e draft

planemo upload for repository https://github.com/gpovysil/galaxy/tools/range2tag commit 16b1092dfe1052f9560188e67f39009f1e4b32bc
author gpovysil
date Wed, 16 May 2018 04:53:11 -0400
parents
children 7d4ec8f51253
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/range2tag.py	Wed May 16 04:53:11 2018 -0400
@@ -0,0 +1,125 @@
+"""range2tag.py
+
+Author -- Gundula Povysil
+Contact -- povysil@bioinf.jku.at
+
+Takes a SAM file, start and stop positions as input and prints all tags
+of reads that overlap with regions to user specified output file.
+=======  ==========  =================  ================================
+Version  Date        Author             Description
+0.0.2    2018-05-15  Gundula Povysil    -
+=======  ==========  =================  ================================
+
+USAGE: python range2tag.py inputFile.sam ranges.txt outputFile.txt
+"""
+
+
+import numpy as np
+import re
+import argparse
+import sys
+import os
+
+def make_argparser():
+    parser = argparse.ArgumentParser(description='Takes a SAM file, start and stop positions as input and prints all tags of reads that overlap with regions to user specified output file.')
+    parser.add_argument('inputFile',
+                        help='SAM file with aligned reads.')
+    parser.add_argument('rangesFile',
+                        help='TXT file with start and stop positions.')
+    parser.add_argument('outputFile',
+                        help='Output TXT file with tags that are within specified regions.')
+    return parser
+
+def range2tag(argv):
+    parser = make_argparser()
+    args=parser.parse_args(argv[1:])
+
+    inputFile = args.inputFile
+    rangesFile = args.rangesFile
+    outputFile = args.outputFile
+
+    if os.path.isfile(inputFile) is False:
+        print("Error: Could not find '{}'".format(inputFile))
+        exit(0)
+
+    if os.path.isfile(rangesFile) is False:
+        print("Error: Could not find '{}'".format(rangesFile))
+        exit(0)
+
+    with open(rangesFile, 'r') as regs:
+        range_array = np.genfromtxt(regs, skip_header=0, delimiter='\t', comments='#')
+
+    start_posList = range_array[:,0].astype(int)
+    stop_posList = range_array[:,1].astype(int)
+
+    print(start_posList)
+    print(stop_posList)
+
+    if len(start_posList) == 0:
+        print("Error: start_positions is empty")
+        exit(2)
+
+    if len(stop_posList) == 0:
+        print("Error: end_positions is empty")
+        exit(3)
+
+    if len(start_posList) != len(stop_posList):
+        print("start_positions and end_positions do not have the same length")
+        exit(3)
+
+    with open(inputFile, 'r') as sam:
+        data_array = np.genfromtxt(sam, skip_header=0, delimiter='\t', usecols=range(11), comments='#', dtype='string')
+
+    tags = np.array(data_array[:, 0])
+    ref_pos = np.array(data_array[:, 3]).astype(int)
+    cigar = np.array(data_array[:, 5])
+
+    lst = []
+    ind = []
+    start_posList = np.array(start_posList).astype(int)
+    stop_posList = np.array(stop_posList).astype(int)
+
+    for start_pos, stop_pos in zip(start_posList, stop_posList):
+        start_pos = start_pos - 3
+        stop_pos = stop_pos + 3
+        mut_tags = None
+        for t in range(0, len(tags)):
+            c_split = re.split('([A-Z])', cigar[t])
+            cigar_long = None
+
+            for i in range(1, len(c_split), 2):
+                if cigar_long is None:
+                    cigar_long = np.repeat(c_split[i], c_split[i - 1])
+                else:
+                    cigar_long = np.concatenate((cigar_long, np.repeat(c_split[i], c_split[i - 1])), axis=0)
+
+            pos = ref_pos[t]
+            seq_pos = 0
+            #    print(pos)
+            if pos < stop_pos:
+                for j in range(0, len(cigar_long)):
+                    if pos >= stop_pos:
+                        break
+                    if cigar_long[j] in ("M", "D", "N"):
+                        pos += 1
+                        #        print(pos)
+                if pos > start_pos:
+                    if mut_tags is None:
+                        mut_tags = np.array((tags[t]))
+                    else:
+                        mut_tags = np.vstack((mut_tags, np.array(tags[t])))
+
+        index = np.repeat("{}_{}".format(start_pos, stop_pos), len(mut_tags))
+        ind.append(index)
+        lst.append(mut_tags)
+
+    index = np.concatenate((ind))
+    tags = np.concatenate((lst))
+    mut_tags = np.column_stack((index, tags))
+
+    np.savetxt(outputFile, mut_tags, fmt="%s")
+    print("File saved under {} in {}!".format(outputFile, os.getcwd()))
+
+
+if __name__ == '__main__':
+    sys.exit(range2tag(sys.argv))