Mercurial > repos > gpovysil > range2tag

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/range2tag.py	Wed May 16 04:53:11 2018 -0400
@@ -0,0 +1,125 @@
+"""range2tag.py
+
+Author -- Gundula Povysil
+Contact -- povysil@bioinf.jku.at
+
+Takes a SAM file, start and stop positions as input and prints all tags
+of reads that overlap with regions to user specified output file.
+=======  ==========  =================  ================================
+Version  Date        Author             Description
+0.0.2    2018-05-15  Gundula Povysil    -
+=======  ==========  =================  ================================
+
+USAGE: python range2tag.py inputFile.sam ranges.txt outputFile.txt
+"""
+
+
+import numpy as np
+import re
+import argparse
+import sys
+import os
+
+def make_argparser():
+    parser = argparse.ArgumentParser(description='Takes a SAM file, start and stop positions as input and prints all tags of reads that overlap with regions to user specified output file.')
+    parser.add_argument('inputFile',
+                        help='SAM file with aligned reads.')
+    parser.add_argument('rangesFile',
+                        help='TXT file with start and stop positions.')
+    parser.add_argument('outputFile',
+                        help='Output TXT file with tags that are within specified regions.')
+    return parser
+
+def range2tag(argv):
+    parser = make_argparser()
+    args=parser.parse_args(argv[1:])
+
+    inputFile = args.inputFile
+    rangesFile = args.rangesFile
+    outputFile = args.outputFile
+
+    if os.path.isfile(inputFile) is False:
+        print("Error: Could not find '{}'".format(inputFile))
+        exit(0)
+
+    if os.path.isfile(rangesFile) is False:
+        print("Error: Could not find '{}'".format(rangesFile))
+        exit(0)
+
+    with open(rangesFile, 'r') as regs:
+        range_array = np.genfromtxt(regs, skip_header=0, delimiter='\t', comments='#')
+
+    start_posList = range_array[:,0].astype(int)
+    stop_posList = range_array[:,1].astype(int)
+
+    print(start_posList)
+    print(stop_posList)
+
+    if len(start_posList) == 0:
+        print("Error: start_positions is empty")
+        exit(2)
+
+    if len(stop_posList) == 0:
+        print("Error: end_positions is empty")
+        exit(3)
+
+    if len(start_posList) != len(stop_posList):
+        print("start_positions and end_positions do not have the same length")
+        exit(3)
+
+    with open(inputFile, 'r') as sam:
+        data_array = np.genfromtxt(sam, skip_header=0, delimiter='\t', usecols=range(11), comments='#', dtype='string')
+
+    tags = np.array(data_array[:, 0])
+    ref_pos = np.array(data_array[:, 3]).astype(int)
+    cigar = np.array(data_array[:, 5])
+
+    lst = []
+    ind = []
+    start_posList = np.array(start_posList).astype(int)
+    stop_posList = np.array(stop_posList).astype(int)
+
+    for start_pos, stop_pos in zip(start_posList, stop_posList):
+        start_pos = start_pos - 3
+        stop_pos = stop_pos + 3
+        mut_tags = None
+        for t in range(0, len(tags)):
+            c_split = re.split('([A-Z])', cigar[t])
+            cigar_long = None
+
+            for i in range(1, len(c_split), 2):
+                if cigar_long is None:
+                    cigar_long = np.repeat(c_split[i], c_split[i - 1])
+                else:
+                    cigar_long = np.concatenate((cigar_long, np.repeat(c_split[i], c_split[i - 1])), axis=0)
+
+            pos = ref_pos[t]
+            seq_pos = 0
+            #    print(pos)
+            if pos < stop_pos:
+                for j in range(0, len(cigar_long)):
+                    if pos >= stop_pos:
+                        break
+                    if cigar_long[j] in ("M", "D", "N"):
+                        pos += 1
+                        #        print(pos)
+                if pos > start_pos:
+                    if mut_tags is None:
+                        mut_tags = np.array((tags[t]))
+                    else:
+                        mut_tags = np.vstack((mut_tags, np.array(tags[t])))
+
+        index = np.repeat("{}_{}".format(start_pos, stop_pos), len(mut_tags))
+        ind.append(index)
+        lst.append(mut_tags)
+
+    index = np.concatenate((ind))
+    tags = np.concatenate((lst))
+    mut_tags = np.column_stack((index, tags))
+
+    np.savetxt(outputFile, mut_tags, fmt="%s")
+    print("File saved under {} in {}!".format(outputFile, os.getcwd()))
+
+
+if __name__ == '__main__':
+    sys.exit(range2tag(sys.argv))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/range2tag.xml	Wed May 16 04:53:11 2018 -0400
@@ -0,0 +1,54 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tool id="range2tag" name="Duplex Sequencing Analysis: range2tag" version="0.0.1">
+    <requirements>
+        <requirement type="package" version="2.7">python</requirement>
+    </requirements>
+    <description>Tool that extracts tags of reads that are within user-specified regions</description>
+    <command>
+        python2 $__tool_directory__/range2tag.py "$file1" "$file2" "$output"
+    </command>
+    <inputs>
+        <param name="file1" type="data" format="sam" label="Dataset 1: SAM file with aligned reads." optional="false" help="SAM File with reads aligned to reference."/>
+        <param name="file2" type="data" format="txt" label="Dataset 2: TXT file with start and stop positions of regions." optional="false" help="Tab delimited TXT file with start and stop positions of regions."/>
+    </inputs>
+    <outputs>
+        <data name="output" format="tabular" />
+    </outputs>
+    <help> <![CDATA[
+**What it does**
+
+    This tool takes a SAM file and a TXT file with start and stop positions as input and prints all tags of reads that overlap with regions to user specified output file.
+
+
+**Input**
+
+    **Dataset 1:** SAM file of aligned reads.
+
+    **Dataset 2:** Tab delimited TXT file with start and stop positions.
+
+**Output**
+
+    The output is a tabular file of tags of all reads that overlap the user-specified regions with start_stop in the first column and the tag in the second column.
+
+
+**About Author**
+
+    Author: Gundula Povysil, MD, PhD
+
+    Department: Institute of Bioinformatics, Johannes Kepler University Linz, Austria
+
+    Contact: gpovysil@gmail.com
+
+        ]]>
+
+    </help>
+    <citations>
+        <citation type="bibtex">
+            @misc{duplex,
+            author = {Heinzl, Monika},
+            year = {2018},
+            title = {Development of algorithms for the analysis of duplex sequencing data}
+         }
+        </citation>
+    </citations>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ranges.txt	Wed May 16 04:53:11 2018 -0400
@@ -0,0 +1,6 @@
+90	633
+659	1140
+1144	1561
+1895	2395
+2396	2865
+2396	3017