Repository 'resize_coordinate_window'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/resize_coordinate_window

Changeset 1:0164d2edba9f (2016-02-16)
Previous changeset 0:08b6255afde7 (2016-01-19) Next changeset 2:541f300f322d (2016-11-14)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/resize_coordinate_window commit 7aa2429d3f53a14be7e44dc6021ed3e11dc2f080
modified:
resize_coordinate_window.py
resize_coordinate_window.xml
added:
test-data/output_discard.gff
b
diff -r 08b6255afde7 -r 0164d2edba9f resize_coordinate_window.py
--- a/resize_coordinate_window.py Tue Jan 19 09:34:56 2016 -0500
+++ b/resize_coordinate_window.py Tue Feb 16 04:05:23 2016 -0500
[
@@ -1,41 +1,88 @@
 import argparse
+import fileinput
 import sys
 
+# Maximum value of a signed 32 bit integer (2**31 - 1).
+MAX_CHROM_LEN = 2147483647
 
-def stop_err( msg ):
-    sys.stderr.write( msg )
+
+def stop_err(msg):
+    sys.stderr.write(msg)
     sys.exit(1)
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--input', dest='input', help="Input dataset")
+parser.add_argument('--start_coordinate', dest='start_coordinate', type=int, help='Chromosome start coordinate, either 0 or 1.')
 parser.add_argument('--subtract_from_start', dest='subtract_from_start', type=int, help='Distance to subtract from start.')
 parser.add_argument('--add_to_end', dest='add_to_end', type=int, help='Distance to add to end.')
-parser.add_argument('--extend_existing', dest='extend_existing', help='Extend existing start/end rather or from computed midpoint.')
+parser.add_argument('--extend_existing', dest='extend_existing', help='Extend existing start/end instead of from computed midpoint.')
+parser.add_argument('--chrom_len_file', dest='chrom_len_file', help="File names of .len files for chromosome lengths")
+parser.add_argument('--region_boundaries', dest='region_boundaries', help="Option for handling region boundaries")
 parser.add_argument('--output', dest='output', help="Output dataset")
 args = parser.parse_args()
 
 extend_existing = args.extend_existing == 'existing'
 out = open(args.output, 'wb')
 
-for line in open(args.input):
-    if line.startswith('#'):
-        continue
-    items = line.split('\t')
-    if len(items) != 9:
-        continue
-    start = int(items[3])
-    end = int(items[4])
-    if extend_existing:
-        start -= args.subtract_from_start
-        end += args.add_to_end
-    else:
-        midpoint = (start + end) // 2
-        start = midpoint - args.subtract_from_start
-        end = midpoint + args.add_to_end
-    if start < 1:
-        out.close()
-        stop_err('Requested expansion places region beyond chromosome bounds.')
-    new_line = '\t'.join([items[0], items[1], items[2], str(start), str(end), items[5], items[6], items[7], items[8]])
-    out.write(new_line)
+chrom_start = int(args.start_coordinate)
+chrom_lens = dict()
+# Determine the length of each chromosome and add it to the chrom_lens dictionary.
+len_file_missing = False
+len_file_error = None
+len_file = fileinput.FileInput(args.chrom_len_file)
+try:
+    for line in len_file:
+        fields = line.split("\t")
+        chrom_lens[fields[0]] = int(fields[1])
+except Exception, e:
+    len_file_error = str(e)
+
+with open(args.input) as fhi:
+    for line in fhi:
+        if line.startswith('#'):
+            # Skip comments.
+            continue
+        items = line.split('\t')
+        if len(items) != 9:
+            # Skip invalid gff data.
+            continue
+        chrom = items[0]
+        start = int(items[3])
+        end = int(items[4])
+        if extend_existing:
+            new_start = start - args.subtract_from_start
+            new_end = end + args.add_to_end
+        else:
+            midpoint = (start + end) // 2
+            new_start = midpoint - args.subtract_from_start
+            new_end = midpoint + args.add_to_end
+        # Check start boundary.
+        if new_start < chrom_start:
+            if args.region_boundaries == 'discard':
+                continue
+            elif args.region_boundaries == 'limit':
+                new_start = chrom_start
+            elif args.region_boundaries == 'error':
+                out.close()
+                stop_err('Requested expansion places region beyond chromosome start boundary of %d.' % chrom_start)
+        # Check end boundary.
+        chrom_len = chrom_lens.get(chrom, None)
+        if chrom_len is None:
+            len_file_missing = True
+            chrom_len = MAX_CHROM_LEN
+        if new_end > chrom_len:
+            if args.region_boundaries == 'discard':
+                continue
+            elif args.region_boundaries == 'limit':
+                new_end = chrom_len
+            elif args.region_boundaries == 'error':
+                out.close()
+                stop_err('Requested expansion places region beyond chromosome end boundary of %d.' % chrom_len)
+        new_line = '\t'.join([chrom, items[1], items[2], str(new_start), str(new_end), items[5], items[6], items[7], items[8]])
+        out.write(new_line)
 out.close()
 
+if len_file_error is not None:
+    print "All chrom lengths set to %d, error in chrom len file: %s" % (MAX_CHROM_LEN, len_file_error)
+if len_file_missing:
+    print "All chrom lengths set to %d, chrom len files are not installed." % MAX_CHROM_LEN
b
diff -r 08b6255afde7 -r 0164d2edba9f resize_coordinate_window.xml
--- a/resize_coordinate_window.xml Tue Jan 19 09:34:56 2016 -0500
+++ b/resize_coordinate_window.xml Tue Feb 16 04:05:23 2016 -0500
b
@@ -1,21 +1,34 @@
-<tool id="resize_coordinate_window" name="Resize coordinate window" version="1.0.0">
+<tool id="resize_coordinate_window" name="Resize coordinate window" version="1.0.1">
     <description>of GFF data</description>
     <command>
         python $__tool_directory__/resize_coordinate_window.py
         --input "$input"
+        --start_coordinate $start_coordinate
         --subtract_from_start $subtract_from_start
         --add_to_end $add_to_end
         --extend_existing $extend_existing
+        --chrom_len_file ${chromInfo}
+        --region_boundaries $region_boundaries
         --output "$output"
     </command>
     <inputs>
         <param name="input" type="data" format="gff" label="Gff file" />
+        <param name="start_coordinate" type="select" label="Start coordinate" help="Input data is 0-based or 1-based">
+            <option value="0" selected="True">0</option>
+            <option value="1">1</option>
+        </param>
         <param name="subtract_from_start" type="integer" value="30" min="0" label="Distance to subtract from the start coordinate"/>
         <param name="add_to_end" type="integer" value="30" min="0" label="Distance to add to the end coordinate"/>
         <param name="extend_existing" type="select" label="Resize window from" help="The midpoint is computed as (start + end) // 2">
             <option value="midpoint" selected="True">the midpoint of the start and end coordinates</option>
             <option value="existing">the start and end coordinates</option>
         </param>
+        <param name="region_boundaries" type="select" label="Handle chromosome boundaries by" help="Expanding the region may result in crossing chromosome start and end coordinate boundaries.">
+            <option value="discard" selected="True">discarding the region</option>
+            <option value="limit">keeping the region by limiting the expansion to not cross the start or end coordinate boundary</option>
+            <option value="nothing">keeping the region by allowing the expansion to cross the start or end coordinate boundary</option>
+            <option value="error">outputting an error</option>
+        </param>
     </inputs>
     <outputs>
         <data name="output" format="gff" />
@@ -23,20 +36,43 @@
     <tests>
         <test>
             <param name="input" value="input.gff" ftype="gff" />
+            <param name="start_coordinate" value="1" />
             <param name="subtract_from_start" value="13" />
             <param name="add_to_end" value="13" />
             <param name="extend_existing" value="midpoint" />
+            <param name="region_boundaries" value="error" />
             <output name="output" file="output.gff" ftype="gff" />
         </test>
+        <test>
+            <param name="input" value="input.gff" ftype="gff" />
+            <param name="start_coordinate" value="0" />
+            <param name="subtract_from_start" value="80" />
+            <param name="add_to_end" value="80" />
+            <param name="extend_existing" value="midpoint" />
+            <param name="region_boundaries" value="discard" />
+            <output name="output" file="output_discard.gff" ftype="gff" />
+        </test>
     </tests>
     <help>
 
 **What it does**
 
 Modifies the start and end coordinates of GFF data such that the new start and end position is based on a
-specified window size that is computed either from the existing start and end coordinates or centered on
+specified region size that is computed either from the existing start and end coordinates or centered on
 the midpoint between them.
 
+Region expansion may result in the new start or end coordinates crossing the chromosome boundary.  The
+chromosome start is set to 0 or 1 using the **Start coordinate** parameter.  The end is retrieved from a
+file within the Galaxy environment that includes the length of chromosomes for all genome builds.  If these
+files are missing, the end coordinate is set to 2147483647, which is the maximum value of a signed 32 bit
+integer.  The **Handle chromosome boundaries by** parameter handles chromosome boundaries that are crossed
+by expanding the region using one of the following options.
+
+* **discarding the region** - the region will be discarded and processing will continue with the next line in the dataset.
+* **keeping the region by limiting the expansion to not cross the start or end coordinate boundary** - expansion will be restricted to not cross the chromosome's start or end coordinates for the current region.
+* **keeping the region by allowing the expansion to cross the start or end coordinate boundary** - allow defined expansion, crossing the start boundary results in a negative start value.
+* **outputting an error** - Stop processing and display an error.
+
 -----
 
 **Example**
@@ -47,7 +83,7 @@
     chr1    genetrack       .       31      51      245     -       .       stddev=2.66582799529
     chr1    genetrack       .       40      60      2060    +       .       stddev=2.7859667372
 
-Resizing the coordinate window by 13 from the computed midpoint of the start and end coordinates produces::
+Setting start coordinate to 1 and resizing the coordinate window by 13 from the computed midpoint of the start and end coordinates produces::
 
     chr1    genetrack       .       14      40      918     +       .       stddev=5.96715849116
     chr1    genetrack       .       28      54      245     -       .       stddev=2.66582799529`
@@ -57,7 +93,7 @@
     <citations>
         <citation type="bibtex">
             @unpublished{None,
-            author = {},
+            author = {Greg Von Kuster},
             title = {None},
             year = {None},
             eprint = {None},
b
diff -r 08b6255afde7 -r 0164d2edba9f test-data/output_discard.gff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_discard.gff Tue Feb 16 04:05:23 2016 -0500
b
@@ -0,0 +1,96 @@
+chr1 genetrack . 3 163 397 + . stddev=0.0
+chr1 genetrack . 19 179 521 + . stddev=0.747112137937
+chr1 genetrack . 53 213 5129 + . stddev=3.01025384354
+chr1 genetrack . 55 215 4659 - . stddev=3.8642622228
+chr1 genetrack . 85 245 897 - . stddev=3.22709952671
+chr1 genetrack . 101 261 956 - . stddev=4.95899971687
+chr1 genetrack . 110 270 1527 + . stddev=4.62574275346
+chr1 genetrack . 115 275 494 - . stddev=1.4255957
+chr1 genetrack . 122 282 2538 + . stddev=5.04731591122
+chr1 genetrack . 136 296 2087 - . stddev=3.6160253713
+chr1 genetrack . 168 328 2496 + . stddev=2.11105291581
+chr1 genetrack . 172 332 5047 - . stddev=3.62629343395
+chr1 genetrack . 184 344 1525 + . stddev=4.46082441647
+chr1 genetrack . 211 371 15 + . stddev=1.74610678049
+chr1 genetrack . 232 392 626 - . stddev=0.0
+chr1 genetrack . 238 398 1544 + . stddev=4.43066151722
+chr1 genetrack . 264 424 533 + . stddev=1.34355443899
+chr1 genetrack . 274 434 726 - . stddev=1.36767079956
+chr1 genetrack . 277 437 286 + . stddev=0.0
+chr1 genetrack . 288 448 792 - . stddev=1.47737416556
+chr1 genetrack . 304 464 608 + . stddev=1.44652711793
+chr1 genetrack . 319 479 126 - . stddev=0.471404520791
+chr1 genetrack . 369 529 618 - . stddev=5.47536569145
+chr1 genetrack . 371 531 1393 + . stddev=4.75587332865
+chr1 genetrack . 391 551 754 - . stddev=3.28891288785
+chr1 genetrack . 413 573 58 + . stddev=0.0
+chr1 genetrack . 468 628 1015 - . stddev=0.0
+chr1 genetrack . 658 818 39 - . stddev=0.0
+chr1 genetrack . 687 847 23 + . stddev=0.0
+chr1 genetrack . 729 889 607 + . stddev=0.0
+chr1 genetrack . 774 934 665 + . stddev=0.0
+chr1 genetrack . 807 967 468 + . stddev=0.0
+chr1 genetrack . 833 993 107 - . stddev=0.0
+chr1 genetrack . 874 1034 2 - . stddev=0.0
+chr1 genetrack . 1022 1182 740 + . stddev=0.0
+chr1 genetrack . 1057 1217 940 - . stddev=3.96036497305
+chr1 genetrack . 1113 1273 25 + . stddev=0.0
+chr1 genetrack . 1221 1381 454 - . stddev=0.0
+chr1 genetrack . 1259 1419 207 - . stddev=0.0
+chr1 genetrack . 1414 1574 584 + . stddev=0.0
+chr1 genetrack . 2005 2165 1181 + . stddev=0.0
+chr1 genetrack . 2032 2192 481 + . stddev=0.0455486534308
+chr1 genetrack . 2055 2215 199 - . stddev=0.0
+chr1 genetrack . 2382 2542 1246 + . stddev=0.0
+chr1 genetrack . 2532 2692 34 + . stddev=0.0
+chr1 genetrack . 2763 2923 1062 + . stddev=1.01561431542
+chr1 genetrack . 2768 2928 1144 - . stddev=1.09438744148
+chr1 genetrack . 2941 3101 1212 - . stddev=0.0
+chr1 genetrack . 3046 3206 555 - . stddev=0.0
+chr1 genetrack . 3060 3220 17 + . stddev=0.0
+chr1 genetrack . 3308 3468 525 - . stddev=0.0
+chr1 genetrack . 3599 3759 845 + . stddev=0.0
+chr1 genetrack . 3715 3875 23 - . stddev=0.0
+chr1 genetrack . 3777 3937 316 - . stddev=0.0
+chr1 genetrack . 3798 3958 491 + . stddev=0.0
+chr1 genetrack . 4027 4187 536 - . stddev=0.0
+chr1 genetrack . 4256 4416 482 + . stddev=0.0
+chr1 genetrack . 4325 4485 3 + . stddev=0.0
+chr1 genetrack . 4391 4551 1110 + . stddev=0.0
+chr1 genetrack . 4430 4590 125 - . stddev=0.0
+chr1 genetrack . 4550 4710 147 + . stddev=0.0
+chr1 genetrack . 4756 4916 1761 + . stddev=4.82408982772
+chr1 genetrack . 4832 4992 710 + . stddev=0.0
+chr1 genetrack . 5040 5200 828 + . stddev=0.0
+chr1 genetrack . 5332 5492 282 - . stddev=0.0
+chr1 genetrack . 5431 5591 75 + . stddev=0.0
+chr1 genetrack . 5637 5797 2 + . stddev=0.0
+chr1 genetrack . 5647 5807 737 - . stddev=0.36608362591
+chr1 genetrack . 6016 6176 646 + . stddev=0.039314009595
+chr1 genetrack . 6028 6188 230 - . stddev=0.0657945476105
+chr1 genetrack . 6117 6277 329 - . stddev=0.0
+chr1 genetrack . 6220 6380 5 + . stddev=0.0
+chr1 genetrack . 6286 6446 285 + . stddev=0.0
+chr1 genetrack . 6310 6470 34 - . stddev=0.0
+chr1 genetrack . 6331 6491 1587 + . stddev=5.61831543503
+chr1 genetrack . 6345 6505 953 - . stddev=3.52372902021
+chr1 genetrack . 6362 6522 742 + . stddev=0.0
+chr1 genetrack . 6426 6586 691 + . stddev=0.0
+chr1 genetrack . 6436 6596 61 - . stddev=1.5137105198
+chr1 genetrack . 6773 6933 28 + . stddev=0.0
+chr1 genetrack . 6988 7148 518 - . stddev=0.0
+chr1 genetrack . 7054 7214 654 + . stddev=0.0
+chr1 genetrack . 7695 7855 714 + . stddev=0.0
+chr1 genetrack . 7777 7937 3 + . stddev=0.0
+chr1 genetrack . 8139 8299 17 + . stddev=0.0
+chr1 genetrack . 8202 8362 2 - . stddev=0.0
+chr1 genetrack . 8389 8549 10 + . stddev=0.0
+chr1 genetrack . 8401 8561 5 - . stddev=0.0
+chr1 genetrack . 8645 8805 5 + . stddev=0.0
+chr1 genetrack . 8764 8924 332 + . stddev=0.0
+chr1 genetrack . 8769 8929 593 - . stddev=0.0
+chr1 genetrack . 8964 9124 24 + . stddev=0.0
+chr1 genetrack . 8988 9148 4 + . stddev=0.0
+chr1 genetrack . 9415 9575 36 + . stddev=0.0
+chr1 genetrack . 9640 9800 480 + . stddev=0.0
+chr1 genetrack . 9853 10013 606 - . stddev=0.0