Mercurial > repos > bgruening > column_arrange_by_header

--- a/columnArrange.xml	Fri Oct 16 14:31:13 2015 -0400
+++ b/columnArrange.xml	Fri Feb 15 07:45:03 2019 -0500
@@ -1,17 +1,15 @@
-<tool id="bg_column_arrange_by_header" name="Column arrange" version="0.1">
+<tool id="bg_column_arrange_by_header" name="Column arrange" version="0.2">
     <description>by header name</description>
-    <requirements>
-        <requirement type="package" version="0.14.1">pandas</requirement>
-    </requirements>
     <command interpreter="python">
     <![CDATA[
         column_arrange.py
           --columns
         #for token in $rep_param_columns:
-            "$token.param_column"
+            '$token.param_column'
         #end for
-        --in $param_input
-        --out $output
+        --in '$param_input'
+        --out '$output'
+        $discard_remaining
     ]]>
     </command>
     <inputs>
@@ -22,9 +20,15 @@
                     <valid initial="string.printable">
                         <remove value="&apos;"/>
                     </valid>
+                    <mapping initial="none">
+                        <add source="&apos;" target="&apos;&quot;&apos;&quot;&apos;" />
+                    </mapping>
                 </sanitizer>
             </param>
         </repeat>
+        <param name="discard_remaining" type="boolean" truevalue="--discard" falsevalue="" checked="false"
+        label="Discard unspecified columns?"
+        help="Columns not explicitly specified above for rearrangement can be appended after the last specified column in their original order (the default) or be discarded from the output." />
     </inputs>
     <outputs>
         <data format="tabular" name="output" />
@@ -38,7 +42,36 @@
             <repeat name="rep_param_columns">
                 <param name="param_column" value="age"/>
             </repeat>
-            <output name="out" file="columnarrange_result1.tab"/>
+            <output name="output" file="columnarrange_result1.tab"/>
+        </test>
+        <test>
+            <param name="param_input" value="columnarrange_input1.tab"/>
+            <repeat name="rep_param_columns">
+                <param name="param_column" value="fname"/>
+            </repeat>
+            <param name="discard_remaining" value="true" />
+            <output name="output">
+                <assert_contents>
+                    <has_n_columns n="1" />
+                    <has_line line="fname" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="param_input" value="columnarrange_input1.tab"/>
+            <repeat name="rep_param_columns">
+                <param name="param_column" value="fname"/>
+            </repeat>
+            <repeat name="rep_param_columns">
+                <param name="param_column" value="age"/>
+            </repeat>
+            <param name="discard_remaining" value="true" />
+            <output name="output">
+                <assert_contents>
+                    <has_n_columns n="2" />
+                    <has_line line="fname&#009;age" />
+                </assert_contents>
+            </output>
         </test>
         <test>
             <param name="param_input" value="columnarrange_input2.tab"/>
@@ -48,14 +81,16 @@
             <repeat name="rep_param_columns">
                 <param name="param_column" value="nationality"/>
             </repeat>
-            <output name="out" file="columnarrange_result2.tab"/>
+            <output name="output" file="columnarrange_result2.tab"/>
         </test>
     </tests>
     <help>
 **What it does**

-With this tool you can specify (by naming the header) which columns need to be leftmost.
-The columns which are not specified will be ordered as before, right of the columns which were specified.
+With this tool you can specify - by name - the order of columns for tabular
+data.
+Columns not specified will remain ordered as before and be moved to the right
+of the specified columns, as shown in the following example.

 Input file::

@@ -63,12 +98,15 @@
     a       b       c       d
     a       b       c       d

-Specifying **CHeader** and **BHeader**, as the columns that should be leftmost, generates::
+Specifying **CHeader** and **BHeader**, as the columns that should be leftmost,
+generates::

     CHeader BHeader AHeader DHeader
     c       b       a       d
     c       b       a       d

+Alternatively, you can choose to retain *only* the specified columns in their
+new arrangement and discard all other columns.
     </help>
     <citations>
         <citation type="bibtex">
--- a/column_arrange.py	Fri Oct 16 14:31:13 2015 -0400
+++ b/column_arrange.py	Fri Feb 15 07:45:03 2019 -0500
@@ -1,18 +1,30 @@
-#!/usr/bin/env python
-import pandas as pd
+#!/usr/bin/env python
+
 import argparse

 parser = argparse.ArgumentParser()
 parser.add_argument('-i', '--input', help='Tabular Input File Name')
 parser.add_argument('-o','--output', help='Tabular Output File')
-parser.add_argument('-c', '--columns', nargs='+', help='Column Headers to Sort By')
+parser.add_argument(
+    '-c', '--columns', nargs='+', help='Column Headers to Sort By'
+)
+parser.add_argument(
+    '-d', '--discard', action='store_true',
+    help='Discard remaining columns'
+)
+
 args=parser.parse_args()

-cols = args.columns
-table = pd.read_csv(args.input, sep='\t')
-blist = list(table.columns)
-for token in cols:
-    blist.remove(token)
-sorted_table = table[args.columns + blist]
-# write without index, seperated by tabs
-sorted_table.to_csv(args.output, sep='\t', index=False)
+with open(args.input) as data:
+    hdr = next(data)
+    columns = hdr.rstrip('\n').split('\t')
+    idx = [columns.index(name) for name in args.columns]
+    if not args.discard:
+        idx += [i for i in range(len(columns)) if i not in idx]
+    rearranged_cols = [columns[i] for i in idx]
+    with open(args.output, 'w') as out:
+        out.write('\t'.join(rearranged_cols) + '\n')
+        for line in data:
+            columns = line.rstrip('\n').split('\t')
+            rearranged_cols = [columns[i] for i in idx]
+            out.write('\t'.join(rearranged_cols) + '\n')
Binary file column_arrange_by_header.tar.gz has changed
--- a/tool_dependencies.xml	Fri Oct 16 14:31:13 2015 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="pandas" version="0.14.1">
-        <repository changeset_revision="ac9f317487a9" name="package_pandas_0_14" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
-    </package>
-</tool_dependency>