Mercurial > repos > iuc > column_remove_by_header

--- a/column_remove_by_header.py	Wed Apr 12 17:17:29 2017 -0400
+++ b/column_remove_by_header.py	Sat Jul 16 06:55:49 2022 +0000
@@ -1,35 +1,64 @@
 #!/usr/bin/env python

-import subprocess
-import sys
-
-AWK_CMD = """BEGIN{FS="%s"; OFS="%s";} {print %s;}"""
+import argparse

-input_filename = sys.argv[1]
-output_filename = sys.argv[2]
-delimiter = sys.argv[3]
-keep_columns = sys.argv[4]
-strip_characters = sys.argv[5]
-
-if keep_columns == "--keep":
-    keep_columns = True
-else:
-    keep_columns = False
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "-i", "--input", required=True, help="Tabular Input File Name"
+)
+parser.add_argument(
+    "-o", "--output", required=True, help="Tabular Output File"
+)
+parser.add_argument(
+    "-c", "--columns", dest="names", nargs="+",
+    help="Column headers to operate on"
+)
+parser.add_argument("-d", "--delimiter", default='\t', help="Column delimiter")
+parser.add_argument(
+    "-k", "--keep", action="store_true",
+    help="Drop non-selected columns instead of selected ones"
+)
+parser.add_argument(
+    "-s", "--strip_chars", default=None,
+    help="Ignore these leading characters when extracting the name of the "
+         "first line"
+)
+parser.add_argument(
+    "--unicode-escaped-cols", action="store_true",
+    help="Indicate that the --columns names use unicode escape sequences "
+         "that should be decoded back before comparing them to the input file "
+         "header"
+)
+args = parser.parse_args()

-names = []
-for name in sys.argv[6:]:
-    names.append( name )
+# The delimiter can only be parsed reliably from the input if it's from
+# the ASCII range of characters
+try:
+    bytes_delimiter = args.delimiter.encode(encoding="ascii")
+except UnicodeEncodeError:
+    raise ValueError("Only ASCII characters are allowed as column delimiters")
+# handle unicode escape sequences in --columns argument
+if args.unicode_escaped_cols:
+    names = [n.encode().decode('unicode_escape') for n in args.names]
+else:
+    names = args.names

-header = None
-with open( input_filename, 'r' ) as fh:
-    header = fh.readline().strip( '\r\n' )
-header = header.split( delimiter )
-columns = []
-for i, key in enumerate( header, 1 ):
-    if i == 1 and strip_characters:
-        key = key.lstrip( strip_characters )
-    if ( keep_columns and key in names ) or ( not keep_columns and key not in names ):
-        columns.append( i )
-print( "Kept", len( columns ), "of", len( header ), "columns." )
-awk_cmd = AWK_CMD % ( delimiter, delimiter, ",".join( map( lambda x: "$%s" % x, columns ) ) )
-sys.exit( subprocess.call( [ 'gawk', awk_cmd, input_filename ], stdout=open( output_filename, 'wb+' ), shell=False ) )
+with open(args.input, "r", encoding="utf-8", errors="surrogateescape") as fh:
+    header_cols = fh.readline().strip("\n").split(args.delimiter)
+columns = set()
+for i, key in enumerate(header_cols):
+    if i == 0 and args.strip_chars:
+        key = key.lstrip(args.strip_chars)
+    if (args.keep and key in names) or (not args.keep and key not in names):
+        columns.add(i)
+print("Kept", len(columns), "of", len(header_cols), "columns.")
+
+with open(args.input, "rb") as i:
+    with open(args.output, "wb") as o:
+        for line in i:
+            fields = [
+                f for idx, f in enumerate(
+                    line.rstrip(b"\r\n").split(bytes_delimiter)
+                ) if idx in columns
+            ]
+            o.write(bytes_delimiter.join(fields) + b"\n")
--- a/column_remove_by_header.xml	Wed Apr 12 17:17:29 2017 -0400
+++ b/column_remove_by_header.xml	Sat Jul 16 06:55:49 2022 +0000
@@ -1,13 +1,13 @@
-<tool id="column_remove_by_header" name="Remove columns" version="0.0.1">
+<tool id="column_remove_by_header" name="Remove columns" version="1.0">
     <description>
         by heading
     </description>
     <requirements>
-        <requirement type="package" version="3.6.1">python</requirement>
-        <requirement type="package" version="4.1.3">gawk</requirement>
+        <requirement type="package" version="3.10.4">python</requirement>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
-        python '$__tool_directory__/column_remove_by_header.py' '${input_tabular}' '${output_tabular}' '${input_tabular.unsanitized.metadata.delimiter}' '${keep_columns}' '${strip_characters}'
+        python '$__tool_directory__/column_remove_by_header.py' -i '${input_tabular}' -o '${output_tabular}' -d '${input_tabular.unsanitized.metadata.delimiter}' ${keep_columns} -s '${strip_characters}' --unicode-escaped-cols
+        --columns
         #for $header in $headers:
             '${header.name}'
         #end for
@@ -15,26 +15,26 @@
     </command>
     <inputs>
         <param name="input_tabular" type="data" format="tabular" multiple="False" optional="False" label="Tabular file"/>
-        <repeat name="headers" title="Header" min="1" default="1">
+        <repeat name="headers" title="Select Columns" min="1" default="1">
             <param name="name" type="text" optional="False" label="Header name">
                 <sanitizer>
                     <valid initial="string.printable">
                         <remove value="&apos;"/>
                     </valid>
                     <mapping initial="none">
-                        <add source="&apos;" target=""/>
+                        <add source="&apos;" target="&apos;&quot;&apos;&quot;&apos;"/>
                     </mapping>
                 </sanitizer>
             </param>
         </repeat>
-        <param label="Keep named columns" name="keep_columns" type="boolean" truevalue="--keep" falsevalue="" checked="False"/>
+        <param name="keep_columns" type="boolean" truevalue="--keep" falsevalue="" checked="False" label="Keep named columns" help="When enabled revert the tool's action and drop all columns that are NOT selected above." />
         <param name="strip_characters" type="text" optional="False" label="Characters to strip when doing name comparison in first column" value="#" help="Removes characters from the left of the first column only.">
             <sanitizer>
                 <valid initial="string.printable">
                     <remove value="&apos;"/>
                 </valid>
                 <mapping initial="none">
-                    <add source="&apos;" target=""/>
+                    <add source="&apos;" target="&apos;&quot;&apos;&quot;&apos;"/>
                 </mapping>
             </sanitizer>
         </param>
@@ -46,7 +46,8 @@
         <test>
             <param name="input_tabular" value="in_1.tabular" ftype="tabular"/>
             <param name="name" value="a"/>
-            <param name="keep_columns" value=""/>
+            <param name="name" value="\xf6"/>
+            <param name="keep_columns" value="false"/>
             <param name="strip_characters" value="#"/>
             <output name="output_tabular" file="out_1.tabular" ftype="tabular"/>
         </test>
@@ -54,16 +55,18 @@
             <param name="input_tabular" value="in_1.tabular" ftype="tabular"/>
             <param name="name" value="a"/>
             <param name="name" value="KEY"/>
-            <param name="keep_columns" value="--keep"/>
+            <param name="keep_columns" value="true"/>
             <param name="strip_characters" value="#"/>
             <output name="output_tabular" file="out_2.tabular" ftype="tabular"/>
         </test>
     </tests>
-    <help>
-        <![CDATA[
-        Removes or keeps columns based upon user provided values.
-        ]]>
-    </help>
+    <help><![CDATA[
+Removes or keeps columns based upon user provided values.
+
+Hint: If any of the column names you would like to specify contains special
+(non-ASCII) characters, you can specify these using their Unicode escape
+sequences.
+    ]]></help>
     <citations>
     </citations>
-</tool>
\ No newline at end of file
+</tool>
--- a/test-data/in_1.tabular	Wed Apr 12 17:17:29 2017 -0400
+++ b/test-data/in_1.tabular	Sat Jul 16 06:55:49 2022 +0000
@@ -1,4 +1,4 @@
-#KEY	b	c	a
-one	1-1	1-2	1-3
-two	1-4	1-5	1-6
-three	1-7	1-8	1-9
+#KEY	b	c	a	ö
+one	1-1	1-2	1-3	ä-ö
+two	1-4	1-5	1-6	µ-î
+three	1-7	1-8	1-9	ß-é