Mercurial > repos > iuc > column_remove_by_header
changeset 1:2040e4c2750a draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
author | iuc |
---|---|
date | Sat, 16 Jul 2022 06:55:49 +0000 |
parents | 372967836e98 |
children | |
files | column_remove_by_header.py column_remove_by_header.xml test-data/in_1.tabular |
diffstat | 3 files changed, 82 insertions(+), 50 deletions(-) [+] |
line wrap: on
line diff
--- a/column_remove_by_header.py Wed Apr 12 17:17:29 2017 -0400 +++ b/column_remove_by_header.py Sat Jul 16 06:55:49 2022 +0000 @@ -1,35 +1,64 @@ #!/usr/bin/env python -import subprocess -import sys - -AWK_CMD = """BEGIN{FS="%s"; OFS="%s";} {print %s;}""" +import argparse -input_filename = sys.argv[1] -output_filename = sys.argv[2] -delimiter = sys.argv[3] -keep_columns = sys.argv[4] -strip_characters = sys.argv[5] - -if keep_columns == "--keep": - keep_columns = True -else: - keep_columns = False +parser = argparse.ArgumentParser() +parser.add_argument( + "-i", "--input", required=True, help="Tabular Input File Name" +) +parser.add_argument( + "-o", "--output", required=True, help="Tabular Output File" +) +parser.add_argument( + "-c", "--columns", dest="names", nargs="+", + help="Column headers to operate on" +) +parser.add_argument("-d", "--delimiter", default='\t', help="Column delimiter") +parser.add_argument( + "-k", "--keep", action="store_true", + help="Drop non-selected columns instead of selected ones" +) +parser.add_argument( + "-s", "--strip_chars", default=None, + help="Ignore these leading characters when extracting the name of the " + "first line" +) +parser.add_argument( + "--unicode-escaped-cols", action="store_true", + help="Indicate that the --columns names use unicode escape sequences " + "that should be decoded back before comparing them to the input file " + "header" +) +args = parser.parse_args() -names = [] -for name in sys.argv[6:]: - names.append( name ) +# The delimiter can only be parsed reliably from the input if it's from +# the ASCII range of characters +try: + bytes_delimiter = args.delimiter.encode(encoding="ascii") +except UnicodeEncodeError: + raise ValueError("Only ASCII characters are allowed as column delimiters") +# handle unicode escape sequences in --columns argument +if args.unicode_escaped_cols: + names = [n.encode().decode('unicode_escape') for n in args.names] +else: + names = args.names -header = None -with open( input_filename, 'r' ) as fh: - header = fh.readline().strip( '\r\n' ) -header = header.split( delimiter ) -columns = [] -for i, key in enumerate( header, 1 ): - if i == 1 and strip_characters: - key = key.lstrip( strip_characters ) - if ( keep_columns and key in names ) or ( not keep_columns and key not in names ): - columns.append( i ) -print( "Kept", len( columns ), "of", len( header ), "columns." ) -awk_cmd = AWK_CMD % ( delimiter, delimiter, ",".join( map( lambda x: "$%s" % x, columns ) ) ) -sys.exit( subprocess.call( [ 'gawk', awk_cmd, input_filename ], stdout=open( output_filename, 'wb+' ), shell=False ) ) +with open(args.input, "r", encoding="utf-8", errors="surrogateescape") as fh: + header_cols = fh.readline().strip("\n").split(args.delimiter) +columns = set() +for i, key in enumerate(header_cols): + if i == 0 and args.strip_chars: + key = key.lstrip(args.strip_chars) + if (args.keep and key in names) or (not args.keep and key not in names): + columns.add(i) +print("Kept", len(columns), "of", len(header_cols), "columns.") + +with open(args.input, "rb") as i: + with open(args.output, "wb") as o: + for line in i: + fields = [ + f for idx, f in enumerate( + line.rstrip(b"\r\n").split(bytes_delimiter) + ) if idx in columns + ] + o.write(bytes_delimiter.join(fields) + b"\n")
--- a/column_remove_by_header.xml Wed Apr 12 17:17:29 2017 -0400 +++ b/column_remove_by_header.xml Sat Jul 16 06:55:49 2022 +0000 @@ -1,13 +1,13 @@ -<tool id="column_remove_by_header" name="Remove columns" version="0.0.1"> +<tool id="column_remove_by_header" name="Remove columns" version="1.0"> <description> by heading </description> <requirements> - <requirement type="package" version="3.6.1">python</requirement> - <requirement type="package" version="4.1.3">gawk</requirement> + <requirement type="package" version="3.10.4">python</requirement> </requirements> <command detect_errors="exit_code"><![CDATA[ - python '$__tool_directory__/column_remove_by_header.py' '${input_tabular}' '${output_tabular}' '${input_tabular.unsanitized.metadata.delimiter}' '${keep_columns}' '${strip_characters}' + python '$__tool_directory__/column_remove_by_header.py' -i '${input_tabular}' -o '${output_tabular}' -d '${input_tabular.unsanitized.metadata.delimiter}' ${keep_columns} -s '${strip_characters}' --unicode-escaped-cols + --columns #for $header in $headers: '${header.name}' #end for @@ -15,26 +15,26 @@ </command> <inputs> <param name="input_tabular" type="data" format="tabular" multiple="False" optional="False" label="Tabular file"/> - <repeat name="headers" title="Header" min="1" default="1"> + <repeat name="headers" title="Select Columns" min="1" default="1"> <param name="name" type="text" optional="False" label="Header name"> <sanitizer> <valid initial="string.printable"> <remove value="'"/> </valid> <mapping initial="none"> - <add source="'" target=""/> + <add source="'" target="'"'"'"/> </mapping> </sanitizer> </param> </repeat> - <param label="Keep named columns" name="keep_columns" type="boolean" truevalue="--keep" falsevalue="" checked="False"/> + <param name="keep_columns" type="boolean" truevalue="--keep" falsevalue="" checked="False" label="Keep named columns" help="When enabled revert the tool's action and drop all columns that are NOT selected above." /> <param name="strip_characters" type="text" optional="False" label="Characters to strip when doing name comparison in first column" value="#" help="Removes characters from the left of the first column only."> <sanitizer> <valid initial="string.printable"> <remove value="'"/> </valid> <mapping initial="none"> - <add source="'" target=""/> + <add source="'" target="'"'"'"/> </mapping> </sanitizer> </param> @@ -46,7 +46,8 @@ <test> <param name="input_tabular" value="in_1.tabular" ftype="tabular"/> <param name="name" value="a"/> - <param name="keep_columns" value=""/> + <param name="name" value="\xf6"/> + <param name="keep_columns" value="false"/> <param name="strip_characters" value="#"/> <output name="output_tabular" file="out_1.tabular" ftype="tabular"/> </test> @@ -54,16 +55,18 @@ <param name="input_tabular" value="in_1.tabular" ftype="tabular"/> <param name="name" value="a"/> <param name="name" value="KEY"/> - <param name="keep_columns" value="--keep"/> + <param name="keep_columns" value="true"/> <param name="strip_characters" value="#"/> <output name="output_tabular" file="out_2.tabular" ftype="tabular"/> </test> </tests> - <help> - <![CDATA[ - Removes or keeps columns based upon user provided values. - ]]> - </help> + <help><![CDATA[ +Removes or keeps columns based upon user provided values. + +Hint: If any of the column names you would like to specify contains special +(non-ASCII) characters, you can specify these using their Unicode escape +sequences. + ]]></help> <citations> </citations> -</tool> \ No newline at end of file +</tool>