Mercurial > repos > blankenberg > column_regex_substitution
changeset 0:12b740c4cbc1 draft default tip
planemo upload for repository https://github.com/blankenberg/tools-blankenberg/tree/master/tools/column_regex_substitution commit 78936dc6be1747303d4cbfd80d09e4cfd1cbf292
author | blankenberg |
---|---|
date | Fri, 07 Sep 2018 10:29:30 -0400 |
parents | |
children | |
files | column_regex_substitution.py column_regex_substitution.xml test-data/column_regex_substitution_in.tabular test-data/column_regex_substitution_out.tabular |
diffstat | 4 files changed, 170 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/column_regex_substitution.py Fri Sep 07 10:29:30 2018 -0400 @@ -0,0 +1,93 @@ +#!/usr/bin/env python +#Dan Blankenberg + +""" +A script for using regex substitutions on columns. +""" + +import optparse +import re +import sys +import string + +VERSION = "0.0.1" + +COLUMN_STRIP_VALUES = "".join( set( string.printable ) - set( string.digits ) - set(',') ) + +def get_provided_columns( provided_value, column_offset ): + try: + rval = sorted( map( lambda x: int( x.strip( COLUMN_STRIP_VALUES ) ) + column_offset, provided_value.split( ',' ) ) ) + except: + rval = None + if rval: + return rval + return None + + +def __main__(): + parser = optparse.OptionParser() + parser.add_option('--pattern', action='store', default=None, + help='pattern string') + parser.add_option('--replacement', action='store', default=None, + help='replacement string') + parser.add_option('--input', action='store', default=None, + help='Filename of input file') + parser.add_option('--output', action='store', default=None, + help='Filename of output file') + parser.add_option('--delimiter', action='store', default=None, + help='column delimiter') + parser.add_option('--columns', action='store', default=None, + help='columns to operate on') + parser.add_option('--column_offset', action='store', default=0, + help='offset to apply to columns index to force to zero-based') + parser.add_option('--skip', action='store', default=0, + help='Number of lines to skip') + parser.add_option('--version', action='store_true', default=False, + help='Show version') + + (options, args) = parser.parse_args() + + if options.version: + print "blankenberg_python_regex_substitution %s" % ( VERSION ) + sys.exit(0) + + if None in [ options.pattern, options.replacement, options.output ]: + parser.print_help() + sys.exit(1) + + pattern = options.pattern + replacement = options.replacement + column_offset = int( options.column_offset ) + print "Pattern: %s\nReplacement: %s" % ( repr( pattern ), repr( replacement ) ) + pattern = re.compile( pattern ) + provided_columns = get_provided_columns( options.columns, column_offset ) + if provided_columns: + column_str = ", ".join( map( lambda x: str( x - column_offset ), provided_columns ) ) + else: + column_str = 'all' + print "With delimiter %s, on columns: %s" % ( repr( options.delimiter ), column_str ) + if options.delimiter is None: + split_func = lambda x: [ x.rstrip( '\n\r' ) ] + join_char = "" + else: + split_func = lambda x: x.rstrip( '\n\r' ).split( options.delimiter ) + join_char = options.delimiter + with open( options.input, 'rb' ) as fin: + with open( options.output, 'w') as fout: + for i, line in enumerate( fin ): + if i < options.skip: + continue + line = split_func( line ) + field_count = len( line ) + if provided_columns: + columns = provided_columns + else: + columns = range( field_count ) + for j in columns: + if j >= field_count: + break + line[ j ] = re.sub( pattern, replacement, line[ j ] ) + fout.write( "%s\n" % ( join_char.join( line ) ) ) + +if __name__ == "__main__": + __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/column_regex_substitution.xml Fri Sep 07 10:29:30 2018 -0400 @@ -0,0 +1,57 @@ +<tool id="column_regex_substitution" name="Column substitution" version="0.0.1"> + <description>by regular expressions</description> + <requirements> + <requirement type="package" version="2.7">python</requirement> + </requirements> + <stdio> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> + <version_command>python column_regex_substitution.py --version</version_command> + <command><![CDATA[ + #import pipes + python '$__tool_directory__/column_regex_substitution.py' + --input '${input_tabular}' + --pattern ${pipes.quote( str( $pattern ).replace('\\','\\\\').decode( 'string_escape' ) ) or '""'} + --replacement ${ pipes.quote( str( $replacement ).replace('\\','\\\\').decode( 'string_escape' ) ) or '""' } + --columns '${ $columns or "" }' + --output '${outfile}' + #if $input_tabular.metadata.delimiter: + --delimiter ${ pipes.quote( str( $input_tabular.unsanitized.metadata.delimiter ) ) } + #end if + --column_offset '-1' + ]]> + </command> + <inputs> + <param name="input_tabular" type="data" format="txt" label="File to perform substitution on" /> + <param name="pattern" type="text" label="Pattern to Match"> + <sanitizer sanitize="False" /> + </param> + <param name="replacement" type="text" label="Replacement"> + <sanitizer sanitize="False" /> + </param> + <param name="columns" label="Columns for substitution" type="data_column" data_ref="input_tabular" optional="True" multiple="True" help="Selecting no columns will target all columns"/> + </inputs> + <outputs> + <data name="outfile" format_source="input_tabular" /> + </outputs> + <tests> + <test> + <param name="input_tabular" ftype="tabular" value="column_regex_substitution_in.tabular"/> + <param name="pattern" value="^$"/> + <param name="replacement" value="."/> + <param name="columns" value="1"/> + <output name="outfile" ftype="tabular" file="column_regex_substitution_out.tabular" /> + </test> + </tests> + <help><![CDATA[ +**What it does** + +Use regular expressions (regex) to substitute field values. Datasets that do not have a metadata delimiter value will be treated as a single column per row. + +This tool makes use of Python's `re.sub functionality <https://docs.python.org/2/library/re.html#re.sub>`_. + +]]> + </help> +<citations></citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/column_regex_substitution_in.tabular Fri Sep 07 10:29:30 2018 -0400 @@ -0,0 +1,10 @@ + Firmicutes Proteobacteria Actinobacteria Deinococcus-Thermus Bacteroidetes Synergistetes Plactomycetes +expected 50 30 10 5 5 0 0 +AA53D 99 61 2 2 16 0 0 +D08KJ/ABH4G 856 905 8 33 12 1 0 +AD891/AD87D 1315 1289 9 43 18 1 0 +AFJRV 268 219 0 4 7 0 1 +AG3A0 448 274 3 6 30 0 0 +AG22P 531 450 0 12 11 0 0 +AH4HV 363 336 2 5 16 0 0 +AHHBT 414 190 1 6 7 0 0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/column_regex_substitution_out.tabular Fri Sep 07 10:29:30 2018 -0400 @@ -0,0 +1,10 @@ +. Firmicutes Proteobacteria Actinobacteria Deinococcus-Thermus Bacteroidetes Synergistetes Plactomycetes +expected 50 30 10 5 5 0 0 +AA53D 99 61 2 2 16 0 0 +D08KJ/ABH4G 856 905 8 33 12 1 0 +AD891/AD87D 1315 1289 9 43 18 1 0 +AFJRV 268 219 0 4 7 0 1 +AG3A0 448 274 3 6 30 0 0 +AG22P 531 450 0 12 11 0 0 +AH4HV 363 336 2 5 16 0 0 +AHHBT 414 190 1 6 7 0 0