Repository 'column_regex_substitution'
hg clone https://toolshed.g2.bx.psu.edu/repos/blankenberg/column_regex_substitution

Changeset 0:12b740c4cbc1 (2018-09-07)
Commit message:
planemo upload for repository https://github.com/blankenberg/tools-blankenberg/tree/master/tools/column_regex_substitution commit 78936dc6be1747303d4cbfd80d09e4cfd1cbf292
added:
column_regex_substitution.py
column_regex_substitution.xml
test-data/column_regex_substitution_in.tabular
test-data/column_regex_substitution_out.tabular
b
diff -r 000000000000 -r 12b740c4cbc1 column_regex_substitution.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/column_regex_substitution.py Fri Sep 07 10:29:30 2018 -0400
[
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+#Dan Blankenberg
+
+"""
+A script for using regex substitutions on columns.
+"""
+
+import optparse
+import re
+import sys
+import string
+
+VERSION = "0.0.1"
+
+COLUMN_STRIP_VALUES = "".join( set( string.printable ) - set( string.digits ) - set(',') )
+
+def get_provided_columns( provided_value, column_offset ):
+    try:
+        rval = sorted( map( lambda x: int( x.strip( COLUMN_STRIP_VALUES ) ) + column_offset, provided_value.split( ',' ) ) )
+    except:
+        rval = None
+    if rval:
+        return rval
+    return None
+
+
+def __main__():
+    parser = optparse.OptionParser()
+    parser.add_option('--pattern', action='store', default=None,
+                      help='pattern string')
+    parser.add_option('--replacement', action='store', default=None,
+                      help='replacement string')
+    parser.add_option('--input', action='store', default=None,
+                      help='Filename of input file')
+    parser.add_option('--output', action='store', default=None,
+                      help='Filename of output file')
+    parser.add_option('--delimiter', action='store', default=None,
+                      help='column delimiter')
+    parser.add_option('--columns', action='store', default=None,
+                      help='columns to operate on')
+    parser.add_option('--column_offset', action='store', default=0,
+                      help='offset to apply to columns index to force to zero-based')
+    parser.add_option('--skip', action='store', default=0,
+                      help='Number of lines to skip')
+    parser.add_option('--version', action='store_true', default=False,
+                      help='Show version')
+
+    (options, args) = parser.parse_args()
+
+    if options.version:
+        print "blankenberg_python_regex_substitution %s" % ( VERSION )
+        sys.exit(0)
+
+    if None in [ options.pattern, options.replacement, options.output ]:
+        parser.print_help()
+        sys.exit(1)
+
+    pattern = options.pattern
+    replacement = options.replacement
+    column_offset = int( options.column_offset )
+    print "Pattern: %s\nReplacement: %s" % ( repr( pattern ), repr( replacement ) )
+    pattern = re.compile( pattern )
+    provided_columns = get_provided_columns( options.columns, column_offset )
+    if provided_columns:
+        column_str = ", ".join( map( lambda x: str( x - column_offset ), provided_columns ) )
+    else:
+        column_str = 'all'
+    print "With delimiter %s, on columns: %s" % ( repr( options.delimiter ), column_str )
+    if options.delimiter is None:
+        split_func = lambda x: [ x.rstrip( '\n\r' ) ]
+        join_char = ""
+    else:
+        split_func = lambda x: x.rstrip( '\n\r' ).split( options.delimiter )
+        join_char = options.delimiter
+    with open( options.input, 'rb' ) as fin:
+        with open( options.output, 'w') as fout:
+            for i, line in enumerate( fin ):
+                if i < options.skip:
+                    continue
+                line = split_func( line )
+                field_count = len( line )
+                if provided_columns:
+                    columns = provided_columns
+                else:
+                    columns = range( field_count )
+                for j in columns:
+                    if j >= field_count:
+                        break
+                    line[ j ] = re.sub( pattern, replacement, line[ j ] )
+                fout.write( "%s\n" % ( join_char.join( line ) ) )
+
+if __name__ == "__main__":
+    __main__()
b
diff -r 000000000000 -r 12b740c4cbc1 column_regex_substitution.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/column_regex_substitution.xml Fri Sep 07 10:29:30 2018 -0400
[
@@ -0,0 +1,57 @@
+<tool id="column_regex_substitution" name="Column substitution" version="0.0.1">
+    <description>by regular expressions</description>
+    <requirements>
+        <requirement type="package" version="2.7">python</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+    </stdio>
+    <version_command>python column_regex_substitution.py --version</version_command>
+    <command><![CDATA[
+        #import pipes
+        python '$__tool_directory__/column_regex_substitution.py'
+        --input '${input_tabular}'
+        --pattern ${pipes.quote( str( $pattern ).replace('\\','\\\\').decode( 'string_escape' ) ) or '""'}
+        --replacement ${ pipes.quote( str( $replacement ).replace('\\','\\\\').decode( 'string_escape' ) ) or '""' }
+        --columns '${ $columns or "" }'
+        --output '${outfile}'
+        #if $input_tabular.metadata.delimiter:
+            --delimiter ${ pipes.quote( str( $input_tabular.unsanitized.metadata.delimiter ) ) }
+        #end if
+        --column_offset '-1'
+    ]]>
+    </command>
+    <inputs>
+        <param name="input_tabular" type="data" format="txt" label="File to perform substitution on" />
+        <param name="pattern" type="text" label="Pattern to Match">
+            <sanitizer sanitize="False" />
+        </param>
+        <param name="replacement" type="text" label="Replacement">
+            <sanitizer sanitize="False" />
+        </param>
+        <param name="columns" label="Columns for substitution" type="data_column" data_ref="input_tabular" optional="True" multiple="True" help="Selecting no columns will target all columns"/>
+    </inputs>
+    <outputs>
+        <data name="outfile" format_source="input_tabular" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_tabular" ftype="tabular" value="column_regex_substitution_in.tabular"/>
+            <param name="pattern" value="^$"/>
+            <param name="replacement" value="."/>
+            <param name="columns" value="1"/>
+            <output name="outfile" ftype="tabular" file="column_regex_substitution_out.tabular" />
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+Use regular expressions (regex) to substitute field values. Datasets that do not have a metadata delimiter value will be treated as a single column per row.
+
+This tool makes use of Python's `re.sub functionality <https://docs.python.org/2/library/re.html#re.sub>`_.
+
+]]>
+    </help>
+<citations></citations>
+</tool>
b
diff -r 000000000000 -r 12b740c4cbc1 test-data/column_regex_substitution_in.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/column_regex_substitution_in.tabular Fri Sep 07 10:29:30 2018 -0400
b
@@ -0,0 +1,10 @@
+ Firmicutes Proteobacteria Actinobacteria Deinococcus-Thermus Bacteroidetes Synergistetes Plactomycetes
+expected 50 30 10 5 5 0 0
+AA53D 99 61 2 2 16 0 0
+D08KJ/ABH4G 856 905 8 33 12 1 0
+AD891/AD87D 1315 1289 9 43 18 1 0
+AFJRV 268 219 0 4 7 0 1
+AG3A0 448 274 3 6 30 0 0
+AG22P 531 450 0 12 11 0 0
+AH4HV 363 336 2 5 16 0 0
+AHHBT 414 190 1 6 7 0 0
b
diff -r 000000000000 -r 12b740c4cbc1 test-data/column_regex_substitution_out.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/column_regex_substitution_out.tabular Fri Sep 07 10:29:30 2018 -0400
b
@@ -0,0 +1,10 @@
+. Firmicutes Proteobacteria Actinobacteria Deinococcus-Thermus Bacteroidetes Synergistetes Plactomycetes
+expected 50 30 10 5 5 0 0
+AA53D 99 61 2 2 16 0 0
+D08KJ/ABH4G 856 905 8 33 12 1 0
+AD891/AD87D 1315 1289 9 43 18 1 0
+AFJRV 268 219 0 4 7 0 1
+AG3A0 448 274 3 6 30 0 0
+AG22P 531 450 0 12 11 0 0
+AH4HV 363 336 2 5 16 0 0
+AHHBT 414 190 1 6 7 0 0