Mercurial > repos > blankenberg > column_regex_substitution

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/column_regex_substitution.py	Fri Sep 07 10:29:30 2018 -0400
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+#Dan Blankenberg
+
+"""
+A script for using regex substitutions on columns.
+"""
+
+import optparse
+import re
+import sys
+import string
+
+VERSION = "0.0.1"
+
+COLUMN_STRIP_VALUES = "".join( set( string.printable ) - set( string.digits ) - set(',') )
+
+def get_provided_columns( provided_value, column_offset ):
+    try:
+        rval = sorted( map( lambda x: int( x.strip( COLUMN_STRIP_VALUES ) ) + column_offset, provided_value.split( ',' ) ) )
+    except:
+        rval = None
+    if rval:
+        return rval
+    return None
+
+
+def __main__():
+    parser = optparse.OptionParser()
+    parser.add_option('--pattern', action='store', default=None,
+                      help='pattern string')
+    parser.add_option('--replacement', action='store', default=None,
+                      help='replacement string')
+    parser.add_option('--input', action='store', default=None,
+                      help='Filename of input file')
+    parser.add_option('--output', action='store', default=None,
+                      help='Filename of output file')
+    parser.add_option('--delimiter', action='store', default=None,
+                      help='column delimiter')
+    parser.add_option('--columns', action='store', default=None,
+                      help='columns to operate on')
+    parser.add_option('--column_offset', action='store', default=0,
+                      help='offset to apply to columns index to force to zero-based')
+    parser.add_option('--skip', action='store', default=0,
+                      help='Number of lines to skip')
+    parser.add_option('--version', action='store_true', default=False,
+                      help='Show version')
+
+    (options, args) = parser.parse_args()
+
+    if options.version:
+        print "blankenberg_python_regex_substitution %s" % ( VERSION )
+        sys.exit(0)
+
+    if None in [ options.pattern, options.replacement, options.output ]:
+        parser.print_help()
+        sys.exit(1)
+
+    pattern = options.pattern
+    replacement = options.replacement
+    column_offset = int( options.column_offset )
+    print "Pattern: %s\nReplacement: %s" % ( repr( pattern ), repr( replacement ) )
+    pattern = re.compile( pattern )
+    provided_columns = get_provided_columns( options.columns, column_offset )
+    if provided_columns:
+        column_str = ", ".join( map( lambda x: str( x - column_offset ), provided_columns ) )
+    else:
+        column_str = 'all'
+    print "With delimiter %s, on columns: %s" % ( repr( options.delimiter ), column_str )
+    if options.delimiter is None:
+        split_func = lambda x: [ x.rstrip( '\n\r' ) ]
+        join_char = ""
+    else:
+        split_func = lambda x: x.rstrip( '\n\r' ).split( options.delimiter )
+        join_char = options.delimiter
+    with open( options.input, 'rb' ) as fin:
+        with open( options.output, 'w') as fout:
+            for i, line in enumerate( fin ):
+                if i < options.skip:
+                    continue
+                line = split_func( line )
+                field_count = len( line )
+                if provided_columns:
+                    columns = provided_columns
+                else:
+                    columns = range( field_count )
+                for j in columns:
+                    if j >= field_count:
+                        break
+                    line[ j ] = re.sub( pattern, replacement, line[ j ] )
+                fout.write( "%s\n" % ( join_char.join( line ) ) )
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/column_regex_substitution.xml	Fri Sep 07 10:29:30 2018 -0400
@@ -0,0 +1,57 @@
+<tool id="column_regex_substitution" name="Column substitution" version="0.0.1">
+    <description>by regular expressions</description>
+    <requirements>
+        <requirement type="package" version="2.7">python</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+    </stdio>
+    <version_command>python column_regex_substitution.py --version</version_command>
+    <command><![CDATA[
+        #import pipes
+        python '$__tool_directory__/column_regex_substitution.py'
+        --input '${input_tabular}'
+        --pattern ${pipes.quote( str( $pattern ).replace('\\','\\\\').decode( 'string_escape' ) ) or '""'}
+        --replacement ${ pipes.quote( str( $replacement ).replace('\\','\\\\').decode( 'string_escape' ) ) or '""' }
+        --columns '${ $columns or "" }'
+        --output '${outfile}'
+        #if $input_tabular.metadata.delimiter:
+            --delimiter ${ pipes.quote( str( $input_tabular.unsanitized.metadata.delimiter ) ) }
+        #end if
+        --column_offset '-1'
+    ]]>
+    </command>
+    <inputs>
+        <param name="input_tabular" type="data" format="txt" label="File to perform substitution on" />
+        <param name="pattern" type="text" label="Pattern to Match">
+            <sanitizer sanitize="False" />
+        </param>
+        <param name="replacement" type="text" label="Replacement">
+            <sanitizer sanitize="False" />
+        </param>
+        <param name="columns" label="Columns for substitution" type="data_column" data_ref="input_tabular" optional="True" multiple="True" help="Selecting no columns will target all columns"/>
+    </inputs>
+    <outputs>
+        <data name="outfile" format_source="input_tabular" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_tabular" ftype="tabular" value="column_regex_substitution_in.tabular"/>
+            <param name="pattern" value="^$"/>
+            <param name="replacement" value="."/>
+            <param name="columns" value="1"/>
+            <output name="outfile" ftype="tabular" file="column_regex_substitution_out.tabular" />
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+Use regular expressions (regex) to substitute field values. Datasets that do not have a metadata delimiter value will be treated as a single column per row.
+
+This tool makes use of Python's `re.sub functionality <https://docs.python.org/2/library/re.html#re.sub>`_.
+
+]]>
+    </help>
+<citations></citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/column_regex_substitution_in.tabular	Fri Sep 07 10:29:30 2018 -0400
@@ -0,0 +1,10 @@
+	Firmicutes	Proteobacteria	Actinobacteria	Deinococcus-Thermus	Bacteroidetes	Synergistetes	Plactomycetes
+expected	50	30	10	5	5	0	0
+AA53D	99	61	2	2	16	0	0
+D08KJ/ABH4G	856	905	8	33	12	1	0
+AD891/AD87D	1315	1289	9	43	18	1	0
+AFJRV	268	219	0	4	7	0	1
+AG3A0	448	274	3	6	30	0	0
+AG22P	531	450	0	12	11	0	0
+AH4HV	363	336	2	5	16	0	0
+AHHBT	414	190	1	6	7	0	0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/column_regex_substitution_out.tabular	Fri Sep 07 10:29:30 2018 -0400
@@ -0,0 +1,10 @@
+.	Firmicutes	Proteobacteria	Actinobacteria	Deinococcus-Thermus	Bacteroidetes	Synergistetes	Plactomycetes
+expected	50	30	10	5	5	0	0
+AA53D	99	61	2	2	16	0	0
+D08KJ/ABH4G	856	905	8	33	12	1	0
+AD891/AD87D	1315	1289	9	43	18	1	0
+AFJRV	268	219	0	4	7	0	1
+AG3A0	448	274	3	6	30	0	0
+AG22P	531	450	0	12	11	0	0
+AH4HV	363	336	2	5	16	0	0
+AHHBT	414	190	1	6	7	0	0