Mercurial > repos > blankenberg > column_regex_substitution
comparison column_regex_substitution.py @ 0:12b740c4cbc1 draft default tip
planemo upload for repository https://github.com/blankenberg/tools-blankenberg/tree/master/tools/column_regex_substitution commit 78936dc6be1747303d4cbfd80d09e4cfd1cbf292
author | blankenberg |
---|---|
date | Fri, 07 Sep 2018 10:29:30 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:12b740c4cbc1 |
---|---|
1 #!/usr/bin/env python | |
2 #Dan Blankenberg | |
3 | |
4 """ | |
5 A script for using regex substitutions on columns. | |
6 """ | |
7 | |
8 import optparse | |
9 import re | |
10 import sys | |
11 import string | |
12 | |
13 VERSION = "0.0.1" | |
14 | |
15 COLUMN_STRIP_VALUES = "".join( set( string.printable ) - set( string.digits ) - set(',') ) | |
16 | |
17 def get_provided_columns( provided_value, column_offset ): | |
18 try: | |
19 rval = sorted( map( lambda x: int( x.strip( COLUMN_STRIP_VALUES ) ) + column_offset, provided_value.split( ',' ) ) ) | |
20 except: | |
21 rval = None | |
22 if rval: | |
23 return rval | |
24 return None | |
25 | |
26 | |
27 def __main__(): | |
28 parser = optparse.OptionParser() | |
29 parser.add_option('--pattern', action='store', default=None, | |
30 help='pattern string') | |
31 parser.add_option('--replacement', action='store', default=None, | |
32 help='replacement string') | |
33 parser.add_option('--input', action='store', default=None, | |
34 help='Filename of input file') | |
35 parser.add_option('--output', action='store', default=None, | |
36 help='Filename of output file') | |
37 parser.add_option('--delimiter', action='store', default=None, | |
38 help='column delimiter') | |
39 parser.add_option('--columns', action='store', default=None, | |
40 help='columns to operate on') | |
41 parser.add_option('--column_offset', action='store', default=0, | |
42 help='offset to apply to columns index to force to zero-based') | |
43 parser.add_option('--skip', action='store', default=0, | |
44 help='Number of lines to skip') | |
45 parser.add_option('--version', action='store_true', default=False, | |
46 help='Show version') | |
47 | |
48 (options, args) = parser.parse_args() | |
49 | |
50 if options.version: | |
51 print "blankenberg_python_regex_substitution %s" % ( VERSION ) | |
52 sys.exit(0) | |
53 | |
54 if None in [ options.pattern, options.replacement, options.output ]: | |
55 parser.print_help() | |
56 sys.exit(1) | |
57 | |
58 pattern = options.pattern | |
59 replacement = options.replacement | |
60 column_offset = int( options.column_offset ) | |
61 print "Pattern: %s\nReplacement: %s" % ( repr( pattern ), repr( replacement ) ) | |
62 pattern = re.compile( pattern ) | |
63 provided_columns = get_provided_columns( options.columns, column_offset ) | |
64 if provided_columns: | |
65 column_str = ", ".join( map( lambda x: str( x - column_offset ), provided_columns ) ) | |
66 else: | |
67 column_str = 'all' | |
68 print "With delimiter %s, on columns: %s" % ( repr( options.delimiter ), column_str ) | |
69 if options.delimiter is None: | |
70 split_func = lambda x: [ x.rstrip( '\n\r' ) ] | |
71 join_char = "" | |
72 else: | |
73 split_func = lambda x: x.rstrip( '\n\r' ).split( options.delimiter ) | |
74 join_char = options.delimiter | |
75 with open( options.input, 'rb' ) as fin: | |
76 with open( options.output, 'w') as fout: | |
77 for i, line in enumerate( fin ): | |
78 if i < options.skip: | |
79 continue | |
80 line = split_func( line ) | |
81 field_count = len( line ) | |
82 if provided_columns: | |
83 columns = provided_columns | |
84 else: | |
85 columns = range( field_count ) | |
86 for j in columns: | |
87 if j >= field_count: | |
88 break | |
89 line[ j ] = re.sub( pattern, replacement, line[ j ] ) | |
90 fout.write( "%s\n" % ( join_char.join( line ) ) ) | |
91 | |
92 if __name__ == "__main__": | |
93 __main__() |