Mercurial > repos > galaxyp > regex_find_replace

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/regex.py	Wed Jan 18 17:45:20 2017 -0500
@@ -0,0 +1,50 @@
+import sys
+import os
+import re
+import string
+import commands
+from optparse import OptionParser
+from tempfile import NamedTemporaryFile
+
+def main():
+  parser = OptionParser()
+  parser.add_option("--input", dest="input")
+  parser.add_option("--output", dest="output")
+  parser.add_option("--input_display_name", dest="input_display_name")
+  parser.add_option("--pattern", dest="patterns", action="append",
+                    help="regex pattern for replacement")
+  parser.add_option("--replacement", dest="replacements", action="append",
+                    help="replacement for regex match")
+  parser.add_option("--column", dest="column", default=None)
+  (options, args) = parser.parse_args()
+
+  mapped_chars = { '\'' :'__sq__', '\\' : '__backslash__' }
+
+  column = None
+  if options.column is not None:
+    column = int(options.column) - 1 # galaxy tabular is 1-based, python array are zero-based
+
+  with open(options.input, 'r') as input:
+    with open(options.output, 'w') as output:
+      while True:
+        line = input.readline()
+        if line == "":
+          break
+        for (pattern, replacement) in zip(options.patterns, options.replacements):
+          for key, value in mapped_chars.items():
+            pattern = pattern.replace(value, key)
+            replacement = replacement.replace(value, key)
+          replacement = replacement.replace("#{input_name}", options.input_display_name)
+          if column is None:
+            line = re.sub(pattern, replacement, line)
+          else:
+            cells = line.split("\t")
+            if cells and len(cells) > column:
+              cell = cells[column]
+              cell = re.sub(pattern, replacement, cell)
+              cells[column] = cell
+              line = "\t".join(cells)
+        output.write(line)
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/regex.xml	Wed Jan 18 17:45:20 2017 -0500
@@ -0,0 +1,136 @@
+<tool id="regex1" name="Regex Find And Replace" version="1.0.0">
+  <description></description>
+  <command interpreter="python">regex.py --input '$input' --output '$out_file1' --input_display_name '$input.display_name'
+    #for $check in $checks:
+    --pattern='$check.pattern' --replacement='$check.replacement'
+    #end for
+  </command>
+  <inputs>
+    <param format="txt" name="input" type="data" label="Select lines from"/>
+    <repeat name="checks" title="Check">
+      <param name="pattern" size="40" type="text" value="chr([0-9A-Za-z])+" label="Find Regex" help="here you can enter text or regular expression (for syntax check lower part of this frame)">
+        <sanitizer>
+          <valid>
+            <add preset="string.printable"/>
+            <remove value="&#92;" />
+            <remove value="&apos;" />
+          </valid>
+          <mapping initial="none">
+            <add source="&#92;" target="__backslash__" />
+            <add source="&apos;" target="__sq__"/>
+          </mapping>
+        </sanitizer>
+      </param>
+      <param name="replacement" size="40" type="text" value="newchr\1" label="Replacement">
+        <sanitizer>
+          <valid>
+            <add preset="string.printable"/>
+            <remove value="&#92;" />
+            <remove value="&apos;" />
+          </valid>
+          <mapping initial="none">
+            <add source="&#92;" target="__backslash__" />
+            <add source="&apos;" target="__sq__"/>
+          </mapping>
+        </sanitizer>
+      </param>
+    </repeat>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="find1.txt"/>
+      <param name="pattern" value="(T\w+)"/>
+      <param name="replacement" value="\1 \1" />
+      <output name="out_file1" file="replace1.txt"/>
+    </test>
+    <test>
+      <param name="input" value="find1.txt"/>
+      <param name="pattern" value="f"/>
+      <param name="replacement" value="'&quot;" />
+      <output name="out_file1" file="replace2.txt"/>
+    </test>
+    <test>
+      <param name="input" value="find1.txt"/>
+      <param name="checks_0|pattern" value="a test file"/>
+      <param name="checks_0|replacement" value="a file named #{input_name}" />
+      <param name="checks_1|pattern" value="see here"/>
+      <param name="checks_1|replacement" value="see #{input_name}" />
+      <param name="checks_2|pattern" value="see (find1).txt"/>
+      <param name="checks_2|replacement" value="see \1" />
+      <output name="out_file1" file="replace3.txt"/>
+    </test>
+  </tests>
+  <help>
+This tool goes line by line through the specified input file and
+replaces text which matches the specified regular expression patterns
+with its corresponding specified replacement.
+
+This tool uses Python regular expressions. More information about
+Python regular expressions can be found here:
+http://docs.python.org/library/re.html.
+
+To convert an Ilumina FATSQ sequence id from the CAVASA 8 format::
+
+ @EAS139:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG
+ GGGTGATGGCCGCTGCCGATGGCGTCAAATCCCACC
+ +EAS139:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG
+ IIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IG9IC
+
+To the CASAVA 7 format::
+
+ @EAS139_FC706VJ:2:2104:15343:197393#0/1
+ GGGTGATGGCCGCTGCCGATGGCGTCAAATCCCACC
+ +EAS139_FC706VJ:2:2104:15343:197393#0/1
+ IIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IG9IC
+
+Use Settings::
+
+ Find Regex: ^([@+][A-Z0-9]+):\d+:(\S+)\s(\d).*$
+ Replacement: \1_\2#0/\3
+
+Note that the parentheses **()** capture patterns in the text that can be used in the replacement text by using a backslash-number reference:  **\\1**
+
+The regex **^([@+][A-Z0-9]+):\d+:(\S+) (\d).*$** means::
+
+  ^  - start the match at the beginning of the line of text
+  (  - start a group (1), that is a string of matched text, that can be back-referenced in the replacement as \1
+  [@+]  - matches either a @ or + character
+  [A-Z0-9]+  - matches an uppercase letter or a digit, the plus sign means to match 1 or more such characters
+  )  - end a group (1), that is a string of matched text, that can be back-referenced in the replacement as \1
+  :\d+:   - matches a colon followed by one or more digits followed by a colon character
+  (\S+)  - matches one or more non-whitespace charcters,  the enclosing parentheses make this a group (2) that can back-referenced in the replacement text as \2
+  \s  - matches a whitespace character
+  (\d)  - matches a single digit character,  the enclosing parentheses make this a group (3) that can back-referenced in the replacement text as \3
+  .*  - dot means match any character, asterisk means zero more more matches
+  $  - the regex must match to the end of the line of text
+
+In the replacement pattern, use the special token #{input_name} to insert the input dataset's display name.
+The name can be modified by a second find/replace check. Suppose you want to insert the sample id of your dataset,
+named **Sample ABC123**, into the dataset itself, which currently contains the lines::
+Data 1
+Data 2
+Data 3
+
+You can use the following checks::
+Find Regex: Data
+Replacement: #{input_name} Data
+
+Find Regex: Sample (\S+)
+Replacement: \1
+
+The result will be::
+ABC123 Data 1
+ABC123 Data 2
+ABC123 Data 3
+
+
+
+Galaxy aggressively escapes input supplied to tools, so if something
+is not working please let us know and we can look into whether this is
+the cause. Also if you would like help constructing regular
+expressions for your inputs, please let us know at help@msi.umn.edu.
+</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/regex_tabular.xml	Wed Jan 18 17:45:20 2017 -0500
@@ -0,0 +1,139 @@
+<tool id="regexColumn1" name="Column Regex Find And Replace" version="1.0.0">
+  <description></description>
+  <command interpreter="python">regex.py --input '$input' --output '$out_file1' --column $field --input_display_name '$input.display_name'
+    #for $check in $checks:
+    --pattern='$check.pattern' --replacement='$check.replacement'
+    #end for
+  </command>
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Select cells from"/>
+    <param name="field" label="using column" type="data_column" data_ref="input" />
+    <repeat name="checks" title="Check">
+      <param name="pattern" size="40" type="text" value="chr([0-9A-Za-z])+" label="Find Regex" help="here you can enter text or regular expression (for syntax check lower part of this frame)">
+        <sanitizer>
+          <valid>
+            <add preset="string.printable"/>
+            <remove value="&#92;" />
+            <remove value="&apos;" />
+          </valid>
+          <mapping initial="none">
+            <add source="&#92;" target="__backslash__" />
+            <add source="&apos;" target="__sq__"/>
+          </mapping>
+        </sanitizer>
+      </param>
+      <param name="replacement" size="40" type="text" value="newchr\1" label="Replacement">
+        <sanitizer>
+          <valid>
+            <add preset="string.printable"/>
+            <remove value="&#92;" />
+            <remove value="&apos;" />
+          </valid>
+          <mapping initial="none">
+            <add source="&#92;" target="__backslash__" />
+            <add source="&apos;" target="__sq__"/>
+          </mapping>
+        </sanitizer>
+      </param>
+    </repeat>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="find_tabular_1.txt" ftype="tabular" />
+      <param name="field" value="2" />
+      <param name="pattern" value="moo"/>
+      <param name="replacement" value="cow" />
+      <output name="out_file1" file="replace_tabular_1.txt"/>
+    </test>
+    <test>
+      <param name="input" value="find_tabular_1.txt" ftype="tabular" />
+      <param name="field" value="1" />
+      <param name="pattern" value="moo"/>
+      <param name="replacement" value="cow" />
+      <output name="out_file1" file="replace_tabular_2.txt"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark
+
+**This tool will attempt to reuse the metadata from your first input.** To change metadata assignments click on the "edit attributes" link of the history item generated by this tool.
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+-----
+
+This tool goes line by line through the specified input file and
+if the text in the selected column matches a specified regular expression pattern
+replaces the text with the corresponding specified replacement.
+
+This tool can be used to change between the chromosome naming conventions of UCSC and Ensembl.
+
+For example to remove the **chr** part of the reference sequence name in the first column of this GFF file::
+
+ ##gff-version 2
+ ##Date: Thu Mar 23 11:21:17 2006
+ ##bed2gff.pl $Rev: 601 $
+ ##Input file: ./database/files/61c6c604e0ef50b280e2fd9f1aa7da61.dat
+ chr1	bed2gff	CCDS1000.1_cds_0_0_chr1_148325916_f	148325916	148325975	.	+	.	score "0";
+ chr21	bed2gff	CCDS13614.1_cds_0_0_chr21_32707033_f	32707033	32707192	.	+	.	score "0";
+ chrX	bed2gff	CCDS14606.1_cds_0_0_chrX_122745048_f	122745048	122745924	.	+	.	score "0";
+
+Setting::
+
+ using column: c1
+ Find Regex: chr([0-9]+|X|Y|M[Tt]?)
+ Replacement: \1
+
+produces::
+
+ ##gff-version 2
+ ##Date: Thu Mar 23 11:21:17 2006
+ ##bed2gff.pl $Rev: 601 $
+ ##Input file: ./database/files/61c6c604e0ef50b280e2fd9f1aa7da61.dat
+ 1    bed2gff CCDS1000.1_cds_0_0_chr1_148325916_f     148325916       148325975       .       +       .       score "0";
+ 21   bed2gff CCDS13614.1_cds_0_0_chr21_32707033_f    32707033        32707192        .       +       .       score "0";
+ X    bed2gff CCDS14606.1_cds_0_0_chrX_122745048_f    122745048       122745924       .       +       .       score "0";
+
+
+This tool uses Python regular expressions with the **re.sub()** function.
+More information about Python regular expressions can be found here:
+http://docs.python.org/library/re.html.
+
+The regex **chr([0-9]+|X|Y|M)** means start with text **chr** followed by either: one or more digits, or the letter X, or the letter Y, or the letter M (optionally followed by a single letter T or t).
+Note that the parentheses **()** capture patterns in the text that can be used in the replacement text by using a backslash-number reference:  **\\1**
+
+
+In the replacement pattern, use the special token #{input_name} to insert the input dataset's display name.
+The name can be modified by a second find/replace check. Suppose you want to insert the sample id of your dataset,
+named **Sample ABC123**, into the dataset itself, which currently contains the lines::
+Data 1
+Data 2
+Data 3
+
+You can use the following checks::
+Find Regex: Data
+Replacement: #{input_name} Data
+
+Find Regex: Sample (\S+)
+Replacement: \1
+
+The result will be::
+ABC123 Data 1
+ABC123 Data 2
+ABC123 Data 3
+
+
+
+Galaxy aggressively escapes input supplied to tools, so if something
+is not working please let us know and we can look into whether this is
+the cause. Also if you would like help constructing regular
+expressions for your inputs, please let us know at help@msi.umn.edu.
+
+</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/find1.txt	Wed Jan 18 17:45:20 2017 -0500
@@ -0,0 +1,3 @@
+This is a test file.
+
+There is nothing to see here.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/find_tabular_1.txt	Wed Jan 18 17:45:20 2017 -0500
@@ -0,0 +1,2 @@
+this	test
+moo	amooa
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/replace1.txt	Wed Jan 18 17:45:20 2017 -0500
@@ -0,0 +1,3 @@
+This This is a test file.
+
+There There is nothing to see here.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/replace2.txt	Wed Jan 18 17:45:20 2017 -0500
@@ -0,0 +1,3 @@
+This is a test '"ile.
+
+There is nothing to see here.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/replace3.txt	Wed Jan 18 17:45:20 2017 -0500
@@ -0,0 +1,3 @@
+This is a file named find1.txt.
+
+There is nothing to see find1.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/replace_tabular_1.txt	Wed Jan 18 17:45:20 2017 -0500
@@ -0,0 +1,2 @@
+this	test
+moo	acowa
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/replace_tabular_2.txt	Wed Jan 18 17:45:20 2017 -0500
@@ -0,0 +1,2 @@
+this	test
+cow	amooa