Repository 'fastq_manipulation'
hg clone https://toolshed.g2.bx.psu.edu/repos/devteam/fastq_manipulation

Changeset 0:5d1e9e13e8db (2014-01-27)
Next changeset 1:bb07615a5b6a (2015-11-11)
Commit message:
Imported from capsule None
added:
fastq_manipulation.py
fastq_manipulation.xml
test-data/empty_file.dat
test-data/fastq_trimmer_out1.fastqsanger
test-data/misc_dna_as_sanger_rev_comp_1.fastqsanger
test-data/misc_dna_original_sanger.fastqsanger
test-data/misc_rna_as_sanger_rev_comp_1.fastqsanger
test-data/misc_rna_original_sanger.fastqsanger
test-data/sanger_full_range_as_rna.fastqsanger
test-data/sanger_full_range_original_sanger.fastqsanger
test-data/sanger_full_range_rev_comp.fastqsanger
test-data/sanger_full_range_rev_comp_1_seq.fastqsanger
tool_dependencies.xml
b
diff -r 000000000000 -r 5d1e9e13e8db fastq_manipulation.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fastq_manipulation.py Mon Jan 27 09:26:01 2014 -0500
[
@@ -0,0 +1,37 @@
+#Dan Blankenberg
+import sys, os, shutil
+import imp
+from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
+
+def main():
+    #Read command line arguments
+    input_filename = sys.argv[1]
+    script_filename = sys.argv[2]
+    output_filename = sys.argv[3]
+    additional_files_path = sys.argv[4]
+    input_type = sys.argv[5] or 'sanger'
+    
+    #Save script file for debuging/verification info later
+    os.mkdir( additional_files_path )
+    shutil.copy( script_filename, os.path.join( additional_files_path, 'debug.txt' ) )
+    
+    fastq_manipulator = imp.load_module( 'fastq_manipulator', open( script_filename ), script_filename, ( '', 'r', imp.PY_SOURCE ) )
+    
+    out = fastqWriter( open( output_filename, 'wb' ), format = input_type )
+    
+    i = None
+    reads_manipulated = 0
+    for i, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
+        new_read = fastq_manipulator.match_and_manipulate_read( fastq_read )
+        if new_read:
+            out.write( new_read )
+        if new_read != fastq_read:
+            reads_manipulated += 1
+    out.close()
+    if i is None:
+        print "Your file contains no valid FASTQ reads."
+    else:
+        print 'Manipulated %s of %s reads (%.2f%%).' % ( reads_manipulated, i + 1, float( reads_manipulated ) / float( i + 1 ) * 100.0 )
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 5d1e9e13e8db fastq_manipulation.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fastq_manipulation.xml Mon Jan 27 09:26:01 2014 -0500
[
b'@@ -0,0 +1,432 @@\n+<tool id="fastq_manipulation" name="Manipulate FASTQ" version="1.0.1">\n+  <options sanitize="False" /> <!-- This tool uses a file to rely all parameter information (actually a dynamically generated python module), we can safely not sanitize any parameters -->\n+  <requirements>\n+    <requirement type="package" version="1.0.0">galaxy_sequence_utils</requirement>\n+  </requirements>\n+  <description>reads on various attributes</description>\n+  <command interpreter="python">fastq_manipulation.py $input_file $fastq_manipulation_file $output_file $output_file.files_path \'${input_file.extension[len( \'fastq\' ):]}\'</command>\n+  <inputs>\n+    <!-- This tool is purposely over-engineered (e.g. Single option conditionals) to allow easy enhancement with workflow/rerun compatibility -->\n+    <page>\n+      <param name="input_file" type="data" format="fastqsanger,fastqcssanger" label="FASTQ File" help="Requires groomed data: if your data does not appear here try using the FASTQ groomer."/>\n+      <!-- Match Reads -->\n+      <repeat name="match_blocks" title="Match Reads">\n+        <conditional name="match_type">\n+          <param name="match_type_selector" type="select" label="Match Reads by">\n+            <option value="identifier">Name/Identifier</option>\n+            <option value="sequence">Sequence Content</option>\n+            <option value="quality">Quality Score Content</option>\n+          </param>\n+          <when value="identifier">\n+            <conditional name="match">\n+              <param name="match_selector" type="select" label="Identifier Match Type">\n+                <option value="regex">Regular Expression</option>\n+              </param>\n+              <when value="regex">\n+                <param type="text" name="match_by" label="Match by" value=".*" />\n+              </when>\n+            </conditional>\n+          </when>\n+          <when value="sequence">\n+            <conditional name="match">\n+              <param name="match_selector" type="select" label="Sequence Match Type">\n+                <option value="regex">Regular Expression</option>\n+              </param>\n+              <when value="regex">\n+                <param type="text" name="match_by" label="Match by" value=".*" />\n+              </when>\n+            </conditional>\n+          </when>\n+          <when value="quality">\n+            <conditional name="match">\n+              <param name="match_selector" type="select" label="Quality Match Type">\n+                <option value="regex">Regular Expression</option>\n+              </param>\n+              <when value="regex">\n+                <param type="text" name="match_by" label="Match by" value=".*" />\n+              </when>\n+            </conditional>\n+          </when>\n+        </conditional>\n+      </repeat>\n+      <!-- Manipulate Matched Reads -->\n+      <repeat name="manipulate_blocks" title="Manipulate Reads">\n+        <conditional name="manipulation_type">\n+          <param name="manipulation_type_selector" type="select" label="Manipulate Reads on">\n+            <option value="identifier">Name/Identifier</option>\n+            <option value="sequence">Sequence Content</option>\n+            <option value="quality">Quality Score Content</option>\n+            <option value="miscellaneous">Miscellaneous Actions</option>\n+          </param>\n+          <when value="identifier">\n+            <conditional name="manipulation">\n+              <param name="manipulation_selector" type="select" label="Identifier Manipulation Type">\n+                <option value="translate">String Translate</option>\n+              </param>\n+              <when value="translate">\n+                <param name="from" type="text" label="From" value="" />\n+                <param name="to" type="text" label="To" value="" />\n+              </when>\n+            </conditional>\n+          </when>\n+          <when value="sequence">\n+            <conditional name="manipulation">\n+              <param name="manipulation_sele'..b'="fastqsanger" />\n+      <param name="match_type_selector" value="identifier" />\n+      <param name="match_selector" value="regex" />\n+      <param name="match_by" value="FAKE0001" />\n+      <param name="manipulation_type_selector" value="sequence" />\n+      <param name="manipulation_selector" value="rev_comp" />\n+      <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" />\n+    </test>\n+    <!-- match all and DNA to RNA -->\n+    <test>\n+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />\n+      <param name="match_type_selector" value="identifier" />\n+      <param name="match_selector" value="regex" />\n+      <param name="match_by" value=".*" />\n+      <param name="manipulation_type_selector" value="sequence" />\n+      <param name="manipulation_selector" value="dna_to_rna" />\n+      <output name="output_file" file="sanger_full_range_as_rna.fastqsanger" />\n+    </test>\n+    <!-- match all and RNA to DNA -->\n+    <test>\n+      <param name="input_file" value="sanger_full_range_as_rna.fastqsanger" ftype="fastqsanger" />\n+      <param name="match_type_selector" value="identifier" />\n+      <param name="match_selector" value="regex" />\n+      <param name="match_by" value=".*" />\n+      <param name="manipulation_type_selector" value="sequence" />\n+      <param name="manipulation_selector" value="rna_to_dna" />\n+      <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" />\n+    </test>\n+  </tests>\n+<help>\n+This tool allows you to build complex manipulations to be applied to each matching read in a FASTQ file. A read must match all matching directives in order for it to be manipulated; if a read does not match, it is output in a non-modified manner. All reads matching will have each of the specified manipulations performed upon them, in the order specified.\n+\n+Regular Expression Matches are made using re.search, see http://docs.python.org/library/re.html for more information.\n+  All matching is performed on a single line string, regardless if e.g. the sequence or quality score spans multiple lines in the original file.\n+\n+String translations are performed using string.translate, see http://docs.python.org/library/string.html#string.translate and http://docs.python.org/library/string.html#string.maketrans for more information.\n+\n+.. class:: warningmark\n+\n+Only color space reads can have adapter bases substituted.\n+\n+\n+-----\n+\n+**Example**\n+\n+Suppose you have a color space sanger formatted sequence (fastqcssanger) and you want to double-encode the color space into psuedo-nucleotide space (this is different from converting) to allow these reads to be used in tools which do not natively support it (using specially designed indexes). This tool can handle this manipulation, however, this is generally not recommended as results tend to be poorer than those produced from tools which are specially designed to handle color space data.\n+\n+Steps:\n+\n+1. Click **Add new Match Reads** and leave the matching options set to the default (Matching by sequence name/identifier using the regular expression "\\*."; thereby matching all reads). \n+2. Click **Add new Manipulate Reads**, change **Manipulate Reads on** to "Sequence Content", set **Sequence Manipulation Type** to "Change Adapter Base" and set **New Adapter** to "" (an empty text field). \n+3. Click **Add new Manipulate Reads**, change **Manipulate Reads on** to "Sequence Content", set **Sequence Manipulation Type** to "String Translate" and set **From** to "0123." and **To** to "ACGTN".\n+4. Click Execute. The new history item will contained double-encoded psuedo-nucleotide space reads.\n+\n+------\n+\n+**Citation**\n+\n+If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. &lt;http://www.ncbi.nlm.nih.gov/pubmed/20562416&gt;`_\n+\n+\n+</help>\n+</tool>\n'
b
diff -r 000000000000 -r 5d1e9e13e8db test-data/fastq_trimmer_out1.fastqsanger
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastq_trimmer_out1.fastqsanger Mon Jan 27 09:26:01 2014 -0500
b
@@ -0,0 +1,8 @@
+@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order)
+CGTA
++
+NOPQ
+@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order)
+ATGC
++
+QPON
b
diff -r 000000000000 -r 5d1e9e13e8db test-data/misc_dna_as_sanger_rev_comp_1.fastqsanger
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/misc_dna_as_sanger_rev_comp_1.fastqsanger Mon Jan 27 09:26:01 2014 -0500
b
@@ -0,0 +1,16 @@
+@FAKE0007 Original version has lower case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order)
+TACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
++
+IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
+@FAKE0008 Original version has mixed case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order)
+cgCTatgAcgCTatgAcgCTatgAcgCTatgAcgCTatgAc
++
+IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
+@FAKE0009 Original version has lower case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order)
+actgactgactgactgactgactgactgactgactgactga
++
+IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
+@FAKE0010 Original version has mixed case ambiguous DNA and PHRED scores of 40, 30, 20, 10 (cycled)
+NHBVDMKSWRYGATCnhbvdmkswrygatc
++
+?I+5?I+5?I+5?I+5?I+5?I+5?I+5?I
b
diff -r 000000000000 -r 5d1e9e13e8db test-data/misc_dna_original_sanger.fastqsanger
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/misc_dna_original_sanger.fastqsanger Mon Jan 27 09:26:01 2014 -0500
b
@@ -0,0 +1,16 @@
+@FAKE0007 Original version has lower case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order)
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTA
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
+@FAKE0008 Original version has mixed case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order)
+gTcatAGcgTcatAGcgTcatAGcgTcatAGcgTcatAGcg
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
+@FAKE0009 Original version has lower case unambiguous DNA with PHRED scores from 0 to 40 inclusive (in that order)
+tcagtcagtcagtcagtcagtcagtcagtcagtcagtcagt
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
+@FAKE0010 Original version has mixed case ambiguous DNA and PHRED scores of 40, 30, 20, 10 (cycled)
+gatcrywsmkhbvdnGATCRYWSMKHBVDN
++
+I?5+I?5+I?5+I?5+I?5+I?5+I?5+I?
b
diff -r 000000000000 -r 5d1e9e13e8db test-data/misc_rna_as_sanger_rev_comp_1.fastqsanger
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/misc_rna_as_sanger_rev_comp_1.fastqsanger Mon Jan 27 09:26:01 2014 -0500
b
@@ -0,0 +1,16 @@
+@FAKE0011 Original version has lower case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order)
+UACGUACGUACGUACGUACGUACGUACGUACGUACGUACGU
++
+IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
+@FAKE0012 Original version has mixed case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order)
+cgCUaugAcgCUaugAcgCUaugAcgCUaugAcgCUaugAc
++
+IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
+@FAKE0013 Original version has lower case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order)
+acugacugacugacugacugacugacugacugacugacuga
++
+IHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
+@FAKE0014 Original version has mixed case ambiguous RNA with PHRED scores from 35 to 40 inclusive (cycled)
+NHBVDMKSWRYGAUCnhbvdmkswrygauc
++
+IHGFEDIHGFEDIHGFEDIHGFEDIHGFED
b
diff -r 000000000000 -r 5d1e9e13e8db test-data/misc_rna_original_sanger.fastqsanger
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/misc_rna_original_sanger.fastqsanger Mon Jan 27 09:26:01 2014 -0500
b
@@ -0,0 +1,16 @@
+@FAKE0011 Original version has lower case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order)
+ACGUACGUACGUACGUACGUACGUACGUACGUACGUACGUA
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
+@FAKE0012 Original version has mixed case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order)
+gUcauAGcgUcauAGcgUcauAGcgUcauAGcgUcauAGcg
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
+@FAKE0013 Original version has lower case unambiguous RNA with PHRED scores from 0 to 40 inclusive (in that order)
+ucagucagucagucagucagucagucagucagucagucagu
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
+@FAKE0014 Original version has mixed case ambiguous RNA with PHRED scores from 35 to 40 inclusive (cycled)
+gaucrywsmkhbvdnGAUCRYWSMKHBVDN
++
+DEFGHIDEFGHIDEFGHIDEFGHIDEFGHI
b
diff -r 000000000000 -r 5d1e9e13e8db test-data/sanger_full_range_as_rna.fastqsanger
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sanger_full_range_as_rna.fastqsanger Mon Jan 27 09:26:01 2014 -0500
[
@@ -0,0 +1,8 @@
+@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order)
+ACGUACGUACGUACGUACGUACGUACGUACGUACGUACGUACGUACGUACGUACGUACGUACGUACGUACGUACGUACGUACGUACGUACGUAC
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order)
+CAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCA
++
+~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
b
diff -r 000000000000 -r 5d1e9e13e8db test-data/sanger_full_range_original_sanger.fastqsanger
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sanger_full_range_original_sanger.fastqsanger Mon Jan 27 09:26:01 2014 -0500
[
@@ -0,0 +1,8 @@
+@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order)
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order)
+CATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA
++
+~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
b
diff -r 000000000000 -r 5d1e9e13e8db test-data/sanger_full_range_rev_comp.fastqsanger
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sanger_full_range_rev_comp.fastqsanger Mon Jan 27 09:26:01 2014 -0500
[
@@ -0,0 +1,8 @@
+@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order)
+GTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
++
+~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
+@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order)
+TGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATG
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
b
diff -r 000000000000 -r 5d1e9e13e8db test-data/sanger_full_range_rev_comp_1_seq.fastqsanger
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sanger_full_range_rev_comp_1_seq.fastqsanger Mon Jan 27 09:26:01 2014 -0500
[
@@ -0,0 +1,8 @@
+@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order)
+GTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
++
+~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
+@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order)
+CATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA
++
+~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
b
diff -r 000000000000 -r 5d1e9e13e8db tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Mon Jan 27 09:26:01 2014 -0500
b
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="galaxy_sequence_utils" version="1.0.0">
+      <repository changeset_revision="0643676ad5f7" name="package_galaxy_utils_1_0" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>