changeset 0:f6e5bb5aa2f5 draft

Imported from capsule None
author devteam
date Mon, 19 May 2014 12:34:22 -0400
parents
children 3a48f010da67
files rmapq_wrapper.py rmapq_wrapper.xml test-data/rmapq_wrapper_test1.bed test-data/rmapq_wrapper_test1.fasta test-data/rmapq_wrapper_test1.qual tool-data/faseq.loc.sample tool_dependencies.xml
diffstat 7 files changed, 264 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rmapq_wrapper.py	Mon May 19 12:34:22 2014 -0400
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+
+import os, sys, tempfile
+
+assert sys.version_info[:2] >= (2.4)
+
+def stop_err( msg ):
+    
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+    
+
+def __main__():
+    
+    # I/O
+    target_path = sys.argv[1]
+    infile = sys.argv[2]
+    scorefile = sys.argv[3]
+    high_score = sys.argv[4]            # -q
+    high_len = sys.argv[5]              # -M
+    read_len = sys.argv[6]              # -w
+    align_len = sys.argv[7]             # -h
+    mismatch = sys.argv[8]              # -m
+    output_file = sys.argv[9]
+    
+    try: 
+        float(high_score)
+    except:
+        stop_err('Invalid value for minimal quality score.')
+
+    try:
+        int(high_len)
+    except:
+        stop_err('Invalid value for minimal high quality bases.')
+            
+    # first guess the read length
+    guess_read_len = 0
+    seq = ''
+    for i, line in enumerate(open(infile)):
+        line = line.rstrip('\r\n')
+        if line.startswith('>'):
+            if seq:
+                guess_read_len = len(seq)
+                break
+        else:
+            seq += line
+            
+    try: 
+        test = int(read_len)
+        if test == 0:
+            read_len = str(guess_read_len)
+        else:
+            assert test >= 20 and test <= 64
+    except:
+        stop_err('Invalid value for read length. Must be between 20 and 64.')
+
+    
+    try:
+        int(align_len)    
+    except:
+        stop_err('Invalid value for minimal length of a hit.')
+    
+    try:
+        int(mismatch)
+    except:
+        stop_err('Invalid value for mismatch numbers in an alignment.')
+    
+    all_files = []
+    if os.path.isdir(target_path):
+        # check target genome
+        fa_files = os.listdir(target_path)
+            
+        for file in fa_files:
+            file = "%s/%s" % ( target_path, file )
+            file = os.path.normpath(file)
+            all_files.append(file)
+    else:
+        stop_err("No sequences for %s are available for search, please report this error." %(target_path))
+   
+    for detail_file_path in all_files:
+        output_tempfile = tempfile.NamedTemporaryFile().name
+        command = "rmapq -q %s -M %s -h %s -w %s -m %s -Q %s -c %s %s -o %s 2>&1" % ( high_score, high_len, align_len, read_len, mismatch, scorefile, detail_file_path, infile, output_tempfile )
+        #print command
+        try:
+            os.system( command )
+        except Exception, e:
+            stop_err( str( e ) )
+
+        try:
+            assert os.system( 'cat %s >> %s' % ( output_tempfile, output_file ) ) == 0
+        except Exception, e:
+            stop_err( str( e ) )
+        
+        try:
+            os.remove( output_tempfile )
+        except:
+            pass
+
+            
+if __name__ == '__main__': __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rmapq_wrapper.xml	Mon May 19 12:34:22 2014 -0400
@@ -0,0 +1,94 @@
+<tool id="rmapq_wrapper" name="RMAPQ" version="1.0.0">
+    <description>for Solexa Short Reads Alignment with Quality Scores</description>
+    <requirements>
+      <requirement type="package" version="2.05">rmap</requirement>
+    </requirements>
+    <command interpreter="python">
+    #if $trim.choice=="No":
+        rmapq_wrapper.py $database $input_seq $input_score $high_score $high_len 0 $align_len $mismatch $output1
+    #else:
+        rmapq_wrapper.py $database $input_seq $input_score $high_score $high_len $trim.read_len $align_len $mismatch $output1
+    #end if
+    </command>
+    <inputs>
+        <param name="database" type="select" display="radio" label="Target database">
+			<options from_file="faseq.loc">
+			  <column name="name" index="0"/>
+			  <column name="value" index="0"/>
+			</options>
+        </param>
+        <param name="input_seq" type="data" format="fasta" label="Sequence file"/>
+        <param name="input_score" type="data" format="qualsolexa" label="Quality score file"/>
+        <param name="high_score" type="float" size="15" value="40" label="Minimum score for high-quality base (-q)"/>
+        <param name="high_len" type="integer" size="15" value="36" label="Minimal high-quality bases (-M)"/>
+        <param name="align_len" type="integer" size="15" value="11" label="Minimal length of a hit (-h)" help="seed"/>
+        <param name="mismatch" type="select" label="Number of mismatches allowed (-m)">
+            <option value="0">0</option>
+            <option value="1">1</option>
+            <option value="3">3</option>
+            <option value="5">5</option>
+        </param>
+        <conditional name="trim">
+            <param name="choice" type="select" label="To trim the reads">
+                <option value="No">No</option>
+                <option value="Yes">Yes</option>
+            </param>
+            <when value="No">
+            </when>
+            <when value="Yes">
+                <param name="read_len" type="integer" size="15" value="36" label="Read length (-w)" /> 
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="output1" format="bed"/>
+    </outputs>
+    <!-- 
+    <tests>
+        <test>
+            <param name="database" value="/galaxy/data/faseq/test" />
+            <param name="input_seq" value="rmapq_wrapper_test1.fasta" ftype="fasta"/>
+            <param name="input_score" value="rmapq_wrapper_test1.qual" ftype="qualsolexa" />
+            <param name="high_score" value="40" />
+            <param name="high_len" value="36" />
+            <param name="read_len" value="36" />
+            <param name="align_len" value="36" />
+            <param name="mismatch" value="3" />
+            <output name="output1" file="rmapq_wrapper_test1.bed"/> 
+        </test>
+    </tests>
+    -->
+    <help>
+    
+.. class:: warningmark
+
+ RMAPQ was developed for **Solexa** reads. 
+
+.. class:: infomark
+
+**TIP**. The tool will guess the length of the reads, however, if you select to trim the reads, the *Maximal Length of the Reads* must be between 20 and 64. Reads with lengths longer than the specified value will be trimmed at the 3'end. 
+
+-----
+
+**What it does**
+
+This tool runs **rmapq** (for more information, please see the reference below), searching against a genome build with sequence qualities.   
+
+-----
+
+**Parameters**
+
+- *Minimal High-quality Bases* (**-M**): the minimal length of the high quality score bases
+- *Minimum Score for High-quality Base* (**-q**) : the minimal quality score 
+- *Minimal Length of a Hit* (**-h**) : the minimal length of an exact match or seed  
+- *Number of Mismatches Allowed* (**-m**) : the maximal number of mismatches allowed in an alignment
+- *Read Length* (**-w**) : maximal length of the reads; reads longer than the threshold will be truncated at 3' end.
+
+-----
+
+**Reference**
+
+ **RMAP** is developed by Dr. Andrew D Smith and Dr. Zhenyu Xuan at the Cold Spring Harbor Laboratory. Please see http://rulai.cshl.edu/rmap/
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rmapq_wrapper_test1.bed	Mon May 19 12:34:22 2014 -0400
@@ -0,0 +1,8 @@
+phix	360	396	seq1	1	-
+phix	4188	4224	seq2	1	+
+phix	4908	4944	seq4	0	-
+phix	2811	2847	seq5	2	+
+phix	3847	3883	seq6	0	-
+phix	91	127	seq7	0	+
+phix	2302	2338	seq8	2	+
+phix	2448	2484	seq9	0	+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rmapq_wrapper_test1.fasta	Mon May 19 12:34:22 2014 -0400
@@ -0,0 +1,20 @@
+>seq1
+GACTCATGATTTCTTACCTATTAGTGGTTGAACATC
+>seq2
+GTGATATGTATGTTGACGGCCATAAGGCTGCTTCTT
+>seq3
+GTTGTCGATAGAACTTCATGTGCCTGTAAAACAAGT
+>seq4
+ACCAACCAGAACGTGAAAAAGCGTCCTGCGTGTAGC
+>seq5
+GTTTATGTTGGTTTCATGGTTTTGTCTAACTTTATC
+>seq6
+GCTTTACCGTCTTTCCAGAAATTGTTCCAAGTATCG
+>seq7
+GCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGC
+>seq8
+GTTATAACGCCGAAGCGGTAAAAATTTTTATTTTTT
+>seq9
+GTTCTCACTTCTGTTACTCCAGCTTCTTCGGCACCT
+>seq10
+GTGGCCTGTTGATTCTAAAGGTTAGTTTCTTCACGC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rmapq_wrapper_test1.qual	Mon May 19 12:34:22 2014 -0400
@@ -0,0 +1,10 @@
+ -40  -40   40  -40	  40  -40  -40  -40	 -40   40  -40  -40	 -40  -40  -40   40	 -40   40  -40  -40	  40  -40  -40  -40	 -40  -40  -40   40	 -40  -40   40  -40	  40  -40  -40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40  -40   40	 -40   40  -40  -40	 -40  -40  -40   40	 -40  -40  -40   40	  40  -40  -40  -40	 -40   40  -40  -40	 -40   40  -40  -40	 -40  -40  -40   40	  40  -40  -40  -40	 -40  -40  -40   40	 -40  -40  -40   40	  40  -40  -40  -40	 -40  -40   40  -40	 -40  -40  -40   40	 -40  -40   40  -40	 -40  -40   40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40   40  -40	  40  -40  -40  -40	  40  -40  -40  -40	 -40   40  -40  -40	  40  -40  -40  -40	 -40  -40  -15   15	 -40   40  -40  -40
+ -40  -40   40  -40	 -40  -40  -40   40	 -40  -40   40  -40	  40  -40  -40  -40	 -40  -40  -40   40	  40  -40  -40  -40	 -40  -40  -40   40	 -40  -40   40  -40	 -40  -40  -40   40	  40  -40  -40  -40	 -40  -40  -40   40	 -40  -40   40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40   40  -40	  40  -40  -40  -40	  -5    5  -40  -40	 -40  -40   40  -40	 -40  -40   40  -40	 -40   21  -40  -21	 -40   40  -40  -40	  40  -40  -40  -40	 -40  -40  -40   40	  40  -40  -40  -40	  12  -40  -40  -12	 -36  -40   36  -40	 -40  -40   40  -40	  -4    4  -40  -40	 -40  -40  -40   40	 -40  -40   14  -14	 -40   40  -40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40   40  -40  -40	 -40  -40  -40   40	 -40  -40  -25   25
+ -40  -40   40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40   40  -40	 -40  -40  -40   40	 -40   40  -40  -40	 -40  -40   40  -40	  40  -40  -40  -40	 -40  -40  -40   40	  40  -40  -40  -40	 -40  -40   40  -40	  40  -40  -40  -40	  40  -40  -40  -40	 -40   40  -40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40   34  -40  -34	  40  -40  -40  -40	 -40  -40  -40   40	 -40  -25   25  -40	 -40  -40  -40   40	 -37  -40   37  -40	 -40    7  -40   -7	 -40   40  -40  -40	 -40  -40  -40   40	 -40  -40   40  -40	 -40  -40  -40   40	  40  -40  -40  -40	  38  -40  -40  -38	  40  -40  -40  -40	  40  -40  -40  -40	 -40   40  -40  -40	  40  -40  -40  -40	  11  -16  -13  -22	 -40  -40   40  -40	 -40  -40  -40   40
+  40  -40  -40  -40	 -40   40  -40  -40	 -40   40  -40  -40	  40  -40  -40  -40	  40  -40  -40  -40	 -33   33  -40  -40	 -40   40  -40  -40	  40  -40  -40  -40	 -40  -40   40  -40	  40  -40  -40  -40	  40  -40  -40  -40	 -40   40  -40  -40	 -40  -40   40  -40	 -40  -40  -25   25	 -40  -40   40  -40	  40  -40  -40  -40	  40  -40  -40  -40	  40  -40  -40  -40	  40  -40  -40  -40	  40  -40  -40  -40	 -40  -40   27  -27	  -5    5  -40  -40	 -40  -40   40  -40	 -40  -40  -40   40	 -40   40  -40  -40	 -40   40  -40  -40	 -40  -40  -40   40	 -40  -40   40  -40	 -40   40  -40  -40	 -40  -37   37  -40	 -40  -40  -40   40	 -40  -40   40  -40	 -40  -40  -25   25	  40  -40  -40  -40	 -40  -40   34  -34	 -40   40  -40  -40
+ -40  -40   40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40  -40   40	  40  -40  -40  -40	 -40  -40  -40   40	 -40  -40   40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40   40  -40	 -40  -40   40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40  -40   40	 -40   40  -40  -40	  40  -40  -40  -40	 -40  -40  -40   40	 -40  -40   40  -40	 -40  -40   40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40   -2    2	 -40  -40   35  -35	 -40  -40  -40   40	 -40   40  -40  -40	 -40  -40  -40   40	  40  -40  -40  -40	  40  -40  -40  -40	 -40   36  -40  -36	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40  -40   40	   5   -5  -40  -28	 -40  -16  -40   16	 -40   40  -40  -40
+ -40  -40   40  -40	 -40   40  -40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40  -40   40	  40  -40  -40  -40	 -40   40  -40  -40	 -40   40  -40  -40	 -40  -40   40  -40	 -40  -40  -40   40	 -40   40  -40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40  -40   40	 -40   40  -40  -40	 -40   40  -40  -40	  40  -40  -40  -40	 -40  -40   40  -40	  40  -40  -40  -40	  40  -40  -40  -40	  40  -40  -40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40   40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40   40  -40  -40	 -40   40  -40  -40	  40  -40  -40  -40	  40  -40  -40  -40	 -40  -40   28  -28	 -40  -40  -40   40	  40  -40  -40  -40	 -40  -40  -40   40	 -40   40  -40  -40	 -40  -40   40  -40
+ -40  -40   40  -40	 -40   40  -40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40   40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40  -40   40	  40  -40  -40  -40	 -40   40  -40  -40	 -40  -40   40  -40	  40  -40  -40  -40	  40  -40  -40  -40	 -40  -40  -40   40	 -40  -40  -40   40	  40  -40  -40  -40	  40  -40  -40  -40	  40  -40  -40  -40	 -40  -40  -40   40	 -40   40  -40  -40	 -40  -40   40  -40	  40  -40  -40  -40	  40  -40  -40  -40	 -40  -40   40  -40	 -40  -40  -40   40	 -40  -40   40  -40	 -40  -40   40  -40	  40  -40  -40  -40	 -40   40  -40  -40	 -40  -40  -40   40	 -40  -40   27  -27	 -40   40  -40  -40	 -40  -40  -40   40	 -40  -40   40  -40	 -40  -40   14  -14	 -40   40  -40  -40
+ -40  -40   40  -40	 -40  -40  -40   40	 -40  -40  -40   40	  40  -40  -40  -40	 -40  -40  -40   40	  40  -40  -40  -40	  40  -40  -40  -40	 -40   40  -40  -40	 -40  -40   40  -40	 -40   40  -40  -40	 -40   40  -40  -40	 -40  -40   40  -40	  40  -40  -40  -40	  40  -40  -40  -40	 -40  -40   40  -40	 -40   40  -40  -40	 -40  -40   40  -40	 -40  -40   40  -40	 -40  -40  -40   40	  40  -40  -40  -40	  40  -40  -40  -40	  40  -40  -40  -40	  40  -40  -40  -40	  40  -40  -40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40  -40   40	 -36  -40  -40   36	  40  -40  -40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40  -40   40
+ -40  -40   40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40   40  -40  -40	 -40  -40  -40   40	 -40   40  -40  -40	  40  -40  -40  -40	 -40   40  -40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40   40  -40  -40	 -40  -40  -40   40	 -40  -40   40  -40	 -40  -40  -40   40	 -40  -40  -40   40	  40  -40  -40  -40	 -40   40  -40  -40	 -40  -40  -40   40	 -40   40  -40  -40	 -40   40  -40  -40	  40  -40  -40  -40	 -40  -40   40  -40	 -40   40  -40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40   40  -40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40   40  -40  -40	 -40  -40   40  -40	 -40  -40   22  -22	 -40   40  -40  -40	  40  -40  -40  -40	 -40   40  -40  -40	 -40   40  -40  -40	 -40  -40  -40   40
+ -40  -40   40  -40	 -40  -40  -40   40	 -40  -40   40  -40	 -40  -40   40  -40	 -40   40  -40  -40	 -40   40  -40  -40	 -40  -40  -40   40	 -40  -40   40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40   40  -40	  40  -40  -40  -40	 -40  -40  -40   40	 -40  -40   -6    6	 -40   40  -40  -40	 -40  -40  -40   40	   3  -40  -40   -3	  40  -40  -40  -40	  40  -40  -40  -40	 -40  -40   40  -40	 -40  -40   40  -40	  -0  -40  -40    0	 -40  -40  -40   40	  40  -40  -40  -40	 -40  -40   40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40  -40  -40   40	 -20   20  -40  -40	 -40  -40  -40   40	 -40  -40  -40   40	 -40   40  -40  -40	   8  -40  -40   -8	 -40   40  -40  -40	 -40  -40   40  -40	  -4    4  -40  -40
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/faseq.loc.sample	Mon May 19 12:34:22 2014 -0400
@@ -0,0 +1,26 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use genome fasta sequence files.  The faseq.loc file has this format 
+#(white space characters are TAB characters):
+#
+# <GenomeBuild> <dir>
+#
+# In the dir, each file is fasta format and contains only one sequence. So,
+#for example, if you had hg18 fasta sequences stored in /depot/data2/galaxy/faseq/hg18,
+#then your faseq.loc entry would look like this:
+#
+#hg18	/depot/data2/galaxy/faseq/hg18
+#
+#and your /depot/data2/galaxy/faseq/hg18 directory would contain all of 
+#your fasta sequence files (e.g.):
+#
+#-rw-r--r--  1 wychung galaxy 138082251 2008-04-16 11:57 chr10.fa
+#-rw-r--r--  1 wychung galaxy    115564 2008-04-16 11:57 chr10_random.fa
+#-rw-r--r--  1 wychung galaxy 137141451 2008-04-16 11:58 chr11.fa
+#...etc...
+#Your faseq.loc file should include an entry per line for each set of fasta 
+#sequence files you have stored.  For example:
+#
+#hg18			/depot/data2/galaxy/faseq/hg18
+#mm9			/depot/data2/galaxy/faseq/mm9
+#Arabidopsis	/depot/data2/galaxy/faseq/Arabidopsis
+#...etc...
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon May 19 12:34:22 2014 -0400
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="rmap" version="2.05">
+    <repository changeset_revision="e0eed0cb34bf" name="package_rmap_2_05" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" />
+  </package>
+</tool_dependency>