Mercurial > repos > devteam > rmapq
changeset 0:f6e5bb5aa2f5 draft
Imported from capsule None
author | devteam |
---|---|
date | Mon, 19 May 2014 12:34:22 -0400 |
parents | |
children | 3a48f010da67 |
files | rmapq_wrapper.py rmapq_wrapper.xml test-data/rmapq_wrapper_test1.bed test-data/rmapq_wrapper_test1.fasta test-data/rmapq_wrapper_test1.qual tool-data/faseq.loc.sample tool_dependencies.xml |
diffstat | 7 files changed, 264 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rmapq_wrapper.py Mon May 19 12:34:22 2014 -0400 @@ -0,0 +1,100 @@ +#!/usr/bin/env python + +import os, sys, tempfile + +assert sys.version_info[:2] >= (2.4) + +def stop_err( msg ): + + sys.stderr.write( "%s\n" % msg ) + sys.exit() + + +def __main__(): + + # I/O + target_path = sys.argv[1] + infile = sys.argv[2] + scorefile = sys.argv[3] + high_score = sys.argv[4] # -q + high_len = sys.argv[5] # -M + read_len = sys.argv[6] # -w + align_len = sys.argv[7] # -h + mismatch = sys.argv[8] # -m + output_file = sys.argv[9] + + try: + float(high_score) + except: + stop_err('Invalid value for minimal quality score.') + + try: + int(high_len) + except: + stop_err('Invalid value for minimal high quality bases.') + + # first guess the read length + guess_read_len = 0 + seq = '' + for i, line in enumerate(open(infile)): + line = line.rstrip('\r\n') + if line.startswith('>'): + if seq: + guess_read_len = len(seq) + break + else: + seq += line + + try: + test = int(read_len) + if test == 0: + read_len = str(guess_read_len) + else: + assert test >= 20 and test <= 64 + except: + stop_err('Invalid value for read length. Must be between 20 and 64.') + + + try: + int(align_len) + except: + stop_err('Invalid value for minimal length of a hit.') + + try: + int(mismatch) + except: + stop_err('Invalid value for mismatch numbers in an alignment.') + + all_files = [] + if os.path.isdir(target_path): + # check target genome + fa_files = os.listdir(target_path) + + for file in fa_files: + file = "%s/%s" % ( target_path, file ) + file = os.path.normpath(file) + all_files.append(file) + else: + stop_err("No sequences for %s are available for search, please report this error." %(target_path)) + + for detail_file_path in all_files: + output_tempfile = tempfile.NamedTemporaryFile().name + command = "rmapq -q %s -M %s -h %s -w %s -m %s -Q %s -c %s %s -o %s 2>&1" % ( high_score, high_len, align_len, read_len, mismatch, scorefile, detail_file_path, infile, output_tempfile ) + #print command + try: + os.system( command ) + except Exception, e: + stop_err( str( e ) ) + + try: + assert os.system( 'cat %s >> %s' % ( output_tempfile, output_file ) ) == 0 + except Exception, e: + stop_err( str( e ) ) + + try: + os.remove( output_tempfile ) + except: + pass + + +if __name__ == '__main__': __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rmapq_wrapper.xml Mon May 19 12:34:22 2014 -0400 @@ -0,0 +1,94 @@ +<tool id="rmapq_wrapper" name="RMAPQ" version="1.0.0"> + <description>for Solexa Short Reads Alignment with Quality Scores</description> + <requirements> + <requirement type="package" version="2.05">rmap</requirement> + </requirements> + <command interpreter="python"> + #if $trim.choice=="No": + rmapq_wrapper.py $database $input_seq $input_score $high_score $high_len 0 $align_len $mismatch $output1 + #else: + rmapq_wrapper.py $database $input_seq $input_score $high_score $high_len $trim.read_len $align_len $mismatch $output1 + #end if + </command> + <inputs> + <param name="database" type="select" display="radio" label="Target database"> + <options from_file="faseq.loc"> + <column name="name" index="0"/> + <column name="value" index="0"/> + </options> + </param> + <param name="input_seq" type="data" format="fasta" label="Sequence file"/> + <param name="input_score" type="data" format="qualsolexa" label="Quality score file"/> + <param name="high_score" type="float" size="15" value="40" label="Minimum score for high-quality base (-q)"/> + <param name="high_len" type="integer" size="15" value="36" label="Minimal high-quality bases (-M)"/> + <param name="align_len" type="integer" size="15" value="11" label="Minimal length of a hit (-h)" help="seed"/> + <param name="mismatch" type="select" label="Number of mismatches allowed (-m)"> + <option value="0">0</option> + <option value="1">1</option> + <option value="3">3</option> + <option value="5">5</option> + </param> + <conditional name="trim"> + <param name="choice" type="select" label="To trim the reads"> + <option value="No">No</option> + <option value="Yes">Yes</option> + </param> + <when value="No"> + </when> + <when value="Yes"> + <param name="read_len" type="integer" size="15" value="36" label="Read length (-w)" /> + </when> + </conditional> + </inputs> + <outputs> + <data name="output1" format="bed"/> + </outputs> + <!-- + <tests> + <test> + <param name="database" value="/galaxy/data/faseq/test" /> + <param name="input_seq" value="rmapq_wrapper_test1.fasta" ftype="fasta"/> + <param name="input_score" value="rmapq_wrapper_test1.qual" ftype="qualsolexa" /> + <param name="high_score" value="40" /> + <param name="high_len" value="36" /> + <param name="read_len" value="36" /> + <param name="align_len" value="36" /> + <param name="mismatch" value="3" /> + <output name="output1" file="rmapq_wrapper_test1.bed"/> + </test> + </tests> + --> + <help> + +.. class:: warningmark + + RMAPQ was developed for **Solexa** reads. + +.. class:: infomark + +**TIP**. The tool will guess the length of the reads, however, if you select to trim the reads, the *Maximal Length of the Reads* must be between 20 and 64. Reads with lengths longer than the specified value will be trimmed at the 3'end. + +----- + +**What it does** + +This tool runs **rmapq** (for more information, please see the reference below), searching against a genome build with sequence qualities. + +----- + +**Parameters** + +- *Minimal High-quality Bases* (**-M**): the minimal length of the high quality score bases +- *Minimum Score for High-quality Base* (**-q**) : the minimal quality score +- *Minimal Length of a Hit* (**-h**) : the minimal length of an exact match or seed +- *Number of Mismatches Allowed* (**-m**) : the maximal number of mismatches allowed in an alignment +- *Read Length* (**-w**) : maximal length of the reads; reads longer than the threshold will be truncated at 3' end. + +----- + +**Reference** + + **RMAP** is developed by Dr. Andrew D Smith and Dr. Zhenyu Xuan at the Cold Spring Harbor Laboratory. Please see http://rulai.cshl.edu/rmap/ + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rmapq_wrapper_test1.bed Mon May 19 12:34:22 2014 -0400 @@ -0,0 +1,8 @@ +phix 360 396 seq1 1 - +phix 4188 4224 seq2 1 + +phix 4908 4944 seq4 0 - +phix 2811 2847 seq5 2 + +phix 3847 3883 seq6 0 - +phix 91 127 seq7 0 + +phix 2302 2338 seq8 2 + +phix 2448 2484 seq9 0 +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rmapq_wrapper_test1.fasta Mon May 19 12:34:22 2014 -0400 @@ -0,0 +1,20 @@ +>seq1 +GACTCATGATTTCTTACCTATTAGTGGTTGAACATC +>seq2 +GTGATATGTATGTTGACGGCCATAAGGCTGCTTCTT +>seq3 +GTTGTCGATAGAACTTCATGTGCCTGTAAAACAAGT +>seq4 +ACCAACCAGAACGTGAAAAAGCGTCCTGCGTGTAGC +>seq5 +GTTTATGTTGGTTTCATGGTTTTGTCTAACTTTATC +>seq6 +GCTTTACCGTCTTTCCAGAAATTGTTCCAAGTATCG +>seq7 +GCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGC +>seq8 +GTTATAACGCCGAAGCGGTAAAAATTTTTATTTTTT +>seq9 +GTTCTCACTTCTGTTACTCCAGCTTCTTCGGCACCT +>seq10 +GTGGCCTGTTGATTCTAAAGGTTAGTTTCTTCACGC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rmapq_wrapper_test1.qual Mon May 19 12:34:22 2014 -0400 @@ -0,0 +1,10 @@ + -40 -40 40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 -15 15 -40 40 -40 -40 + -40 -40 40 -40 -40 -40 -40 40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 -40 40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 40 -40 -40 -40 -5 5 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 21 -40 -21 -40 40 -40 -40 40 -40 -40 -40 -40 -40 -40 40 40 -40 -40 -40 12 -40 -40 -12 -36 -40 36 -40 -40 -40 40 -40 -4 4 -40 -40 -40 -40 -40 40 -40 -40 14 -14 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -25 25 + -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 -40 40 40 -40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 34 -40 -34 40 -40 -40 -40 -40 -40 -40 40 -40 -25 25 -40 -40 -40 -40 40 -37 -40 37 -40 -40 7 -40 -7 -40 40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 40 -40 -40 -40 38 -40 -40 -38 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 11 -16 -13 -22 -40 -40 40 -40 -40 -40 -40 40 + 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -33 33 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 -25 25 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 27 -27 -5 5 -40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 40 -40 -40 -40 -37 37 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -25 25 40 -40 -40 -40 -40 -40 34 -34 -40 40 -40 -40 + -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -2 2 -40 -40 35 -35 -40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 40 40 -40 -40 -40 40 -40 -40 -40 -40 36 -40 -36 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 5 -5 -40 -28 -40 -16 -40 16 -40 40 -40 -40 + -40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 28 -28 -40 -40 -40 40 40 -40 -40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 40 -40 + -40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 27 -27 -40 40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 14 -14 -40 40 -40 -40 + -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 40 -40 -40 -40 -40 -40 -40 40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -36 -40 -40 36 40 -40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 + -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 22 -22 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 + -40 -40 40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 -6 6 -40 40 -40 -40 -40 -40 -40 40 3 -40 -40 -3 40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -0 -40 -40 0 -40 -40 -40 40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -20 20 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 8 -40 -40 -8 -40 40 -40 -40 -40 -40 40 -40 -4 4 -40 -40
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/faseq.loc.sample Mon May 19 12:34:22 2014 -0400 @@ -0,0 +1,26 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use genome fasta sequence files. The faseq.loc file has this format +#(white space characters are TAB characters): +# +# <GenomeBuild> <dir> +# +# In the dir, each file is fasta format and contains only one sequence. So, +#for example, if you had hg18 fasta sequences stored in /depot/data2/galaxy/faseq/hg18, +#then your faseq.loc entry would look like this: +# +#hg18 /depot/data2/galaxy/faseq/hg18 +# +#and your /depot/data2/galaxy/faseq/hg18 directory would contain all of +#your fasta sequence files (e.g.): +# +#-rw-r--r-- 1 wychung galaxy 138082251 2008-04-16 11:57 chr10.fa +#-rw-r--r-- 1 wychung galaxy 115564 2008-04-16 11:57 chr10_random.fa +#-rw-r--r-- 1 wychung galaxy 137141451 2008-04-16 11:58 chr11.fa +#...etc... +#Your faseq.loc file should include an entry per line for each set of fasta +#sequence files you have stored. For example: +# +#hg18 /depot/data2/galaxy/faseq/hg18 +#mm9 /depot/data2/galaxy/faseq/mm9 +#Arabidopsis /depot/data2/galaxy/faseq/Arabidopsis +#...etc...
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon May 19 12:34:22 2014 -0400 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="rmap" version="2.05"> + <repository changeset_revision="e0eed0cb34bf" name="package_rmap_2_05" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>