# HG changeset patch # User devteam # Date 1400517262 14400 # Node ID f6e5bb5aa2f574f21d55e6f8a59e0cdfaa754e17 Imported from capsule None diff -r 000000000000 -r f6e5bb5aa2f5 rmapq_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rmapq_wrapper.py Mon May 19 12:34:22 2014 -0400 @@ -0,0 +1,100 @@ +#!/usr/bin/env python + +import os, sys, tempfile + +assert sys.version_info[:2] >= (2.4) + +def stop_err( msg ): + + sys.stderr.write( "%s\n" % msg ) + sys.exit() + + +def __main__(): + + # I/O + target_path = sys.argv[1] + infile = sys.argv[2] + scorefile = sys.argv[3] + high_score = sys.argv[4] # -q + high_len = sys.argv[5] # -M + read_len = sys.argv[6] # -w + align_len = sys.argv[7] # -h + mismatch = sys.argv[8] # -m + output_file = sys.argv[9] + + try: + float(high_score) + except: + stop_err('Invalid value for minimal quality score.') + + try: + int(high_len) + except: + stop_err('Invalid value for minimal high quality bases.') + + # first guess the read length + guess_read_len = 0 + seq = '' + for i, line in enumerate(open(infile)): + line = line.rstrip('\r\n') + if line.startswith('>'): + if seq: + guess_read_len = len(seq) + break + else: + seq += line + + try: + test = int(read_len) + if test == 0: + read_len = str(guess_read_len) + else: + assert test >= 20 and test <= 64 + except: + stop_err('Invalid value for read length. Must be between 20 and 64.') + + + try: + int(align_len) + except: + stop_err('Invalid value for minimal length of a hit.') + + try: + int(mismatch) + except: + stop_err('Invalid value for mismatch numbers in an alignment.') + + all_files = [] + if os.path.isdir(target_path): + # check target genome + fa_files = os.listdir(target_path) + + for file in fa_files: + file = "%s/%s" % ( target_path, file ) + file = os.path.normpath(file) + all_files.append(file) + else: + stop_err("No sequences for %s are available for search, please report this error." %(target_path)) + + for detail_file_path in all_files: + output_tempfile = tempfile.NamedTemporaryFile().name + command = "rmapq -q %s -M %s -h %s -w %s -m %s -Q %s -c %s %s -o %s 2>&1" % ( high_score, high_len, align_len, read_len, mismatch, scorefile, detail_file_path, infile, output_tempfile ) + #print command + try: + os.system( command ) + except Exception, e: + stop_err( str( e ) ) + + try: + assert os.system( 'cat %s >> %s' % ( output_tempfile, output_file ) ) == 0 + except Exception, e: + stop_err( str( e ) ) + + try: + os.remove( output_tempfile ) + except: + pass + + +if __name__ == '__main__': __main__() diff -r 000000000000 -r f6e5bb5aa2f5 rmapq_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rmapq_wrapper.xml Mon May 19 12:34:22 2014 -0400 @@ -0,0 +1,94 @@ + + for Solexa Short Reads Alignment with Quality Scores + + rmap + + + #if $trim.choice=="No": + rmapq_wrapper.py $database $input_seq $input_score $high_score $high_len 0 $align_len $mismatch $output1 + #else: + rmapq_wrapper.py $database $input_seq $input_score $high_score $high_len $trim.read_len $align_len $mismatch $output1 + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: warningmark + + RMAPQ was developed for **Solexa** reads. + +.. class:: infomark + +**TIP**. The tool will guess the length of the reads, however, if you select to trim the reads, the *Maximal Length of the Reads* must be between 20 and 64. Reads with lengths longer than the specified value will be trimmed at the 3'end. + +----- + +**What it does** + +This tool runs **rmapq** (for more information, please see the reference below), searching against a genome build with sequence qualities. + +----- + +**Parameters** + +- *Minimal High-quality Bases* (**-M**): the minimal length of the high quality score bases +- *Minimum Score for High-quality Base* (**-q**) : the minimal quality score +- *Minimal Length of a Hit* (**-h**) : the minimal length of an exact match or seed +- *Number of Mismatches Allowed* (**-m**) : the maximal number of mismatches allowed in an alignment +- *Read Length* (**-w**) : maximal length of the reads; reads longer than the threshold will be truncated at 3' end. + +----- + +**Reference** + + **RMAP** is developed by Dr. Andrew D Smith and Dr. Zhenyu Xuan at the Cold Spring Harbor Laboratory. Please see http://rulai.cshl.edu/rmap/ + + + diff -r 000000000000 -r f6e5bb5aa2f5 test-data/rmapq_wrapper_test1.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rmapq_wrapper_test1.bed Mon May 19 12:34:22 2014 -0400 @@ -0,0 +1,8 @@ +phix 360 396 seq1 1 - +phix 4188 4224 seq2 1 + +phix 4908 4944 seq4 0 - +phix 2811 2847 seq5 2 + +phix 3847 3883 seq6 0 - +phix 91 127 seq7 0 + +phix 2302 2338 seq8 2 + +phix 2448 2484 seq9 0 + diff -r 000000000000 -r f6e5bb5aa2f5 test-data/rmapq_wrapper_test1.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rmapq_wrapper_test1.fasta Mon May 19 12:34:22 2014 -0400 @@ -0,0 +1,20 @@ +>seq1 +GACTCATGATTTCTTACCTATTAGTGGTTGAACATC +>seq2 +GTGATATGTATGTTGACGGCCATAAGGCTGCTTCTT +>seq3 +GTTGTCGATAGAACTTCATGTGCCTGTAAAACAAGT +>seq4 +ACCAACCAGAACGTGAAAAAGCGTCCTGCGTGTAGC +>seq5 +GTTTATGTTGGTTTCATGGTTTTGTCTAACTTTATC +>seq6 +GCTTTACCGTCTTTCCAGAAATTGTTCCAAGTATCG +>seq7 +GCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGC +>seq8 +GTTATAACGCCGAAGCGGTAAAAATTTTTATTTTTT +>seq9 +GTTCTCACTTCTGTTACTCCAGCTTCTTCGGCACCT +>seq10 +GTGGCCTGTTGATTCTAAAGGTTAGTTTCTTCACGC diff -r 000000000000 -r f6e5bb5aa2f5 test-data/rmapq_wrapper_test1.qual --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rmapq_wrapper_test1.qual Mon May 19 12:34:22 2014 -0400 @@ -0,0 +1,10 @@ + -40 -40 40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 -15 15 -40 40 -40 -40 + -40 -40 40 -40 -40 -40 -40 40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 -40 40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 40 -40 -40 -40 -5 5 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 21 -40 -21 -40 40 -40 -40 40 -40 -40 -40 -40 -40 -40 40 40 -40 -40 -40 12 -40 -40 -12 -36 -40 36 -40 -40 -40 40 -40 -4 4 -40 -40 -40 -40 -40 40 -40 -40 14 -14 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -25 25 + -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 -40 40 40 -40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 34 -40 -34 40 -40 -40 -40 -40 -40 -40 40 -40 -25 25 -40 -40 -40 -40 40 -37 -40 37 -40 -40 7 -40 -7 -40 40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 40 -40 -40 -40 38 -40 -40 -38 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 11 -16 -13 -22 -40 -40 40 -40 -40 -40 -40 40 + 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -33 33 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 -25 25 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 27 -27 -5 5 -40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 40 -40 -40 -40 -37 37 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -25 25 40 -40 -40 -40 -40 -40 34 -34 -40 40 -40 -40 + -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -2 2 -40 -40 35 -35 -40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 40 40 -40 -40 -40 40 -40 -40 -40 -40 36 -40 -36 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 5 -5 -40 -28 -40 -16 -40 16 -40 40 -40 -40 + -40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 28 -28 -40 -40 -40 40 40 -40 -40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 40 -40 + -40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 27 -27 -40 40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 14 -14 -40 40 -40 -40 + -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 40 -40 -40 -40 -40 -40 -40 40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -36 -40 -40 36 40 -40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 + -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 40 -40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 22 -22 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 + -40 -40 40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 40 -40 40 -40 -40 -40 -40 -40 -40 40 -40 -40 -6 6 -40 40 -40 -40 -40 -40 -40 40 3 -40 -40 -3 40 -40 -40 -40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -0 -40 -40 0 -40 -40 -40 40 40 -40 -40 -40 -40 -40 40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 -40 -40 40 -20 20 -40 -40 -40 -40 -40 40 -40 -40 -40 40 -40 40 -40 -40 8 -40 -40 -8 -40 40 -40 -40 -40 -40 40 -40 -4 4 -40 -40 diff -r 000000000000 -r f6e5bb5aa2f5 tool-data/faseq.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/faseq.loc.sample Mon May 19 12:34:22 2014 -0400 @@ -0,0 +1,26 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use genome fasta sequence files. The faseq.loc file has this format +#(white space characters are TAB characters): +# +# +# +# In the dir, each file is fasta format and contains only one sequence. So, +#for example, if you had hg18 fasta sequences stored in /depot/data2/galaxy/faseq/hg18, +#then your faseq.loc entry would look like this: +# +#hg18 /depot/data2/galaxy/faseq/hg18 +# +#and your /depot/data2/galaxy/faseq/hg18 directory would contain all of +#your fasta sequence files (e.g.): +# +#-rw-r--r-- 1 wychung galaxy 138082251 2008-04-16 11:57 chr10.fa +#-rw-r--r-- 1 wychung galaxy 115564 2008-04-16 11:57 chr10_random.fa +#-rw-r--r-- 1 wychung galaxy 137141451 2008-04-16 11:58 chr11.fa +#...etc... +#Your faseq.loc file should include an entry per line for each set of fasta +#sequence files you have stored. For example: +# +#hg18 /depot/data2/galaxy/faseq/hg18 +#mm9 /depot/data2/galaxy/faseq/mm9 +#Arabidopsis /depot/data2/galaxy/faseq/Arabidopsis +#...etc... diff -r 000000000000 -r f6e5bb5aa2f5 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon May 19 12:34:22 2014 -0400 @@ -0,0 +1,6 @@ + + + + + +