# HG changeset patch # User nilshomer # Date 1307482987 14400 # Node ID 9d60d2fce247f83bcc2e89f9b63ad2ac7b974925 Migrated tool version 0.1.1 from old tool shed archive to new tool shed repository diff -r 000000000000 -r 9d60d2fce247 srma_ref.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/srma_ref.loc Tue Jun 07 17:43:07 2011 -0400 @@ -0,0 +1,25 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of SRMA indexed sequences data files. You will need +#to create these data files and then create a srma_ref.loc file +#similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The srma_ref.loc +#file has this format (white space characters are TAB characters): +# +# +# +#So, for example, if you had hg18 indexed stored in +#/depot/data2/galaxy/srma/hg18/, +#then the srma_ref.loc entry would look like this: +# +#hg18 /depot/data2/galaxy/srma/hg18/hg18.fa +# +#and your /depot/data2/galaxy/srma/hg18/ directory +#would contain hg18.fa.*.brg and hg18.fa.*.bif files: +#hg18.fa +#hg18.fa.dict +#...etc... +# +#The dictionary file for each reference (ex. hg18.fa.dict) must be +#created with Picard (http://picard.sourceforge.net). +# +20079 /Users/nhomer/hg/galaxy-central/srma/DH10B.fa diff -r 000000000000 -r 9d60d2fce247 srma_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/srma_wrapper.py Tue Jun 07 17:43:07 2011 -0400 @@ -0,0 +1,116 @@ +#!/usr/bin/env python + +""" +Runs SRMA on a SAM/BAM file; +TODO: more documentation + +usage: srma_wrapper.py [options] + -r, --ref=r: The reference genome to use or index + -i, --input=i: The SAM/BAM input file + -o, --output=o: The SAM/BAM output file + -O, --offset=O: The alignment offset + -Q, --minMappingQuality=Q: The minimum mapping quality + -P, --minAlleleProbability=P: The minimum allele probability conditioned on coverage (for the binomial quantile). + -C, --minAlleleCoverage=C: The minimum haploid coverage for the consensus. Default value: 3. This option can be set + -R, --range=R: A range to examine + -c, --correctBases=c: Correct bases + -q, --useSequenceQualities=q: Use sequence qualities + -M, --maxHeapSize=M: The maximum number of nodes on the heap before re-alignment is ignored + -s, --fileSource=s: Whether to use a previously indexed reference sequence or one from history (indexed or history) + -p, --params=p: Parameter setting to use (pre_set or full) + -D, --dbkey=D: Dbkey for reference genome +""" + +import optparse, os, shutil, subprocess, sys, tempfile + +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +def __main__(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' ) + parser.add_option( '-i', '--input', dest='input', help='The SAM/BAM input file' ) + parser.add_option( '-o', '--output', dest='output', help='The SAM/BAM output file' ) + parser.add_option( '-O', '--offset', dest='offset', help='The alignment offset' ) + parser.add_option( '-Q', '--minMappingQuality', dest='minMappingQuality', help='The minimum mapping quality' ) + parser.add_option( '-P', '--minAlleleProbability', dest='minAlleleProbability', help='The minimum allele probability conditioned on coverage (for the binomial quantile).' ) + parser.add_option( '-C', '--minAlleleCoverage', dest='minAlleleCoverage', help='The minimum haploid coverage for the consensus' ) + parser.add_option( '-R', '--range', dest='range', help='A range to examine' ) + parser.add_option( '-c', '--correctBases', dest='correctBases', help='Correct bases ' ) + parser.add_option( '-q', '--useSequenceQualities', dest='useSequenceQualities', help='Use sequence qualities ' ) + parser.add_option( '-M', '--maxHeapSize', dest='maxHeapSize', help='The maximum number of nodes on the heap before re-alignment is ignored' ) + parser.add_option( '-s', '--fileSource', dest='fileSource', help='Whether to use a previously indexed reference sequence or one from history (indexed or history)' ) + parser.add_option( '-p', '--params', dest='params', help='Parameter setting to use (pre_set or full)' ) + parser.add_option( '-D', '--dbkey', dest='dbkey', help='Dbkey for reference genome' ) + (options, args) = parser.parse_args() + + # make temp directory for bfast + tmp_dir = '%s/' % tempfile.mkdtemp() + + # assume indexing has already been done + if options.fileSource == 'history': + stop_err( 'Error: indexing not implemented' ) + + # set up aligning and generate aligning command options + if options.params == 'pre_set': + srma_cmds = '' + else: + if options.useSequenceQualities == 'true': + useSequenceQualities = 'true' + else: + useSequenceQualities = 'false' + ranges = 'null' + if options.range == 'None': + range = 'null' + else: + range = options.range + + srma_cmds = "OFFSET=%s MIN_MAPQ=%s MINIMUM_ALLELE_PROBABILITY=%s MINIMUM_ALLELE_COVERAGE=%s RANGES=%s RANGE=%s CORRECT_BASES=%s USE_SEQUENCE_QUALITIES=%s MAX_HEAP_SIZE=%s" % ( options.offset, options.minMappingQuality, options.minAlleleProbability, options.minAlleleCoverage, ranges, range, options.correctBases, options.useSequenceQualities, options.maxHeapSize ) + + # how do we call a JAR file? + cmd = 'java -jar /Users/nhomer/srma/build/jar/srma.jar I=%s O=%s R=%s %s' % ( options.input, options.output, options.ref, srma_cmds ) + + # perform alignments + buffsize = 1048576 + try: + # need to nest try-except in try-finally to handle 2.4 + try: + try: + tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name + tmp_stderr = open( tmp, 'wb' ) + proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + stderr = '' + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + raise Exception, 'Error executing SRMA. ' + str( e ) + # check that there are results in the output file + if os.path.getsize( options.output ) > 0: + if "0" == options.space: + sys.stdout.write( 'BFAST run on Base Space data' ) + else: + sys.stdout.write( 'BFAST run on Color Space data' ) + else: + raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.' + except Exception, e: + stop_err( 'The alignment failed.\n' + str( e ) ) + finally: + # clean up temp dir + if os.path.exists( tmp_dir ): + shutil.rmtree( tmp_dir ) + +if __name__=="__main__": __main__() diff -r 000000000000 -r 9d60d2fce247 srma_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/srma_wrapper.xml Tue Jun 07 17:43:07 2011 -0400 @@ -0,0 +1,150 @@ + + + + srma_wrapper.py + + #if $refGenomeSource.refGenomeSource == "history": + --ref=$refGenomeSource.ownFile + #else: + --ref=$refGenomeSource.ref.value + #end if + --input=$input --output=$output + --params=$params.source_select --fileSource=$refGenomeSource.refGenomeSource + #if $params.source_select == "pre_set": + --offset="None", --minMappingQuality="None", --minAlleleProbability="None", --minAlleleCoverage="None", --range="None", --correctBases="None", --useSequenceQualities="None", --maxHeapSize="None", --fileSource="None", --params="None", --dbkey="None", + #else: + --offset=$params.offset, --minMappingQuality=$params.minMappingQuality, --minAlleleProbability=$params.minAlleleProbability, --minAlleleCoverage=$params.minAlleleCoverage, --range=$params.range, --correctBases=$params.correctBases, --useSequenceQualities=$params.useSequenceQualities, --maxHeapSize=$params.maxHeapSize, --fileSource=$params.fileSource, --params=$params.params, --dbkey=$params.dbkey, #end if + #if $refGenomeSource.refGenomeSource == "history": + --dbkey=$dbkey + #else: + --dbkey="None" + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +SRMA is a short read micro re-aligner for next-generation high throughput sequencing data. + +Sequence alignment algorithms examine each read independently. When indels occur towards the ends of reads, the alignment can lead to false SNPs as well as improperly placed indels. This tool aims to perform a re-alignment of each read to a graphical representation of all alignments within a local region to provide a better overall base-resolution consensus. + +Currently this tool works well with and has been tested on 30x diploid coverage genome sequencing data from Illumina and ABI SOLiD technology. This tool may not work well with 454 data, as indels are a significant error mode for 454 data. + +------ + +Please cite the website "http://srma.sourceforge.net" as well as: + +Homer N, and Nelson SF. SRMA: short read micro re-aligner. 2010. + +------ + +**Know what you are doing** + +.. class:: warningmark + +There is no such thing (yet) as an automated gearshift in short read mapping. It is all like stick-shift driving in San Francisco. In other words = running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy. + +.. __: http://srma.sourceforge.net/ + +------ + +**Input formats** + +SRMA accepts a BAM input file. + +------ + +**Outputs** + +The output is in BAM format, see http://samtools.sourceforge.net for more details. + +------- + +**SRMA settings** + +All of the options have a default value. You can change any of them. Most of the options in SRMA have been implemented here. + +------ + +**SRMA parameter list** + +This is an exhaustive list of SRMA options: + +For **SRMA**:: + + INPUT=File + I=File The input SAM or BAM file. Required. + + OUTPUT=File + O=File The output SAM or BAM file. Default value: null. + + REFERENCE=File + R=File The reference FASTA file. Required. + + OFFSET=Integer The alignment offset. Default value: 20. This option can be set to 'null' to clear the + default value. + + MIN_MAPQ=Integer The minimum mapping quality. Default value: 0. This option can be set to 'null' to clear + the default value. + + MINIMUM_ALLELE_PROBABILITY=Double + The minimum allele probability conditioned on coverage (for the binomial quantile). + Default value: 0.1. This option can be set to 'null' to clear the default value. + + MINIMUM_ALLELE_COVERAGE=Integer + The minimum haploid coverage for the consensus. Default value: 3. This option can be set + to 'null' to clear the default value. + + RANGE=String A range to examine. Default value: null. + + CORRECT_BASES=Boolean Correct bases. Default value: false. This option can be set to 'null' to clear the + default value. Possible values: {true, false} + + USE_SEQUENCE_QUALITIES=BooleanUse sequence qualities Default value: true. This option can be set to 'null' to clear the + default value. Possible values: {true, false} + + MAX_HEAP_SIZE=Integer The maximum number of nodes on the heap before re-alignment is ignored Default value: + 8192. This option can be set to 'null' to clear the default value. + + + + diff -r 000000000000 -r 9d60d2fce247 srma_wrapper_code.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/srma_wrapper_code.py Tue Jun 07 17:43:07 2011 -0400 @@ -0,0 +1,11 @@ +import os + +def exec_before_job(app, inp_data, out_data, param_dict, tool): + try: + refFile = param_dict[ 'refGenomeSource' ][ 'index' ].value + except: + try: + refFile = param_dict[ 'refGenomeSource' ][ 'ownFile' ].dbkey + except: + refFile = '?' + dbkey = os.path.split( refFile )[1].split( '.' )[0]