Mercurial > repos > juanperin > bwa_wrapper
changeset 0:fb4844b6a98e default tip
Migrated tool version 1.0.3 from old tool shed archive to new tool shed repository
author | juanperin |
---|---|
date | Tue, 07 Jun 2011 17:28:32 -0400 |
parents | |
children | |
files | bwa_long/bwa_wrapper.py bwa_long/bwa_wrapper.xml |
diffstat | 2 files changed, 583 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa_long/bwa_wrapper.py Tue Jun 07 17:28:32 2011 -0400 @@ -0,0 +1,273 @@ +#!/usr/bin/env python + +""" +Runs BWA on single-end or paired-end data. +Produces a SAM file containing the mappings. +Works with BWA version 0.5.3-0.5.5. + +usage: bwa_wrapper.py [options] + -t, --threads=t: The number of threads to use + -r, --ref=r: The reference genome to use or index + -f, --fastq=f: The (forward) fastq file to use for the mapping + -F, --rfastq=F: The reverse fastq file to use for mapping if paired-end data + -u, --output=u: The file to save the output (SAM format) + -g, --genAlignType=g: The type of pairing (single or paired) + -p, --params=p: Parameter setting to use (pre_set or full) + -s, --fileSource=s: Whether to use a previously indexed reference sequence or one from history (indexed or history) + -n, --maxEditDist=n: Maximum edit distance if integer + -m, --fracMissingAligns=m: Fraction of missing alignments given 2% uniform base error rate if fraction + -o, --maxGapOpens=o: Maximum number of gap opens + -e, --maxGapExtens=e: Maximum number of gap extensions + -d, --disallowLongDel=d: Disallow a long deletion within specified bps + -i, --disallowIndel=i: Disallow indel within specified bps + -l, --seed=l: Take the first specified subsequences + -k, --maxEditDistSeed=k: Maximum edit distance to the seed + -M, --mismatchPenalty=M: Mismatch penalty + -O, --gapOpenPenalty=O: Gap open penalty + -E, --gapExtensPenalty=E: Gap extension penalty + -R, --suboptAlign=R: Proceed with suboptimal alignments even if the top hit is a repeat + -N, --noIterSearch=N: Disable iterative search + -T, --outputTopN=T: Output top specified hits + -S, --maxInsertSize=S: Maximum insert size for a read pair to be considered mapped good + -P, --maxOccurPairing=P: Maximum occurrences of a read for pairings + -D, --dbkey=D: Dbkey for reference genome + -H, --suppressHeader=h: Suppress header +""" + +import optparse, os, shutil, subprocess, sys, tempfile + +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +def __main__(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to use' ) + parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' ) + parser.add_option( '-f', '--fastq', dest='fastq', help='The (forward) fastq file to use for the mapping' ) + parser.add_option( '-F', '--rfastq', dest='rfastq', help='The reverse fastq file to use for mapping if paired-end data' ) + parser.add_option( '-u', '--output', dest='output', help='The file to save the output (SAM format)' ) + parser.add_option( '-g', '--genAlignType', dest='genAlignType', help='The type of pairing (single or paired)' ) + parser.add_option( '-p', '--params', dest='params', help='Parameter setting to use (pre_set or full)' ) + parser.add_option( '-s', '--fileSource', dest='fileSource', help='Whether to use a previously indexed reference sequence or one form history (indexed or history)' ) + parser.add_option( '-n', '--maxEditDist', dest='maxEditDist', help='Maximum edit distance if integer' ) + parser.add_option( '-m', '--fracMissingAligns', dest='fracMissingAligns', help='Fraction of missing alignments given 2% uniform base error rate if fraction' ) + parser.add_option( '-o', '--maxGapOpens', dest='maxGapOpens', help='Maximum number of gap opens' ) + parser.add_option( '-e', '--maxGapExtens', dest='maxGapExtens', help='Maximum number of gap extensions' ) + parser.add_option( '-d', '--disallowLongDel', dest='disallowLongDel', help='Disallow a long deletion within specified bps' ) + parser.add_option( '-i', '--disallowIndel', dest='disallowIndel', help='Disallow indel within specified bps' ) + parser.add_option( '-l', '--seed', dest='seed', help='Take the first specified subsequences' ) + parser.add_option( '-k', '--maxEditDistSeed', dest='maxEditDistSeed', help='Maximum edit distance to the seed' ) + parser.add_option( '-M', '--mismatchPenalty', dest='mismatchPenalty', help='Mismatch penalty' ) + parser.add_option( '-O', '--gapOpenPenalty', dest='gapOpenPenalty', help='Gap open penalty' ) + parser.add_option( '-E', '--gapExtensPenalty', dest='gapExtensPenalty', help='Gap extension penalty' ) + parser.add_option( '-R', '--suboptAlign', dest='suboptAlign', help='Proceed with suboptimal alignments even if the top hit is a repeat' ) + parser.add_option( '-N', '--noIterSearch', dest='noIterSearch', help='Disable iterative search' ) + parser.add_option( '-T', '--outputTopN', dest='outputTopN', help='Output top specified hits' ) + parser.add_option( '-S', '--maxInsertSize', dest='maxInsertSize', help='Maximum insert size for a read pair to be considered mapped good' ) + parser.add_option( '-P', '--maxOccurPairing', dest='maxOccurPairing', help='Maximum occurrences of a read for pairings' ) + parser.add_option( '-D', '--dbkey', dest='dbkey', help='Dbkey for reference genome' ) + parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' ) + (options, args) = parser.parse_args() + # make temp directory for placement of indices + tmp_index_dir = tempfile.mkdtemp() + tmp_dir = tempfile.mkdtemp() + # index if necessary + if options.fileSource == 'history': + ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir ) + ref_file_name = ref_file.name + ref_file.close() + os.symlink( options.ref, ref_file_name ) + # determine which indexing algorithm to use, based on size + try: + size = os.stat( options.ref ).st_size + if size <= 2**30: + indexingAlg = 'is' + else: + indexingAlg = 'bwtsw' + except: + indexingAlg = 'is' + indexing_cmds = '-a %s' % indexingAlg + cmd1 = 'bwa index %s %s' % ( indexing_cmds, ref_file_name ) + try: + tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name + tmp_stderr = open( tmp, 'wb' ) + proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + # clean up temp dirs + if os.path.exists( tmp_index_dir ): + shutil.rmtree( tmp_index_dir ) + if os.path.exists( tmp_dir ): + shutil.rmtree( tmp_dir ) + stop_err( 'Error indexing reference sequence. ' + str( e ) ) + else: + ref_file_name = options.ref + # set up aligning and generate aligning command options + if options.params == 'pre_set': + aligning_cmds = '-t %s' % options.threads + gen_alignment_cmds = '' + else: + if options.maxEditDist != '0': + editDist = options.maxEditDist + else: + editDist = options.fracMissingAligns + if options.seed != '-1': + seed = '-l %s' % options.seed + else: + seed = '' + if options.suboptAlign == 'true': + suboptAlign = '-R' + else: + suboptAlign = '' + if options.noIterSearch == 'true': + noIterSearch = '-N' + else: + noIterSearch = '' + aligning_cmds = '-n %s -o %s -e %s -d %s -i %s %s -k %s -t %s -M %s -O %s -E %s %s %s' % \ + ( editDist, options.maxGapOpens, options.maxGapExtens, options.disallowLongDel, + options.disallowIndel, seed, options.maxEditDistSeed, options.threads, + options.mismatchPenalty, options.gapOpenPenalty, options.gapExtensPenalty, + suboptAlign, noIterSearch ) + if options.genAlignType == 'single': + gen_alignment_cmds = '-n %s' % options.outputTopN + elif options.genAlignType == 'paired': + gen_alignment_cmds = '-a %s -o %s' % ( options.maxInsertSize, options.maxOccurPairing ) + else: + gen_alignment_cmds = '-n %s' % options.outputTopN + # set up output files + tmp_align_out = tempfile.NamedTemporaryFile( dir=tmp_dir ) + tmp_align_out_name = tmp_align_out.name + tmp_align_out.close() + tmp_align_out2 = tempfile.NamedTemporaryFile( dir=tmp_dir ) + tmp_align_out2_name = tmp_align_out2.name + tmp_align_out2.close() + # prepare actual aligning and generate aligning commands + cmd2 = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, options.fastq, tmp_align_out_name ) + cmd2b = '' + if options.genAlignType == 'paired': + cmd2b = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, options.rfastq, tmp_align_out2_name ) + cmd3 = 'bwa sampe %s %s %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, tmp_align_out2_name, options.fastq, options.rfastq, options.output ) + elif options.genAlignType == 'single': + cmd3 = 'bwa samse %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, options.fastq, options.output ) + else: + cmd2 = 'sleep 1' + cmd2b = 'sleep 1' + cmd3 = 'bwa bwasw %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, options.fastq, options.output ) + # perform alignments + buffsize = 1048576 + try: + # need to nest try-except in try-finally to handle 2.4 + try: + # align + try: + tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name + tmp_stderr = open( tmp, 'wb' ) + proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + stderr = '' + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + raise Exception, 'Error aligning sequence. ' + str( e ) + # and again if paired data + try: + if cmd2b: + tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name + tmp_stderr = open( tmp, 'wb' ) + proc = subprocess.Popen( args=cmd2b, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + stderr = '' + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + raise Exception, 'Error aligning second sequence. ' + str( e ) + # generate align + try: + tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name + tmp_stderr = open( tmp, 'wb' ) + proc = subprocess.Popen( args=cmd3, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + stderr = '' + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + raise Exception, 'Error generating alignments. ' + str( e ) + # remove header if necessary + if options.suppressHeader == 'true': + tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir) + tmp_out_name = tmp_out.name + tmp_out.close() + try: + shutil.move( options.output, tmp_out_name ) + except Exception, e: + raise Exception, 'Error moving output file before removing headers. ' + str( e ) + fout = file( options.output, 'w' ) + for line in file( tmp_out.name, 'r' ): + if not ( line.startswith( '@HD' ) or line.startswith( '@SQ' ) or line.startswith( '@RG' ) or line.startswith( '@PG' ) or line.startswith( '@CO' ) ): + fout.write( line ) + fout.close() + # check that there are results in the output file + if os.path.getsize( options.output ) > 0: + sys.stdout.write( 'BWA run on %s-end data' % options.genAlignType ) + else: + raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.' + except Exception, e: + stop_err( 'The alignment failed.\n' + str( e ) ) + finally: + # clean up temp dir + if os.path.exists( tmp_index_dir ): + shutil.rmtree( tmp_index_dir ) + if os.path.exists( tmp_dir ): + shutil.rmtree( tmp_dir ) + +if __name__=="__main__": __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa_long/bwa_wrapper.xml Tue Jun 07 17:28:32 2011 -0400 @@ -0,0 +1,310 @@ +<tool id="bwa_wrapper" name="Map with BWA" version="1.0.3"> + <description></description> + <command interpreter="python">bwa_wrapper.py +--threads="4" +#if $genomeSource.refGenomeSource == "history": +--ref=$genomeSource.ownFile +#else: +--ref=$genomeSource.indices.value +#end if +--fastq=$paired.input1 +#if $paired.sPaired == "paired": +--rfastq=$paired.input2 +#else: +--rfastq="None" +#end if +--output=$output --genAlignType=$paired.sPaired --params=$params.source_select --fileSource=$genomeSource.refGenomeSource +#if $params.source_select == "pre_set": +--maxEditDist="None" --fracMissingAligns="None" --maxGapOpens="None" --maxGapExtens="None" --disallowLongDel="None" --disallowIndel="None" --seed="None" --maxEditDistSeed="None" --mismatchPenalty="None" --gapOpenPenalty="None" --gapExtensPenalty="None" --suboptAlign="None" --noIterSearch="None" --outputTopN="None" --maxInsertSize="None" --maxOccurPairing="None" +#else: +--maxEditDist=$params.maxEditDist --fracMissingAligns=$params.fracMissingAligns --maxGapOpens=$params.maxGapOpens --maxGapExtens=$params.maxGapExtens --disallowLongDel=$params.disallowLongDel --disallowIndel=$params.disallowIndel --seed=$params.seed --maxEditDistSeed=$params.maxEditDistSeed --mismatchPenalty=$params.mismatchPenalty --gapOpenPenalty=$params.gapOpenPenalty --gapExtensPenalty=$params.gapExtensPenalty --suboptAlign=$params.suboptAlign --noIterSearch=$params.noIterSearch --outputTopN=$params.outputTopN --maxInsertSize=$params.maxInsertSize --maxOccurPairing=$params.maxOccurPairing +#end if +#if $genomeSource.refGenomeSource == "history": +--dbkey=$dbkey +#else: +--dbkey="None" +#end if +--suppressHeader=$suppressHeader + </command> + <inputs> + <conditional name="genomeSource"> + <param name="refGenomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="indices" type="select" label="Select a reference genome"> + <options from_file="bwa_index.loc"> + <column name="value" index="1" /> + <column name="name" index="0" /> + </options> + </param> + </when> + <when value="history"> + <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" /> + </when> + </conditional> + <conditional name="paired"> + <param name="sPaired" type="select" label="Is this library mate-paired?"> + <option value="single">Single-end</option> + <option value="paired">Paired-end</option> + <option value="longread">Long-reads</option> + </param> + <when value="single"> + <param name="input1" type="data" format="fastqsanger" label="FASTQ file" help="Must have Sanger-scaled quality values with ASCII offset 33"/> + </when> + <when value="paired"> + <param name="input1" type="data" format="fastqsanger" label="Forward FASTQ file" help="Must have Sanger-scaled quality values with ASCII offset 33"/> + <param name="input2" type="data" format="fastqsanger" label="Reverse FASTQ file" help="Must have Sanger-scaled quality values with ASCII offset 33"/> + </when> + <when value="longread"> + <param name="input1" type="data" format="fastqsanger" label="FASTQ file" help="Must have Sanger-scaled quality values with ASCII offset 33"/> + </when> + </conditional> + <conditional name="params"> + <param name="source_select" type="select" label="BWA settings to use" help="For most mapping needs use Commonly Used settings. If you want full control use Full Parameter List"> + <option value="pre_set">Commonly Used</option> + <option value="full">Full Parameter List</option> + </param> + <when value="pre_set" /> + <when value="full"> + <param name="maxEditDist" type="integer" value="0" label="Maximum edit distance (-n)" help="Enter this value OR a fraction of missing alignments, not both" /> + <param name="fracMissingAligns" type="float" value="0.04" label="Fraction of missing alignments given 2% uniform base error rate (-n)" help="Enter this value OR maximum edit distance, not both" /> + <param name="maxGapOpens" type="integer" value="1" label="Maximum number of gap opens (-o)" /> + <param name="maxGapExtens" type="integer" value="-1" label="Maximum number of gap extensions (-e)" help="-1 for k-difference mode (disallowing long gaps)" /> + <param name="disallowLongDel" type="integer" value="16" label="Disallow long deletion within [value] bp towards the 3'-end (-d)" /> + <param name="disallowIndel" type="integer" value="5" label="Disallow insertion/deletion within [value] bp towards the end (-i)" /> + <param name="seed" type="integer" value="-1" label="Number of first subsequences to take as seed (-l)" help="Enter -1 for infinity" /> + <param name="maxEditDistSeed" type="integer" value="2" label="Maximum edit distance in the seed (-k)" /> + <param name="mismatchPenalty" type="integer" value="3" label="Mismatch penalty (-M)" help="BWA will not search for suboptimal hits with a score lower than [value]" /> + <param name="gapOpenPenalty" type="integer" value="11" label="Gap open penalty (-O)" /> + <param name="gapExtensPenalty" type="integer" value="4" label="Gap extension penalty (-E)" /> + <param name="suboptAlign" type="boolean" truevalue="true" falsevalue="false" checked="no" label="Proceed with suboptimal alignments even if the top hit is a repeat" help="By default, BWA only searches for suboptimal alignments if the top hit is unique. Using this option has no effect on accuracy for single-end reads. It is mainly designed for improving the alignment accuracy of paired-end reads. However, the pairing procedure will be slowed down, especially for very short reads (~32bp) (-R)" /> + <param name="noIterSearch" type="boolean" truevalue="true" falsevalue="false" checked="no" label="Disable iterative search" help="All hits with no more than maxDiff differences will be found. This mode is much slower than the default (-N)" /> + <param name="outputTopN" type="integer" value="-1" label="Output top [value] hits" help="For single-end reads only. Enter -1 to disable outputting multiple hits. NOTE: If you put in a positive value here, your output will NOT be in SAM format (-n)" /> + <param name="maxInsertSize" type="integer" value="500" label="Maximum insert size for a read pair to be considered as being mapped properly" help="For paired-end reads only. Only used when there are not enough good alignments to infer the distribution of insert sizes (-a)" /> + <param name="maxOccurPairing" type="integer" value="100000" label="Maximum occurrences of a read for pairing" help="For paired-end reads only. A read with more occurrences will be treated as a single-end read. Reducing this parameter helps faster pairing (-o)" /> + </when> + </conditional> + <param name="suppressHeader" type="boolean" truevalue="true" falsevalue="false" checked="true" label="Suppress the header in the output SAM file" help="BWA produces SAM with several lines of header information" /> + </inputs> + <outputs> + <data format="sam" name="output" /> + </outputs> + <tests> + <test> + <!-- + BWA commands: + bwa aln -t 4 phiX test-data/bwa_wrapper_in1.fastq > bwa_wrapper_out1.sai + bwa samse phiX bwa_wrapper_out1.sai test-data/bwa_wrapper_in1.fastq >> bwa_wrapper_out1.sam + phiX.fasta is the prefix for the reference + remove the comment lines (beginning with '@') from the resulting sam file + note that 'phiX' should be 'PHIX174' to match what's in the indexed file + --> + <param name="refGenomeSource" value="indexed" /> + <param name="indices" value="phiX" /> + <param name="sPaired" value="single" /> + <param name="input1" value="bwa_wrapper_in1.fastq" ftype="fastqsanger" /> + <param name="source_select" value="pre_set" /> + <param name="suppressHeader" value="true" /> + <output name="output" file="bwa_wrapper_out1.sam" ftype="sam" sort="true" /> + </test> + <test> + <!-- + BWA commands: + cp test-data/phiX.fasta phiX.fasta + bwa index -a is phiX.fasta + bwa aln -n 0.04 -o 1 -e -1 -d 16 -i 5 -k 2 -t 4 -M 3 -O 11 -E 4 -R -N phiX.fasta test-data/bwa_wrapper_in1.fastq > bwa_wrapper_out2.sai + bwa samse phiX.fasta bwa_wrapper_out2.sai test-data/bwa_wrapper_in1.fastq >> bwa_wrapper_out2.sam + phiX.fasta is the prefix for the reference + remove the comment lines (beginning with '@') from the resulting sam file + --> + <param name="refGenomeSource" value="history" /> + <param name="ownFile" value="phiX.fasta" /> + <param name="sPaired" value="single" /> + <param name="input1" value="bwa_wrapper_in1.fastq" ftype="fastqsanger" /> + <param name="source_select" value="full" /> + <param name="maxEditDist" value="0" /> + <param name="fracMissingAligns" value="0.04" /> + <param name="maxGapOpens" value="1" /> + <param name="maxGapExtens" value="-1" /> + <param name="disallowLongDel" value="16" /> + <param name="disallowIndel" value="5" /> + <param name="seed" value="-1" /> + <param name="maxEditDistSeed" value="2" /> + <param name="mismatchPenalty" value="3" /> + <param name="gapOpenPenalty" value="11" /> + <param name="gapExtensPenalty" value="4" /> + <param name="suboptAlign" value="true" /> + <param name="noIterSearch" value="true" /> + <param name="outputTopN" value="-1" /> + <param name="maxInsertSize" value="500" /> + <param name="maxOccurPairing" value="100000" /> + <param name="suppressHeader" value="true" /> + <output name="output" file="bwa_wrapper_out2.sam" ftype="sam" sort="true" /> + </test> + <test> + <!-- + BWA commands: + bwa aln -n 0.04 -o 1 -e -1 -d 16 -i 5 -k 2 -t 4 -M 3 -O 11 -E 4 -R -N phiX.fasta test-data/bwa_wrapper_in2.fastq > bwa_wrapper_out3a.sai + bwa aln -n 0.04 -o 1 -e -1 -d 16 -i 5 -k 2 -t 4 -M 3 -O 11 -E 4 -R -N phiX.fasta test-data/bwa_wrapper_in3.fastq > bwa_wrapper_out3b.sai + bwa sampe -a 500 -o 100000 phiX.fasta bwa_wrapper_out3a.sai bwa_wrapper_out3b.sai test-data/bwa_wrapper_in2.fastq test-data/bwa_wrapper_in3.fastq >> bwa_wrapper_out3.sam + phiX.fasta is the prefix for the reference + remove the comment lines (beginning with '@') from the resulting sam file + note that 'phiX' should be 'PHIX174' to match what's in the indexed file + --> + <param name="refGenomeSource" value="indexed" /> + <param name="indices" value="phiX" /> + <param name="sPaired" value="paired" /> + <param name="input1" value="bwa_wrapper_in2.fastq" ftype="fastqsanger" /> + <param name="input2" value="bwa_wrapper_in3.fastq" ftype="fastqsanger" /> + <param name="source_select" value="full" /> + <param name="maxEditDist" value="0" /> + <param name="fracMissingAligns" value="0.04" /> + <param name="maxGapOpens" value="1" /> + <param name="maxGapExtens" value="-1" /> + <param name="disallowLongDel" value="16" /> + <param name="disallowIndel" value="5" /> + <param name="seed" value="-1" /> + <param name="maxEditDistSeed" value="2" /> + <param name="mismatchPenalty" value="3" /> + <param name="gapOpenPenalty" value="11" /> + <param name="gapExtensPenalty" value="4" /> + <param name="suboptAlign" value="true" /> + <param name="noIterSearch" value="true" /> + <param name="outputTopN" value="-1" /> + <param name="maxInsertSize" value="500" /> + <param name="maxOccurPairing" value="100000" /> + <param name="suppressHeader" value="true" /> + <output name="output" file="bwa_wrapper_out3.sam" ftype="sam" sort="true" /> + </test> + </tests> + <help> + +**What it does** + +BWA is a fast light-weighted tool that aligns relatively short sequences (queries) to a sequence database (large), such as the human reference genome. It is developed by Heng Li at the Sanger Insitute. Li H. and Durbin R. (2009) Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics, 25, 1754-60. + +------ + +**Know what you are doing** + +.. class:: warningmark + +There is no such thing (yet) as an automated gearshift in short read mapping. It is all like stick-shift driving in San Francisco. In other words = running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy. + + .. __: http://bio-bwa.sourceforge.net/ + +------ + +**Input formats** + +BWA accepts files in Sanger FASTQ format. Use the FASTQ Groomer to prepare your files. + +------ + +**Outputs** + +The output is in SAM format, and has the following columns:: + + Column Description + -------- -------------------------------------------------------- + 1 QNAME Query (pair) NAME + 2 FLAG bitwise FLAG + 3 RNAME Reference sequence NAME + 4 POS 1-based leftmost POSition/coordinate of clipped sequence + 5 MAPQ MAPping Quality (Phred-scaled) + 6 CIGAR extended CIGAR string + 7 MRNM Mate Reference sequence NaMe ('=' if same as RNAME) + 8 MPOS 1-based Mate POSition + 9 ISIZE Inferred insert SIZE + 10 SEQ query SEQuence on the same strand as the reference + 11 QUAL query QUALity (ASCII-33 gives the Phred base quality) + 12 OPT variable OPTional fields in the format TAG:VTYPE:VALU + +The flags are as follows:: + + Flag Description + ------ ------------------------------------- + 0x0001 the read is paired in sequencing + 0x0002 the read is mapped in a proper pair + 0x0004 the query sequence itself is unmapped + 0x0008 the mate is unmapped + 0x0010 strand of the query (1 for reverse) + 0x0020 strand of the mate + 0x0040 the read is the first read in a pair + 0x0080 the read is the second read in a pair + 0x0100 the alignment is not primary + +It looks like this (scroll sideways to see the entire example):: + + QNAME FLAG RNAME POS MAPQ CIAGR MRNM MPOS ISIZE SEQ QUAL OPT + HWI-EAS91_1_30788AAXX:1:1:1761:343 4 * 0 0 * * 0 0 AAAAAAANNAAAAAAAAAAAAAAAAAAAAAAAAAAACNNANNGAGTNGNNNNNNNGCTTCCCACAGNNCTGG hhhhhhh;;hhhhhhhhhhh^hOhhhhghhhfhhhgh;;h;;hhhh;h;;;;;;;hhhhhhghhhh;;Phhh + HWI-EAS91_1_30788AAXX:1:1:1578:331 4 * 0 0 * * 0 0 GTATAGANNAATAAGAAAAAAAAAAATGAAGACTTTCNNANNTCTGNANNNNNNNTCTTTTTTCAGNNGTAG hhhhhhh;;hhhhhhhhhhhhhhhhhhhhhhhhhhhh;;h;;hhhh;h;;;;;;;hhhhhhhhhhh;;hhVh + +------- + +**BWA settings** + +All of the options have a default value. You can change any of them. All of the options in BWA have been implemented here. + +------ + +**BWA parameter list** + +This is an exhaustive list of BWA options: + +For **aln**:: + + -n NUM Maximum edit distance if the value is INT, or the fraction of missing + alignments given 2% uniform base error rate if FLOAT. In the latter + case, the maximum edit distance is automatically chosen for different + read lengths. [0.04] + -o INT Maximum number of gap opens [1] + -e INT Maximum number of gap extensions, -1 for k-difference mode + (disallowing long gaps) [-1] + -d INT Disallow a long deletion within INT bp towards the 3'-end [16] + -i INT Disallow an indel within INT bp towards the ends [5] + -l INT Take the first INT subsequence as seed. If INT is larger than the + query sequence, seeding will be disabled. For long reads, this option + is typically ranged from 25 to 35 for '-k 2'. [inf] + -k INT Maximum edit distance in the seed [2] + -t INT Number of threads (multi-threading mode) [1] + -M INT Mismatch penalty. BWA will not search for suboptimal hits with a score + lower than (bestScore-misMsc). [3] + -O INT Gap open penalty [11] + -E INT Gap extension penalty [4] + -c Reverse query but not complement it, which is required for alignment + in the color space. + -R Proceed with suboptimal alignments even if the top hit is a repeat. By + default, BWA only searches for suboptimal alignments if the top hit is + unique. Using this option has no effect on accuracy for single-end + reads. It is mainly designed for improving the alignment accuracy of + paired-end reads. However, the pairing procedure will be slowed down, + especially for very short reads (~32bp). + -N Disable iterative search. All hits with no more than maxDiff + differences will be found. This mode is much slower than the default. + +For **samse**:: + + -n INT Output up to INT top hits. Value -1 to disable outputting multiple + hits. NOTE: Entering a value other than -1 will result in output that + is not in SAM format, and therefore not usable further down the + pipeline. Check the BWA documentation for details on the format of + the output. [-1] + +For **sampe**:: + + -a INT Maximum insert size for a read pair to be considered as being mapped + properly. Since version 0.4.5, this option is only used when there + are not enough good alignment to infer the distribution of insert + sizes. [500] + -o INT Maximum occurrences of a read for pairing. A read with more + occurrences will be treated as a single-end read. Reducing this + parameter helps faster pairing. [100000] + + </help> + <code file="bwa_wrapper_code.py" /> +</tool> + +