Mercurial > repos > iuc > hisat2
view hisat2.xml @ 15:d5fe9aead222 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hisat2 commit 48012d1879395ecf1b4e6cd962f325c372164a33
author | iuc |
---|---|
date | Tue, 26 Sep 2017 17:01:28 -0400 |
parents | 526b91fbde60 |
children | 21e8ea8e1adb |
line wrap: on
line source
<tool id="hisat2" name="HISAT2" version="2.1.0" profile="17.01"> <description>A fast and sensitive alignment program</description> <macros> <import>hisat2_macros.xml</import> </macros> <requirements> <requirement type="package" version="2.1.0">hisat2</requirement> <requirement type="package" version="1.4">samtools</requirement> </requirements> <stdio> <regex level="fatal" match="hisat2-align exited with value 1" source="both" /> <regex level="fatal" match="hisat2: not found" source="both" /> <exit_code range="1:" /> </stdio> <version_command>hisat2 --version</version_command> <command><![CDATA[ ## Prepare HISAT2 index #if $reference_genome.source == "history": ln -s '$reference_genome.history_item' genome.fa && hisat2-build -p \${GALAXY_SLOTS:-1} genome.fa genome && #set index_path = 'genome' #else: #set index_path = $reference_genome.index.fields.path #end if ## If using known splice sites #if str($adv.spliced_options.spliced_options_selector) == "advanced" and str($adv.spliced_options.known_splice_gtf) != 'None': ln -s '${adv.spliced_options.known_splice_gtf}' splice_sites.gtf && hisat2_extract_splice_sites.py splice_sites.gtf > splice_sites.txt && #end if ## Link in the input files, so HISAT2 can tell their type #set compressed="False" #set reads_are_fastq = True #if str($library.type) == 'paired': #if $library.input_1.is_of_type("fastq.gz", "fastqsanger.gz"): #set read1 = "input_f.fastq.gz" #set compressed = "GZ" #else if $library.input_1.is_of_type("fastq.bz2", "fastqsanger.bz2"): #set read1 = "input_f.fastq.bz2" #set compressed = "BZ2" #else if $library.input_1.is_of_type('fasta'): #set reads_are_fastq = False #set read1 = "input_f.fasta" #else: #set read1 = "input_f.fastq" #end if ln -f -s '${library.input_1}' ${read1} && #if $library.input_2.is_of_type("fastq.gz", "fastqsanger.gz"): #set read2 = "input_r.fastq.gz" #set compressed = "GZ" #else if $library.input_2.is_of_type("fastq.bz2", "fastqsanger.bz2"): #set read2 = "input_r.fastq.bz2" #set compressed = "BZ2" #else if $library.input_2.is_of_type('fasta'): #set read2 = "input_r.fasta" #else: #set read2 = "input_r.fastq" #end if ln -f -s '${library.input_2}' ${read2} && #else if str($library.type) == 'paired_collection': #if $library.input_1.forward.is_of_type("fastq.gz", "fastqsanger.gz"): #set read1 = "input_f.fastq.gz" #set compressed = "GZ" #else if $library.input_1.forward.is_of_type("fastq.bz2", "fastqsanger.bz2"): #set read1 = "input_f.fastq.bz2" #set compressed = "BZ2" #else if $library.input_1.forward.is_of_type('fasta'): #set reads_are_fastq = False #set read1 = "input_f.fasta" #else: #set read1 = "input_f.fastq" #end if ln -s '${library.input_1.forward}' ${read1} && #if $library.input_1.reverse.is_of_type("fastq.gz", "fastqsanger.gz"): #set read2 = "input_r.fastq.gz" #set compressed = "GZ" #else if $library.input_1.reverse.is_of_type("fastq.bz2", "fastqsanger.bz2"): #set read2 = "input_r.fastq.bz2" #set compressed = "BZ2" #else if $library.input_1.reverse.is_of_type("fasta"): #set read2 = "input_r.fasta" #else: #set read2 = "input_r.fastq" #end if ln -s '${library.input_1.reverse}' ${read2} && #else: #if $library.input_1.is_of_type("fastq.gz", "fastqsanger.gz"): #set read1 = "input_f.fastq.gz" #set compressed = "GZ" #else if $library.input_1.is_of_type("fastq.bz2", "fastqsanger.bz2"): #set read1 = "input_f.fastq.bz2" #set compressed = "BZ2" #else if $library.input_1.is_of_type('fasta'): #set reads_are_fastq = False #set read1 = "input_f.fasta" #else: #set read1 = "input_f.fastq" #end if ln -f -s '${library.input_1}' ${read1} && #end if ## Run HISAT2 hisat2 ## number threads -p \${GALAXY_SLOTS:-1} ## ref genome index path -x '${index_path}' ## input reads are fasta? #if not $reads_are_fastq: -f #end if ## Input reads #if str( $library.type ) == "single": -U '${read1}' #if str($adv.output_options.output_options_selector) == "advanced": #if str( $adv.output_options.unaligned_file ) == "true": #if $compressed == "GZ": --un-gz '$output_unaligned_reads_l' #else if $compressed == "BZ2": --un-bz2 '$output_unaligned_reads_l' #else: --un '$output_unaligned_reads_l' #end if #end if #if str( $adv.output_options.aligned_file ) == "true": #if $compressed == "GZ": --al-gz '$output_aligned_reads_l' #else if $compressed == "BZ2": --al-bz2 '$output_aligned_reads_l' #else: --al '$output_aligned_reads_l' #end if #end if #end if #else: -1 '${read1}' -2 '${read2}' #if str($adv.output_options.output_options_selector) == "advanced": #if str( $adv.output_options.unaligned_file ) == "true": #if $compressed == "GZ": --un-conc-gz '${output_unaligned_reads_l}' #else if $compressed == "BZ2": --un-conc-bz2 '${output_unaligned_reads_l}' #else: --un-conc '${output_unaligned_reads_l}' #end if #end if #if str( $adv.output_options.aligned_file ) == "true": #if $compressed == "GZ": --al-conc-gz '${output_aligned_reads_l}' #else if $compressed == "BZ2": --al-conc-bz2 '${output_aligned_reads_l}' #else: --al-conc '${output_aligned_reads_l}' #end if #end if #end if #if str($library.paired_options.paired_options_selector) == "advanced": ${library.paired_options.fr_rf_ff} ${library.paired_options.no_mixed} ${library.paired_options.no_discordant} #end if #end if ## Specify strandedness of reads #if str($library.rna_strandness): --rna-strandness ${library.rna_strandness} #end if ## Input options #if str($adv.input_options.input_options_selector) == "advanced": #if int( $adv.input_options.skip ) > 0: --skip ${adv.input_options.skip} #end if #if int( $adv.input_options.qupto ) > 0: --qupto ${adv.input_options.qupto} #end if --trim5 '${adv.input_options.trim5}' --trim3 '${adv.input_options.trim3}' ${adv.input_options.qv_encoding} ${adv.input_options.solexa_quals} ${adv.input_options.int_quals} #end if ## Alignment options #if str($adv.alignment_options.alignment_options_selector) == "advanced": --n-ceil ${adv.alignment_options.function_type},${adv.alignment_options.constant_term},${adv.alignment_options.coefficient} ${adv.alignment_options.ignore_quals} ${adv.alignment_options.skip_forward} ${adv.alignment_options.skip_reverse} #end if ## Scoring options #if str($adv.scoring_options.scoring_options_selector) == "advanced": --mp ${adv.scoring_options.max_mismatch},${adv.scoring_options.min_mismatch} ${adv.scoring_options.no_softclip} --np ${adv.scoring_options.ambiguous_penalty} --rdg ${adv.scoring_options.read_open_penalty},${adv.scoring_options.read_extend_penalty} --rfg ${adv.scoring_options.ref_open_penalty},${adv.scoring_options.ref_extend_penalty} --sp ${scoring_options.soft_clip_penalty_max},${adv.scoring_options.soft_clip_penalty_min} --score-min ${scoring_options.function_type},${adv.scoring_options.constant_term},${adv.scoring_options.coefficient} #end if ## Spliced alignment options #if str($adv.spliced_options.spliced_options_selector) == "advanced": --pen-cansplice ${adv.spliced_options.canonical_penalty} --pen-noncansplice ${adv.spliced_options.noncanonical_penalty} --pen-canintronlen ${adv.spliced_options.function_type},${adv.spliced_options.constant_term},${adv.spliced_options.coefficient} --pen-noncanintronlen ${adv.spliced_options.nc_function_type},${adv.spliced_options.nc_constant_term},${adv.spliced_options.nc_coefficient} #if str($adv.spliced_options.known_splice_gtf) != 'None': --known-splicesite-infile splice_sites.txt #end if ${adv.spliced_options.no_spliced_alignment_options.no_spliced_alignment} #if $adv.spliced_options.no_spliced_alignment_options.no_spliced_alignment == '--no-spliced-alignment' -I ${adv.spliced_options.no_spliced_alignment_options.minins} -X ${adv.spliced_options.no_spliced_alignment_options.maxins} #end if --min-intronlen ${adv.spliced_options.min_intron} --max-intronlen ${adv.spliced_options.max_intron} ${adv.spliced_options.tma} #if str($adv.spliced_options.notmplen): ${adv.spliced_options.notmplen} #end if #end if ## Reporting options #if str($adv.reporting_options.reporting_options_selector) == "advanced": -k ${adv.reporting_options.max_primary} --max-seeds ${adv.reporting_options.max_seeds} $adv.reporting_options.secondary #end if ## Other options #if str( $adv.other_options.other_options_selector ) == "advanced": ${adv.other_options.non_deterministic} --seed '${adv.other_options.seed}' #end if ## Output Summary #if str($sum.new_summary) == "true": --new-summary #end if #if str($sum.summary_file) == "true": --summary-file summary.txt #end if ## Convert SAM output to sorted BAM | samtools sort - -@ \${GALAXY_SLOTS:-1} -l 6 -o '${output_alignments}' ## Rename any output fastq files #if $output_unaligned_reads_l and $output_unaligned_reads_r: #from os.path import splitext #set _unaligned_root, _unaligned_ext = splitext( str( $output_unaligned_reads_l ) ) && mv '${ _unaligned_root }.1${_unaligned_ext}' '$output_unaligned_reads_l' && mv '${ _unaligned_root }.2${_unaligned_ext}' '$output_unaligned_reads_r' #end if #if $output_aligned_reads_l and $output_aligned_reads_r: #from os.path import splitext #set _aligned_root, _aligned_ext = splitext( str( $output_aligned_reads_l ) ) && mv '${ _aligned_root }.1${_aligned_ext}' '$output_aligned_reads_l' && mv '${ _aligned_root }.2${_aligned_ext}' '$output_aligned_reads_r' #end if ]]></command> <!-- Define inputs --> <inputs> <!-- Reference genome --> <conditional name="reference_genome"> <param name="source" type="select" label="Source for the reference genome" help="Built-in references were created using default options"> <option value="indexed" selected="true">Use a built-in genome</option> <option value="history">Use a genome from history</option> </param> <when value="indexed"> <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team"> <options from_data_table="hisat2_indexes"> <filter type="sort_by" column="2" /> <validator type="no_options" message="No genomes are available for the selected input dataset" /> </options> </param> </when> <when value="history"> <param name="history_item" type="data" format="fasta" label="Select the reference genome" /> </when> </conditional> <!-- Reads --> <conditional name="library"> <param name="type" type="select" label="Single-end or paired-end reads?"> <option value="single">Single-end</option> <option value="paired">Paired-end</option> <option value="paired_collection">Paired-end Collection</option> </param> <when value="single"> <param name="input_1" format="fastqsanger,fastqsanger.gz,fastqsanger.bz2,fasta" type="data" label="FASTA/Q file" help="Must be of datatype "fastqsanger" or "fasta"" /> <param name="rna_strandness" argument="--rna-strandness" type="select" label="Specify strand information" help="'F' means a read corresponds to a transcript. 'R' means a read corresponds to the reverse complemented counterpart of a transcript. With this option being used, every read alignment will have an XS attribute tag: '+' means a read belongs to a transcript on '+' strand of genome. '-' means a read belongs to a transcript on '-' strand of genome."> <option value="">Unstranded</option> <option value="F">Forward (F)</option> <option value="R">Reverse (R)</option> </param> </when> <when value="paired"> <param name="input_1" format="fastqsanger,fastqsanger.gz,fastqsanger.bz2,fasta" type="data" label="FASTA/Q file #1" help="Must be of datatype "fastqsanger"or "fasta"" /> <param name="input_2" format="fastqsanger,fastqsanger.gz,fastqsanger.bz2,fasta" type="data" label="FASTA/Q file #2" help="Must be of datatype "fastqsanger"or "fasta"" /> <expand macro="paired_end_options" /> </when> <when value="paired_collection"> <param name="input_1" format="fastqsanger,fastqsanger.gz,fastqsanger.bz2,fasta" type="data_collection" collection_type="paired" label="Paired Collection" help="Must be of datatype "fastqsanger" or "fasta"" /> <expand macro="paired_end_options" /> </when> </conditional> <!-- Summary Options --> <section name="sum" title="Summary Options" expanded="False"> <param name="new_summary" argument="--new-summary" type="boolean" checked="False" label="Output alignment summary in a more machine-friendly style." help="Select this option for compatibility with MultiQC" /> <param name="summary_file" argument="--summary-file" type="boolean" checked="False" label="Print alignment summary to a file." help="Output alignment summary to a file in addition to stderr." /> </section> <!-- Advanced Options --> <section name="adv" title="Advanced Options" expanded="False"> <conditional name="input_options"> <param name="input_options_selector" type="select" label="Input options"> <option value="defaults">Use default values</option> <option value="advanced">Specify input options</option> </param> <when value="defaults" /> <when value="advanced"> <param name="skip" argument="-s" type="integer" min="0" value="0" label="Skip the first N reads or pairs in the input" help="default: 0" /> <param name="qupto" argument="-u" type="integer" min="0" value="0" label="Align the first N reads or read pairs from the input (after the first N reads or pairs have been skipped), then stop" help="default: no limit" /> <param name="trim5" argument="-5" type="integer" min="0" value="0" label="Trim 5' end" help="Trim N bases from 5' (left) end of each read before alignment, default: 0" /> <param name="trim3" argument="-3" type="integer" min="0" value="0" label="Trim 3' end" help="Trim N bases from 3' (right) end of each read before alignment, default: 0"/> <param name="qv_encoding" type="select" display="radio" label="Select quality score encoding" help="See help below for more details"> <option value="--phred33" selected="True">Input qualities are ASCII chars equal to the Phred quality plus 33. This is also called the "Phred+33" encoding, which is used by the very latest Illumina pipelines (--phred33)</option> <option value="--phred64">Input qualities are ASCII chars equal to the Phred quality plus 64. This is also called the "Phred+64" encoding (--phred64)</option> </param> <param name="solexa_quals" argument="--solexa-quals" type="boolean" truevalue="--solexa-quals" falsevalue="" checked="False" label="Convert input qualities from Solexa (which can be negative) to Phred (which can't). This scheme was used in older Illumina GA Pipeline versions (prior to 1.3)" help="--solexa-quals; default: False"/> <param name="int_quals" argument="--int-quals" type="boolean" truevalue="--int-quals" falsevalue="" checked="False" label="Are quality values provided as space separated integers?" help="Quality values are represented in the read input file as space-separated ASCII integers, e.g., 40 40 30 40..., rather than ASCII characters, e.g., II?I.... Integers are treated as being on the Phred quality scale unless --solexa-quals is also specified [default: False]"/> </when> </conditional> <conditional name="alignment_options"> <param name="alignment_options_selector" type="select" label="Alignment options"> <option value="defaults">Use default values</option> <option value="advanced">Specify alignment options</option> </param> <when value="defaults" /> <when value="advanced"> <param name="function_type" argument="--n-ceil" type="select" display="radio" label="Function governing the maximum number of ambiguous characters (usually Ns and/or .s) allowed in a read as a function of read length" help="Reads exceeding this ceiling are filtered out"> <option value="C">Constant [f(x) = B]</option> <option value="L" selected="true">Linear [f(x) = B + A * x]</option> <option value="S">Square root [f(x) = B + A * x²]</option> <option value="G">Natural logarithm [f(x) = B + A * log(x)]</option> </param> <param name="constant_term" type="float" value="0" label="Constant term (B)" help="Constant term for the above function" /> <param name="coefficient" type="float" value="0.15" label="Coefficient (A)" help="Coefficient for the above function" /> <param argument="--ignore-quals" name="ignore_quals" type="boolean" truevalue="--ignore-quals" falsevalue="" label="Ignore quality values" help="When calculating a mismatch penalty, always consider the quality value at the mismatched position to be the highest possible, regardless of the actual value. I.e. input is treated as though all quality values are high. This is also the default behavior when the input doesn't specify quality values" /> <param argument="--nofw" name="skip_forward" type="boolean" truevalue="--nofw" falsevalue="" label="Skip forward strand of reference" help="If --nofw is specified, HISAT2 will not attempt to align unpaired reads to the forward (Watson) reference strand. In paired-end mode, --nofw and --norc pertain to the fragments; i.e. specifying --nofw causes HISAT2 to explore only those paired-end configurations corresponding to fragments from the reverse-complement (Crick) strand" /> <param argument="--norc" name="skip_reverse" type="boolean" truevalue="--norc" falsevalue="" label="Skip reverse strand of reference" help="If --norc is specified, HISAT2 will not attempt to align unpaired reads against the reverse-complement (Crick) reference strand. In paired-end mode, --nofw and --norc pertain to the fragments; i.e. specifying --norc causes HISAT2 to explore only those paired-end configurations corresponding to fragments from the forward-complement (Watson) strand" /> </when> </conditional> <conditional name="scoring_options"> <param name="scoring_options_selector" type="select" label="Scoring options"> <option value="defaults">Use default values</option> <option value="advanced">Specify scoring options</option> </param> <when value="defaults" /> <when value="advanced"> <param name="constant_term" type="float" value="0" label="Constant term (B)" help="Constant term for the above function" /> <param name="coefficient" type="float" value="-0.2" label="Coefficient (A)" help="Coefficient for the above function" /> <param argument="--mp" name="max_mismatch" type="integer" value="6" min="0" label="Maximum mismatch penalty" help="Sets the maximum mismatch penalty. A number less than or equal to MX and greater than or equal to MN is subtracted from the alignment score for each position where a read character aligns to a reference character, the characters do not match, and neither is an N. If --ignore-quals is specified, the number subtracted quals MX. Otherwise, the number subtracted is MN + floor( (MX-MN)(MIN(Q, 40.0)/40.0) ) where Q is the Phred quality value" /> <param argument="--mp" name="min_mismatch" type="integer" value="2" min="0" label="Minimum mismatch penalty" help="Sets the minimum mismatch penalty. A number less than or equal to MX and greater than or equal to MN is subtracted from the alignment score for each position where a read character aligns to a reference character, the characters do not match, and neither is an N. If --ignore-quals is specified, the number subtracted quals MX. Otherwise, the number subtracted is MN + floor( (MX-MN)(MIN(Q, 40.0)/40.0) ) where Q is the Phred quality value" /> <param argument="--sp" name="soft_clip_penalty_max" type="integer" value="2" min="0" label="Maximum soft-clipping penalty" help="Sets the maximum (MX) penalty for soft-clipping per base. A number less than or equal to MX and greater than or equal to MN is subtracted from the alignment score for each position. The number subtracted is MN + floor( (MX-MN)(MIN(Q, 40.0)/40.0) ) where Q is the Phred quality value" /> <param argument="--sp" name="soft_clip_penalty_min" type="integer" value="1" min="0" label="Minimum soft-clipping penalty" help="Sets the minimum (MN) penalty for soft-clipping per base. A number less than or equal to MX and greater than or equal to MN is subtracted from the alignment score for each position. The number subtracted is MN + floor( (MX-MN)(MIN(Q, 40.0)/40.0) ) where Q is the Phred quality value" /> <param argument="--no-softclip" name="no_softclip" type="boolean" truevalue="" falsevalue="--no-softclip" label="Allow soft-clipping" /> <param argument="--np" name="ambiguous_penalty" type="integer" value="1" min="0" label="Ambiguous read penalty" help="Sets penalty for positions where the read, reference, or both, contain an ambiguous character such as N" /> <param argument="--rdg" name="read_open_penalty" type="integer" value="5" min="0" label="Read gap open penalty" help="A read gap of length N gets a penalty of [open_penalty] + N * [extend_penalty]" /> <param argument="--rdg" name="read_extend_penalty" type="integer" value="3" min="0" label="Read gap extend penalty" help="A read gap of length N gets a penalty of [open_penalty] + N * [extend_penalty]" /> <param argument="--rfg" name="ref_open_penalty" type="integer" value="5" min="0" label="Reference gap open penalty" help="A reference gap of length N gets a penalty of [open_penalty] + N * [extend_penalty]" /> <param argument="--rfg" name="ref_extend_penalty" type="integer" value="3" min="0" label="Reference gap extend penalty" help="A reference gap of length N gets a penalty of [open_penalty] + N * [extend_penalty]" /> <param name="function_type" argument="--score-min" type="select" display="radio" label="Function governing the minimum alignment score needed for an alignment to be considered "valid" (i.e. good enough to report)" help="This is a function of read length"> <option value="C">Constant [f(x) = B]</option> <option value="L" selected="true">Linear [f(x) = B + A * x]</option> <option value="S">Square root [f(x) = B + A * x²]</option> <option value="G">Natural logarithm [f(x) = B + A * log(x)]</option> </param> </when> </conditional> <conditional name="spliced_options"> <param name="spliced_options_selector" type="select" label="Spliced alignment options"> <option value="defaults">Use default values</option> <option value="advanced">Specify spliced alignment options</option> </param> <when value="defaults" /> <when value="advanced"> <param name="canonical_penalty" argument="--pen-cansplice" type="integer" value="0" min="0" label="Penalty for canonical splice sites" /> <param name="noncanonical_penalty" argument="--pen-noncansplice" type="integer" value="12" min="0" label="Penalty for non-canonical splice sites" /> <param name="function_type" argument="--pen-canintronlen" type="select" display="radio" label="Penalty function for long introns with canonical splice sites" help="Alignments with shorter introns are preferred to those with longer ones"> <option value="C">Constant [f(x) = B]</option> <option value="L">Linear [f(x) = B + A * x]</option> <option value="S">Square root [f(x) = B + A * x²]</option> <option value="G" selected="true">Natural logarithm [f(x) = B + A * log(x)]</option> </param> <param name="constant_term" type="float" value="-8" label="Constant term (B)" help="Constant term for the above function" /> <param name="coefficient" type="float" value="1" label="Coefficient (A)" help="Coefficient for the above function" /> <param name="nc_function_type" argument="--pen-noncanintronlen" type="select" display="radio" label="Penalty function for long introns with non-canonical splice sites" help="Alignments with shorter introns are preferred to those with longer ones"> <option value="C">Constant [f(x) = B]</option> <option value="L">Linear [f(x) = B + A * x]</option> <option value="S">Square root [f(x) = B + A * x²]</option> <option value="G" selected="true">Natural logarithm [f(x) = B + A * log(x)]</option> </param> <param name="nc_constant_term" type="float" value="-8" label="Constant term (B)" help="Constant term for the above function" /> <param name="nc_coefficient" type="float" value="1" label="Coefficient (A)" help="Coefficient for the above function" /> <param name="min_intron" type="integer" value="20" min="0" label="Minimum intron length" /> <param name="max_intron" type="integer" value="500000" min="0" label="Maximum intron length" /> <conditional name="no_spliced_alignment_options"> <param name="no_spliced_alignment" argument="--no-spliced-alignment" type="select" label="Disable spliced alignment"> <option value="--no-spliced-alignment">True</option> <option value="">False</option> </param> <when value="--no-spliced-alignment"> <param name="minins" argument="-I" type="integer" value="0" min="0" label="Minimum fragment length for valid paired-end alignments" help="E.g. if -I 60 is specified and a paired-end alignment consists of two 20-bp alignments in the appropriate orientation with a 20-bp gap between them, that alignment is considered valid (as long as -X is also satisfied). A 19-bp gap would not be valid in that case. If trimming options -3 or -5 are also used, the -I constraint is applied with respect to the untrimmed mates. The larger the difference between -I and -X, the slower HISAT2 will run. This is because larger differences between -I and -X require that HISAT2 scan a larger window to determine if a concordant alignment exists. For typical fragment length ranges (200 to 400 nucleotides), HISAT2 is very efficient" /> <param name="maxins" argument="-X" type="integer" value="500" min="0" label="Maximum fragment length for valid paired-end alignments" help="E.g. if -X 100 is specified and a paired-end alignment consists of two 20-bp alignments in the proper orientation with a 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied). A 61-bp gap would not be valid in that case. If trimming options -3 or -5 are also used, the -X constraint is applied with respect to the untrimmed mates, not the trimmed mates. The larger the difference between -I and -X, the slower HISAT2 will run. This is because larger differences between -I and -X require that HISAT2 scan a larger window to determine if a concordant alignment exists. For typical fragment length ranges (200 to 400 nucleotides), HISAT2 is very efficient" /> </when> <when value="" /> </conditional> <param name="known_splice_gtf" type="data" format="gtf" optional="true" label="GTF file with known splice sites" /> <param name="tma" type="select" display="radio" label="Transcriptome assembly reporting"> <option value="">Use default reporting.</option> <option value="--tmo">Report only those alignments within known transcripts.</option> <option value="--dta">Report alignments tailored for transcript assemblers including StringTie.</option> <option value="--dta-cufflinks">Report alignments tailored specifically for Cufflinks.</option> </param> <param name="notmplen" argument="--no-templatelen-adjustment" type="boolean" truevalue="--no-templatelen-adjustment" falsevalue="" label="Disable automatic template length adjustment for RNA-seq reads" help="Default: false" /> </when> </conditional> <conditional name="reporting_options"> <param name="reporting_options_selector" type="select" label="Reporting options"> <option value="defaults">Use default values</option> <option value="advanced">Specify reporting options</option> </param> <when value="defaults" /> <when value="advanced"> <param name="max_primary" argument="-k" type="integer" min="0" optional="true" label="Primary alignments" help="Search for at most K distinct, primary alignments for each read. Primary alignments mean alignments whose alignment score is equal or higher than any other alignments. The search terminates when it can't find more distinct valid alignments, or when it finds K, whichever happens first. The alignment score for a paired-end alignment equals the sum of the alignment scores of the individual mates. Each reported read or pair alignment beyond the first has the SAM 'secondary' bit (which equals 256) set in its FLAGS field. For reads that have more than K distinct, valid alignments, HISAT2 does not guarantee that the K alignments reported are the best possible in terms of alignment score. HISAT2 is not designed with large values for -k in mind, so when aligning reads to long repetitive genomes, a large K can be very, very slow. Default: 5 (HFM) or 10 (HGFM)" /> <param name="max_seeds" argument="--max-seeds" type="integer" min="0" optional="true" label="Maximum number of seeds that will be extended" help="HISAT2, like other aligners, uses seed-and-extend approaches. HISAT2 tries to extend seeds to full-length alignments. HISAT2 extends up to these many seeds and skips the rest of the seeds. Large values for --max-seeds may improve alignment sensitivity, but HISAT2 is not designed with large values for --max-seeds in mind, and when aligning reads to long repetitive genomes, a large --max-seeds can be very, very slow. Default: 5 (HFM) or 10 (HGFM)" /> <param name="secondary" argument="--secondary" type="boolean" truevalue="--secondary" falsevalue="" label="Report secondary alignments" /> </when> </conditional> <conditional name="output_options"> <param name="output_options_selector" type="select" label="Output options"> <option value="defaults">Use default values</option> <option value="advanced">Specify output options</option> </param> <when value="defaults" /> <when value="advanced"> <param name="unaligned_file" argument="--un/--un-conc" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Write unaligned reads (in fastq format) to separate file(s)" help="This triggers --un parameter for single reads and --un-conc for paired reads" /> <param name="aligned_file" argument="--al/--al-conc" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Write aligned reads (in fastq format) to separate file(s)" help="This triggers --al parameter for single reads and --al-conc for paired reads" /> </when> </conditional> <conditional name="other_options"> <param name="other_options_selector" type="select" label="Other options"> <option value="defaults">Use default values</option> <option value="advanced">Specify other options</option> </param> <when value="defaults" /> <when value="advanced"> <param name="seed" argument="--seed" type="integer" value="0" min="0" label="Use this number as the seed for pseudo-random number generator" help="Default=0" /> <param name="non_deterministic" argument="--non-deterministic" type="boolean" truevalue="--non-deterministic" falsevalue="" label="Re-initialize the pseudo-random generator for each read using the current time" help="see Help below for explanation of this option; default: False"/> </when> </conditional> </section> </inputs> <!-- Define outputs --> <outputs> <!-- BAM --> <data name="output_alignments" format="bam" label="${tool.name} on ${on_string}: aligned reads (BAM) " /> <!-- Unaligned fastq (L) --> <data name="output_unaligned_reads_l" format="fastqsanger" label="${tool.name} on ${on_string}: unaligned reads (L)"> <filter>adv['output_options']['output_options_selector'] == "advanced" and adv['output_options']['unaligned_file'] is True)</filter> </data> <!-- Aligned fastq (L) --> <data name="output_aligned_reads_l" format="fastqsanger" label="${tool.name} on ${on_string}: aligned reads (L)"> <filter>adv['output_options']['output_options_selector'] == "advanced" and adv['output_options']['aligned_file'] is True)</filter> </data> <!-- Unaligned fastq (R) --> <data name="output_unaligned_reads_r" format="fastqsanger" label="${tool.name} on ${on_string}: unaligned reads (R)"> <filter>(library['type'] == 'paired' or library['type'] == 'paired_collection') and (adv['output_options']['output_options_selector'] == "advanced" and adv['output_options']['unaligned_file'] is True) </filter> </data> <!-- Aligned fastq (R) --> <data name="output_aligned_reads_r" format="fastqsanger" label="${tool.name} on ${on_string}: aligned reads (R)"> <filter>(library['type'] == 'paired' or library['type'] == 'paired_collection') and (adv['output_options']['output_options_selector'] == "advanced" and adv['output_options']['aligned_file'] is True) </filter> </data> <!-- Alignment summary file --> <data name="summary_file" format="txt" from_work_dir="summary.txt" label="${tool.name} on ${on_string}: Mapping summary" > <filter>sum['summary_file'] is True</filter> </data> </outputs> <!-- Define tests --> <tests> <test><!-- Ensure bam output works --> <param name="type" value="paired" /> <param name="source" value="history" /> <param name="history_item" ftype="fasta" value="phiX.fa" /> <param name="input_1" ftype="fastqsanger" value="hisat_input_1_forward.fastq" /> <param name="input_2" ftype="fastqsanger" value="hisat_input_1_reverse.fastq" /> <output name="output_alignments" file="hisat_output_1.bam" ftype="bam" lines_diff="2" /> </test> <test><!-- Ensure built-in reference works --> <param name="type" value="paired" /> <param name="source" value="indexed" /> <param name="input_1" ftype="fastqsanger" dbkey="phiX" value="hisat_input_1_forward.fastq" /> <param name="input_2" ftype="fastqsanger" dbkey="phiX" value="hisat_input_1_reverse.fastq" /> <output name="output_alignments" file="hisat_output_1.bam" ftype="bam" lines_diff="2" /> </test> <test><!-- Ensure trimming works --> <param name="type" value="paired" /> <param name="source" value="history" /> <param name="history_item" ftype="fasta" value="phiX.fa" /> <param name="input_options_selector" value="advanced" /> <param name="trim3" value="15" /> <param name="trim5" value="15" /> <param name="input_1" ftype="fastqsanger" value="hisat_input_2_forward.fastq" /> <param name="input_2" ftype="fastqsanger" value="hisat_input_2_reverse.fastq" /> <output name="output_alignments" file="hisat_output_2.bam" ftype="bam" lines_diff="2" /> </test> <test><!-- Ensure paired options works --> <param name="type" value="paired" /> <param name="source" value="history" /> <param name="history_item" ftype="fasta" value="phiX.fa" /> <param name="input_options_selector" value="advanced" /> <param name="trim3" value="15" /> <param name="trim5" value="15" /> <param name="input_1" ftype="fastqsanger" value="hisat_input_2_forward.fastq" /> <param name="input_2" ftype="fastqsanger" value="hisat_input_2_reverse.fastq" /> <param name="paired_options_selector" value="advanced" /> <param name="no_mixed" value="True" /> <param name="no_discordant" value="True" /> <output name="output_alignments" file="hisat_output_3.bam" ftype="bam" lines_diff="2" /> </test> <test><!-- Ensure unaligned output works --> <param name="type" value="single" /> <param name="source" value="history" /> <param name="history_item" ftype="fasta" value="phiX.fa" /> <param name="output_options_selector" value="advanced" /> <param name="unaligned_file" value="true" /> <param name="aligned_file" value="true" /> <param name="history_item" ftype="fasta" value="phiX.fa" /> <param name="input_1" ftype="fasta" value="test_unaligned_reads.fasta" /> <output name="output_unaligned_reads_l" file="test_unaligned_reads.fasta" /> </test> <test><!-- Ensure paired unaligned output works --> <param name="type" value="paired" /> <param name="source" value="history" /> <param name="output_options_selector" value="advanced" /> <param name="unaligned_file" value="true" /> <param name="aligned_file" value="true" /> <param name="history_item" ftype="fasta" value="phiX.fa" /> <param name="input_1" ftype="fasta" value="test_unaligned_reads.fasta" /> <param name="input_2" ftype="fasta" value="test_unaligned_reads.fasta" /> <output name="output_unaligned_reads_l" file="test_unaligned_reads.fasta" /> <output name="output_unaligned_reads_r" file="test_unaligned_reads.fasta" /> </test> <test><!-- Ensure fastqsanger.gz works --> <param name="type" value="paired" /> <param name="source" value="history" /> <param name="history_item" ftype="fasta" value="phiX.fa" /> <param name="input_options_selector" value="advanced" /> <param name="trim3" value="15" /> <param name="trim5" value="15" /> <param name="input_1" ftype="fastqsanger.gz" value="hisat_input_2_forward.fastq.gz" /> <param name="input_2" ftype="fastqsanger.gz" value="hisat_input_2_reverse.fastq.gz" /> <param name="paired_options_selector" value="advanced" /> <param name="no_mixed" value="True" /> <param name="no_discordant" value="True" /> <output name="output_alignments" file="hisat_output_3.bam" ftype="bam" lines_diff="2" /> </test> <test><!-- Ensure fastqsanger.bz2 works --> <param name="type" value="paired" /> <param name="source" value="history" /> <param name="history_item" ftype="fasta" value="phiX.fa" /> <param name="input_options_selector" value="advanced" /> <param name="trim3" value="15" /> <param name="trim5" value="15" /> <param name="input_1" ftype="fastqsanger.bz2" value="hisat_input_2_forward.fastq.bz2" /> <param name="input_2" ftype="fastqsanger.bz2" value="hisat_input_2_reverse.fastq.bz2" /> <param name="paired_options_selector" value="advanced" /> <param name="no_mixed" value="True" /> <param name="no_discordant" value="True" /> <output name="output_alignments" file="hisat_output_3.bam" ftype="bam" lines_diff="2" /> </test> <test><!-- Ensure paired strandness works --> <param name="type" value="paired" /> <param name="source" value="history" /> <param name="history_item" ftype="fasta" value="phiX.fa" /> <param name="input_1" ftype="fastqsanger" value="hisat_input_1_forward.fastq" /> <param name="input_2" ftype="fastqsanger" value="hisat_input_1_reverse.fastq" /> <param name="rna_strandness" value="FR" /> <output name="output_alignments" file="hisat_output_4.bam" ftype="bam" lines_diff="2" /> </test> <test><!-- Ensure summary file output works --> <param name="type" value="single" /> <param name="source" value="history" /> <param name="history_item" ftype="fasta" value="phiX.fa" /> <param name="input_1" ftype="fastqsanger" value="hisat_input_1_forward.fastq" /> <param name="rna_strandness" value="R" /> <param name="new_summary" value="true" /> <param name="summary_file" value="true" /> <output name="summary_file" file="hisat_output.summary" ftype="txt" /> </test> </tests> <help><![CDATA[ Introduction ============ What is HISAT? -------------- `HISAT <http://ccb.jhu.edu/software/hisat>`__ is a fast and sensitive spliced alignment program. As part of HISAT, we have developed a new indexing scheme based on the Burrows-Wheeler transform (`BWT <http://en.wikipedia.org/wiki/Burrows-Wheeler_transform>`__) and the `FM index <http://en.wikipedia.org/wiki/FM-index>`__, called hierarchical indexing, that employs two types of indexes: (1) one global FM index representing the whole genome, and (2) many separate local FM indexes for small regions collectively covering the genome. Our hierarchical index for the human genome (about 3 billion bp) includes ~48,000 local FM indexes, each representing a genomic region of ~64,000bp. As the basis for non-gapped alignment, the FM index is extremely fast with a low memory footprint, as demonstrated by `Bowtie <http://bowtie-bio.sf.net>`__. In addition, HISAT provides several alignment strategies specifically designed for mapping different types of RNA-seq reads. All these together, HISAT enables extremely fast and sensitive alignment of reads, in particular those spanning two exons or more. As a result, HISAT is much faster >50 times than `TopHat2 <http://ccb.jhu.edu/software/tophat>`__ with better alignment quality. Although it uses a large number of indexes, the memory requirement of HISAT is still modest, approximately 4.3 GB for human. HISAT uses the `Bowtie2 <http://bowtie-bio.sf.net/bowtie2>`__ implementation to handle most of the operations on the FM index. In addition to spliced alignment, HISAT handles reads involving indels and supports a paired-end alignment mode. Multiple processors can be used simultaneously to achieve greater alignment speed. HISAT outputs alignments in `SAM <http://samtools.sourceforge.net/SAM1.pdf>`__ format, enabling interoperation with a large number of other tools (e.g. `SAMtools <http://samtools.sourceforge.net>`__, `GATK <http://www.broadinstitute.org/gsa/wiki/index.php/The_Genome_Analysis_Toolkit>`__) that use SAM. HISAT is distributed under the `GPLv3 license <http://www.gnu.org/licenses/gpl-3.0.html>`__, and it runs on the command line under Linux, Mac OS X and Windows. Running HISAT ============= Reporting --------- The reporting mode governs how many alignments HISAT looks for, and how to report them. In general, when we say that a read has an alignment, we mean that it has a `valid alignment <#valid-alignments-meet-or-exceed-the-minimum-score-threshold>`__. When we say that a read has multiple alignments, we mean that it has multiple alignments that are valid and distinct from one another. Distinct alignments map a read to different places ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Two alignments for the same individual read are "distinct" if they map the same read to different places. Specifically, we say that two alignments are distinct if there are no alignment positions where a particular read offset is aligned opposite a particular reference offset in both alignments with the same orientation. E.g. if the first alignment is in the forward orientation and aligns the read character at read offset 10 to the reference character at chromosome 3, offset 3,445,245, and the second alignment is also in the forward orientation and also aligns the read character at read offset 10 to the reference character at chromosome 3, offset 3,445,245, they are not distinct alignments. Two alignments for the same pair are distinct if either the mate 1s in the two paired-end alignments are distinct or the mate 2s in the two alignments are distinct or both. Default mode: search for one or more alignments, report each ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ HISAT searches for up to N distinct, primary alignments for each read, where N equals the integer specified with the ``-k`` parameter. Primary alignments mean alignments whose alignment score is equal or higher than any other alignments. It is possible that multiple distinct alignments whave the same score. That is, if ``-k 2`` is specified, HISAT will search for at most 2 distinct alignments. The alignment score for a paired-end alignment equals the sum of the alignment scores of the individual mates. Each reported read or pair alignment beyond the first has the SAM 'secondary' bit (which equals 256) set in its FLAGS field. See the `SAM specification <http://samtools.sourceforge.net/SAM1.pdf>`__ for details. HISAT does not "find" alignments in any specific order, so for reads that have more than N distinct, valid alignments, HISAT does not gaurantee that the N alignments reported are the best possible in terms of alignment score. Still, this mode can be effective and fast in situations where the user cares more about whether a read aligns (or aligns a certain number of times) than where exactly it originated. Alignment summmary ------------------ When HISAT finishes running, it prints messages summarizing what happened. These messages are printed to the "standard error" ("stderr") filehandle and can be optionally printed to a file. Choose `--new-summary` under **Summary Options** for compatibility with `MultiQC <http://multiqc.info/docs/#hisat2>`_. For datasets consisting of unpaired reads, the summary might look like this: :: 20000 reads; of these: 20000 (100.00%) were unpaired; of these: 1247 (6.24%) aligned 0 times 18739 (93.69%) aligned exactly 1 time 14 (0.07%) aligned >1 times 93.77% overall alignment rate For datasets consisting of pairs, the summary might look like this: :: 10000 reads; of these: 10000 (100.00%) were paired; of these: 650 (6.50%) aligned concordantly 0 times 8823 (88.23%) aligned concordantly exactly 1 time 527 (5.27%) aligned concordantly >1 times ---- 650 pairs aligned concordantly 0 times; of these: 34 (5.23%) aligned discordantly 1 time ---- 616 pairs aligned 0 times concordantly or discordantly; of these: 1232 mates make up the pairs; of these: 660 (53.57%) aligned 0 times 571 (46.35%) aligned exactly 1 time 1 (0.08%) aligned >1 times 96.70% overall alignment rate The indentation indicates how subtotals relate to totals. .. class:: infomark **HISAT2 options** Galaxy wrapper for HISAT2 implements most, but not all, options available through the command line. Supported options are described below. ----- **Inputs** HISAT2 accepts files in FASTQ or FASTA format (single-end or paired-end). Note that if your reads are from a stranded library, you need to choose the appropriate setting under **Specify strand information** above. For single-end reads, use F or R. 'F' means a read corresponds to a transcript. 'R' means a read corresponds to the reverse complemented counterpart of a transcript. For paired-end reads, use either FR or RF. With this option being used, every read alignment will have an XS attribute tag: '+' means a read belongs to a transcript on '+' strand of genome. '-' means a read belongs to a transcript on '-' strand of genome. (TopHat has a similar option, --library-type option, where fr-firststrand corresponds to R and RF; fr-secondstrand corresponds to F and FR.) ------ **Input options**:: -s/--skip <int> Skip (i.e. do not align) the first `<int>` reads or pairs in the input. -u/--qupto <int> Align the first `<int>` reads or read pairs from the input (after the `-s`/`--skip` reads or pairs have been skipped), then stop. Default: no limit. -5/--trim5 <int> Trim `<int>` bases from 5' (left) end of each read before alignment (default: 0). -3/--trim3 <int> Trim `<int>` bases from 3' (right) end of each read before alignment (default: 0). --phred33 Input qualities are ASCII chars equal to the Phred quality plus 33. This is also called the "Phred+33" encoding, which is used by the very latest Illumina pipelines. --phred64 Input qualities are ASCII chars equal to the Phred quality plus 64. This is also called the "Phred+64" encoding. --solexa-quals Convert input qualities from Solexa Phred quality (which can be negative) to Phred Phred quality (which can't). This scheme was used in older Illumina GA Pipeline versions (prior to 1.3). Default: off. --int-quals Quality values are represented in the read input file as space-separated ASCII integers, e.g., `40 40 30 40`..., rather than ASCII characters, e.g., `II?I`.... Integers are treated as being on the Phred quality scale unless `--solexa-quals` is also specified. Default: off. ------ **Alignment options**:: --n-ceil <func> Sets a function governing the maximum number of ambiguous characters (usually `N`s and/or `.`s) allowed in a read as a function of read length. For instance, specifying `-L,0,0.15` sets the N-ceiling function `f` to `f(x) = 0 + 0.15 * x`, where x is the read length. Reads exceeding this ceiling are filtered out. Default: `L,0,0.15`. --ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched position to be the highest possible, regardless of the actual value. I.e. input is treated as though all quality values are high. This is also the default behavior when the input doesn't specify quality values (e.g. in `-f`, `-r`, or `-c` modes). --nofw/--norc If `--nofw` is specified, `hisat2` will not attempt to align unpaired reads to the forward (Watson) reference strand. If `--norc` is specified, `hisat2` will not attempt to align unpaired reads against the reverse-complement (Crick) reference strand. In paired-end mode, `--nofw` and `--norc` pertain to the fragments; i.e. specifying `--nofw` causes `hisat2` to explore only those paired-end configurations corresponding to fragments from the reverse-complement (Crick) strand. Default: both strands enabled. ----- **Scoring options**:: --mp MX,MN Sets the maximum (`MX`) and minimum (`MN`) mismatch penalties, both integers. A number less than or equal to `MX` and greater than or equal to `MN` is subtracted from the alignment score for each position where a read character aligns to a reference character, the characters do not match, and neither is an `N`. If `--ignore-quals` is specified, the number subtracted quals `MX`. Otherwise, the number subtracted is `MN + floor( (MX-MN)(MIN(Q, 40.0)/40.0) )` where Q is the Phred quality value. Default: `MX` = 6, `MN` = 2. --sp MX,MN Sets the maximum (`MX`) and minimum (`MN`) penalties for soft-clipping per base, both integers. A number less than or equal to `MX` and greater than or equal to `MN` is subtracted from the alignment score for each position. The number subtracted is `MN + floor( (MX-MN)(MIN(Q, 40.0)/40.0) )` where Q is the Phred quality value. Default: `MX` = 2, `MN` = 1. --no-softclip Disallow soft-clipping. --np <int> Sets penalty for positions where the read, reference, or both, contain an ambiguous character such as `N`. Default: 1. --rdg <int1>,<int2> Sets the read gap open (`<int1>`) and extend (`<int2>`) penalties. A read gap of length N gets a penalty of `<int1>` + N * `<int2>`. Default: 5, 3. --rfg <int1>,<int2> Sets the reference gap open (`<int1>`) and extend (`<int2>`) penalties. A reference gap of length N gets a penalty of `<int1>` + N * `<int2>`. Default: 5, 3. --score-min <func> Sets a function governing the minimum alignment score needed for an alignment to be considered "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying `L,0,-0.6` sets the minimum-score function `f` to `f(x) = 0 + -0.6 * x`, where `x` is the read length. The default is `L,0,-0.2`. ----- **Spliced alignment options**:: --pen-cansplice <int> Sets the penalty for each pair of canonical splice sites (e.g. GT/AG). Default: 0. --pen-noncansplice <int> Sets the penalty for each pair of non-canonical splice sites (e.g. non-GT/AG). Default: 12. --pen-canintronlen <func> Sets the penalty for long introns with canonical splice sites so that alignments with shorter introns are preferred to those with longer ones. Default: G,-8,1 --pen-noncanintronlen <func> Sets the penalty for long introns with noncanonical splice sites so that alignments with shorter introns are preferred to those with longer ones. Default: G,-8,1 --min-intronlen <int> Sets minimum intron length. Default: 20 --max-intronlen <int> Sets maximum intron length. Default: 500000 --no-spliced-alignment Disable spliced alignment. -I/--minins <int> The minimum fragment length for valid paired-end alignments.This option is valid only with `--no-spliced-alignment`. E.g. if `-I 60` is specified and a paired-end alignment consists of two 20-bp alignments in the appropriate orientation with a 20-bp gap between them, that alignment is considered valid (as long as `-X` is also satisfied). A 19-bp gap would not be valid in that case. If trimming options `-3` or `-5` are also used, the `-I` constraint is applied with respect to the untrimmed mates. The larger the difference between `-I` and `-X`, the slower HISAT2 will run. This is because larger differences between `-I` and `-X` require that HISAT2 scan a larger window to determine if a concordant alignment exists. For typical fragment length ranges (200 to 400 nucleotides), HISAT2 is very efficient. Default: 0 (essentially imposing no minimum) -X/--maxins <int> The maximum fragment length for valid paired-end alignments. This option is valid only with `--no-spliced-alignment`. E.g. if `-X 100` is specified and a paired-end alignment consists of two 20-bp alignments in the proper orientation with a 60-bp gap between them, that alignment is considered valid (as long as `-I` is also satisfied). A 61-bp gap would not be valid in that case. If trimming options `-3` or `-5` are also used, the -X constraint is applied with respect to the untrimmed mates, not the trimmed mates. The larger the difference between `-I` and `-X`, the slower HISAT2 will run. This is because larger differences between `-I` and `-X` require that HISAT2 scan a larger window to determine if a concordant alignment exists. For typical fragment length ranges (200 to 400 nucleotides), HISAT2 is very efficient. Default: 500. --known-splicesite-infile <path> With this mode, you can provide a list of known splice sites, which HISAT2 makes use of to align reads with small anchors. You can create such a list using python hisat2_extract_splice_sites.py genes.gtf > splicesites.txt, where hisat2_extract_splice_sites.py is included in the HISAT2 package, genes.gtf is a gene annotation file, and splicesites.txt is a list of splice sites with which you provide HISAT2 in this mode. Note that it is better to use indexes built using annotated transcripts (such as genome_tran or genome_snp_tran), which works better than using this option. It has no effect to provide splice sites that are already included in the indexes. --tmo/--transcriptome-mapping-only Report only those alignments within known transcripts. --dta/--downstream-transcriptome-assembly Report alignments tailored for transcript assemblers including StringTie. With this option, HISAT2 requires longer anchor lengths for de novo discovery of splice sites. This leads to fewer alignments with short-anchors, which helps transcript assemblers improve significantly in computation and memory usage. --dta-cufflinks Report alignments tailored specifically for Cufflinks. In addition to what HISAT2 does with the above option (--dta), With this option, HISAT2 looks for novel splice sites with three signals (GT/AG, GC/AG, AT/AC), but all user-provided splice sites are used irrespective of their signals. HISAT2 produces an optional field, XS:A:[+-], for every spliced alignment. --no-templatelen-adjustment Disables template length adjustment for RNA-seq reads. ----- **Reporting options**:: -k <int> It searches for at most `<int>` distinct, primary alignments for each read. Primary alignments mean alignments whose alignment score is equal or higher than any other alignments. The search terminates when it can't find more distinct valid alignments, or when it finds `<int>`, whichever happens first. The alignment score for a paired-end alignment equals the sum of the alignment scores of the individual mates. Each reported read or pair alignment beyond the first has the SAM 'secondary' bit (which equals 256) set in its FLAGS field. For reads that have more than `<int>` distinct, valid alignments, hisat2 does not guarantee that the `<int>` alignments reported are the best possible in terms of alignment score. Default: 5 (HFM) or 10 (HGFM) Note: HISAT2 is not designed with large values for `-k` in mind, and when aligning reads to long, repetitive genomes large `-k` can be very, very slow. -max-seeds HISAT2, like other aligners, uses seed-and-extend approaches. HISAT2 tries to extend seeds to full-length alignments. In HISAT2, `--max-seeds` is used to control the maximum number of seeds that will be extended. HISAT2 extends up to these many seeds and skips the rest of the seeds. Large values for `--max-seeds` may improve alignment sensitivity, but HISAT2 is not designed with large values for `--max-seeds` in mind, and when aligning reads to long, repetitive genomes large `--max-seeds` can be very, very slow. The default value is the maximum of 5 and the value that comes with `-k`. --secondary Report secondary alignments. ----- **Paired-end options**:: --fr/--rf/--ff The upstream/downstream mate orientations for a valid paired-end alignment against the forward reference strand. E.g., if `--fr` is specified and there is a candidate paired-end alignment where mate 1 appears upstream of the reverse complement of mate 2 and the fragment length constraints (`-I` and `-X`) are met, that alignment is valid. Also, if mate 2 appears upstream of the reverse complement of mate 1 and all other constraints are met, that too is valid. `--rf` likewise requires that an upstream mate1 be reverse-complemented and a downstream mate2 be forward-oriented. ` --ff` requires both an upstream mate 1 and a downstream mate 2 to be forward-oriented. Default: `--fr` (appropriate for Illumina's Paired-end Sequencing Assay). --no-mixed By default, when `hisat2` cannot find a concordant or discordant alignment for a pair, it then tries to find alignments for the individual mates. This option disables that behavior. --no-discordant By default, `hisat2` looks for discordant alignments if it cannot find any concordant alignments. A discordant alignment is an alignment where both mates align uniquely, but that does not satisfy the paired-end constraints (`--fr`/`--rf`/`--ff`, `-I`, `-X`). This option disables that behavior. **Output options**:: --un/--un-gz/--un-bz2 Write unpaired reads that fail to align to file at `<path>`. These reads correspond to the SAM records with the FLAGS `0x4` bit set and neither the `0x40` nor `0x80` bits set. If `--un-gz` is specified, output will be gzip compressed. If `--un-bz2` is specified, output will be bzip2 compressed. Reads written in this way will appear exactly as they did in the input file, without any modification (same sequence, same name, same quality string, same quality encoding). Reads will not necessarily appear in the same order as they did in the input. --al/--al-gz/--al-bz2 Write unpaired reads that align at least once to file at `<path>`. These reads correspond to the SAM records with the FLAGS `0x4`, `0x40`, and `0x80` bits unset. If `--al-gz` is specified, output will be gzip compressed. If `--al-bz2` is specified, output will be bzip2 compressed. Reads written in this way will appear exactly as they did in the input file, without any modification (same sequence, same name, same quality string, same quality encoding). Reads will not necessarily appear in the same order as they did in the input. --un-conc/--un-conc-gz/--un-conc-bz2 Write paired-end reads that fail to align concordantly to file(s) at `<path>`. These reads correspond to the SAM records with the FLAGS `0x4` bit set and either the `0x40` or `0x80` bit set (depending on whether it's mate #1 or #2). .1 and .2 strings are added to the filename to distinguish which file contains mate #1 and mate #2. If a percent symbol, %, is used in <path>, the percent symbol is replaced with 1 or 2 to make the per-mate filenames. Otherwise, .1 or .2 are added before the final dot in <path> to make the per-mate filenames. Reads written in this way will appear exactly as they did in the input files, without any modification (same sequence, same name, same quality string, same quality encoding). Reads will not necessarily appear in the same order as they did in the inputs. --al-conc/--al-conc-gz/--al-conc-bz2 Write paired-end reads that align concordantly at least once to file(s) at `<path>`. These reads correspond to the SAM records with the FLAGS `0x4` bit unset and either the `0x40` or `0x80` bit set (depending on whether it's mate #1 or #2). .1 and .2 strings are added to the filename to distinguish which file contains mate #1 and mate #2. If a percent symbol, %, is used in <path>, the percent symbol is replaced with 1 or 2 to make the per-mate filenames. Otherwise, .1 or .2 are added before the final dot in `<path>` to make the per-mate filenames. Reads written in this way will appear exactly as they did in the input files, without any modification (same sequence, same name, same quality string, same quality encoding). Reads will not necessarily appear in the same order as they did in the inputs. **Other options**:: --seed <int> Use `<int>` as the seed for pseudo-random number generator. Default: 0. --non-deterministic Normally, HISAT2 re-initializes its pseudo-random generator for each read. It seeds the generator with a number derived from (a) the read name, (b) the nucleotide sequence, (c) the quality sequence, (d) the value of the `--seed` option. This means that if two reads are identical (same name, same nucleotides, same qualities) HISAT2 will find and report the same alignment(s) for both, even if there was ambiguity. When `--non-deterministic` is specified, HISAT2 re-initializes its pseudo-random generator for each read using the current time. This means that HISAT2 will not necessarily report the same alignment for two identical reads. This is counter-intuitive for some users, but might be more appropriate in situations where the input consists of many identical reads. ]]></help> <citations> <citation type="doi">10.1038/nmeth.3317</citation> </citations> </tool>