Mercurial > repos > devteam > gffread
diff gffread.xml @ 8:154d00cbbf2d draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gffread commit f40643d8b80299ebb84faebe92579321ac459746"
author | iuc |
---|---|
date | Sat, 25 Sep 2021 15:38:31 +0000 |
parents | 4dea02886337 |
children | 3e436657dcd0 |
line wrap: on
line diff
--- a/gffread.xml Mon Nov 11 18:27:46 2019 -0500 +++ b/gffread.xml Sat Sep 25 15:38:31 2021 +0000 @@ -1,13 +1,21 @@ -<tool id="gffread" name="gffread" version="@VERSION@.0"> +<tool id="gffread" name="gffread" version="@GALAXY_TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.05"> <description>Filters and/or converts GFF3/GTF2 records</description> + <xrefs> + <xref type="bio.tools">gffread</xref> + </xrefs> <macros> - <token name="@VERSION@">0.11.6</token> + <!-- the version of this tool must not be lowered since in the past 2.x was used + lets use small increments and hope that gffread catches up one day --> + <token name="@GALAXY_TOOL_VERSION@">2.2.1.3</token> + <token name="@TOOL_VERSION@">0.12.7</token> + <token name="@VERSION_SUFFIX@">0</token> <xml name="fasta_output_select"> <param name="fa_outputs" type="select" display="checkboxes" multiple="true" label="Select fasta outputs"> - <option value="-w exons.fa">fasta file with spliced exons for each GFF transcript (-w exons.fa)</option> - <option value="-x cds.fa">fasta file with spliced CDS for each GFF transcript (-x cds.fa)</option> - <option value="-y pep.fa">protein fasta file with the translation of CDS for each record (-y pep.fa)</option> + <option value="-w exons.fa">fasta file with spliced exons for each GFF transcript (-w)</option> + <option value="-x cds.fa">fasta file with spliced CDS for each GFF transcript (-x)</option> + <option value="-y pep.fa">protein fasta file with the translation of CDS for each record (-y)</option> <option value="-W">for each fasta: record the exon coordinates projected onto the spliced sequence (-W)</option> + <option value="-S">for protein fasta: use '*' instead of '.' as stop codon translation (-S)</option> </param> </xml> <xml name="ref_filtering_select"> @@ -22,14 +30,14 @@ </param> </xml> <xml name="trackname"> - <param name="tname" type="text" value="" optional="true" label="Trackname to use in the second column of each GFF output line" help="(-t track_name}"> + <param argument="-t" name="tname" type="text" value="" optional="true" label="Trackname to use in the second column of each GFF output line" help=""> <validator type="regex">\w+</validator> </param> </xml> <xml name="merge_opts"> <option value="-K">also collapse shorter, fully contained transcripts with fewer introns than the container (-K)</option> <option value="-Q">remove the containment restriction: multi-exon transcripts will be collapsed if just their introns match, while single-exon transcripts can partially overlap 80% (-Q)</option> - <option value="-d dupinfo">output collapsing info (-d dupinfo)</option> + <option value="-d dupinfo">output collapsing info (-d)</option> </xml> <xml name="cluster_opts"> <option value="--force-exons"> make sure that the lowest level GFF features are printed as 'exon' features (--force-exons)</option> @@ -48,14 +56,19 @@ </xml> </macros> <requirements> - <requirement type="package" version="@VERSION@">gffread</requirement> + <requirement type="package" version="@TOOL_VERSION@">gffread</requirement> </requirements> + <version_command>gffread --version</version_command> <command detect_errors="aggressive"> <![CDATA[ #if $reference_genome.source == 'history': ln -s '$reference_genome.genome_fasta' genomeref.fa && #end if + gffread '$input' + #if $input.ext.startswith("bed") + --in-bed + #end if #if $reference_genome.source == 'cached': -g '${reference_genome.fasta_indexes.fields.path}' #if $reference_genome.ref_filtering and str($reference_genome.ref_filtering) != '': @@ -104,22 +117,68 @@ #end if #end if #if $gffs.gff_fmt != 'none': - #if $gffs.tname: + #if $gffs.gff_fmt != 'bed' and $gffs.tname: -t '$gffs.tname' #end if #if $gffs.gff_fmt == 'gff': + ## TODO bug 'gft' -> 'gtf' #if $input.datatype.file_ext == 'gft': $gffs.ensembl #end if - $gffs.output_cmd - #elif $gffs.gff_fmt == 'gtf': - $gffs.output_cmd + #end if + #if $gffs.gff_fmt == 'gtf' + -T + #elif $gffs.gff_fmt == 'bed' + --bed #end if + -o output.$gffs.gff_fmt #end if + +## Missing options +## +## --ids +## --nids +## -l +## --jmatch +## --nc +## --ignore-locus +## -A -s (see above) +## --sort-alpha : chromosomes (reference sequences) are sorted alphabetically +## --sort-by : sort the reference sequences by the order in which their +## names are given in the <refseq.lst> file +## Misc +## --keep-exon-attrs : for -F option, do not attempt to reduce redundant +## --attrs +## --keep-genes : in transcript-only mode (default), also preserve gene records +## --keep-comments: for GFF3 input/output, try to preserve comments +## -B (see above) +## -P +## --add-hasCDS : add a "hasCDS" attribute with value "true" for transcripts +## that have CDS features +## --adj-stop stop codon adjustment: enables -P and performs automatic +## adjustment of the CDS stop coordinate if premature or downstream + +## --in-tlf: input GFF-like one-line-per-transcript format without exon/CDS +## features (see --tlf option below); automatic if the input +## filename ends with .tlf) +## --stream: fast processing of input GFF/BED transcripts as they are received +## ((no sorting, exons must be grouped by transcript in the input data) + +## Clustering + +## -Y + +## Output + +## --gene2exon +## --t-adopt +## -j +## --w-add +## --w-nocds ]]> </command> <inputs> - <param name="input" type="data" format="gff3,gtf" label="Input GFF3 or GTF feature file"/> + <param name="input" type="data" format="bed,gff3,gtf" label="Input BED, GFF3 or GTF feature file"/> <!-- filtering --> <param name="filtering" type="select" display="checkboxes" multiple="true" label="filters"> <option value="-U">discard single-exon transcripts (-U)</option> @@ -135,9 +194,9 @@ </param> <when value="none"/> <when value="filter"> - <param name="range" type="text" value="" label="Only show transcripts overlapping coordinate range"> + <param argument="-r" name="range" type="text" value="" label="Only show transcripts overlapping coordinate range"> <help><![CDATA[ - (-r [['strand']'chr':]'start'..'end') <br> + [['strand']'chr':]'start'..'end' <br> examples: <br> 1000..500000 <br> chr1:1000..500000 <br> @@ -147,14 +206,14 @@ </help> <validator type="regex">(([+-])?(\w+:))?\d+\.\.\d+</validator> </param> - <param name="discard_partial" type="boolean" truevalue="-R" falsevalue="" checked="false" - label="Discard all transcripts that are not fully contained within the given range" help="(-R)"/> + <param argument="-R" name="discard_partial" type="boolean" truevalue="-R" falsevalue="" checked="false" + label="Discard all transcripts that are not fully contained within the given range" help=""/> </when> </conditional> - <param name="maxintron" type="integer" value="" optional="true" min="0" label="Filter out transcipts with large introns" - help="If set, discard transcripts having an intron larger (-i max_intron)"/> - <param name="chr_replace" type="data" format="tabular" optional="true" label="Replace reference sequence names" > - <help><![CDATA[(-m chr_replace) <br> + <param argument="-i" name="maxintron" type="integer" value="" optional="true" min="0" label="Filter out transcipts with large introns" + help="If set, discard transcripts having an intron larger"/> + <param argument="-m" name="chr_replace" type="data" format="tabular" optional="true" label="Replace reference sequence names" > + <help><![CDATA[ chr_replace is a reference sequence replacement table consisting of 2 columns: "original_ref_ID" "new_ref_ID"<br> It is useful for switching between Ensembl and UCSC naming conventions <br> NOTE: GFF records on reference sequences that are not found among the "original_ref_ID" entries in this file will be filtered out @@ -174,10 +233,10 @@ <!-- merging --> <conditional name="merging"> - <param name="merge_sel" type="select" label="Transcript merging" help="(-M/--merge or --cluster-only)"> + <param name="merge_sel" type="select" label="Transcript merging" help=""> <option value="none">none</option> - <option value="merge">merge: cluster the input transcripts into loci, collapsing matching transcripts</option> - <option value="cluster">cluster-only: merge but without collapsing matching transcripts</option> + <option value="merge">merge: cluster the input transcripts into loci, collapsing matching transcripts (--merge)</option> + <option value="cluster">cluster-only: merge but without collapsing matching transcripts (--cluster-only)</option> </param> <when value="none"/> <when value="merge"> @@ -192,7 +251,7 @@ <!-- reference sequence file --> <!-- Error: -g option is required for options -w, -x, -y, -V, -N, -M --> <conditional name="reference_genome"> - <param name="source" type="select" label="Reference Genome" help="(-g genome.fasta) NOTE: Required for fasta outputs"> + <param name="source" type="select" label="Reference Genome" help="NOTE: Required for fasta outputs"> <option value="none">none</option> <option value="cached"></option> <option value="history">From your history</option> @@ -200,14 +259,14 @@ <when value="none"> </when> <when value="cached"> - <param name="fasta_indexes" type="select" label="Source FASTA Sequence"> + <param argument="-g" name="fasta_indexes" type="select" label="Source FASTA Sequence"> <options from_data_table="all_fasta"/> </param> <expand macro="ref_filtering_select" /> <expand macro="fasta_output_select" /> </when> <when value="history"> - <param name="genome_fasta" type="data" format="fasta" label="Genome Reference Fasta"/> + <param argument="-g" name="genome_fasta" type="data" format="fasta" label="Genome Reference Fasta"/> <expand macro="ref_filtering_select" /> <expand macro="fasta_output_select" /> </when> @@ -219,35 +278,39 @@ <option value="none">none</option> <option value="gff">GFF</option> <option value="gtf">GTF</option> + <option value="bed">BED</option> </param> <when value="none"> </when> <when value="gff"> - <param name="output_cmd" type="hidden" value="-o output.gff3"/> - <param name="ensembl" type="boolean" truevalue="-L" falsevalue="" checked="false" label="Ensembl GTF to GFF3 conversion" help="(-L)"/> + <param argument="-L" name="ensembl" type="boolean" truevalue="-L" falsevalue="" checked="false" label="Ensembl GTF to GFF3 conversion" help=""/> <expand macro="trackname" /> </when> <when value="gtf"> - <param name="output_cmd" type="hidden" value="-T -o output.gtf"/> <expand macro="trackname" /> </when> + <when value="bed"> + </when> </conditional> - <param name="full_gff_attribute_preservation" type="boolean" truevalue="-F" falsevalue="" checked="false" - label="full GFF attribute preservation (all attributes are shown)" help="(-F)"/> - <param name="decode_url" type="boolean" truevalue="-D" falsevalue="" checked="false" - label="decode url encoded characters within attributes" help="(-D)"/> - <param name="expose" type="boolean" truevalue="-E" falsevalue="" checked="false" - label="warn about duplicate transcript IDs and other potential problems with the given GFF/GTF records" help="(-E)"/> + <param argument="-F" name="full_gff_attribute_preservation" type="boolean" truevalue="-F" falsevalue="" checked="false" + label="full GFF attribute preservation (all attributes are shown)" help=""/> + <param argument="-D" name="decode_url" type="boolean" truevalue="-D" falsevalue="" checked="false" + label="decode url encoded characters within attributes" help=""/> + <param argument="-E" name="expose" type="boolean" truevalue="-E" falsevalue="" checked="false" + label="warn about duplicate transcript IDs and other potential problems with the given GFF/GTF records" help=""/> </inputs> <outputs> - <data name="output_gff" format="gff3" metadata_source="input" label="${tool.name} on ${on_string}: gff3" from_work_dir="output.gff3"> + <data name="output_gff" format="gff3" metadata_source="input" label="${tool.name} on ${on_string}: gff3" from_work_dir="output.gff"> <filter>gffs['gff_fmt'] == 'gff'</filter> </data> <data name="output_gtf" format="gtf" metadata_source="input" label="${tool.name} on ${on_string}: gtf" from_work_dir="output.gtf"> <filter>gffs['gff_fmt'] == 'gtf'</filter> </data> + <data name="output_bed" format="bed" metadata_source="input" label="${tool.name} on ${on_string}: bed" from_work_dir="output.bed"> + <filter>gffs['gff_fmt'] == 'bed'</filter> + </data> <data name="output_exons" format="fasta" label="${tool.name} on ${on_string}: exons.fa" from_work_dir="exons.fa"> <filter>'fa_outputs' in reference_genome and str(reference_genome['fa_outputs']).find('exons.fa') > 0 </filter> </data> @@ -262,28 +325,48 @@ </data> </outputs> <tests> - <test> + <test expect_num_outputs="1"> <param name="input" ftype="gtf" value="Homo_sapiens.GRCh37_19.71.gtf"/> <param name="gff_fmt" value="gff"/> - <output name="output_gff" file="Homo_sapiens.GRCh37_19.71.gff3" ftype="gff3" lines_diff="2" /> + <output name="output_gff" file="Homo_sapiens.GRCh37_19.71.gff3" ftype="gff3" lines_diff="4" /> </test> - <test> + <test expect_num_outputs="1"> + <param name="input" ftype="gtf" value="Homo_sapiens.GRCh37_19.71.gtf"/> + <param name="gff_fmt" value="gff"/> + <output name="output_gff" file="Homo_sapiens.GRCh37_19.71.gff3" ftype="gff3" lines_diff="4" /> + </test> + <test expect_num_outputs="1"> <param name="input" ftype="gtf" value="ecoli-k12.gff3"/> <param name="gff_fmt" value="gff"/> <param name="full_gff_attribute_preservation" value="-F"/> - <output name="output_gff" file="ecoli-k12.processed.gff3" ftype="gff3" lines_diff="2" /> + <output name="output_gff" file="ecoli-k12.processed.gff3" ftype="gff3" lines_diff="4" /> </test> - <test> - <param name="input" ftype="gtf" value="Homo_sapiens.GRCh37_19.71.gtf"/> - <param name="filtering" value="--no-pseudo"/> - <param name="gff_fmt" value="gtf"/> - <output name="output_gtf"> + <!-- bed output --> + <test expect_num_outputs="1"> + <param name="input" ftype="gff3" value="Homo_sapiens.GRCh37_19.71.gff3"/> + <param name="gff_fmt" value="bed"/> + <output name="output_bed" ftype="bed"> <assert_contents> - <not_has_text text="pseudo" /> + <has_n_lines n="42"/> + <has_n_columns n="13"/> </assert_contents> </output> </test> - <test> + <!-- bed input and test tname --> + <test expect_num_outputs="1"> + <param name="input" ftype="bed" value="Homo_sapiens.GRCh37_19.71.bed"/> + <param name="gff_fmt" value="gff"/> + <param name="tname" value="track name"/> + <output name="output_bed" ftype="gff3"> + <assert_contents> + <has_n_lines n="388"/> + <!-- this will work with https://github.com/galaxyproject/galaxy/pull/12528 --> + <!-- <has_n_columns n="9" comment="#"/> --> + <has_text text="track name"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="1"> <param name="input" ftype="gtf" value="Homo_sapiens.GRCh37_19.71.gtf"/> <param name="region_filter" value="filter"/> <param name="range" value="19:496500..504965"/> @@ -295,7 +378,7 @@ </assert_contents> </output> </test> - <test> + <test expect_num_outputs="1"> <param name="input" ftype="gtf" value="Homo_sapiens.GRCh37_19.71.gtf"/> <param name="region_filter" value="filter"/> <param name="range" value="19:496500..504965"/> @@ -308,7 +391,7 @@ </assert_contents> </output> </test> - <test> + <test expect_num_outputs="1"> <param name="input" ftype="gtf" value="Homo_sapiens.GRCh37_19.71.gtf"/> <param name="filtering" value="-C"/> <param name="region_filter" value="filter"/> @@ -321,7 +404,7 @@ </assert_contents> </output> </test> - <test> + <test expect_num_outputs="4"> <param name="input" ftype="gtf" value="Homo_sapiens.GRCh37_19.71.gtf"/> <param name="source" value="history"/> <param name="genome_fasta" ftype="fasta" value="Homo_sapiens.GRCh37.71.dna.chromosome.19.fa"/> @@ -354,7 +437,18 @@ </assert_contents> </output> </test> - + <test expect_num_outputs="1"> + <param name="input" ftype="gtf" value="stop_codons.gtf"/> + <param name="source" value="history"/> + <param name="genome_fasta" ftype="fasta" value="Homo_sapiens.GRCh37.71.dna.chromosome.19.fa"/> + <param name="fa_outputs" value="-y pep.fa,-S"/> + <output name="output_pep"> + <assert_contents> + <has_text text="ENST00000269812" /> + <has_text text="PLRGLHPRV*LQTPLERCPCWPPAGGTGGCPHCLLHLRLLQSPTPTALSEGGGAGTEAQPVTDVDPGRG*" /> + </assert_contents> + </output> + </test> </tests> <help> <![CDATA[ @@ -364,30 +458,32 @@ .. _stringtie: http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread - -gffread v0.11.4. Usage: :: +gffread v0.12.7. Usage: :: - gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] - [-o <outfile>] [-t <trackname>] [-r [[<strand>]<chr>:]<start>..<end> [-R]] + gffread [-g <genomic_seqs_fasta> | <dir>] [-s <seq_info.fsize>] + [-o <outfile>] [-t <trackname>] [-r [<strand>]<chr>:<start>-<end> [-R]] + [--jmatch <chr>:<start>-<end>] [--no-pseudo] [-CTVNJMKQAFPGUBHZWTOLE] [-w <exons.fa>] [-x <cds.fa>] [-y <tr_cds.fa>] - [-i <maxintron>] [--bed] [--table <attrlist>] [--sort-by <refseq_list.txt>] - + [-j ][--ids <IDs.lst> | --nids <IDs.lst>] [--attrs <attr-list>] [-i <maxintron>] + [--stream] [--bed | --gtf | --tlf] [--table <attrlist>] [--sort-by <ref.lst>] + [<input_gff>] + Filter, convert or cluster GFF/GTF/BED records, extract the sequence of transcripts (exon or CDS) and more. By default (i.e. without -O) only transcripts are processed, discarding any other non-transcript features. Default output is a simplified GFF3 with only the basic attributes. - <input_gff> is a GFF file, use '-' for stdin - Options: - + --ids discard records/transcripts if their IDs are not listed in <IDs.lst> + --nids discard records/transcripts if their IDs are listed in <IDs.lst> -i discard transcripts having an intron larger than <maxintron> -l discard transcripts shorter than <minlen> bases -r only show transcripts overlapping coordinate range <start>..<end> (on chromosome/contig <chr>, strand <strand> if provided) -R for -r option, discard all transcripts that are not fully contained within the given range + --jmatch only output transcripts matching the given junction -U discard single-exon transcripts -C coding only: discard mRNAs that have no CDS features --nc non-coding only: discard mRNAs that have CDS features @@ -398,18 +494,18 @@ for each of the mapped sequences: <seq-name> <seq-length> <seq-description> (useful for -A option with mRNA/EST/protein mappings) - - Sorting: (by default, chromosomes are kept in the order they were found) + Sorting: (by default, chromosomes are kept in the order they were found) --sort-alpha : chromosomes (reference sequences) are sorted alphabetically --sort-by : sort the reference sequences by the order in which their names are given in the <refseq.lst> file - Misc options: - -F preserve all GFF attributes (for non-exon features) + -F keep all GFF attributes (for non-exon features) --keep-exon-attrs : for -F option, do not attempt to reduce redundant exon/CDS attributes -G do not keep exon attributes, move them to the transcript feature (for GFF3 output) + --attrs <attr-list> only output the GTF/GFF attributes listed in <attr-list> + which is a comma delimited list of attribute names to --keep-genes : in transcript-only mode (default), also preserve gene records --keep-comments: for GFF3 input/output, try to preserve comments -O process other non-transcript GFF records (by default non-transcript @@ -437,10 +533,11 @@ --in-tlf: input GFF-like one-line-per-transcript format without exon/CDS features (see --tlf option below); automatic if the input filename ends with .tlf) - + --stream: fast processing of input GFF/BED transcripts as they are received + ((no sorting, exons must be grouped by transcript in the input data) Clustering: -M/--merge : cluster the input transcripts into loci, discarding - "duplicated" transcripts (those with the same exact introns + "redundant" transcripts (those with the same exact introns and fully contained or equal boundaries) -d <dupinfo> : for -M option, write duplication info to file <dupinfo> --cluster-only: same as -M/--merge but without discarding any of the @@ -452,7 +549,6 @@ multi-exon transcripts, and >=80% overlap for single-exon transcripts -Y for -M option, enforce -Q but also discard overlapping single-exon transcripts, even on the opposite strand (can be combined with -K) - Output options: --force-exons: make sure that the lowest level GFF features are considered "exon" features @@ -465,25 +561,26 @@ -g full path to a multi-fasta file with the genomic sequences for all input mappings, OR a directory with single-fasta files (one per genomic sequence, with file names matching sequence names) - -w write a fasta file with spliced exons for each GFF transcript + -j output the junctions and the corresponding transcripts + -w write a fasta file with spliced exons for each transcript + --w-add <N> for the -w option, extract additional <N> bases + both upstream and downstream of the transcript boundaries + --w-nocds for -w, disable the output of CDS info in the FASTA file -x write a fasta file with spliced CDS for each GFF transcript -y write a protein fasta file with the translation of CDS for each record - -W for -w and -x options, write in the FASTA defline the exon + -W for -w, -x and -y options, write in the FASTA defline all the exon coordinates projected onto the spliced sequence; - for -y option, write transcript attributes in the FASTA defline -S for -y option, use '*' instead of '.' as stop codon translation - -L Ensembl GTF to GFF3 conversion (implies -F; should be used with -m) + -L Ensembl GTF to GFF3 conversion, adds version to IDs -m <chr_replace> is a name mapping table for converting reference sequence names, having this 2-column format: <original_ref_ID> <new_ref_ID> - WARNING: all GFF records on reference sequences whose original IDs - are not found in the 1st column of this table will be discarded! -t use <trackname> in the 2nd column of each GFF/GTF output line - -o write the records into <outfile> instead of stdout + -o write the output records into <outfile> instead of stdout -T main output will be GTF instead of GFF3 --bed output records in BED format instead of default GFF3 --tlf output "transcript line format" which is like GFF - but exons, CDS features and related data are stored as GFF + but with exons and CDS related features stored as GFF attributes in the transcript feature line, like this: exoncount=N;exons=<exons>;CDSphase=<N>;CDS=<CDScoords> <exons> is a comma-delimited list of exon_start-exon_end coordinates; @@ -491,9 +588,14 @@ --table output a simple tab delimited format instead of GFF, with columns having the values of GFF attributes given in <attrlist>; special pseudo-attributes (prefixed by @) are recognized: - @chr, @start, @end, @strand, @numexons, @exons, @cds, @covlen, @cdslen + @id, @geneid, @chr, @start, @end, @strand, @numexons, @exons, + @cds, @covlen, @cdslen + If any of -w/-y/-x FASTA output files are enabled, the same fields + (excluding @id) are appended to the definition line of corresponding + FASTA records -v,-E expose (warn about) duplicate transcript IDs and other potential problems with the given GFF/GTF records + ]]> </help> <citations>