comparison annotateMyIDs.xml @ 0:442e2d79b05c draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/annotatemyids commit 46271ad3015ee41a825860084b2ab9d8081ecab8
author iuc
date Wed, 03 Jan 2018 15:25:26 -0500
parents
children 2e1b256f732f
comparison
equal deleted inserted replaced
-1:000000000000 0:442e2d79b05c
1 <tool id="annotateMyIDs" name="annotateMyIDs" version="3.5.0.0">
2 <description>annotate a generic set of identifiers</description>
3 <requirements>
4 <requirement type="package" version="3.5.0">bioconductor-org.hs.eg.db</requirement>
5 <requirement type="package" version="3.5.0">bioconductor-org.mm.eg.db</requirement>
6 <requirement type="package" version="3.5.0">bioconductor-org.dm.eg.db</requirement>
7 <requirement type="package" version="3.5.0">bioconductor-org.dr.eg.db</requirement>
8 </requirements>
9 <version_command><![CDATA[
10 echo $(R --version | grep version | grep -v GNU)", org.Hs.eg.db version" $(R --vanilla --slave -e "library(org.Hs.eg.db); cat(sessionInfo()\$otherPkgs\$org.Hs.eg.db\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", org.Dr.eg.db version" $(R --vanilla --slave -e "library(org.Dr.eg.db); cat(sessionInfo()\$otherPkgs\$org.Dr.eg.db\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", org.Dm.eg.db version" $(R --vanilla --slave -e "library(org.Dm.eg.db); cat(sessionInfo()\$otherPkgs\$org.Dm.eg.db\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", org.Mm.eg.db version" $(R --vanilla --slave -e "library(org.Mm.eg.db); cat(sessionInfo()\$otherPkgs\$org.Mm.eg.db\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", optparse version" $(R --vanilla --slave -e "library(optparse); cat(sessionInfo()\$otherPkgs\$optparse\$Version)" 2> /dev/null | grep -v -i "WARNING: ")
11 ]]></version_command>
12 <command detect_errors="exit_code"><![CDATA[
13 #if $rscriptOpt:
14 cp '${annotatemyids_script}' '${out_rscript}' &&
15 #end if
16 Rscript '${annotatemyids_script}'
17 ]]>
18 </command>
19 <configfiles>
20 <configfile name="annotatemyids_script"><![CDATA[
21 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
22
23 # we need that to not crash galaxy with an UTF8 error on German LC settings.
24 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
25
26 id_type <- "${id_type}"
27 organism <- "${organism}"
28 output_cols <- "${output_cols}"
29 file_has_header <- ${file_has_header}
30
31 ids <- as.character(read.table('$id_file', header=file_has_header)[,1])
32
33 if(organism == "Hs"){
34 suppressPackageStartupMessages(library(org.Hs.eg.db))
35 db <- org.Hs.eg.db
36 } else if (organism == "Mm"){
37 suppressPackageStartupMessages(library(org.Mm.eg.db))
38 db <- org.Mm.eg.db
39 } else if (organism == "Dm"){
40 suppressPackageStartupMessages(library(org.Dm.eg.db))
41 db <- org.Dm.eg.db
42 } else if (organism == "Dr"){
43 suppressPackageStartupMessages(library(org.Dr.eg.db))
44 db <- org.Dr.eg.db
45 } else {
46 cat(paste("Organism type not supported", organism))
47 }
48
49 cols <- unlist(strsplit(output_cols, ","))
50 result <- select(db, keys=ids, keytype=id_type, columns=cols)
51 write.table(result, file='$out_tab', sep="\t", row.names=FALSE, quote=FALSE)
52
53 ]]></configfile>
54 </configfiles>
55 <inputs>
56 <param name="id_file" type="data" format="tabular" label="File with IDs" help="A tabular file with the first column containing one of the supported types of identifier, see Help below." />
57 <param name="file_has_header" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="File has header?" help="If this option is set to Yes, the tool will assume that the input file has a column header in the first row and the identifers commence on the second line. Default: No" />
58 <param name="organism" type="select" label="Organism" help="Select the organism the identifiers are from">
59 <option value="Hs" selected="true">Human</option>
60 <option value="Mm">Mouse</option>
61 <option value="Dm">Fruit fly</option>
62 <option value="Dr">Zebrafish</option>
63 </param>
64 <param name="id_type" type="select" label="ID Type" help="Select the type of IDs in your input file">
65 <option value="ENSEMBL" selected="true">Ensembl Gene</option>
66 <option value="ENSEMBLPROT">Ensembl Protein</option>
67 <option value="ENSEMBLTRANS">Ensembl Transcript</option>
68 <option value="ENTREZID">Entrez</option>
69 <option value="FLYBASE">FlyBase</option>
70 <option value="GO">GO</option>
71 <option value="PATH">KEGG</option>
72 <option value="MGI">MGI</option>
73 <option value="REFSEQ">RefSeq</option>
74 <option value="SYMBOL">Gene Symbol</option>
75 <option value="ZFIN">Zfin</option>
76 </param>
77 <param name="output_cols" type="select" multiple="True" display="checkboxes" label="Output columns" help="Choose the columns you want in the output table. Note that selecting some columns such as GO or KEGG could make the table very large as some genes may be associated with many terms. Default: ENSEMBL, ENTREZID, SYMBOL, GENENAME">
78 <option value="ALIAS">ALIAS</option>
79 <option value="DESCRIPTION">DESCRIPTION</option>
80 <option value="ENSEMBL" selected="True">ENSEMBL</option>
81 <option value="ENTREZID" selected="True">ENTREZID</option>
82 <option value="EVIDENCE">EVIDENCE</option>
83 <option value="SYMBOL" selected="True">SYMBOL</option>
84 <option value="GENENAME" selected="True">GENENAME</option>
85 <option value="REFSEQ">REFSEQ</option>
86 <option value="GO">GO</option>
87 <option value="ONTOLOGY">ONTOLOGY</option>
88 <option value="PATH">KEGG</option>
89 </param>
90 <param name="rscriptOpt" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output Rscript?" help="If this option is set to Yes, the Rscript used to annotate the IDs will be provided as a text file in the output. Default: No" />
91 </inputs>
92 <outputs>
93 <data name="out_tab" format="tabular" from_work_dir="*.tab" label="${tool.name} on ${on_string}: Annotated IDs" />
94 <data name="out_rscript" format="txt" from_work_dir="*.txt" label="${tool.name} on ${on_string}: Rscript">
95 <filter>rscriptOpt is True</filter>
96 </data>
97 </outputs>
98 <tests>
99 <!-- Ensure output table works -->
100 <test expect_num_outputs="1">
101 <param name="id_file" value="genelist.txt" ftype="tabular"/>
102 <param name="id_type" value="SYMBOL"/>
103 <param name="organism" value="Hs"/>
104 <output name="out_tab" file="out.tab" />
105 </test>
106 <!-- Ensure Ensembl IDs input and Rscript output work -->
107 <test expect_num_outputs="2">
108 <param name="id_file" value="ensembl_ids.tab" ftype="tabular"/>
109 <param name="id_type" value="ENSEMBL"/>
110 <param name="organism" value="Hs"/>
111 <param name="rscriptOpt" value="True" />
112 <output name="out_tab" file="out_ensembl.tab" />
113 <output name="out_rscript" file="out_rscript.txt" compare="sim_size" />
114 </test>
115 <!-- Ensure GO and KEGG output work -->
116 <test expect_num_outputs="1">
117 <param name="id_file" value="ensembl_ids.tab" ftype="tabular"/>
118 <param name="id_type" value="ENSEMBL"/>
119 <param name="organism" value="Hs"/>
120 <param name="output_cols" value="ENSEMBL,GO,ONTOLOGY,EVIDENCE" />
121 <output name="out_tab" file="out_gokegg.tab" compare="contains" />
122 </test>
123 </tests>
124 <help><![CDATA[
125
126 .. class:: infomark
127
128 **What it does**
129
130 This tool can get annotation for a generic set of IDs, using the Bioconductor_ annotation data packages. Supported organisms are human, mouse, fruit fly and zebrafish. The org.db packages that are used here are primarily based on mapping using Entrez Gene identifiers. More information on the annotation packages can be found at the Bioconductor website, for example, information on the human annotation package (org.Hs.eg.db) can be found here_.
131
132 Examples of what this tool can be used for are:
133
134 * adding gene names to IDs
135 * mapping between IDs e.g. Entrez, Ensembl, Symbols
136 * adding GO and KEGG identifiers
137
138 .. _Bioconductor: https://www.bioconductor.org/
139 .. _here: http://bioconductor.org/packages/release/data/annotation/manuals/org.Hs.eg.db/man/org.Hs.eg.db.pdf
140
141 -----
142
143 **Inputs**
144
145 A tab-delimited file with identifiers in the first column. If the file contains a header row, select the file has a header option in the tool form above.
146
147 Example:
148
149 =============== =======================
150 **GeneID** *Additional Columns...*
151 --------------- -----------------------
152 ENSG00000091831
153 ENSG00000082175
154 ENSG00000141736
155 ENSG00000012048
156 ENSG00000139618
157 ENSG00000129514
158 ENSG00000171862
159 ENSG00000141510
160 =============== =======================
161
162 ID types supported for input are:
163
164 * **ENSEMBL**: Ensembl gene IDs
165 * **ENSEMBLPROT**: Ensembl protein IDs
166 * **ENSEMBLTRANS**: Ensembl transcript IDs
167 * **ENTREZID**: Entrez gene Identifiers
168 * **FLYBASE**: FlyBase accession numbers
169 * **GO**: GO Identifiers
170 * **MGI**: Jackson Laboratory MGI gene accession numbers
171 * **PATH**: KEGG Pathway Identifiers
172 * **REFSEQ**: Refseq Identifiers
173 * **SYMBOL**: The official gene symbol
174 * **ZFIN**: Zfin accession numbers
175
176 .. class:: warningmark
177
178 This tool uses the ``select`` function from the Bioconductor AnnotationDBi_ package. Note that if you request columns that have multiple matches for your IDs, select will return *one row in the output for each possible match*. This has the effect that if you request multiple columns and some of them have a many-to-one relationship to the IDs, things will continue to multiply accordingly. So it's not a good idea to request a large number of columns unless you know what you are asking for should have a one-to-one relationship with the initial set of IDs. In general, if you need to retrieve a column like **GO** or **KEGG**, that has a many-to-one relationship to the original IDs, it is most useful to extract that separately.
179
180 .. _AnnotationDBi: https://www.bioconductor.org/packages/devel/bioc/manuals/AnnotationDbi/man/AnnotationDbi.pdf
181
182 -----
183
184 **Outputs**
185
186 If the input IDs are Ensembl, the default output will be similar to below, containing four columns. Other columns, such as GO and KEGG terms, can be selected above to be added as additional columns.
187
188 Example:
189
190 =============== ============ ========== =================================
191 **ENSEMBL** **ENTREZID** **SYMBOL** **GENENAME**
192 --------------- ------------ ---------- ---------------------------------
193 ENSG00000091831 2099 ESR1 estrogen receptor 1
194 ENSG00000082175 5241 PGR progesterone receptor
195 ENSG00000141736 2064 ERBB2 erb-b2 receptor tyrosine kinase 2
196 ENSG00000012048 672 BRCA1 breast cancer 1
197 ENSG00000139618 675 BRCA2 breast cancer 2
198 ENSG00000129514 3169 FOXA1 forkhead box A1
199 ENSG00000171862 5728 PTEN phosphatase and tensin homolog
200 ENSG00000141510 7157 TP53 tumor protein p53
201 =============== ============ ========== =================================
202
203 Columns available for output include many of the ID columns already described under Inputs above and also:
204
205 * **ALIAS**: Commonly used gene symbols
206 * **DESCRIPTION**: The description of the associated gene
207 * **EVIDENCE**: Evidence codes for GO associations with a gene of interest
208 * **GENENAME**: The full gene name
209 * **ONTOLOGY**: For GO Identifiers, which Gene Ontology (BP, CC, or MF)
210
211 ]]></help>
212 <citations>
213 <citation type="bibtex">
214 @unpublished{None,
215 author = {Mark Dunning},
216 title = {annotateMyIDs},
217 year = {2017},
218 eprint = {None},
219 url = {}
220 }</citation>
221 </citations>
222 </tool>