Mercurial > repos > iuc > annotatemyids

--- a/annotateMyIDs.xml	Wed Jan 03 16:01:27 2018 -0500
+++ b/annotateMyIDs.xml	Sun Mar 11 05:17:23 2018 -0400
@@ -1,4 +1,4 @@
-<tool id="annotatemyids" name="annotateMyIDs" version="3.5.0.0">
+<tool id="annotatemyids" name="annotateMyIDs" version="3.5.0.1">
     <description>annotate a generic set of identifiers</description>
     <requirements>
         <requirement type="package" version="3.5.0">bioconductor-org.hs.eg.db</requirement>
@@ -7,7 +7,7 @@
         <requirement type="package" version="3.5.0">bioconductor-org.dr.eg.db</requirement>
     </requirements>
     <version_command><![CDATA[
-echo $(R --version | grep version | grep -v GNU)", org.Hs.eg.db version" $(R --vanilla --slave -e "library(org.Hs.eg.db); cat(sessionInfo()\$otherPkgs\$org.Hs.eg.db\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", org.Dr.eg.db version" $(R --vanilla --slave -e "library(org.Dr.eg.db); cat(sessionInfo()\$otherPkgs\$org.Dr.eg.db\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", org.Dm.eg.db version" $(R --vanilla --slave -e "library(org.Dm.eg.db); cat(sessionInfo()\$otherPkgs\$org.Dm.eg.db\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", org.Mm.eg.db version" $(R --vanilla --slave -e "library(org.Mm.eg.db); cat(sessionInfo()\$otherPkgs\$org.Mm.eg.db\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", optparse version" $(R --vanilla --slave -e "library(optparse); cat(sessionInfo()\$otherPkgs\$optparse\$Version)" 2> /dev/null | grep -v -i "WARNING: ")
+echo $(R --version | grep version | grep -v GNU)", org.Hs.eg.db version" $(R --vanilla --slave -e "library(org.Hs.eg.db); cat(sessionInfo()\$otherPkgs\$org.Hs.eg.db\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", org.Dr.eg.db version" $(R --vanilla --slave -e "library(org.Dr.eg.db); cat(sessionInfo()\$otherPkgs\$org.Dr.eg.db\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", org.Dm.eg.db version" $(R --vanilla --slave -e "library(org.Dm.eg.db); cat(sessionInfo()\$otherPkgs\$org.Dm.eg.db\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", org.Mm.eg.db version" $(R --vanilla --slave -e "library(org.Mm.eg.db); cat(sessionInfo()\$otherPkgs\$org.Mm.eg.db\$Version)" 2> /dev/null | grep -v -i "WARNING: ")
     ]]></version_command>
     <command detect_errors="exit_code"><![CDATA[
         #if $rscriptOpt:
@@ -27,6 +27,7 @@
 organism <- "${organism}"
 output_cols <- "${output_cols}"
 file_has_header <- ${file_has_header}
+remove_dups <- ${remove_dups}

 ids <- as.character(read.table('$id_file', header=file_has_header)[,1])

@@ -48,6 +49,11 @@

 cols <- unlist(strsplit(output_cols, ","))
 result <- select(db, keys=ids, keytype=id_type, columns=cols)
+
+if(remove_dups) {
+    result <- result[!duplicated(result$${id_type}),]
+}
+
 write.table(result, file='$out_tab', sep="\t", row.names=FALSE, quote=FALSE)

     ]]></configfile>
@@ -87,6 +93,7 @@
             <option value="ONTOLOGY">ONTOLOGY</option>
             <option value="PATH">KEGG</option>
         </param>
+        <param name="remove_dups" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Remove duplicates?" help="If this option is set to Yes, only the first occurrence of each input Gene ID will be kept. Default: No" />
         <param name="rscriptOpt" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output Rscript?" help="If this option is set to Yes, the Rscript used to annotate the IDs will be provided as a text file in the output. Default: No" />
     </inputs>
     <outputs>
@@ -118,7 +125,16 @@
             <param name="id_type" value="ENSEMBL"/>
             <param name="organism" value="Hs"/>
             <param name="output_cols" value="ENSEMBL,GO,ONTOLOGY,EVIDENCE" />
-            <output name="out_tab" file="out_gokegg.tab" compare="contains" />
+            <output name="out_tab" file="out_gokegg.tab" />
+        </test>
+        <!-- Ensure duplicate Gene ID removal works -->
+        <test expect_num_outputs="1">
+            <param name="id_file" value="ensembl_ids.tab" ftype="tabular"/>
+            <param name="id_type" value="ENSEMBL"/>
+            <param name="organism" value="Hs"/>
+            <param name="output_cols" value="ENSEMBL,GO,ONTOLOGY,EVIDENCE" />
+            <param name="remove_dups" value="True" />
+            <output name="out_tab" file="out_gokegg_dupsrem.tab" />
         </test>
     </tests>
     <help><![CDATA[
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out_gokegg_dupsrem.tab	Sun Mar 11 05:17:23 2018 -0400
@@ -0,0 +1,9 @@
+ENSEMBL	GO	ONTOLOGY	EVIDENCE
+ENSG00000091831	GO:0000122	BP	IMP
+ENSG00000082175	GO:0000978	MF	IDA
+ENSG00000141736	GO:0000165	BP	TAS
+ENSG00000012048	GO:0000151	CC	NAS
+ENSG00000139618	GO:0000722	BP	IEA
+ENSG00000129514	GO:0000122	BP	IEA
+ENSG00000171862	GO:0000079	BP	TAS
+ENSG00000141510	GO:0000122	BP	IBA