changeset 6:a371252a2cf6 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 0c5222345ace5054df44da29cab278f4a02e2b41
author galaxyp
date Thu, 06 Jul 2023 21:15:39 +0000
parents 265c35540faa
children 4ddc8da62671
files macros.xml test-data/Helicobacter_protein_accessions.tsv uniprotxml_downloader.py uniprotxml_downloader.xml
diffstat 4 files changed, 80 insertions(+), 43 deletions(-) [+]
line wrap: on
line diff
--- a/macros.xml	Fri Nov 04 15:08:37 2022 +0000
+++ b/macros.xml	Thu Jul 06 21:15:39 2023 +0000
@@ -3,6 +3,7 @@
         <param name="field" type="select" label="Field">
             <option value="taxonomy_name">Taxonomy Name</option>
             <option value="taxonomy_id">Taxonomy ID</option>
+            <option value="accession">Accession</option>
         </param>
     </xml>
 </macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Helicobacter_protein_accessions.tsv	Thu Jul 06 21:15:39 2023 +0000
@@ -0,0 +1,2 @@
+E1Q2I0
+E1Q3C4
\ No newline at end of file
--- a/uniprotxml_downloader.py	Fri Nov 04 15:08:37 2022 +0000
+++ b/uniprotxml_downloader.py	Thu Jul 06 21:15:39 2023 +0000
@@ -47,16 +47,16 @@
 def __main__():
     # Parse Command Line
     parser = optparse.OptionParser()
-    parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs')
-    parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs')
-    parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download')
+    parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of search search_ids')
+    parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains search search_ids')
+    parser.add_option('-s', '--search-id', dest='search_id', action='append', default=[], help='ID to search in Uniprot')
     parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries')
     parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format')
-    parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id'], default='taxonomy_name', help='query field')
+    parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id', 'accession'], default='taxonomy_name', help='query field')
     parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml')
     parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr')
     (options, args) = parser.parse_args()
-    taxids = set(options.taxon)
+    search_ids = set(options.search_id)
     if options.input:
         with open(options.input, 'r') as inputFile:
             for linenum, line in enumerate(inputFile):
@@ -64,19 +64,19 @@
                     continue
                 fields = line.rstrip('\r\n').split('\t')
                 if len(fields) > abs(options.column):
-                    taxid = fields[options.column].strip()
-                    if taxid:
-                        taxids.add(taxid)
-    taxon_queries = [f'{options.field}:"{taxid}"' for taxid in taxids]
-    taxon_query = ' OR '.join(taxon_queries)
+                    search_id = fields[options.column].strip()
+                    if search_id:
+                        search_ids.add(search_id)
+    search_queries = [f'{options.field}:"{search_id}"' for search_id in search_ids]
+    search_query = ' OR '.join(search_queries)
     if options.output:
         dest_path = options.output
     else:
-        dest_path = "uniprot_%s.xml" % '_'.join(taxids)
+        dest_path = "uniprot_%s.xml" % '_'.join(search_ids)
     reviewed = " reviewed:%s" % options.reviewed if options.reviewed else ''
     try:
         url = 'https://rest.uniprot.org/uniprotkb/stream'
-        query = "%s%s" % (taxon_query, reviewed)
+        query = "%s%s" % (search_query, reviewed)
         params = {'query': query, 'format': options.format}
         if options.debug:
             print("%s ? %s" % (url, params), file=sys.stderr)
@@ -112,7 +112,7 @@
                     else:
                         print("failed: Not a uniprot xml file", file=sys.stderr)
                         exit(1)
-        print("NCBI Taxon ID:%s" % taxids, file=sys.stdout)
+        print("Search IDs:%s" % search_ids, file=sys.stdout)
         if 'X-UniProt-Release' in response.headers:
             print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout)
         if 'X-Total-Results' in response.headers:
--- a/uniprotxml_downloader.xml	Fri Nov 04 15:08:37 2022 +0000
+++ b/uniprotxml_downloader.xml	Thu Jul 06 21:15:39 2023 +0000
@@ -1,4 +1,4 @@
-<tool id="uniprotxml_downloader" name="UniProt" version="2.3.0" profile="21.01">
+<tool id="uniprotxml_downloader" name="UniProt" version="2.4.0" profile="21.01">
     <description>download proteome as XML or fasta</description>
     <macros>
         <import>macros.xml</import>
@@ -12,32 +12,32 @@
     <command>
 <![CDATA[
 python '$__tool_directory__/uniprotxml_downloader.py'
-#if $taxid.input_choice == 'common':
-    --taxon $taxid.organism
+#if $input_method.input_choice == 'common':
+    --search-id $input_method.organism
     --field taxonomy_id
-    #if $taxid.reviewed:
-        --reviewed=$taxid.reviewed
+    #if $input_method.reviewed:
+        --reviewed=$input_method.reviewed
     #end if
-#elif $taxid.input_choice == 'taxids':
-    --field $taxid.field
-    #for $id in $taxid.taxons.split(','):
-        -t '$id'
+#elif $input_method.input_choice == 'enter_ids':
+    --field $input_method.field
+    #for $id in $input_method.ids.split(','):
+        --search-id '$id'
     #end for
-#elif $taxid.input_choice == 'history':
-    --field $taxid.field
-    --input='${taxid.taxon_file}'
-    --column=#echo int(str($taxid.column)) - 1#
+#elif $input_method.input_choice == 'history':
+    --field $input_method.field
+    --input='${input_method.id_file}'
+    --column=#echo int(str($input_method.column)) - 1#
 #end if
 --format $format
 --output '${proteome}'
 ]]>
     </command>
     <inputs>
-        <conditional name="taxid">
+        <conditional name="input_method">
             <param name="input_choice" type="select" label="Select">
                 <option value="common">A Common Organism</option>
-                <option value="taxids">A manually entered list of Taxon IDs or names</option>
-                <option value="history">A history dataset with a column containing Taxon IDs or names</option>
+                <option value="enter_ids">A manually entered list of Uniprot IDs</option>
+                <option value="history">A history dataset with a column containing Uniprot IDs</option>
             </param>
             <when value="common">
                 <param name="organism" type="select" label="Common Organisms"
@@ -59,16 +59,16 @@
                     <option value="no">UniProtKB/TrEMBL (unreviewed only)</option>
                 </param>
             </when>
-            <when value="taxids">
-                <param name="taxons" type="text" label="NCBI Taxon IDs or names" 
-                       help="Enter one or more Organsim IDs (separated by commas) from http://www.uniprot.org/proteomes/">
+            <when value="enter_ids">
+                <param name="ids" type="text" label="Search ID values"
+                       help="Enter one or more IDs (separated by commas) from http://www.uniprot.org/proteomes/">
                     <validator type="regex" message="OrganismID[,OrganismID]">^\w+( \w+)*(,\w+( \w+)*)*$</validator>
                 </param>
                 <expand macro="query_field"/>
             </when>
             <when value="history">
-                <param name="taxon_file" type="data" format="tabular,txt" label="Dataset (tab separated) with Taxon ID/Name column"/>
-                <param name="column" type="data_column" data_ref="taxon_file" label="Column with Taxon ID/name"/>
+                <param name="id_file" type="data" format="tabular,txt" label="Dataset (tab separated) with ID column"/>
+                <param name="column" type="data_column" data_ref="id_file" label="Column with ID"/>
                 <expand macro="query_field"/>
             </when>
         </conditional>
@@ -86,8 +86,8 @@
     </outputs>
     <tests>
         <test>
-            <param name="input_choice" value="taxids"/>
-            <param name="taxons" value="1566990"/>
+            <param name="input_choice" value="enter_ids"/>
+            <param name="ids" value="1566990"/>
             <param name="format" value="xml"/>
             <output name="proteome">
                 <assert_contents>
@@ -96,8 +96,8 @@
             </output>
         </test>
         <test>
-            <param name="input_choice" value="taxids"/>
-            <param name="taxons" value="765963,512562"/>
+            <param name="input_choice" value="enter_ids"/>
+            <param name="ids" value="765963,512562"/>
             <param name="field" value="taxonomy_id"/>
             <param name="format" value="fasta"/>
             <output name="proteome">
@@ -108,8 +108,8 @@
             </output>
         </test>
         <test>
-            <param name="input_choice" value="taxids"/>
-            <param name="taxons" value="Shi470,PeCan4"/>
+            <param name="input_choice" value="enter_ids"/>
+            <param name="ids" value="Shi470,PeCan4"/>
             <param name="field" value="taxonomy_name"/>
             <param name="format" value="fasta"/>
             <output name="proteome">
@@ -120,8 +120,20 @@
             </output>
         </test>
         <test>
+            <param name="input_choice" value="enter_ids"/>
+            <param name="ids" value="E1Q2I0,E1Q3C4"/>
+            <param name="field" value="accession"/>
+            <param name="format" value="fasta"/>
+            <output name="proteome">
+                <assert_contents>
+                    <has_text text="E1Q2I0" />
+                    <has_text text="E1Q3C4" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
             <param name="input_choice" value="history"/>
-            <param name="taxon_file" value="Helicobacter_strains.tsv" ftype="tabular"/>
+            <param name="id_file" value="Helicobacter_strains.tsv" ftype="tabular"/>
             <param name="column" value="1"/>
             <param name="field" value="taxonomy_name"/>
             <param name="format" value="fasta"/>
@@ -134,7 +146,7 @@
         </test>
         <test>
             <param name="input_choice" value="history"/>
-            <param name="taxon_file" value="Helicobacter_strains_ids.tsv" ftype="tabular"/>
+            <param name="id_file" value="Helicobacter_strains_ids.tsv" ftype="tabular"/>
             <param name="column" value="2"/>
             <param name="field" value="taxonomy_id"/>
             <param name="format" value="fasta"/>
@@ -145,6 +157,19 @@
                 </assert_contents>
             </output>
         </test>
+        <test>
+            <param name="input_choice" value="history"/>
+            <param name="id_file" value="Helicobacter_protein_accessions.tsv" ftype="tabular"/>
+            <param name="column" value="1"/>
+            <param name="field" value="accession"/>
+            <param name="format" value="fasta"/>
+            <output name="proteome">
+                <assert_contents>
+                    <has_text text="E1Q2I0" />
+                    <has_text text="E1Q3C4" />
+                </assert_contents>
+            </output>
+        </test>
     </tests>
     <help>
 <![CDATA[
@@ -160,7 +185,11 @@
 
 Example taxon: http://www.uniprot.org/taxonomy/512562
 
-Taxon IDs or names can be entered as text or read from a column in a tabular dataset from your history.
+Example protein: https://www.uniprot.org/uniprotkb/E1Q2I0/entry
+
+Description of query fields: https://www.uniprot.org/help/query-fields
+
+IDs can be entered as text or read from a column in a tabular dataset from your history.
 
 Example IDs and names releated to the Bacteria Helicobacter pylori (strain Shi470) ::
 
@@ -171,6 +200,11 @@
  - Helicobacter
  - Helicobacteraceae
 
+ Example protein accession numbers from Helicobacter pylori:
+
+ - E1Q2I0
+ - E1Q3C4
+
 
 UniProtKB help: http://www.uniprot.org/help/uniprotkb