Mercurial > repos > galaxyp > uniprotxml_downloader
changeset 6:a371252a2cf6 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 0c5222345ace5054df44da29cab278f4a02e2b41
author | galaxyp |
---|---|
date | Thu, 06 Jul 2023 21:15:39 +0000 |
parents | 265c35540faa |
children | 4ddc8da62671 |
files | macros.xml test-data/Helicobacter_protein_accessions.tsv uniprotxml_downloader.py uniprotxml_downloader.xml |
diffstat | 4 files changed, 80 insertions(+), 43 deletions(-) [+] |
line wrap: on
line diff
--- a/macros.xml Fri Nov 04 15:08:37 2022 +0000 +++ b/macros.xml Thu Jul 06 21:15:39 2023 +0000 @@ -3,6 +3,7 @@ <param name="field" type="select" label="Field"> <option value="taxonomy_name">Taxonomy Name</option> <option value="taxonomy_id">Taxonomy ID</option> + <option value="accession">Accession</option> </param> </xml> </macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Helicobacter_protein_accessions.tsv Thu Jul 06 21:15:39 2023 +0000 @@ -0,0 +1,2 @@ +E1Q2I0 +E1Q3C4 \ No newline at end of file
--- a/uniprotxml_downloader.py Fri Nov 04 15:08:37 2022 +0000 +++ b/uniprotxml_downloader.py Thu Jul 06 21:15:39 2023 +0000 @@ -47,16 +47,16 @@ def __main__(): # Parse Command Line parser = optparse.OptionParser() - parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs') - parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs') - parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download') + parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of search search_ids') + parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains search search_ids') + parser.add_option('-s', '--search-id', dest='search_id', action='append', default=[], help='ID to search in Uniprot') parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format') - parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id'], default='taxonomy_name', help='query field') + parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id', 'accession'], default='taxonomy_name', help='query field') parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') (options, args) = parser.parse_args() - taxids = set(options.taxon) + search_ids = set(options.search_id) if options.input: with open(options.input, 'r') as inputFile: for linenum, line in enumerate(inputFile): @@ -64,19 +64,19 @@ continue fields = line.rstrip('\r\n').split('\t') if len(fields) > abs(options.column): - taxid = fields[options.column].strip() - if taxid: - taxids.add(taxid) - taxon_queries = [f'{options.field}:"{taxid}"' for taxid in taxids] - taxon_query = ' OR '.join(taxon_queries) + search_id = fields[options.column].strip() + if search_id: + search_ids.add(search_id) + search_queries = [f'{options.field}:"{search_id}"' for search_id in search_ids] + search_query = ' OR '.join(search_queries) if options.output: dest_path = options.output else: - dest_path = "uniprot_%s.xml" % '_'.join(taxids) + dest_path = "uniprot_%s.xml" % '_'.join(search_ids) reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' try: url = 'https://rest.uniprot.org/uniprotkb/stream' - query = "%s%s" % (taxon_query, reviewed) + query = "%s%s" % (search_query, reviewed) params = {'query': query, 'format': options.format} if options.debug: print("%s ? %s" % (url, params), file=sys.stderr) @@ -112,7 +112,7 @@ else: print("failed: Not a uniprot xml file", file=sys.stderr) exit(1) - print("NCBI Taxon ID:%s" % taxids, file=sys.stdout) + print("Search IDs:%s" % search_ids, file=sys.stdout) if 'X-UniProt-Release' in response.headers: print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout) if 'X-Total-Results' in response.headers:
--- a/uniprotxml_downloader.xml Fri Nov 04 15:08:37 2022 +0000 +++ b/uniprotxml_downloader.xml Thu Jul 06 21:15:39 2023 +0000 @@ -1,4 +1,4 @@ -<tool id="uniprotxml_downloader" name="UniProt" version="2.3.0" profile="21.01"> +<tool id="uniprotxml_downloader" name="UniProt" version="2.4.0" profile="21.01"> <description>download proteome as XML or fasta</description> <macros> <import>macros.xml</import> @@ -12,32 +12,32 @@ <command> <![CDATA[ python '$__tool_directory__/uniprotxml_downloader.py' -#if $taxid.input_choice == 'common': - --taxon $taxid.organism +#if $input_method.input_choice == 'common': + --search-id $input_method.organism --field taxonomy_id - #if $taxid.reviewed: - --reviewed=$taxid.reviewed + #if $input_method.reviewed: + --reviewed=$input_method.reviewed #end if -#elif $taxid.input_choice == 'taxids': - --field $taxid.field - #for $id in $taxid.taxons.split(','): - -t '$id' +#elif $input_method.input_choice == 'enter_ids': + --field $input_method.field + #for $id in $input_method.ids.split(','): + --search-id '$id' #end for -#elif $taxid.input_choice == 'history': - --field $taxid.field - --input='${taxid.taxon_file}' - --column=#echo int(str($taxid.column)) - 1# +#elif $input_method.input_choice == 'history': + --field $input_method.field + --input='${input_method.id_file}' + --column=#echo int(str($input_method.column)) - 1# #end if --format $format --output '${proteome}' ]]> </command> <inputs> - <conditional name="taxid"> + <conditional name="input_method"> <param name="input_choice" type="select" label="Select"> <option value="common">A Common Organism</option> - <option value="taxids">A manually entered list of Taxon IDs or names</option> - <option value="history">A history dataset with a column containing Taxon IDs or names</option> + <option value="enter_ids">A manually entered list of Uniprot IDs</option> + <option value="history">A history dataset with a column containing Uniprot IDs</option> </param> <when value="common"> <param name="organism" type="select" label="Common Organisms" @@ -59,16 +59,16 @@ <option value="no">UniProtKB/TrEMBL (unreviewed only)</option> </param> </when> - <when value="taxids"> - <param name="taxons" type="text" label="NCBI Taxon IDs or names" - help="Enter one or more Organsim IDs (separated by commas) from http://www.uniprot.org/proteomes/"> + <when value="enter_ids"> + <param name="ids" type="text" label="Search ID values" + help="Enter one or more IDs (separated by commas) from http://www.uniprot.org/proteomes/"> <validator type="regex" message="OrganismID[,OrganismID]">^\w+( \w+)*(,\w+( \w+)*)*$</validator> </param> <expand macro="query_field"/> </when> <when value="history"> - <param name="taxon_file" type="data" format="tabular,txt" label="Dataset (tab separated) with Taxon ID/Name column"/> - <param name="column" type="data_column" data_ref="taxon_file" label="Column with Taxon ID/name"/> + <param name="id_file" type="data" format="tabular,txt" label="Dataset (tab separated) with ID column"/> + <param name="column" type="data_column" data_ref="id_file" label="Column with ID"/> <expand macro="query_field"/> </when> </conditional> @@ -86,8 +86,8 @@ </outputs> <tests> <test> - <param name="input_choice" value="taxids"/> - <param name="taxons" value="1566990"/> + <param name="input_choice" value="enter_ids"/> + <param name="ids" value="1566990"/> <param name="format" value="xml"/> <output name="proteome"> <assert_contents> @@ -96,8 +96,8 @@ </output> </test> <test> - <param name="input_choice" value="taxids"/> - <param name="taxons" value="765963,512562"/> + <param name="input_choice" value="enter_ids"/> + <param name="ids" value="765963,512562"/> <param name="field" value="taxonomy_id"/> <param name="format" value="fasta"/> <output name="proteome"> @@ -108,8 +108,8 @@ </output> </test> <test> - <param name="input_choice" value="taxids"/> - <param name="taxons" value="Shi470,PeCan4"/> + <param name="input_choice" value="enter_ids"/> + <param name="ids" value="Shi470,PeCan4"/> <param name="field" value="taxonomy_name"/> <param name="format" value="fasta"/> <output name="proteome"> @@ -120,8 +120,20 @@ </output> </test> <test> + <param name="input_choice" value="enter_ids"/> + <param name="ids" value="E1Q2I0,E1Q3C4"/> + <param name="field" value="accession"/> + <param name="format" value="fasta"/> + <output name="proteome"> + <assert_contents> + <has_text text="E1Q2I0" /> + <has_text text="E1Q3C4" /> + </assert_contents> + </output> + </test> + <test> <param name="input_choice" value="history"/> - <param name="taxon_file" value="Helicobacter_strains.tsv" ftype="tabular"/> + <param name="id_file" value="Helicobacter_strains.tsv" ftype="tabular"/> <param name="column" value="1"/> <param name="field" value="taxonomy_name"/> <param name="format" value="fasta"/> @@ -134,7 +146,7 @@ </test> <test> <param name="input_choice" value="history"/> - <param name="taxon_file" value="Helicobacter_strains_ids.tsv" ftype="tabular"/> + <param name="id_file" value="Helicobacter_strains_ids.tsv" ftype="tabular"/> <param name="column" value="2"/> <param name="field" value="taxonomy_id"/> <param name="format" value="fasta"/> @@ -145,6 +157,19 @@ </assert_contents> </output> </test> + <test> + <param name="input_choice" value="history"/> + <param name="id_file" value="Helicobacter_protein_accessions.tsv" ftype="tabular"/> + <param name="column" value="1"/> + <param name="field" value="accession"/> + <param name="format" value="fasta"/> + <output name="proteome"> + <assert_contents> + <has_text text="E1Q2I0" /> + <has_text text="E1Q3C4" /> + </assert_contents> + </output> + </test> </tests> <help> <![CDATA[ @@ -160,7 +185,11 @@ Example taxon: http://www.uniprot.org/taxonomy/512562 -Taxon IDs or names can be entered as text or read from a column in a tabular dataset from your history. +Example protein: https://www.uniprot.org/uniprotkb/E1Q2I0/entry + +Description of query fields: https://www.uniprot.org/help/query-fields + +IDs can be entered as text or read from a column in a tabular dataset from your history. Example IDs and names releated to the Bacteria Helicobacter pylori (strain Shi470) :: @@ -171,6 +200,11 @@ - Helicobacter - Helicobacteraceae + Example protein accession numbers from Helicobacter pylori: + + - E1Q2I0 + - E1Q3C4 + UniProtKB help: http://www.uniprot.org/help/uniprotkb