# HG changeset patch # User galaxyp # Date 1481927585 18000 # Node ID e1abc9a35c64b143e761c2c10b70b9b72f2e60eb # Parent fc8c4bd28681555164e27a5d5e9b217ec468d77a planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 15c2d28359584bcee25cdb456cff50892fff7347 diff -r fc8c4bd28681 -r e1abc9a35c64 test-data/Helicobacter_strains.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Helicobacter_strains.tsv Fri Dec 16 17:33:05 2016 -0500 @@ -0,0 +1,2 @@ +PeCan4 +Shi470 diff -r fc8c4bd28681 -r e1abc9a35c64 uniprotxml_downloader.py --- a/uniprotxml_downloader.py Wed Dec 07 16:44:14 2016 -0500 +++ b/uniprotxml_downloader.py Fri Dec 16 17:33:05 2016 -0500 @@ -15,19 +15,32 @@ import re import optparse import urllib +import urllib2 def __main__(): # Parse Command Line parser = optparse.OptionParser() + parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs') + parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs' ) parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download') - parser.add_option('-r', '--reviewed', dest='reviewed', help='file path for th downloaed uniprot xml') + parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') + parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml',help='output format') parser.add_option('-o', '--output', dest='output', help='file path for th downloaed uniprot xml') parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Print UniProt Info') parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') (options, args) = parser.parse_args() - - taxids = options.taxon if options.taxon else ['9606'] + taxids = set(options.taxon) + if options.input: + with open(options.input,'r') as inputFile: + for linenum,line in enumerate(inputFile): + if line.startswith('#'): + continue + fields = line.rstrip('\r\n').split('\t') + if len(fields) > abs(options.column): + taxid = fields[options.column].strip() + if taxid: + taxids.add(taxid) taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids] taxon_query = ' OR '.join(taxon_queries) if options.output: @@ -35,26 +48,31 @@ else: dest_path = "uniprot_%s.xml" % '_'.join(taxids) reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' - url = 'http://www.uniprot.org/uniprot/?query=%s%s&force=yes&format=xml' % (taxon_query, reviewed) - if options.debug: - print >> sys.stderr, url try: - (fname, msg) = urllib.urlretrieve(url, dest_path) + def reporthook(n1,n2,n3): + pass + url = 'http://www.uniprot.org/uniprot/' + query = "%s%s" % (taxon_query, reviewed) + params = {'query' : query, 'force' : 'yes' , 'format' : options.format} + if options.debug: + print >> sys.stderr, "%s ? %s" % (url,params) + data = urllib.urlencode(params) + (fname, msg) = urllib.urlretrieve(url, dest_path,reporthook,data) headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]} if 'Content-Length' in headers and headers['Content-Length'] == 0: print >> sys.stderr, url print >> sys.stderr, msg exit(1) - elif True: - pass - else: + if options.format == 'xml': with open(dest_path, 'r') as contents: while True: line = contents.readline() if options.debug: print >> sys.stderr, line - if line is None or not line.startswith('> sys.stderr, "failed: Not a uniprot xml file" exit(1) - if options.verbose: print >> sys.stdout, "NCBI Taxon ID:%s" % taxids if 'X-UniProt-Release' in headers: diff -r fc8c4bd28681 -r e1abc9a35c64 uniprotxml_downloader.xml --- a/uniprotxml_downloader.xml Wed Dec 07 16:44:14 2016 -0500 +++ b/uniprotxml_downloader.xml Fri Dec 16 17:33:05 2016 -0500 @@ -1,33 +1,37 @@ - - proteome + + download proteome as XML or fasta - python + python - + + - - ^\d+(,\d+)*$ + ^\w+( \w+)*(,\w+( \w+)*)*$ + + + + + + + + - + + + + + - + - + + + + + + + + + + + + + + + + + + + + + + + + +