comparison uniprotxml_downloader.py @ 4:12692567c7f9 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 62afd9de6db50f4314e49d9f24881b6d3778a0a5"
author galaxyp
date Tue, 01 Jun 2021 11:54:47 +0000
parents 1a5690a5eedc
children 265c35540faa
comparison
equal deleted inserted replaced
3:1a5690a5eedc 4:12692567c7f9
9 # 9 #
10 # James E Johnson 10 # James E Johnson
11 # 11 #
12 #------------------------------------------------------------------------------ 12 #------------------------------------------------------------------------------
13 """ 13 """
14 import optparse
15 import re
14 import sys 16 import sys
15 import re 17 from urllib import parse
16 import optparse 18
17 import urllib 19 import requests
18 import urllib2 20 from requests.adapters import HTTPAdapter
21 from requests.packages.urllib3.util.retry import Retry
22
23 DEFAULT_TIMEOUT = 5 # seconds
24 retry_strategy = Retry(
25 total=5,
26 backoff_factor=2,
27 status_forcelist=[429, 500, 502, 503, 504],
28 allowed_methods=["HEAD", "GET", "OPTIONS", "POST"]
29 )
30
31
32 class TimeoutHTTPAdapter(HTTPAdapter):
33 def __init__(self, *args, **kwargs):
34 self.timeout = DEFAULT_TIMEOUT
35 if "timeout" in kwargs:
36 self.timeout = kwargs["timeout"]
37 del kwargs["timeout"]
38 super().__init__(*args, **kwargs)
39
40 def send(self, request, **kwargs):
41 timeout = kwargs.get("timeout")
42 if timeout is None:
43 kwargs["timeout"] = self.timeout
44 return super().send(request, **kwargs)
19 45
20 46
21 def __main__(): 47 def __main__():
22 # Parse Command Line 48 # Parse Command Line
23 parser = optparse.OptionParser() 49 parser = optparse.OptionParser()
24 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs') 50 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs')
25 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs' ) 51 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs')
26 parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download') 52 parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download')
27 parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') 53 parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries')
28 parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format') 54 parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format')
29 parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') 55 parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml')
30 parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Print UniProt Info')
31 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') 56 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr')
32 (options, args) = parser.parse_args() 57 (options, args) = parser.parse_args()
33 taxids = set(options.taxon) 58 taxids = set(options.taxon)
34 if options.input: 59 if options.input:
35 with open(options.input,'r') as inputFile: 60 with open(options.input, 'r') as inputFile:
36 for linenum,line in enumerate(inputFile): 61 for linenum, line in enumerate(inputFile):
37 if line.startswith('#'): 62 if line.startswith('#'):
38 continue 63 continue
39 fields = line.rstrip('\r\n').split('\t') 64 fields = line.rstrip('\r\n').split('\t')
40 if len(fields) > abs(options.column): 65 if len(fields) > abs(options.column):
41 taxid = fields[options.column].strip() 66 taxid = fields[options.column].strip()
42 if taxid: 67 if taxid:
43 taxids.add(taxid) 68 taxids.add(taxid)
44 taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids] 69 taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids]
45 taxon_query = ' OR '.join(taxon_queries) 70 taxon_query = ' OR '.join(taxon_queries)
46 if options.output: 71 if options.output:
47 dest_path = options.output 72 dest_path = options.output
48 else: 73 else:
49 dest_path = "uniprot_%s.xml" % '_'.join(taxids) 74 dest_path = "uniprot_%s.xml" % '_'.join(taxids)
50 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' 75 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else ''
51 try: 76 try:
52 def reporthook(n1,n2,n3):
53 pass
54 url = 'https://www.uniprot.org/uniprot/' 77 url = 'https://www.uniprot.org/uniprot/'
55 query = "%s%s" % (taxon_query, reviewed) 78 query = "%s%s" % (taxon_query, reviewed)
56 params = {'query' : query, 'force' : 'yes' , 'format' : options.format} 79 params = {'query': query, 'force': 'yes', 'format': options.format}
57 if options.debug: 80 if options.debug:
58 print >> sys.stderr, "%s ? %s" % (url,params) 81 print("%s ? %s" % (url, params), file=sys.stderr)
59 data = urllib.urlencode(params) 82 data = parse.urlencode(params)
60 (fname, msg) = urllib.urlretrieve(url, dest_path,reporthook,data) 83 print(f"Retrieving: {url+data}")
61 headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]} 84 adapter = TimeoutHTTPAdapter(max_retries=retry_strategy)
62 if 'Content-Length' in headers and headers['Content-Length'] == 0: 85 http = requests.Session()
63 print >> sys.stderr, url 86 http.mount("https://", adapter)
64 print >> sys.stderr, msg 87 response = http.post(url, data=params)
65 exit(1) 88 http.close()
89 with open(dest_path, 'w') as fh:
90 fh.write(response.text)
66 if options.format == 'xml': 91 if options.format == 'xml':
67 with open(dest_path, 'r') as contents: 92 with open(dest_path, 'r') as contents:
68 while True: 93 while True:
69 line = contents.readline() 94 line = contents.readline()
70 if options.debug: 95 if options.debug:
71 print >> sys.stderr, line 96 print(line, file=sys.stderr)
72 if line is None: 97 if line is None:
73 break 98 break
74 if line.startswith('<?'): 99 if line.startswith('<?'):
75 continue 100 continue
76 # pattern match <root or <ns:root for any ns string 101 # pattern match <root or <ns:root for any ns string
77 pattern = '^<(\w*:)?uniprot' 102 pattern = r'^<(\w*:)?uniprot'
78 if re.match(pattern, line): 103 if re.match(pattern, line):
79 break 104 break
80 else: 105 else:
81 print >> sys.stderr, "failed: Not a uniprot xml file" 106 print("failed: Not a uniprot xml file", file=sys.stderr)
82 exit(1) 107 exit(1)
83 if options.verbose: 108 print("NCBI Taxon ID:%s" % taxids, file=sys.stdout)
84 print >> sys.stdout, "NCBI Taxon ID:%s" % taxids 109 if 'X-UniProt-Release' in response.headers:
85 if 'X-UniProt-Release' in headers: 110 print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout)
86 print >> sys.stdout, "UniProt-Release:%s" % headers['X-UniProt-Release'] 111 if 'X-Total-Results' in response.headers:
87 if 'X-Total-Results' in headers: 112 print("Entries:%s" % response.headers['X-Total-Results'], file=sys.stdout)
88 print >> sys.stdout, "Entries:%s" % headers['X-Total-Results'] 113 except Exception as e:
89 print >> sys.stdout, "%s" % url 114 exit("%s" % e)
90 except Exception, e:
91 print >> sys.stderr, "failed: %s" % e
92 115
93 116
94 if __name__ == "__main__": 117 if __name__ == "__main__":
95 __main__() 118 __main__()