comparison uniprotxml_downloader.py @ 0:0bd2688166a5 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
author galaxyp
date Tue, 08 Mar 2016 12:03:49 -0500
parents
children e1abc9a35c64
comparison
equal deleted inserted replaced
-1:000000000000 0:0bd2688166a5
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2016, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 #
12 #------------------------------------------------------------------------------
13 """
14 import sys
15 import re
16 import optparse
17 import urllib
18
19
20 def __main__():
21 # Parse Command Line
22 parser = optparse.OptionParser()
23 parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download')
24 parser.add_option('-r', '--reviewed', dest='reviewed', help='file path for th downloaed uniprot xml')
25 parser.add_option('-o', '--output', dest='output', help='file path for th downloaed uniprot xml')
26 parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Print UniProt Info')
27 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr')
28 (options, args) = parser.parse_args()
29
30 taxids = options.taxon if options.taxon else ['9606']
31 taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids]
32 taxon_query = ' OR '.join(taxon_queries)
33 if options.output:
34 dest_path = options.output
35 else:
36 dest_path = "uniprot_%s.xml" % '_'.join(taxids)
37 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else ''
38 url = 'http://www.uniprot.org/uniprot/?query=%s%s&force=yes&format=xml' % (taxon_query, reviewed)
39 if options.debug:
40 print >> sys.stderr, url
41 try:
42 (fname, msg) = urllib.urlretrieve(url, dest_path)
43 headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]}
44 if 'Content-Length' in headers and headers['Content-Length'] == 0:
45 print >> sys.stderr, url
46 print >> sys.stderr, msg
47 exit(1)
48 elif True:
49 pass
50 else:
51 with open(dest_path, 'r') as contents:
52 while True:
53 line = contents.readline()
54 if options.debug:
55 print >> sys.stderr, line
56 if line is None or not line.startswith('<?'):
57 break
58 # pattern match <root or <ns:root for any ns string
59 pattern = '^<(\w*:)?uniprot'
60 if re.match(pattern, line):
61 break
62 else:
63 print >> sys.stderr, "failed: Not a uniprot xml file"
64 exit(1)
65
66 if options.verbose:
67 print >> sys.stdout, "NCBI Taxon ID:%s" % taxids
68 if 'X-UniProt-Release' in headers:
69 print >> sys.stdout, "UniProt-Release:%s" % headers['X-UniProt-Release']
70 if 'X-Total-Results' in headers:
71 print >> sys.stdout, "Entries:%s" % headers['X-Total-Results']
72 print >> sys.stdout, "%s" % url
73 except Exception, e:
74 print >> sys.stderr, "failed: %s" % e
75
76
77 if __name__ == "__main__":
78 __main__()