annotate uniprotxml_downloader.py @ 0:0bd2688166a5 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
author galaxyp
date Tue, 08 Mar 2016 12:03:49 -0500
parents
children e1abc9a35c64
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
1 #!/usr/bin/env python
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
2 """
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
3 #
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
4 #------------------------------------------------------------------------------
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
5 # University of Minnesota
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
6 # Copyright 2016, Regents of the University of Minnesota
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
7 #------------------------------------------------------------------------------
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
8 # Author:
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
9 #
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
10 # James E Johnson
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
11 #
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
12 #------------------------------------------------------------------------------
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
13 """
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
14 import sys
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
15 import re
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
16 import optparse
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
17 import urllib
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
18
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
19
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
20 def __main__():
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
21 # Parse Command Line
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
22 parser = optparse.OptionParser()
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
23 parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download')
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
24 parser.add_option('-r', '--reviewed', dest='reviewed', help='file path for th downloaed uniprot xml')
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
25 parser.add_option('-o', '--output', dest='output', help='file path for th downloaed uniprot xml')
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
26 parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Print UniProt Info')
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
27 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr')
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
28 (options, args) = parser.parse_args()
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
29
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
30 taxids = options.taxon if options.taxon else ['9606']
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
31 taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids]
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
32 taxon_query = ' OR '.join(taxon_queries)
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
33 if options.output:
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
34 dest_path = options.output
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
35 else:
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
36 dest_path = "uniprot_%s.xml" % '_'.join(taxids)
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
37 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else ''
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
38 url = 'http://www.uniprot.org/uniprot/?query=%s%s&force=yes&format=xml' % (taxon_query, reviewed)
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
39 if options.debug:
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
40 print >> sys.stderr, url
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
41 try:
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
42 (fname, msg) = urllib.urlretrieve(url, dest_path)
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
43 headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]}
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
44 if 'Content-Length' in headers and headers['Content-Length'] == 0:
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
45 print >> sys.stderr, url
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
46 print >> sys.stderr, msg
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
47 exit(1)
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
48 elif True:
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
49 pass
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
50 else:
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
51 with open(dest_path, 'r') as contents:
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
52 while True:
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
53 line = contents.readline()
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
54 if options.debug:
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
55 print >> sys.stderr, line
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
56 if line is None or not line.startswith('<?'):
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
57 break
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
58 # pattern match <root or <ns:root for any ns string
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
59 pattern = '^<(\w*:)?uniprot'
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
60 if re.match(pattern, line):
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
61 break
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
62 else:
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
63 print >> sys.stderr, "failed: Not a uniprot xml file"
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
64 exit(1)
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
65
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
66 if options.verbose:
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
67 print >> sys.stdout, "NCBI Taxon ID:%s" % taxids
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
68 if 'X-UniProt-Release' in headers:
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
69 print >> sys.stdout, "UniProt-Release:%s" % headers['X-UniProt-Release']
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
70 if 'X-Total-Results' in headers:
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
71 print >> sys.stdout, "Entries:%s" % headers['X-Total-Results']
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
72 print >> sys.stdout, "%s" % url
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
73 except Exception, e:
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
74 print >> sys.stderr, "failed: %s" % e
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
75
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
76
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
77 if __name__ == "__main__":
0bd2688166a5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff changeset
78 __main__()