Mercurial > repos > galaxyp > uniprotxml_downloader
comparison uniprotxml_downloader.py @ 2:e1abc9a35c64 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 15c2d28359584bcee25cdb456cff50892fff7347
author | galaxyp |
---|---|
date | Fri, 16 Dec 2016 17:33:05 -0500 |
parents | 0bd2688166a5 |
children | 1a5690a5eedc |
comparison
equal
deleted
inserted
replaced
1:fc8c4bd28681 | 2:e1abc9a35c64 |
---|---|
13 """ | 13 """ |
14 import sys | 14 import sys |
15 import re | 15 import re |
16 import optparse | 16 import optparse |
17 import urllib | 17 import urllib |
18 import urllib2 | |
18 | 19 |
19 | 20 |
20 def __main__(): | 21 def __main__(): |
21 # Parse Command Line | 22 # Parse Command Line |
22 parser = optparse.OptionParser() | 23 parser = optparse.OptionParser() |
24 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs') | |
25 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs' ) | |
23 parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download') | 26 parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download') |
24 parser.add_option('-r', '--reviewed', dest='reviewed', help='file path for th downloaed uniprot xml') | 27 parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') |
28 parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml',help='output format') | |
25 parser.add_option('-o', '--output', dest='output', help='file path for th downloaed uniprot xml') | 29 parser.add_option('-o', '--output', dest='output', help='file path for th downloaed uniprot xml') |
26 parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Print UniProt Info') | 30 parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Print UniProt Info') |
27 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') | 31 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') |
28 (options, args) = parser.parse_args() | 32 (options, args) = parser.parse_args() |
29 | 33 taxids = set(options.taxon) |
30 taxids = options.taxon if options.taxon else ['9606'] | 34 if options.input: |
35 with open(options.input,'r') as inputFile: | |
36 for linenum,line in enumerate(inputFile): | |
37 if line.startswith('#'): | |
38 continue | |
39 fields = line.rstrip('\r\n').split('\t') | |
40 if len(fields) > abs(options.column): | |
41 taxid = fields[options.column].strip() | |
42 if taxid: | |
43 taxids.add(taxid) | |
31 taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids] | 44 taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids] |
32 taxon_query = ' OR '.join(taxon_queries) | 45 taxon_query = ' OR '.join(taxon_queries) |
33 if options.output: | 46 if options.output: |
34 dest_path = options.output | 47 dest_path = options.output |
35 else: | 48 else: |
36 dest_path = "uniprot_%s.xml" % '_'.join(taxids) | 49 dest_path = "uniprot_%s.xml" % '_'.join(taxids) |
37 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' | 50 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' |
38 url = 'http://www.uniprot.org/uniprot/?query=%s%s&force=yes&format=xml' % (taxon_query, reviewed) | |
39 if options.debug: | |
40 print >> sys.stderr, url | |
41 try: | 51 try: |
42 (fname, msg) = urllib.urlretrieve(url, dest_path) | 52 def reporthook(n1,n2,n3): |
53 pass | |
54 url = 'http://www.uniprot.org/uniprot/' | |
55 query = "%s%s" % (taxon_query, reviewed) | |
56 params = {'query' : query, 'force' : 'yes' , 'format' : options.format} | |
57 if options.debug: | |
58 print >> sys.stderr, "%s ? %s" % (url,params) | |
59 data = urllib.urlencode(params) | |
60 (fname, msg) = urllib.urlretrieve(url, dest_path,reporthook,data) | |
43 headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]} | 61 headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]} |
44 if 'Content-Length' in headers and headers['Content-Length'] == 0: | 62 if 'Content-Length' in headers and headers['Content-Length'] == 0: |
45 print >> sys.stderr, url | 63 print >> sys.stderr, url |
46 print >> sys.stderr, msg | 64 print >> sys.stderr, msg |
47 exit(1) | 65 exit(1) |
48 elif True: | 66 if options.format == 'xml': |
49 pass | |
50 else: | |
51 with open(dest_path, 'r') as contents: | 67 with open(dest_path, 'r') as contents: |
52 while True: | 68 while True: |
53 line = contents.readline() | 69 line = contents.readline() |
54 if options.debug: | 70 if options.debug: |
55 print >> sys.stderr, line | 71 print >> sys.stderr, line |
56 if line is None or not line.startswith('<?'): | 72 if line is None: |
57 break | 73 break |
74 if line.startswith('<?'): | |
75 continue | |
58 # pattern match <root or <ns:root for any ns string | 76 # pattern match <root or <ns:root for any ns string |
59 pattern = '^<(\w*:)?uniprot' | 77 pattern = '^<(\w*:)?uniprot' |
60 if re.match(pattern, line): | 78 if re.match(pattern, line): |
61 break | 79 break |
62 else: | 80 else: |
63 print >> sys.stderr, "failed: Not a uniprot xml file" | 81 print >> sys.stderr, "failed: Not a uniprot xml file" |
64 exit(1) | 82 exit(1) |
65 | |
66 if options.verbose: | 83 if options.verbose: |
67 print >> sys.stdout, "NCBI Taxon ID:%s" % taxids | 84 print >> sys.stdout, "NCBI Taxon ID:%s" % taxids |
68 if 'X-UniProt-Release' in headers: | 85 if 'X-UniProt-Release' in headers: |
69 print >> sys.stdout, "UniProt-Release:%s" % headers['X-UniProt-Release'] | 86 print >> sys.stdout, "UniProt-Release:%s" % headers['X-UniProt-Release'] |
70 if 'X-Total-Results' in headers: | 87 if 'X-Total-Results' in headers: |