comparison uniprotxml_downloader.py @ 2:e1abc9a35c64 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 15c2d28359584bcee25cdb456cff50892fff7347
author galaxyp
date Fri, 16 Dec 2016 17:33:05 -0500
parents 0bd2688166a5
children 1a5690a5eedc
comparison
equal deleted inserted replaced
1:fc8c4bd28681 2:e1abc9a35c64
13 """ 13 """
14 import sys 14 import sys
15 import re 15 import re
16 import optparse 16 import optparse
17 import urllib 17 import urllib
18 import urllib2
18 19
19 20
20 def __main__(): 21 def __main__():
21 # Parse Command Line 22 # Parse Command Line
22 parser = optparse.OptionParser() 23 parser = optparse.OptionParser()
24 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs')
25 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs' )
23 parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download') 26 parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download')
24 parser.add_option('-r', '--reviewed', dest='reviewed', help='file path for th downloaed uniprot xml') 27 parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries')
28 parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml',help='output format')
25 parser.add_option('-o', '--output', dest='output', help='file path for th downloaed uniprot xml') 29 parser.add_option('-o', '--output', dest='output', help='file path for th downloaed uniprot xml')
26 parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Print UniProt Info') 30 parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Print UniProt Info')
27 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') 31 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr')
28 (options, args) = parser.parse_args() 32 (options, args) = parser.parse_args()
29 33 taxids = set(options.taxon)
30 taxids = options.taxon if options.taxon else ['9606'] 34 if options.input:
35 with open(options.input,'r') as inputFile:
36 for linenum,line in enumerate(inputFile):
37 if line.startswith('#'):
38 continue
39 fields = line.rstrip('\r\n').split('\t')
40 if len(fields) > abs(options.column):
41 taxid = fields[options.column].strip()
42 if taxid:
43 taxids.add(taxid)
31 taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids] 44 taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids]
32 taxon_query = ' OR '.join(taxon_queries) 45 taxon_query = ' OR '.join(taxon_queries)
33 if options.output: 46 if options.output:
34 dest_path = options.output 47 dest_path = options.output
35 else: 48 else:
36 dest_path = "uniprot_%s.xml" % '_'.join(taxids) 49 dest_path = "uniprot_%s.xml" % '_'.join(taxids)
37 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' 50 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else ''
38 url = 'http://www.uniprot.org/uniprot/?query=%s%s&force=yes&format=xml' % (taxon_query, reviewed)
39 if options.debug:
40 print >> sys.stderr, url
41 try: 51 try:
42 (fname, msg) = urllib.urlretrieve(url, dest_path) 52 def reporthook(n1,n2,n3):
53 pass
54 url = 'http://www.uniprot.org/uniprot/'
55 query = "%s%s" % (taxon_query, reviewed)
56 params = {'query' : query, 'force' : 'yes' , 'format' : options.format}
57 if options.debug:
58 print >> sys.stderr, "%s ? %s" % (url,params)
59 data = urllib.urlencode(params)
60 (fname, msg) = urllib.urlretrieve(url, dest_path,reporthook,data)
43 headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]} 61 headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]}
44 if 'Content-Length' in headers and headers['Content-Length'] == 0: 62 if 'Content-Length' in headers and headers['Content-Length'] == 0:
45 print >> sys.stderr, url 63 print >> sys.stderr, url
46 print >> sys.stderr, msg 64 print >> sys.stderr, msg
47 exit(1) 65 exit(1)
48 elif True: 66 if options.format == 'xml':
49 pass
50 else:
51 with open(dest_path, 'r') as contents: 67 with open(dest_path, 'r') as contents:
52 while True: 68 while True:
53 line = contents.readline() 69 line = contents.readline()
54 if options.debug: 70 if options.debug:
55 print >> sys.stderr, line 71 print >> sys.stderr, line
56 if line is None or not line.startswith('<?'): 72 if line is None:
57 break 73 break
74 if line.startswith('<?'):
75 continue
58 # pattern match <root or <ns:root for any ns string 76 # pattern match <root or <ns:root for any ns string
59 pattern = '^<(\w*:)?uniprot' 77 pattern = '^<(\w*:)?uniprot'
60 if re.match(pattern, line): 78 if re.match(pattern, line):
61 break 79 break
62 else: 80 else:
63 print >> sys.stderr, "failed: Not a uniprot xml file" 81 print >> sys.stderr, "failed: Not a uniprot xml file"
64 exit(1) 82 exit(1)
65
66 if options.verbose: 83 if options.verbose:
67 print >> sys.stdout, "NCBI Taxon ID:%s" % taxids 84 print >> sys.stdout, "NCBI Taxon ID:%s" % taxids
68 if 'X-UniProt-Release' in headers: 85 if 'X-UniProt-Release' in headers:
69 print >> sys.stdout, "UniProt-Release:%s" % headers['X-UniProt-Release'] 86 print >> sys.stdout, "UniProt-Release:%s" % headers['X-UniProt-Release']
70 if 'X-Total-Results' in headers: 87 if 'X-Total-Results' in headers: