comparison retrieve_fasta_from_NCBI.py @ 4:64f45c5e94a0 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit ca132a4b5d5d7175e6e8bd62cc518397d14dad17
author drosofff
date Mon, 15 May 2017 03:10:11 -0400
parents a9d8f69d59fb
children c6de5c7b4ae3
comparison
equal deleted inserted replaced
3:a9d8f69d59fb 4:64f45c5e94a0
19 retmax of efetch is 1/10 of declared value from NCBI 19 retmax of efetch is 1/10 of declared value from NCBI
20 20
21 queries are 1 sec delayed, to satisfy NCBI guidelines (more than what they request) 21 queries are 1 sec delayed, to satisfy NCBI guidelines (more than what they request)
22 22
23 23
24 python get_fasta_from_taxon.py -i 1638 -o test.out -d protein
25 python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs
26 """ 24 """
27 import sys 25 import sys
28 import logging 26 import logging
29 import optparse 27 import optparse
30 import time 28 import time
35 33
36 34
37 class Eutils: 35 class Eutils:
38 36
39 def __init__(self, options, logger): 37 def __init__(self, options, logger):
38 """
39 Initialize retrieval parameters
40 """
40 self.logger = logger 41 self.logger = logger
41 self.base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" 42 self.base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
42 self.query_string = options.query_string 43 self.query_string = options.query_string
43 self.dbname = options.dbname 44 self.dbname = options.dbname
44 if options.outname: 45 if options.outname:
45 self.outname = options.outname 46 self.outname = options.outname
46 else: 47 else:
47 self.outname = 'NCBI_download' + '.' + self.dbname + '.fasta' 48 self.outname = 'NCBI_download' + '.' + self.dbname + '.fasta'
48 self.ids = [] 49 self.ids = []
49 self.retmax_esearch = 100000 50 self.retmax_esearch = 100000
50 self.retmax_efetch = 1000 51 self.retmax_efetch = 500
51 self.count = 0 52 self.count = 0
52 self.webenv = "" 53 self.webenv = ""
53 self.query_key = "" 54 self.query_key = ""
54 55
55 def retrieve(self): 56 def retrieve(self):
56 """ """ 57 """
58 Retrieve the fasta sequences corresponding to the query
59 """
57 self.get_count_value() 60 self.get_count_value()
58 self.get_uids_list() 61
59 self.get_sequences() 62 # If no UIDs are found exit script
63 if self.count > 0:
64 self.get_uids_list()
65 self.get_sequences()
66 else:
67 self.logger.info("No UIDs were found. Exiting script.")
60 68
61 def get_count_value(self): 69 def get_count_value(self):
62 """ 70 """
63 just to retrieve Count (number of UIDs) 71 just to retrieve Count (number of UIDs)
64 Total number of UIDs from the retrieved set to be shown in the XML 72 Total number of UIDs from the retrieved set to be shown in the XML
75 self.logger.debug("Query response:") 83 self.logger.debug("Query response:")
76 for line in querylog: 84 for line in querylog:
77 self.logger.debug(line.rstrip()) 85 self.logger.debug(line.rstrip())
78 if '</Count>' in line: 86 if '</Count>' in line:
79 self.count = int(line[line.find('<Count>')+len('<Count>') : line.find('</Count>')]) 87 self.count = int(line[line.find('<Count>')+len('<Count>') : line.find('</Count>')])
80 self.logger.info("Founded %d UIDs" % self.count) 88 self.logger.info("Found %d UIDs" % self.count)
81 89
82 def get_uids_list(self): 90 def get_uids_list(self):
83 """ 91 """
84 Increasing retmax allows more of the retrieved UIDs to be included in the XML output, 92 Increasing retmax allows more of the retrieved UIDs to be included in the XML output,
85 up to a maximum of 100,000 records. 93 up to a maximum of 100,000 records.
111 data = urllib.urlencode(values) 119 data = urllib.urlencode(values)
112 self.logger.debug("data: %s" % str(data)) 120 self.logger.debug("data: %s" % str(data))
113 req = urllib2.Request(url, data) 121 req = urllib2.Request(url, data)
114 response = urllib2.urlopen(req) 122 response = urllib2.urlopen(req)
115 querylog = response.readlines() 123 querylog = response.readlines()
124 response.close()
116 time.sleep(1) 125 time.sleep(1)
117 return querylog 126 return querylog
118 127
119 def epost(self, db, ids): 128 def epost(self, db, ids):
120 url = self.base + "epost.fcgi" 129 url = self.base + "epost.fcgi"
121 self.logger.debug("url_epost: %s" % url) 130 self.logger.debug("url_epost: %s" % url)
122 values = {'db': db, 131 values = {'db': db,
123 'id': ids} 132 'id': ids}
124 data = urllib.urlencode(values) 133 data = urllib.urlencode(values)
125 req = urllib2.Request(url, data) 134 req = urllib2.Request(url, data)
126 #self.logger.debug("data: %s" % str(data))
127 req = urllib2.Request(url, data)
128 serverResponse = False 135 serverResponse = False
136 nb_trials = 0
129 while not serverResponse: 137 while not serverResponse:
138 nb_trials += 1
130 try: 139 try:
140 self.logger.debug("Try number %s for opening and readin URL %s" % ( nb_trials, url+data ))
131 response = urllib2.urlopen(req) 141 response = urllib2.urlopen(req)
142 querylog = response.readlines()
143 response.close()
132 serverResponse = True 144 serverResponse = True
133 except: # catch *all* exceptions 145 except urllib2.HTTPError as e:
134 e = sys.exc_info()[0] 146 self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) )
135 self.logger.info( "Catched Error: %s" % e ) 147 self.logger.info("Retrying in 1 sec")
136 self.logger.info( "Retrying in 10 sec") 148 serverResponse = False
137 time.sleep(10) 149 time.sleep(1)
138 querylog = response.readlines() 150 except urllib2.URLError as e:
151 self.logger.info("urlopen error: Failed to reach a server")
152 self.logger.info("Reason :%s" % ( e.reason ) )
153 self.logger.info("Retrying in 1 sec")
154 serverResponse = False
155 time.sleep(1)
156 except httplib.IncompleteRead as e:
157 self.logger.info("IncompleteRead error: %s" % ( e.partial ) )
158 self.logger.info("Retrying in 1 sec")
159 serverResponse = False
160 time.sleep(1)
139 self.logger.debug("query response:") 161 self.logger.debug("query response:")
140 for line in querylog: 162 for line in querylog:
141 self.logger.debug(line.rstrip()) 163 self.logger.debug(line.rstrip())
142 if '</QueryKey>' in line: 164 if '</QueryKey>' in line:
143 self.query_key = str(line[line.find('<QueryKey>')+len('<QueryKey>'):line.find('</QueryKey>')]) 165 self.query_key = str(line[line.find('<QueryKey>')+len('<QueryKey>'):line.find('</QueryKey>')])
157 'rettype': "fasta", 179 'rettype': "fasta",
158 'retmode': "text"} 180 'retmode': "text"}
159 data = urllib.urlencode(values) 181 data = urllib.urlencode(values)
160 req = urllib2.Request(url, data) 182 req = urllib2.Request(url, data)
161 self.logger.debug("data: %s" % str(data)) 183 self.logger.debug("data: %s" % str(data))
162 req = urllib2.Request(url, data)
163 serverTransaction = False 184 serverTransaction = False
164 counter = 0 185 counter = 0
186 response_code = 0
165 while not serverTransaction: 187 while not serverTransaction:
166 counter += 1 188 counter += 1
167 self.logger.info("Server Transaction Trial: %s" % ( counter ) ) 189 self.logger.info("Server Transaction Trial: %s" % ( counter ) )
168 try: 190 try:
169 response = urllib2.urlopen(req) 191 response = urllib2.urlopen(req)
192 response_code = response.getcode()
170 fasta = response.read() 193 fasta = response.read()
171 if ("Resource temporarily unavailable" in fasta) or (not fasta.startswith(">") ): 194 response.close()
195 if ( (response_code != 200) or ("Resource temporarily unavailable" in fasta)
196 or ("Error" in fasta) or (not fasta.startswith(">") ) ):
172 serverTransaction = False 197 serverTransaction = False
173 else: 198 else:
174 serverTransaction = True 199 serverTransaction = True
175 except urllib2.HTTPError as e: 200 except urllib2.HTTPError as e:
176 serverTransaction = False 201 serverTransaction = False
177 self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) ) 202 self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) )
203 except urllib2.URLError as e:
204 serverTransaction = False
205 self.logger.info("urlopen error: Failed to reach a server")
206 self.logger.info("Reason :%s" % ( e.reason ) )
178 except httplib.IncompleteRead as e: 207 except httplib.IncompleteRead as e:
179 serverTransaction = False 208 serverTransaction = False
180 self.logger.info("IncompleteRead error: %s" % ( e.partial ) ) 209 self.logger.info("IncompleteRead error: %s" % ( e.partial ) )
181 fasta = self.sanitiser(self.dbname, fasta) # 210 fasta = self.sanitiser(self.dbname, fasta)
182 time.sleep(1) 211 time.sleep(0.1)
183 return fasta 212 return fasta
184 213
185 def sanitiser(self, db, fastaseq): 214 def sanitiser(self, db, fastaseq):
186 if db not in "nuccore protein" : return fastaseq 215 if db not in "nuccore protein" : return fastaseq
187 regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}") 216 regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}")
235 self.logger.info("Number of batches for efetch action: %d" % ((count / batch_size) + 1)) 264 self.logger.info("Number of batches for efetch action: %d" % ((count / batch_size) + 1))
236 with open(self.outname, 'w') as out: 265 with open(self.outname, 'w') as out:
237 for start in range(0, count, batch_size): 266 for start in range(0, count, batch_size):
238 end = min(count, start+batch_size) 267 end = min(count, start+batch_size)
239 batch = uids_list[start:end] 268 batch = uids_list[start:end]
240 self.epost(self.dbname, ",".join(batch)) 269 if self.epost(self.dbname, ",".join(batch)) != -1:
241 mfasta = '' 270 mfasta = ''
242 while not mfasta: 271 while not mfasta:
243 self.logger.info("retrieving batch %d" % ((start / batch_size) + 1)) 272 self.logger.info("retrieving batch %d" % ((start / batch_size) + 1))
244 mfasta = self.efetch(self.dbname, self.query_key, self.webenv) 273 mfasta = self.efetch(self.dbname, self.query_key, self.webenv)
245 out.write(mfasta + '\n') 274 out.write(mfasta + '\n')
246 275
247 276
248 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' 277 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
249 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' 278 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
250 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] 279 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']