comparison retrieve_fasta_from_NCBI.py @ 0:0bdc5a73c8d1 draft

planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author drosofff
date Sun, 21 Jun 2015 14:29:45 -0400
parents
children 79cb7620843d
comparison
equal deleted inserted replaced
-1:000000000000 0:0bdc5a73c8d1
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 """
4 From a taxonomy ID retrieves all the nucleotide sequences
5 It returns a multiFASTA nuc/prot file
6
7 Entrez Database UID common name E-utility Database Name
8 Nucleotide GI number nuccore
9 Protein GI number protein
10
11 Retrieve strategy:
12
13 esearch to get total number of UIDs (count)
14 esearch to get UIDs in batches
15 loop untile end of UIDs list:
16 epost to put a batch of UIDs in the history server
17 efetch to retrieve info from previous post
18
19 retmax of efetch is 1/10 of declared value from NCBI
20
21 queries are 1 sec delayed, to satisfy NCBI guidelines (more than what they request)
22
23
24 python get_fasta_from_taxon.py -i 1638 -o test.out -d protein
25 python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs
26 """
27 import sys
28 import logging
29 import optparse
30 import time
31 import urllib
32 import urllib2
33 import httplib
34 import re
35 class Eutils:
36
37 def __init__(self, options, logger):
38 self.logger = logger
39 self.base = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
40 self.query_string = options.query_string
41 self.dbname = options.dbname
42 if options.outname:
43 self.outname = options.outname
44 else:
45 self.outname = 'NCBI_download' + '.' + self.dbname + '.fasta'
46 self.ids = []
47 self.retmax_esearch = 100000
48 self.retmax_efetch = 1000
49 self.count = 0
50 self.webenv = ""
51 self.query_key = ""
52
53 def retrieve(self):
54 """ """
55 self.get_count_value()
56 self.get_uids_list()
57 self.get_sequences()
58
59 def get_count_value(self):
60 """
61 just to retrieve Count (number of UIDs)
62 Total number of UIDs from the retrieved set to be shown in the XML
63 output (default=20). By default, ESearch only includes the first 20
64 UIDs retrieved in the XML output. If usehistory is set to 'y',
65 the remainder of the retrieved set will be stored on the History server;
66
67 http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
68 """
69 self.logger.info("retrieving data from %s" % self.base)
70 self.logger.info("for Query: %s and database: %s" %
71 (self.query_string, self.dbname))
72 querylog = self.esearch(self.dbname, self.query_string, '', '', "count")
73 self.logger.debug("Query response:")
74 for line in querylog:
75 self.logger.debug(line.rstrip())
76 if '</Count>' in line:
77 self.count = int(line[line.find('<Count>')+len('<Count>') : line.find('</Count>')])
78 self.logger.info("Founded %d UIDs" % self.count)
79
80 def get_uids_list(self):
81 """
82 Increasing retmax allows more of the retrieved UIDs to be included in the XML output,
83 up to a maximum of 100,000 records.
84 from http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
85 """
86 retmax = self.retmax_esearch
87 if (self.count > retmax):
88 num_batches = (self.count / retmax) + 1
89 else:
90 num_batches = 1
91 self.logger.info("Batch size for esearch action: %d UIDs" % retmax)
92 self.logger.info("Number of batches for esearch action: %d " % num_batches)
93 for n in range(num_batches):
94 querylog = self.esearch(self.dbname, self.query_string, n*retmax, retmax, '')
95 for line in querylog:
96 if '<Id>' in line and '</Id>' in line:
97 uid = (line[line.find('<Id>')+len('<Id>') : line.find('</Id>')])
98 self.ids.append(uid)
99 self.logger.info("Retrieved %d UIDs" % len(self.ids))
100
101 def esearch(self, db, term, retstart, retmax, rettype):
102 url = self.base + "esearch.fcgi"
103 self.logger.debug("url: %s" % url)
104 values = {'db': db,
105 'term': term,
106 'rettype': rettype,
107 'retstart': retstart,
108 'retmax': retmax}
109 data = urllib.urlencode(values)
110 self.logger.debug("data: %s" % str(data))
111 req = urllib2.Request(url, data)
112 response = urllib2.urlopen(req)
113 querylog = response.readlines()
114 time.sleep(1)
115 return querylog
116
117 def epost(self, db, ids):
118 url = self.base + "epost.fcgi"
119 self.logger.debug("url_epost: %s" % url)
120 values = {'db': db,
121 'id': ids}
122 data = urllib.urlencode(values)
123 req = urllib2.Request(url, data)
124 #self.logger.debug("data: %s" % str(data))
125 req = urllib2.Request(url, data)
126 serverResponse = False
127 while not serverResponse:
128 try:
129 response = urllib2.urlopen(req)
130 serverResponse = True
131 except: # catch *all* exceptions
132 e = sys.exc_info()[0]
133 self.logger.info( "Catched Error: %s" % e )
134 self.logger.info( "Retrying in 10 sec")
135 time.sleep(10)
136 querylog = response.readlines()
137 self.logger.debug("query response:")
138 for line in querylog:
139 self.logger.debug(line.rstrip())
140 if '</QueryKey>' in line:
141 self.query_key = str(line[line.find('<QueryKey>')+len('<QueryKey>'):line.find('</QueryKey>')])
142 if '</WebEnv>' in line:
143 self.webenv = str(line[line.find('<WebEnv>')+len('<WebEnv>'):line.find('</WebEnv>')])
144 self.logger.debug("*** epost action ***")
145 self.logger.debug("query_key: %s" % self.query_key)
146 self.logger.debug("webenv: %s" % self.webenv)
147 time.sleep(1)
148
149 def efetch(self, db, query_key, webenv):
150 url = self.base + "efetch.fcgi"
151 self.logger.debug("url_efetch: %s" % url)
152 values = {'db': db,
153 'query_key': query_key,
154 'webenv': webenv,
155 'rettype': "fasta",
156 'retmode': "text"}
157 data = urllib.urlencode(values)
158 req = urllib2.Request(url, data)
159 self.logger.debug("data: %s" % str(data))
160 req = urllib2.Request(url, data)
161 serverTransaction = False
162 counter = 0
163 while not serverTransaction:
164 counter += 1
165 self.logger.info("Server Transaction Trial: %s" % ( counter ) )
166 try:
167 response = urllib2.urlopen(req)
168 fasta = response.read()
169 if "Resource temporarily unavailable" in fasta:
170 serverTransaction = False
171 else:
172 serverTransaction = True
173 except urllib2.HTTPError as e:
174 serverTransaction = False
175 self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) )
176 except httplib.IncompleteRead as e:
177 serverTransaction = False
178 self.logger.info("IncompleteRead error: %s" % ( e.partial ) )
179 if self.dbname != "pubmed":
180 assert fasta.startswith(">"), fasta
181 fasta = self.sanitiser(self.dbname, fasta) #
182 time.sleep(1)
183 return fasta
184
185 def sanitiser(self, db, fastaseq):
186 if db not in "nuccore protein" : return fastaseq
187 regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}")
188 sane_seqlist = []
189 seqlist = fastaseq.split("\n\n")
190 for seq in seqlist[:-1]:
191 fastalines = seq.split("\n")
192 if len(fastalines) < 2:
193 self.logger.info("Empty sequence for %s" % ("|".join(fastalines[0].split("|")[:4]) ) )
194 self.logger.info("%s download is skipped" % ("|".join(fastalines[0].split("|")[:4]) ) )
195 continue
196 if db == "nuccore":
197 badnuc = 0
198 for nucleotide in fastalines[1]:
199 if nucleotide not in "ATGC":
200 badnuc += 1
201 if float(badnuc)/len(fastalines[1]) > 0.4:
202 self.logger.info("%s ambiguous nucleotides in %s or download interrupted at this offset | %s" % ( float(badnuc)/len(fastalines[1]), "|".join(fastalines[0].split("|")[:4]), fastalines[1]) )
203 self.logger.info("%s download is skipped" % (fastalines[0].split("|")[:4]) )
204 continue
205 fastalines[0] = fastalines[0].replace(" ","_")[:100] # remove spaces and trim the header to 100 chars
206 cleanseq = "\n".join(fastalines)
207 sane_seqlist.append(cleanseq)
208 elif db == "protein":
209 fastalines[0] = fastalines[0][0:100]
210 fastalines[0] = fastalines[0].replace(" ", "_")
211 fastalines[0] = fastalines[0].replace("[", "_")
212 fastalines[0] = fastalines[0].replace("]", "_")
213 fastalines[0] = fastalines[0].replace("=", "_")
214 fastalines[0] = fastalines[0].rstrip("_") # because blast makedb doesn't like it
215 fastalines[0] = re.sub(regex, "_", fastalines[0])
216 cleanseq = "\n".join(fastalines)
217 sane_seqlist.append(cleanseq)
218 self.logger.info("clean sequences appended: %d" % (len(sane_seqlist) ) )
219 return "\n".join(sane_seqlist)
220
221 def get_sequences(self):
222 """
223 Total number of records from the input set to be retrieved, up to a maximum
224 of 10,000. Optionally, for a large set the value of retstart can be iterated
225 while holding retmax constant, thereby downloading the entire set in batches
226 of size retmax.
227
228 http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
229
230 """
231 batch_size = self.retmax_efetch
232 count = self.count
233 uids_list = self.ids
234 self.logger.info("Batch size for efetch action: %d" % batch_size)
235 self.logger.info("Number of batches for efetch action: %d" % ((count / batch_size) + 1))
236 with open(self.outname, 'w') as out:
237 for start in range(0, count, batch_size):
238 end = min(count, start+batch_size)
239 batch = uids_list[start:end]
240 self.epost(self.dbname, ",".join(batch))
241 mfasta = ''
242 while not mfasta:
243 self.logger.info("retrieving batch %d" % ((start / batch_size) + 1))
244 mfasta = self.efetch(self.dbname, self.query_key, self.webenv)
245 out.write(mfasta + '\n')
246
247
248 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
249 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
250 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
251
252
253 def __main__():
254 """ main function """
255 parser = optparse.OptionParser(description='Retrieve data from NCBI')
256 parser.add_option('-i', dest='query_string', help='NCBI Query String')
257 parser.add_option('-o', dest='outname', help='output file name')
258 parser.add_option('-l', '--logfile', help='log file (default=stderr)')
259 parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO', help='logging level (default: INFO)')
260 parser.add_option('-d', dest='dbname', help='database type')
261 (options, args) = parser.parse_args()
262 if len(args) > 0:
263 parser.error('Wrong number of arguments')
264
265 log_level = getattr(logging, options.loglevel)
266 kwargs = {'format': LOG_FORMAT,
267 'datefmt': LOG_DATEFMT,
268 'level': log_level}
269 if options.logfile:
270 kwargs['filename'] = options.logfile
271 logging.basicConfig(**kwargs)
272 logger = logging.getLogger('data_from_NCBI')
273
274 E = Eutils(options, logger)
275 E.retrieve()
276
277
278 if __name__ == "__main__":
279 __main__()