fetch_fasta_from_ncbi: fetch_fasta_from

comparison fetch_fasta_from_NCBI.py @ 4:c667d0ee39f5 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit ca3070e85c370b914ffa0562afe12b363e05aea4

author	artbio
date	Wed, 29 Nov 2017 17:38:52 -0500
parents	8be88084f89c
children	706fe8139955

comparison

equal deleted inserted replaced

-:8be88084f89c
+:c667d0ee39f5
 self.retmax_esearch = 100000
 self.retmax_efetch = 500
 self.count = 0
 self.webenv = ""
 self.query_key = ""
+if options.get_uids:
+self.get_uids = True
+else:
+self.get_uids = False
+if options.iuds_file:
+with open(options.iuds_file, 'r') as f:
+self.ids.extend(f.readline().split(' '))
 def dry_run(self):
 self.get_count_value()
 def retrieve(self):
 """
 Retrieve the fasta sequences corresponding to the query
 """
-self.get_count_value()
+if len(self.ids) == 0:
+self.get_count_value()
+else:
+self.count = len(self.ids)
 # If no UIDs are found exit script
 if self.count > 0:
-self.get_uids_list()
+if len(self.ids) == 0:
-try:
+self.get_uids_list()
-self.get_sequences()
+if not self.get_uids:
-except QueryException as e:
+try:
-self.logger.error("Exiting script.")
+self.get_sequences()
-raise e
+except QueryException as e:
+self.logger.error("Exiting script.")
+raise e
+else:
+with open(self.outname, 'w') as f:
+f.write('\t'.join(self.ids)+'\n')
 else:
 self.logger.error("No UIDs were found. Exiting script.")
 raise Exception("")
 def get_count_value(self):
 querylog = response.readlines()
 response.close()
 time.sleep(1)
 return querylog
-def epost(self, db, ids):
+def sanitiser(self, db, fastaseq):
-url = self.base + "epost.fcgi"
+if(db not in "nuccore protein"):
-self.logger.debug("url_epost: %s" % url)
+return fastaseq
-values = {'db': db,
+regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}")
-'id': ids}
+sane_seqlist = []
-data = urllib.urlencode(values)
+seqlist = fastaseq.split("\n\n")
-req = urllib2.Request(url, data)
+for seq in seqlist[:-1]:
-serverResponse = False
+fastalines = seq.split("\n")
-nb_trials = 0
+if len(fastalines) < 2:
-while not serverResponse:
+self.logger.info("Empty sequence for %s" %
-nb_trials += 1
+("|".join(fastalines[0].split("|")[:4])))
-try:
+self.logger.info("%s download is skipped" %
-self.logger.debug("Try number %s for opening and readin URL %s"
+("|".join(fastalines[0].split("|")[:4])))
-% (nb_trials, url+data))
+continue
-response = urllib2.urlopen(req)
+if db == "nuccore":
-querylog = response.readlines()
+badnuc = 0
-response.close()
+for nucleotide in fastalines[1]:
-serverResponse = True
+if nucleotide not in "ATGC":
-except urllib2.HTTPError as e:
+badnuc += 1
-self.logger.info("urlopen error:%s, %s" % (e.code, e.read()))
+if float(badnuc)/len(fastalines[1]) > 0.4:
-self.logger.info("Retrying in 1 sec")
+self.logger.info("%s ambiguous nucleotides in %s\
-serverResponse = False
+or download interrupted at this offset\
-time.sleep(1)
+| %s" % (float(badnuc)/len(fastalines[1]),
-except urllib2.URLError as e:
+"|".join(fastalines[0].split("|")
-self.logger.info("urlopen error: Failed to reach a server")
+[:4]),
-self.logger.info("Reason :%s" % (e.reason))
+fastalines[1]))
-self.logger.info("Retrying in 1 sec")
+self.logger.info("%s download is skipped" %
-serverResponse = False
+(fastalines[0].split("|")[:4]))
-time.sleep(1)
+continue
-except httplib.IncompleteRead as e:
+""" remove spaces and trim the header to 100 chars """
-self.logger.info("IncompleteRead error:  %s" % (e.partial))
+fastalines[0] = fastalines[0].replace(" ", "_")[:100]
-self.logger.info("Retrying in 1 sec")
+cleanseq = "\n".join(fastalines)
-serverResponse = False
+sane_seqlist.append(cleanseq)
-time.sleep(1)
+elif db == "protein":
-self.logger.debug("query response:")
+fastalines[0] = fastalines[0][0:100]
-for line in querylog:
+fastalines[0] = fastalines[0].replace(" ", "_")
-self.logger.debug(line.rstrip())
+fastalines[0] = fastalines[0].replace("[", "_")
-if '</QueryKey>' in line:
+fastalines[0] = fastalines[0].replace("]", "_")
-self.query_key = str(line[line.find('<QueryKey>') +
+fastalines[0] = fastalines[0].replace("=", "_")
-len('<QueryKey>'):line.find('</QueryKey>')
+""" because blast makedb doesn't like it """
-])
+fastalines[0] = fastalines[0].rstrip("_")
-if '</WebEnv>' in line:
+fastalines[0] = re.sub(regex, "_", fastalines[0])
-self.webenv = str(line[line.find('<WebEnv>')+len('<WebEnv>'):
+cleanseq = "\n".join(fastalines)
-line.find('</WebEnv>')])
+sane_seqlist.append(cleanseq)
-self.logger.debug("*** epost action ***")
+self.logger.info("clean sequences appended: %d" % (len(sane_seqlist)))
-self.logger.debug("query_key: %s" % self.query_key)
+return "\n".join(sane_seqlist)
-self.logger.debug("webenv: %s" % self.webenv)
-time.sleep(1)
+def efetch(self, db, uid_list):
-def efetch(self, db, query_key, webenv):
 url = self.base + "efetch.fcgi"
 self.logger.debug("url_efetch: %s" % url)
 values = {'db': db,
-'query_key': query_key,
+'id': uid_list,
-'webenv': webenv,
 'rettype': "fasta",
 'retmode': "text"}
 data = urllib.urlencode(values)
 req = urllib2.Request(url, data)
 self.logger.debug("data: %s" % str(data))
 this batch. Aborting."})
 fasta = self.sanitiser(self.dbname, fasta)
 time.sleep(0.1)
 return fasta
-def sanitiser(self, db, fastaseq):
-if(db not in "nuccore protein"):
-return fastaseq
-regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}")
-sane_seqlist = []
-seqlist = fastaseq.split("\n\n")
-for seq in seqlist[:-1]:
-fastalines = seq.split("\n")
-if len(fastalines) < 2:
-self.logger.info("Empty sequence for %s" %
-("|".join(fastalines[0].split("|")[:4])))
-self.logger.info("%s download is skipped" %
-("|".join(fastalines[0].split("|")[:4])))
-continue
-if db == "nuccore":
-badnuc = 0
-for nucleotide in fastalines[1]:
-if nucleotide not in "ATGC":
-badnuc += 1
-if float(badnuc)/len(fastalines[1]) > 0.4:
-self.logger.info("%s ambiguous nucleotides in %s\
-or download interrupted at this offset\
-| %s" % (float(badnuc)/len(fastalines[1]),
-"|".join(fastalines[0].split("|")
-[:4]),
-fastalines[1]))
-self.logger.info("%s download is skipped" %
-(fastalines[0].split("|")[:4]))
-continue
-""" remove spaces and trim the header to 100 chars """
-fastalines[0] = fastalines[0].replace(" ", "_")[:100]
-cleanseq = "\n".join(fastalines)
-sane_seqlist.append(cleanseq)
-elif db == "protein":
-fastalines[0] = fastalines[0][0:100]
-fastalines[0] = fastalines[0].replace(" ", "_")
-fastalines[0] = fastalines[0].replace("[", "_")
-fastalines[0] = fastalines[0].replace("]", "_")
-fastalines[0] = fastalines[0].replace("=", "_")
-""" because blast makedb doesn't like it """
-fastalines[0] = fastalines[0].rstrip("_")
-fastalines[0] = re.sub(regex, "_", fastalines[0])
-cleanseq = "\n".join(fastalines)
-sane_seqlist.append(cleanseq)
-self.logger.info("clean sequences appended: %d" % (len(sane_seqlist)))
-return "\n".join(sane_seqlist)
 def get_sequences(self):
-"""
+batch_size = 200
-Total number of records from the input set to be retrieved,
-up to a maximum of 10,000. Optionally, for a large set the value of
-retstart can be iterated while holding retmax constant, thereby
-downloading the entire set in batches of size retmax.
-http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
-"""
-batch_size = self.retmax_efetch
 count = self.count
 uids_list = self.ids
 self.logger.info("Batch size for efetch action: %d" % batch_size)
 self.logger.info("Number of batches for efetch action: %d" %
 ((count / batch_size) + 1))
 with open(self.outname, 'w') as out:
 for start in range(0, count, batch_size):
 end = min(count, start+batch_size)
 batch = uids_list[start:end]
-if self.epost(self.dbname, ",".join(batch)) != -1:
+self.logger.info("retrieving batch %d" %
-mfasta = ''
+((start / batch_size) + 1))
-while not mfasta:
+try:
-self.logger.info("retrieving batch %d" %
+mfasta = self.efetch(self.dbname, ','.join(batch))
-((start / batch_size) + 1))
+out.write(mfasta + '\n')
-try:
+except QueryException as e:
-mfasta = self.efetch(self.dbname, self.query_key,
+self.logger.error("%s" % e.message)
-self.webenv)
+raise e
-out.write(mfasta + '\n')
+urllib.urlcleanup()
-except QueryException as e:
-self.logger.error("%s" % e.message)
-raise e
 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
 def command_parse():
 parser = argparse.ArgumentParser(description='Retrieve data from NCBI')
-parser.add_argument('-i', dest='query_string', help='NCBI Query String',
+parser.add_argument('-i', dest='query_string', help='NCBI Query String')
-required=True)
+parser.add_argument('--UID_list', dest='iuds_file',
+help='file containing a list of iuds to be fetched')
 parser.add_argument('-o', dest='outname', help='output file name')
 parser.add_argument('-d', dest='dbname', help='database type')
 parser.add_argument('--count', '-c', dest='count_ids',
 action='store_true', default=False,
 help='dry run ouputing only the number of sequences\
 found')
+parser.add_argument('--get_uids', '-u', dest='get_uids', default=False,
+action='store_true', help='prints to the output a list\
+of UIDs')
 parser.add_argument('-l', '--logfile', help='log file (default=stderr)')
 parser.add_argument('--loglevel', choices=LOG_LEVELS, default='INFO',
 help='logging level (default: INFO)')
 args = parser.parse_args()
+if args.query_string is not None and args.iuds_file is not None:
+parser.error('Please choose either fetching the -i query or the -u\
+list.')
 return args
 def __main__():
 """ main function """
 E = Eutils(args, logger)
 if args.count_ids:
 try:
 E.dry_run()
 except Exception:
-sys.exit(1)
+sys.exit(-1)
 else:
 try:
 E.retrieve()
 except Exception:
-sys.exit(1)
+sys.exit(-1)
 if __name__ == "__main__":
 __main__()

Mercurial > repos > artbio > fetch_fasta_from_ncbi

comparison fetch_fasta_from_NCBI.py @ 4:c667d0ee39f5 draft