Mercurial > repos > vimalkumarvelayudhan > viga

diff VIGA.py @ 0:231e4c669675 draft
Initial commit - v0.10.3 git commit deeded0
author: vimalkumarvelayudhan
date: Tue, 27 Feb 2018 14:16:54 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VIGA.py	Tue Feb 27 14:16:54 2018 -0500
@@ -0,0 +1,997 @@
+#!/usr/bin/env python
+
+# -*- coding: utf-8 -*-
+
+# VIGA - De-novo VIral Genome Annotator
+#
+# Copyright (C) 2017 - Enrique Gonzalez-Tortuero
+#                      Vimalkumar Velayudhan
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Importing python libraries
+from __future__ import print_function
+import argparse
+import csv
+import fileinput
+import fractions
+import glob
+import numpy
+import os
+import os.path
+import re
+import sys
+import shutil
+import subprocess
+from BCBio import GFF
+from Bio import SeqIO
+from Bio import SeqFeature
+from Bio.Alphabet import IUPAC
+from Bio.Seq import Seq
+from Bio.SeqFeature import FeatureLocation
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqUtils.ProtParam import ProteinAnalysis
+from collections import OrderedDict, defaultdict
+from itertools import product
+from scipy import signal
+from time import strftime
+
+# Preparing functions
+def batch_iterator(iterator, batch_size):
+	entry = True
+	while entry:
+		batch = []
+		while len(batch) < batch_size:
+			try:
+				entry = iterator.next()
+			except StopIteration:
+				entry = None
+			if entry is None:
+				break
+			batch.append(entry)
+		if batch:
+			yield batch
+
+def check_peaks(peaks, length):
+	# Checking if origin/terminus peaks are too close or too far apart. In that case, they are probably wrong
+    closest, farthest = int(length * float(0.45)), int(length * float(0.55))
+    pairs = []
+    for pair in list(product(*peaks)):
+        ### added this to make sure gets origin and ter right
+        tr, pk = sorted(list(pair), key = lambda x: x[1], reverse = False) # trough and peak
+        a = (tr[0] - pk[0]) % length
+        b = (pk[0] - tr[0]) % length
+        pt = abs(tr[1] - pk[1]) # distance between values
+        if (a <= farthest and a >= closest) or (b <=farthest and b >= closest):
+            pairs.append([pt, tr, pk])
+    if len(pairs) == 0:
+        return [False, False]
+    pt, tr, pk = sorted(pairs, reverse = True)[0]
+    return [tr[0], pk[0]]
+
+def cmd_exists(cmd):
+	return subprocess.call("type " + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0
+
+def GCskew(name, length, seq, window, slide):
+	replacements = {'G':1, 'C':-1, 'A':0, 'T':0, 'N':0}
+	gmc = [] # G - C
+	for base in seq:
+		try:
+			gmc.append(replacements[base])
+		except:
+			gmc.append(0)
+	# convert to G + C
+	gpc = [abs(i) for i in gmc] # G + C
+	# calculate sliding windows for (G - C) and (G + C)
+	weights = numpy.ones(window)/window
+	gmc = [[i, c] for i, c in enumerate(signal.fftconvolve(gmc, weights, 'same').tolist())]
+	gpc = [[i, c] for i, c in enumerate(signal.fftconvolve(gpc, weights, 'same').tolist())]
+	# calculate gc skew and cummulative gc skew sum
+	skew = [[], []] # x and y for gc skew
+	c_skew = [[], []] # x and y for gc skew cummulative sums
+	cs = 0 # cummulative sum
+	# select windows to use based on slide
+	for i, m in gmc[0::slide]:
+		p = gpc[i][1]
+		if p == 0:
+			gcs = 0
+		else:
+			gcs = m/p
+		cs += gcs
+		skew[0].append(i)
+		c_skew[0].append(i)
+		skew[1].append(gcs)
+		c_skew[1].append(cs)
+	ori, ter = find_ori_ter(c_skew, length)
+	return ori, ter, skew, c_skew
+
+def eprint(*args, **kwargs):
+	print(*args, file=sys.stderr, **kwargs)
+
+def find_ori_ter(c_skew, length):
+    # Find origin and terminus of replication based on cumulative GC skew min and max peaks
+    c_skew_min = signal.argrelextrema(numpy.asarray(c_skew[1]), numpy.less, order = 1)[0].tolist()
+    c_skew_max = signal.argrelextrema(numpy.asarray(c_skew[1]), numpy.greater, order = 1)[0].tolist()
+    # return False if no peaks were detected
+    if len(c_skew_min) == 0 or len(c_skew_min) == 0:
+        return [False, False]
+    else:
+        c_skew_min = [[c_skew[0][i], c_skew[1][i]] for i in c_skew_min]
+        c_skew_max = [[c_skew[0][i], c_skew[1][i]] for i in c_skew_max]
+        ori, ter = check_peaks([c_skew_min, c_skew_max], length)
+    return ori, ter
+
+def stringSplitByNumbers(x):
+	r = re.compile('(\d+)')
+	l = r.split(x)
+	return [int(y) if y.isdigit() else y for y in l]
+
+# Defining the program version
+version = "0.10.3"
+
+# Processing the parameters
+parser = argparse.ArgumentParser(description='VIGA is an automatic de novo VIral Genome Annotator.')
+basic_group = parser.add_argument_group('Basic options for VIGA [REQUIRED]')
+
+basic_group.add_argument("--input", dest="inputfile", type=str, required=True, help='Input file as a FASTA file', metavar="FASTAFILE")
+basic_group.add_argument("--rfamdb", dest="rfamdatabase", type=str, required=True, help='RFAM Database that will be used for the ribosomal RNA prediction. RFAMDB should be in the format "/full/path/to/rfamdb/Rfam.cm" and must be compressed accordingly (see INFERNAL manual) before running the script.', metavar="RFAMDB")
+basic_group.add_argument("--modifiers", dest="modifiers", type=str, required=True, help='Input file as a plain text file with the modifiers per every FASTA header according to SeqIn (https://www.ncbi.nlm.nih.gov/Sequin/modifiers.html). All modifiers must be written in a single line and are separated by a single space character. No space should be placed besides the = sign. For example: [organism=Serratia marcescens subsp. marcescens] [sub-species=marcescens] [strain=AH0650_Sm1] [topology=linear] [moltype=DNA] [tech=wgs] [gcode=11] [country=Australia] [isolation-source=sputum]. This line will be copied and printed along with the record name as the definition line of every contig sequence.', metavar="TEXTFILE")
+
+advanced_group = parser.add_argument_group('Advanced options for VIGA [OPTIONAL]')
+advanced_group.add_argument("--readlength", dest="read_length", type=int, default=101, help='Read length for the circularity prediction (default: 101 bp)', metavar="INT")
+advanced_group.add_argument("--windowsize", dest="window", type=int, default=100, help='Window size used to determine the origin of replication in circular contigs according to the cumulative GC skew (default: 100 bp)', metavar="INT")
+advanced_group.add_argument("--slidingsize", dest="slide", type=int, default=10, help='Window size used to determine the origin of replication in circular contigs according to the cumulative GC skew (default: 10 bp)', metavar="INT")
+advanced_group.add_argument("--out", dest="rootoutput", type=str, help='Name of the outputs files (without extension)', metavar="OUTPUTNAME")
+advanced_group.add_argument("--locus", dest="locus", type=str, default='LOC', help='Name of the sequences. If the input is a multiFASTA file, please put a general name as the program will add the number of the contig at the end of the name (Default: %(default)s)', metavar="STRING")
+advanced_group.add_argument("--gff", dest="gffprint", action='store_true', default=False, help='Printing the output as GFF3 file (Default: False)')
+advanced_group.add_argument("--threads", dest="ncpus", default=1, help='Number of threads/cpus (Default: %(default)s cpu)', metavar="INT")
+advanced_group.add_argument("--nohmmer", dest="nohmmer", action='store_true', default=False, help='Running only BLAST to predict protein function. (Default: False)')
+advanced_group.add_argument("--noblast", dest="noblast", action='store_true', default=False, help='Running DIAMOND instead of BLAST to predict protein function according to homology. This will be less sensitive but faster than BLAST. (Default: False)')
+advanced_group.add_argument("--blastdb", dest="blastdatabase", type=str, help='BLAST Database that will be used for the protein function prediction. The database must be an amino acid one, not nucleotidic. This argument is mandatory if --noblast option is disabled', metavar="BLASTDB")
+advanced_group.add_argument("--diamonddb", dest="diamonddatabase", type=str, help='DIAMOND Database that will be used for the protein function prediction. The database must be created from a amino acid FASTA file as indicated in https://github.com/bbuchfink/diamond. This argument is mandatory when --noblast option is enabled', metavar="DIAMONDDB")
+advanced_group.add_argument("--blastevalue", dest="blastevalue", default=0.00001, help='BLAST/DIAMOND e-value threshold (Default: 0.00001)', metavar="FLOAT")
+advanced_group.add_argument("--hmmdb", dest="hmmdatabase", type=str, help='PHMMER Database that will be used for the protein function prediction according to Hidden Markov Models. In this case, HMMDB must be in FASTA format (e.g. UniProt: "', metavar="HMMDB")
+advanced_group.add_argument("--hmmerevalue", dest="hmmerevalue", default=0.001, help='PHMMER e-value threshold (Default: 0.001)', metavar="FLOAT")
+
+type_choices = {'BCT': 'Prokaryotic chromosome', 'CON': 'Contig', 'PHG': 'Phages', 'VRL': 'Eukaryotic/Archaea virus'}
+type_help = ('GenBank Division: One of the following codes - {0}. (Default: %(default)s)'.format(', '.join('{0} ({1})'.format(k, v) for k, v in type_choices.items())))
+advanced_group.add_argument("--typedata", dest="typedata", type=str, default='CON', help=type_help, metavar="BCT|CON|VRL|PHG")
+
+gcode_choices = {'1': 'Standard genetic code [Eukaryotic]', '2': 'Vertebrate mitochondrial code', '3': 'Yeast mitochondrial code', '4': 'Mycoplasma/Spiroplasma and Protozoan/mold/coelenterate mitochondrial code', '5': 'Invertebrate mitochondrial code', '6': 'Ciliate, dasycladacean and hexamita nuclear code', '9': 'Echinoderm/flatworm mitochondrial code', '10': 'Euplotid nuclear code', '11': 'Bacteria/Archaea/Phages/Plant plastid', '12': 'Alternative yeast nuclear code', '13': 'Ascidian mitochondrial code', '14': 'Alternative flatworm mitochondrial code', '16': 'Chlorophycean mitochondrial code', '21': 'Trematode mitochondrial code', '22': 'Scedenesmus obliquus mitochondrial code', '23': 'Thraustochytrium mitochondrial code', '24': 'Pterobranquia mitochondrial code', '25': 'Gracilibacteria & Candidate division SR1', '26': 'Pachysolen tannophilus nuclear code', '27': 'Karyorelict nuclear code', '28': 'Condylostoma nuclear code', '29': 'Mesodinium nuclear code', '30': 'Peritrich nuclear code', '31': 'Blastocrithidia nuclear code'}
+gcode_help = ('Number of GenBank translation table. At this moment, the available options are {0}. (Default: %(default)s)'.format('{}'.format(', '.join('{0} ({1})'.format(k, v) for k, v in gcode_choices.items()))))
+advanced_group.add_argument("--gcode", dest="gcode", type=str, default='11', help=gcode_help, metavar="NUMBER")
+
+advanced_group.add_argument('--mincontigsize', dest="mincontigsize", type=int, default = 200, help = 'Minimum contig length to be considered in the final files (Default: 200 bp)', metavar="INT")
+advanced_group.add_argument("--idthr", dest="idthreshold", default=50.00, help='ID threshold (Default: 50.0)', metavar="FLOAT")
+advanced_group.add_argument("--coverthr", dest="covthreshold", default=50.00, help='Coverage threshold (Default: 50.0)', metavar="FLOAT")
+advanced_group.add_argument("--diffid", dest="diffid", default=5.00, help='Max allowed difference between the ID percentages of BLAST and HMMER. (Default = 5.00; Not recommended to change such value)', metavar="FLOAT (>0.01)")
+advanced_group.add_argument("--minrepeat", dest="minrepeat", type=int, default=16, help="Minimum repeat length for CRISPR detection (Default: 16)", metavar="INT")
+advanced_group.add_argument("--maxrepeat", dest="maxrepeat", type=int, default=64, help="Maximum repeat length for CRISPR detection (Default: 64)")
+advanced_group.add_argument("--minspacer", dest="minspacer", type=int, default=8, help="Minimum spacer length for CRISPR detection (Default: 8)")
+advanced_group.add_argument("--maxspacer", dest="maxspacer", type=int, default=64, help="Maximum spacer length for CRISPR detection (Default: 64)")
+advanced_group.add_argument("--blastexh", dest="blastexh", action='store_true', default=False, help='Use of exhaustive BLAST to predict the proteins by homology according to Fozo et al. (2010) Nucleic Acids Res (Default=FALSE)')
+
+args = parser.parse_args()
+
+root_output = args.rootoutput
+if not root_output:
+	root_output = '{}_annotated'.format(os.path.splitext(args.inputfile)[0])
+
+if args.noblast == False and args.blastdatabase == None:
+    sys.exit('You MUST specify BLAST database using the parameter --blastdb if you are not using --noblast option')
+
+if args.noblast == True and args.diamonddatabase == None:
+    sys.exit('You MUST specify DIAMOND database using the parameter --diamonddb if you are using --noblast option')
+
+if args.nohmmer == False and args.hmmdatabase == None:
+		sys.exit('You MUST specify HMMER database using the parameter --hmmdb if you are not using --nohmmer option')
+
+# Printing the header of the program 
+eprint("This is VIGA %s" % str(version))
+eprint("Written by Enrique Gonzalez Tortuero & Vimalkumar Velayudhan")
+eprint("Homepage is https://github.com/EGTortuero/virannot")
+eprint("Local time: ", strftime("%a, %d %b %Y %H:%M:%S"))
+eprint("\n\n")
+
+# checking the presence of the programs in the system
+
+if not cmd_exists("lastz")==True:
+	sys.exit("You must install LASTZ before using this script")
+elif not cmd_exists("cmscan")==True:
+	sys.exit("You must install INFERNAL before using this script")
+elif not cmd_exists("prodigal")==True:
+	sys.exit("You must install prodigal before using this script")
+elif not cmd_exists("parallel")==True:
+	sys.exit("You must install GNU Parallel before using this script")
+elif not cmd_exists("blastp")==True:
+	sys.exit("You must install BLAST before using this script")
+elif not cmd_exists("diamond")==True:
+	sys.exit("You must install DIAMOND before using this script")
+elif not cmd_exists("phmmer")==True:
+	sys.exit("You must install HMMER 3 before using this script")
+elif not cmd_exists("aragorn")==True:
+	sys.exit("You must install ARAGORN before using this script")
+elif not cmd_exists("pilercr")==True:
+	sys.exit("You must install PILERCR before using this script")
+elif not cmd_exists("trf")==True:
+	sys.exit("You must install Tandem Repeats Finder before using this script")
+elif not cmd_exists("irf")==True:
+	sys.exit("You must install Inverted Repeats Finder before using this script")
+
+eprint("Data type is {0} and GenBank translation table no is {1}\n".format(args.typedata, args.gcode))
+
+# Correcting the original file (long headers problem + multiple FASTA files)
+record_iter = SeqIO.parse(open(args.inputfile, "rU"),"fasta")
+counter = 1
+newnamessequences = {}
+for i, batch in enumerate(batch_iterator(record_iter, 1)):
+	seq_index = "CONTIG_%i" % (i + 1)
+	filename = "%s.temp.fna" % seq_index
+	newfilename = "%s.fna" % seq_index
+	with open(filename, "w") as handle:
+		count = SeqIO.write(batch, filename, "fasta")
+
+	with open(filename, "rU") as original, open(newfilename, "w") as corrected:
+		sequences = SeqIO.parse(original, "fasta", IUPAC.ambiguous_dna)
+		for record in sequences:
+			original_name = record.id
+			record.id = "%s_%i" % (args.locus, counter)
+			record.description = record.description
+			counter += 1
+			newnamessequences[record.id] = original_name
+			eprint("WARNING: The name of the sequence %s was corrected as %s" % (original_name, record.id))
+		SeqIO.write(record, corrected, "fasta")
+
+	with open("logfile.txt", "w") as logfile:
+		logfile.write("#Original\tNew\n")
+		for oldname in sorted(newnamessequences, key = stringSplitByNumbers):
+			logfile.write("%s\t%s\n" % (oldname, newnamessequences[oldname]))
+	os.remove(filename)
+
+for newfile in sorted(glob.glob("CONTIG_*.fna")):
+
+	# Predicting the shape of the contig (code based on Alex Crits-Christoph's find_circular.py script [https://github.com/alexcritschristoph/VICA/blob/master/find_circular.py])
+	eprint("Predicting the shape of the contig using LASTZ")
+	genomeshape = {}
+	with open(newfile, "rU") as targetfasta:
+		Sequence = SeqIO.parse(newfile, "fasta")
+		for record in Sequence:
+			seq_beginning = str(record.seq[0:args.read_length])
+			seq_ending = str(record.seq[len(record.seq)-args.read_length:len(record.seq)])
+			combined_seqs = SeqRecord(Seq(seq_beginning + seq_ending, IUPAC.ambiguous_dna), id = record.description)
+			SeqIO.write(combined_seqs, "temporal_circular.fasta", "fasta")
+			outputlastz = subprocess.check_output(["lastz", "temporal_circular.fasta", "--self", "--notrivial", "--nomirror", "--format=general-:start1,end1,start2,end2,score,strand1,strand2,identity,length1"])
+			resultslastz = outputlastz.split("\n")
+			for resultlastz in resultslastz:
+				if resultlastz != '':
+					start1 = resultlastz.split()[0]
+					end1 = resultlastz.split()[1]
+					start2 = resultlastz.split()[2]
+					end2 = resultlastz.split()[3]
+					strand1 = resultlastz.split()[5]
+					strand2 = resultlastz.split()[6]
+					identity = resultlastz.split()[7]
+					length = int(resultlastz.split()[9])
+					if strand1 == strand2 and length > 0.4 * args.read_length and float(fractions.Fraction(identity)) > 0.95 and int(start1) < 5 and int(start2) > args.read_length and int(end1) < args.read_length and int(end2) > args.read_length * 2 * 0.9:
+						genomeshape['genomeshape'] = "circular"
+						try:
+							if genomeshape['identity'] >= float(fractions.Fraction(identity)):
+								genomeshape['identity'] = float(fractions.Fraction(identity))
+								genomeshape['length'] = length
+						except KeyError:
+							genomeshape['identity'] = float(fractions.Fraction(identity))
+							genomeshape['length'] = length
+					else:
+						continue
+					if strand1 == strand2 and length > 0.4 * args.read_length and float(fractions.Fraction(identity)) > 0.95 and int(start1) < 5 and int(start2) > args.read_length and int(end1) < args.read_length and int(end2) > args.read_length * 2 * 0.9:
+						genomeshape['genomeshape'] = "circular"
+						try:
+							if genomeshape['identity'] >= float(fractions.Fraction(identity)):
+								genomeshape['identity'] = float(fractions.Fraction(identity))
+								genomeshape['length'] = length
+						except KeyError:
+							genomeshape['identity'] = float(fractions.Fraction(identity))
+							genomeshape['length'] = length
+			try:
+				if genomeshape['genomeshape'] == "":
+						genomeshape['genomeshape'] = "linear"
+			except KeyError:
+				genomeshape['genomeshape'] = "linear"
+			else:
+				genomeshape['genomeshape'] = "circular"
+				with open("temp.fasta", "w") as correctedcircular:
+					Corrseq = str(record.seq[int(genomeshape['length'])//2:-int(genomeshape['length'])//2])
+					Newseq = SeqRecord(Seq(Corrseq, IUPAC.ambiguous_dna), id = record.description)
+					SeqIO.write(Newseq, correctedcircular, "fasta")
+				os.rename("temp.fasta", "temp2.fasta")
+		eprint("LASTZ predicted that %s is %s\n" % (newfile, genomeshape['genomeshape']))
+		os.remove("temporal_circular.fasta")
+
+	# Calculate the cumulative GCskew in circular contigs to determine the origin of replication (Based on iRep -- Brown CT, Olm MR, Thomas BC, Banfield JF (2016) Measurement of bacterial replication rates in microbial communities. Nature Biotechnology 34: 1256-63.)
+	if genomeshape['genomeshape'] == "circular":
+		for record in SeqIO.parse("temp2.fasta", "fasta"):
+			length_contig = len(record.seq)
+			#if length < min_len:
+			#    print('%s: Too Short' % (name), file=sys.stderr)
+			#    continue
+			oric, term, gcskew, cgcskew = GCskew(record.id, length_contig, record.seq, args.window, args.slide)
+			valoric = oric
+			if oric == False:
+				oric, term = 'n/a', 'n/a'
+			else:
+				oric, term = '{:,}'.format(oric), '{:,}'.format(term)
+			eprint('%s -> Origin: %s Terminus: %s' % (record.id, oric, term))
+			#print('\t'.join(['# Name', 'Position', 'GC Skew', 'Cumulative GC Skew']))
+			#for i, pos in enumerate(gcskew[0]):
+			#	out = [record.id, pos, gcskew[1][i], cgcskew[1][i]]
+			#	print('\t'.join([str(i) for i in out]))
+			if valoric != False:
+				firstpartseq = str(record.seq[valoric:-1])
+				secondpartseq = str(record.seq[0:(valoric-1)])
+				combinedcorrectedseq = SeqRecord(Seq(firstpartseq + secondpartseq, IUPAC.ambiguous_dna), id = record.description)
+				SeqIO.write(combinedcorrectedseq, newfile, "fasta")
+			else:
+				eprint("As the program was unable to predict the origin of replication, %s was considered as is without correcting!" % record.id)
+				os.rename("temp2.fasta", newfile)
+		if os.path.isfile("temp2.fasta"):
+			os.remove("temp2.fasta")
+
+	# Predicting genes using PRODIGAL
+	eprint("\nRunning Prodigal to predict the genes in %s" % newfile)
+	for record in SeqIO.parse(newfile, "fasta"):
+		length_contig = len(record.seq)
+		if (length_contig >= 100000):
+			if genomeshape['genomeshape'] == 'linear':
+				subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-g", args.gcode, "-c", "-q"])
+			else:
+				subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-g", args.gcode, "-q"])
+		else:
+			if genomeshape['genomeshape'] == 'linear':
+				subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-p", "meta", "-g", args.gcode, "-c", "-q"])
+			else:
+				subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-p", "meta", "-g", args.gcode, "-q"])
+	num_seqs = len(list(SeqIO.parse("pretemp.faa", "fasta")))
+	eprint("PRODIGAL was able to predict %i genes in %s\n" % (num_seqs, newfile))
+	
+	with open("pretemp.faa", "rU") as originalfaa, open("temp.faa", "w") as correctedfaa:
+		sequences = SeqIO.parse(originalfaa, "fasta")
+		for record in sequences:
+			record.seq = record.seq.rstrip("*")
+			SeqIO.write(record, correctedfaa, "fasta")
+			
+	faa_file = "%s.faa" % newfile # TO DEBUG
+	shutil.copyfile("temp.faa", faa_file) # TO DEBUG
+	os.remove("pretemp.faa")
+
+	# Predicting the function of the proteins based on homology using BLAST
+	equivalence = {}
+	record_iter = SeqIO.parse(open("temp.faa"),"fasta")
+	for i, batch in enumerate(batch_iterator(record_iter, 1)):
+		seq_index = "SEQ_%i" % (i + 1)
+		filename = "%s.faa" % seq_index
+		with open(filename, "w") as handle:
+			count = SeqIO.write(batch, handle, "fasta")
+			equivalence[seq_index] = batch[0].id
+
+	if args.blastexh==True:
+		eprint("Running BLAST to predict the genes according to homology inference in %s using exhaustive mode (see Fozo et al. (2010) Nucleic Acids Res for details)" % newfile)
+		subprocess.call(['blastp', '-query', "temp.faa", '-db', args.blastdatabase, '-evalue', str(args.blastevalue), '-outfmt', '6 qseqid sseqid pident length qlen slen qstart qend evalue bitscore stitle', '-out', 'temp_blast.csv', '-max_target_seqs', '10', '-word_size', '2', '-gapopen', '8', '-gapextend', '2', '-matrix', '"PAM70"', '-comp_based_stats', '"0"', "-num_threads", str(args.ncpus)])
+		eprint("Done. BLAST was done to predict the genes by homology\n")
+		blast_log = "%s.blast.log" % newfile # TO DEBUG
+		shutil.copyfile("temp_blast.csv", blast_log) # TO DEBUG
+	elif args.noblast==True:
+		eprint("Running DIAMOND to predict the genes according to homology inference in %s using default parameters" % newfile)
+		with open("temp.faa", "r") as tempfile:
+			first_line = tempfile.readline()
+			if first_line.startswith(">"):
+				subprocess.call(['diamond', 'blastp', '-q', 'temp.faa', '-d', args.diamonddatabase, '-e', str(args.blastevalue), '-f', '6', 'qseqid', 'sseqid', 'pident', 'length', 'qlen', 'slen', 'qstart', 'qend', 'evalue', 'bitscore', 'stitle', '-o', 'temp_blast.csv', '-k', '10', "-p", str(args.ncpus), '--quiet'])
+			else:
+				open("temp_blast.csv", 'a').close()
+		blast_log = "%s.blast.log" % newfile # TO DEBUG
+		shutil.copyfile("temp_blast.csv", blast_log) # TO DEBUG
+		eprint("Done. DIAMOND was done to predict the genes by homology\n")
+	else:
+		eprint("Running BLAST to predict the genes according to homology inference in %s using default parameters" % newfile)
+		subprocess.call(['blastp', '-query', "temp.faa", '-db', args.blastdatabase, '-evalue', str(args.blastevalue), '-outfmt', '6 qseqid sseqid pident length qlen slen qstart qend evalue bitscore stitle', '-out', 'temp_blast.csv', '-max_target_seqs', '10', "-num_threads", str(args.ncpus)])
+		blast_log = "%s.blast.log" % newfile
+		shutil.copyfile("temp_blast.csv", blast_log) # TO DEBUG
+		eprint("Done. BLAST was done to predict the genes by homology\n") # TO DEBUG
+
+	# Parsing the results from BLAST
+	with open("temp_blast.csv", "rU") as blastresults:
+		hypotheticalpat = re.compile(r'(((((?i)hypothetical)|(?i)uncharacteri(z|s)ed|(?i)predicted)) protein)|((?i)ORF|((?i)unnamed protein product|(?i)gp\d+))')
+		reader = csv.DictReader(blastresults, delimiter='\t', fieldnames=['qseqid','sseqid','pident','length','qlen','slen','qstart','qend','evalue','bitscore','stitle'])
+		information_proteins_blast = {}
+		for row in reader:
+			perc_cover = round(100.00*(float(row['length'])/float(row['qlen'])),2)
+			perc_id = float(row['pident'])
+			infoprot_blast = {}
+			infoprot_blast['sseqid'] = row['sseqid']
+			infoprot_blast['pident'] = perc_id
+			infoprot_blast['pcover'] = perc_cover
+			infoprot_blast['evalue'] = row['evalue']
+			infoprot_blast['descr'] = row['stitle']
+			try:
+				data = information_proteins_blast[row['qseqid']]
+			except KeyError:
+				if not re.search(hypotheticalpat, infoprot_blast['descr']) and float(perc_id) >= float(args.idthreshold) and float(perc_cover) >= float(args.covthreshold) and float(row['evalue']) <= float(args.blastevalue):
+						information_proteins_blast[row['qseqid']] = infoprot_blast
+				else:
+					continue
+			else:
+				if not re.search(hypotheticalpat, infoprot_blast['descr']) and float(perc_id) >= float(args.idthreshold) and float(perc_id) >= float(infoprot_blast['pident']) and float(perc_cover) >= float(args.covthreshold) and float(perc_cover) >= float(infoprot_blast['pcover']) and float(row['evalue']) <= float(args.blastevalue):
+						information_proteins_blast[row['qseqid']] = infoprot_blast
+
+	## Predicting the function of the proteins based on HMM predictions using phmmer
+	if args.nohmmer == False:
+		with open("commands.sh", "w") as commands:
+			for singleprot in sorted(glob.glob("SEQ_*.faa")):
+				hhmtable = "%s.tbl" % singleprot
+				eprint("Creating file to run parallel PHMMER")
+				eprint("Adding %s to run PHMMER." % singleprot)
+				line2write = ' '.join(["phmmer", "--cpu", "1", "--domtblout", hhmtable, "-E", str(args.hmmerevalue), "-o", "/dev/null", singleprot, args.hmmdatabase, '\n'])
+				commands.write(line2write)
+
+		eprint("Running parallel PHMMER")
+		subprocess.call(['parallel', '-j', str(args.ncpus)], stdin=open('commands.sh'))
+		eprint("Done. PHMMER was done to predict the function of the genes according to Hidden Markov Models\n")
+		os.remove("commands.sh")
+
+		# Parsing the results from HMMER
+		information_proteins_hmmer = {}
+		hypotheticalpat = re.compile(r'(((((?i)hypothetical)|(?i)uncharacteri(z|s)ed|(?i)predicted)) protein)|((?i)ORF|((?i)unnamed protein product|(?i)gp\d+))')
+		for singletbl in sorted(glob.glob("*.faa.tbl")):
+			rootname = singletbl.replace(".faa.tbl", "")
+			with open(singletbl) as tblfile:
+				infoprot_hmmer = {}
+				for line in tblfile:
+					if line.startswith("#"):
+						continue
+					else:
+						try:
+							pat = re.compile("^(\S+)\s+\S\s+\d+\s+(\S+)\s+\S\s+(\d+)\s+((?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?)\s+\S+\s+\S+\s+\S+\s+\S+\s+(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s+\S+\s+\S+\s+\S+\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+\S+\s+\S+\s+(\S+)\s+(.*)")
+							matchname, lociname, length, evaluehh, start, end, pident, description = re.match(pat, line).groups()
+						except AttributeError:
+							continue
+						else:
+							length = float(length)
+							pident = 100.00*float(pident)
+							protarea = 100.00*(((float(end)-1.00) - (float(start)-1.00))/length)
+							try:
+								data2 = infoprot_hmmer['lociname']
+							except KeyError:
+								if not re.search(hypotheticalpat, description) and float(protarea) >= float(args.covthreshold) and float(evaluehh) <= float(args.hmmerevalue) and float(pident) >= 50.00:
+									infoprot_hmmer['lociname'] = lociname
+									infoprot_hmmer['name'] = matchname
+									infoprot_hmmer['evalue'] = float(evaluehh)
+									infoprot_hmmer['pcover'] = float(protarea)
+									infoprot_hmmer['pident'] = float(pident)
+									infoprot_hmmer['descr'] = description
+								else:
+									try:
+										if not re.search(hypotheticalpat, description) and float(protarea) >= float(args.covthreshold) and float(evaluehh) <= float(args.hmmerevalue) and float(pident) >= 50.00 and float(pident) >= infoprot_hmmer['pident']:
+											infoprot_hmmer['lociname'] = lociname
+											infoprot_hmmer['name'] = matchname
+											infoprot_hmmer['evalue'] = float(evaluehh)
+											infoprot_hmmer['pcover'] = float(protarea)
+											infoprot_hmmer['pident'] = float(pident)
+											infoprot_hmmer['descr'] = description
+									except KeyError:
+											continue
+							else:
+								if not re.search(hypotheticalpat, description) and float(protarea) >= float(args.covthreshold) and float(evaluehh) <= float(args.hmmerevalue) and float(pident) >= 50.00 and float(pident) >= infoprot_hmmer['pident']:
+									infoprot_hmmer['lociname'] = lociname
+									infoprot_hmmer['name'] = matchname
+									infoprot_hmmer['evalue'] = float(evaluehh)
+									infoprot_hmmer['pcover'] = float(protarea)
+									infoprot_hmmer['pident'] = float(pident)
+									infoprot_hmmer['descr'] = description
+				information_proteins_hmmer[rootname] = infoprot_hmmer
+
+	#Storing protein information in memory
+	with open("temp.faa", "rU") as protsfile:
+		tempprotsdict = {}
+		for protseq in SeqIO.parse(protsfile, "fasta"):
+			tempindprot = {}
+			dataprot = protseq.description.split(' # ')
+			modseq = str(protseq.seq).replace("X","") # Removing all ambiguous amino acids to avoid problems with ProteinAnalysis module
+			analysed_seq = ProteinAnalysis(modseq)
+			tempindprot['length'] = len(protseq.seq)
+			tempindprot['isoelectricpoint'] = analysed_seq.isoelectric_point()
+			tempindprot['molweightkda'] = analysed_seq.molecular_weight()/1000.00
+			tempindprot['instability'] = analysed_seq.instability_index()
+			tempindprot['protein_id'] = dataprot[0]
+			tempindprot['strand'] = int(dataprot[3])
+			tempindprot['begin'] = int(dataprot[1])-1
+			tempindprot['end'] = int(dataprot[2])
+			tempprotsdict[dataprot[0]] = tempindprot
+
+	# Creation of table
+	debugtable = "%s.csv" % newfile
+	with open(debugtable, "w") as tablefile:
+		if args.nohmmer == False:
+			print("\t".join(["Identifier", "Start", "Stop", "Strand", "size_aa", "pI", "Mol_weight_kDa", "Instability_index", "ID_BLAST", "Descr_BLAST", "evalue_BLAST", "%ID_BLAST", "%Cover_BLAST", "ID_HMMER", "Descr_HMMER", "evalue_HMMER", "%ID_HMMER", "%Cover_HMMER"]), file=tablefile)
+			keylist = information_proteins_hmmer.keys()
+			keylist.sort()
+			for keyB in keylist:
+				keyB = keyB.replace(".faa.tbl", "")
+				try:
+					print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), information_proteins_blast[equivalence[keyB]]['sseqid'], information_proteins_blast[equivalence[keyB]]['descr'], str(information_proteins_blast[equivalence[keyB]]['evalue']), str(information_proteins_blast[equivalence[keyB]]['pident']), str(information_proteins_blast[equivalence[keyB]]['pcover']), information_proteins_hmmer[keyB]['name'], information_proteins_hmmer[keyB]['descr'], str(information_proteins_hmmer[keyB]['evalue']), str(information_proteins_hmmer[keyB]['pident']), str(information_proteins_hmmer[keyB]['pcover'])]), file=tablefile)
+				except KeyError:
+					try:
+						print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), information_proteins_blast[equivalence[keyB]]['sseqid'], information_proteins_blast[equivalence[keyB]]['descr'], str(information_proteins_blast[equivalence[keyB]]['evalue']), str(information_proteins_blast[equivalence[keyB]]['pident']), str(information_proteins_blast[equivalence[keyB]]['pcover']), "None", "None", "NA", "NA", "NA"]), file=tablefile)
+					except KeyError:
+						try:
+							print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), "None", "None", "NA", "NA", "NA", information_proteins_hmmer[keyB]['name'], information_proteins_hmmer[keyB]['descr'], str(information_proteins_hmmer[keyB]['evalue']), str(information_proteins_hmmer[keyB]['pident']), str(information_proteins_hmmer[keyB]['pcover'])]), file=tablefile)
+						except KeyError:
+							print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), "None", "None", "NA", "NA", "NA",  "None", "None", "NA", "NA", "NA"]), file=tablefile)
+		else:
+			print("\t".join(["Identifier", "Start", "Stop", "Strand", "size_aa", "pI", "Mol_weight_kDa", "Instability_index", "ID_BLAST", "Descr_BLAST", "evalue_BLAST", "%ID_BLAST", "%Cover_BLAST"]), file=tablefile)
+			keylist = equivalence.values()
+			keylist.sort()
+			for keyB in keylist:
+				try:
+					print("\t".join([keyB, str(tempprotsdict[keyB]['begin']), str(tempprotsdict[keyB]['end']), str(tempprotsdict[keyB]['strand']), str(tempprotsdict[keyB]['length']), str(tempprotsdict[keyB]['isoelectricpoint']), str(tempprotsdict[keyB]['molweightkda']), str(tempprotsdict[keyB]['instability']), information_proteins_blast[keyB]['sseqid'], information_proteins_blast[keyB]['descr'], str(information_proteins_blast[keyB]['evalue']), str(information_proteins_blast[keyB]['pident']), str(information_proteins_blast[keyB]['pcover'])]), file=tablefile)
+				except KeyError:
+					print("\t".join([keyB, str(tempprotsdict[keyB]['begin']), str(tempprotsdict[keyB]['end']), str(tempprotsdict[keyB]['strand']), str(tempprotsdict[keyB]['length']), str(tempprotsdict[keyB]['isoelectricpoint']), str(tempprotsdict[keyB]['molweightkda']), str(tempprotsdict[keyB]['instability']), "None", "None", "NA", "NA", "NA"]), file=tablefile)
+
+	# Algorithm of decisions (which one: BLAST/HMMER?)
+	multipleprots = {}
+	Hypotheticalpat = re.compile(r'(((((?i)hypothetical)|(?i)uncharacteri(z|s)ed|(?i)predicted)) protein)|((?i)ORF|((?i)unnamed protein product|(?i)gp\d+))')
+	if args.nohmmer == False:
+		keylist = information_proteins_hmmer.keys()
+		keylist.sort()
+		for keyB in keylist:
+			keyB = keyB.replace(".faa.tbl", "")
+			singleprot = {}
+			singleprot['name'] = equivalence[keyB]
+			if (equivalence[keyB] in information_proteins_blast) and (keyB in information_proteins_hmmer):
+				if re.search(Hypotheticalpat, information_proteins_blast[equivalence[keyB]]['descr']):
+					try:
+						if re.search(Hypotheticalpat, information_proteins_hmmer[keyB]['descr']):
+							singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
+						else:
+							singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
+					except KeyError:
+						singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
+				else:
+					try:
+						if (float(information_proteins_blast[equivalence[keyB]]['pident'])>float(information_proteins_hmmer[keyB]['pident'])) and (float(information_proteins_blast[equivalence[keyB]]['pcover'])>float(information_proteins_hmmer[keyB]['pcover'])):
+							singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
+						elif (float(information_proteins_blast[equivalence[keyB]]['pident'])<float(information_proteins_hmmer[keyB]['pident'])) and (float(information_proteins_blast[equivalence[keyB]]['pcover'])<float(information_proteins_hmmer[keyB]['pcover'])):
+							singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
+						elif (float(information_proteins_blast[equivalence[keyB]]['pident'])>float(information_proteins_hmmer[keyB]['pident'])) and (float(information_proteins_blast[equivalence[keyB]]['pcover'])<float(information_proteins_hmmer[keyB]['pcover'])):
+							if (float(information_proteins_blast[equivalence[keyB]]['pident'])-float(information_proteins_hmmer[keyB]['pident']) >= args.diffid):
+								singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
+							else:
+								singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
+						else:
+							if (float(information_proteins_hmmer[keyB]['pident'])-float(information_proteins_blast[equivalence[keyB]]['pident']) >= args.diffid):
+								singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
+							else:
+								singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
+					except KeyError:
+						try:
+							if (float(information_proteins_blast[equivalence[keyB]]['pcover'])>float(information_proteins_hmmer[keyB]['pcover'])):
+								singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
+							else:
+								singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
+						except KeyError:
+							singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
+			elif equivalence[keyB] in information_proteins_blast:
+				singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
+			elif keyB in information_proteins_hmmer:
+				try:
+					singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
+				except KeyError:
+					singleprot['descr'] = 'Hypothetical protein'
+			else:
+				singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
+			multipleprots[keyB] = singleprot
+	else:
+		keylist = equivalence.values()
+		keylist.sort()
+		for keyB in keylist:
+			singleprot = {}
+			singleprot['name'] = keyB
+			try:
+				if information_proteins_blast[keyB]['descr'] == None:
+					singleprot['descr'] = 'Hypothetical protein'
+				elif re.search(Hypotheticalpat, information_proteins_blast[keyB]['descr']):
+					singleprot['descr'] = 'Conserved hypothetical protein'
+				else:
+					singleprot['descr'] = information_proteins_blast[keyB]['descr']
+			except KeyError:
+				singleprot['descr'] = 'Hypothetical protein'
+			multipleprots[keyB] = singleprot
+
+	#Storing protein information in memory
+	with open("temp.faa", "rU") as protsfile:
+		protsdict = {}
+		for protseq in SeqIO.parse(protsfile, "fasta"):
+			indprot = {}
+			dataprot = protseq.description.split(' # ')
+			indprot['translation'] = protseq.seq
+			indprot['protein_id'] = dataprot[0]
+			indprot['strand'] = int(dataprot[3])
+			indprot['begin'] = int(dataprot[1])-1
+			indprot['end'] = int(dataprot[2])
+			for keyOmega in sorted(multipleprots):
+				if multipleprots[keyOmega]['name'] == dataprot[0]:
+					indprot['product'] = multipleprots[keyOmega]['descr']
+			protsdict[dataprot[0]] = indprot
+
+	# Predicting the rRNA sequences
+	with open(newfile, "rU") as targetfasta, open("/dev/null", "w") as apocalypse:
+		eprint("Running INFERNAL+RFAM to predict rRNA-like sequences in %s" % newfile)
+		subprocess.call(["cmscan", "--rfam", "--cut_ga", "--nohmmonly", "--tblout", "rrnafile.csv", "--cpu", args.ncpus, args.rfamdatabase, newfile], stdout=apocalypse)
+
+	#Storing rRNA information in memory
+	with open("rrnafile.csv", "rU") as rrnafile:
+		namedict = {"SSU_rRNA_archaea": "16s_rRNA", "SSU_rRNA_bacteria": "16s_rRNA", "SSU_rRNA_eukarya": "18s_rRNA", "SSU_rRNA_microsporidia": "16s_rRNA", "LSU_rRNA_archaea": "23s_rRNA", "LSU_rRNA_bacteria": "23s_rRNA", "LSU_rRNA_eukarya": "28s_rRNA", "5S_rRNA": "5s_rRNA"}
+		rRNAdict = defaultdict(list)
+		for line in rrnafile:
+			if not line.startswith("#"):
+				InfoLINE = re.sub("\s+", ",", line)
+				line_splitted = InfoLINE.split(",")
+				item_type = line_splitted[0]
+				if item_type.startswith(('LSU', 'SSU', '5S')):
+					strand = 0
+					if line_splitted[9] == "+":
+						strand = 1
+					else:
+						strand = -1
+					rRNAdict[item_type].append({'score': float(line_splitted[14]), 'product': namedict[line_splitted[0]], 'begin': int(line_splitted[7]), 'end': int(line_splitted[8]), 'strand': strand})
+
+		subunits = {'LSU': {'max_score': 0 }, 'SSU': {'max_score': 0 }, '5S': {'max_score': 0 }}
+		for type_rRNA, rRNA_data in rRNAdict.items():
+			max_score = max([item['score'] for item in rRNAdict[type_rRNA]])
+			for item in ('LSU', 'SSU'):
+				if type_rRNA.startswith(item):
+					if max_score > subunits[item]['max_score']:
+						subunits[item]['listdata'] = rRNA_data
+						subunits[item]['max_score'] = max_score
+			if type_rRNA.startswith('5S'):
+				subunits['5S']['listdata'] = rRNA_data			
+				subunits['5S']['max_score'] = max_score
+		
+		for rRNA in subunits:
+			i = 0
+			try:
+				lengthlist = len(subunits[rRNA]['listdata'])
+			except KeyError:
+				continue
+			else:
+				while i < lengthlist:
+					eprint("%s harbours a %s from %i to %i" % (newfile, subunits[rRNA]['listdata'][i]['product'], int(subunits[rRNA]['listdata'][i]['begin']), int(subunits[rRNA]['listdata'][i]['end'])))
+					i += 1
+
+	# Predicting the tRNA sequences
+	eprint("Running ARAGORN to predict tRNA-like sequences in %s" % newfile)
+	genetictable = "-gc%s" % str(args.gcode)
+	with open("trnafile.fasta", "w") as trnafile:
+		if genomeshape['genomeshape'] == "circular":
+			subprocess.call(["aragorn", "-c", "-fon", genetictable, newfile], stdout=trnafile)
+		else:
+			subprocess.call(["aragorn", "-l", "-fon", genetictable, newfile], stdout=trnafile)
+	num_tRNA = len(list(SeqIO.parse("trnafile.fasta", "fasta")))
+	eprint("ARAGORN was able to predict %i tRNAs in %s\n" % (num_tRNA, newfile))
+
+	#Storing tRNA and tmRNA information in memory
+	with open("trnafile.fasta", "rU") as trnafile:
+		tRNAdict = {}
+		tmRNAdict = {}
+		for tRNAseq in SeqIO.parse(trnafile, "fasta"):
+			indtRNA = {}
+			indtmRNA = {}
+			tRNA_information = tRNAseq.description.split(" ")
+			tRNApat = re.compile("^tRNA-")
+			if tRNA_information[1] == "tmRNA":
+				if str(tRNA_information[2]) == "(Permuted)":
+					indtmRNA['product'] = "tmRNA"
+					tmRNA_coords = str(tRNA_information[3])
+					Beginningrevcomppat = re.compile("^c")
+					if re.match(Beginningrevcomppat, tmRNA_coords):
+						indtmRNA['strand'] = -1
+						tmRNA_coords = tmRNA_coords.replace("c[","").replace("]","").split(",")
+					else:
+						indtmRNA['strand'] = 1
+						tmRNA_coords = tmRNA_coords.replace("[","").replace("]","").split(",")
+					indtmRNA['begin'] = int(tmRNA_coords[0])
+					indtmRNA['end'] = int(tmRNA_coords[1])
+					tmRNAdict[tRNAseq.id] = indtmRNA
+				else:
+					indtmRNA['product'] = "tmRNA"
+					tmRNA_coords = str(tRNA_information[2])
+					Beginningrevcomppat = re.compile("^c")
+					if re.match(Beginningrevcomppat, tmRNA_coords):
+						indtmRNA['strand'] = -1
+						tmRNA_coords = tmRNA_coords.replace("c[","").replace("]","").split(",")
+					else:
+						indtmRNA['strand'] = 1
+						tmRNA_coords = tmRNA_coords.replace("[","").replace("]","").split(",")
+					indtmRNA['begin'] = int(tmRNA_coords[0])
+					indtmRNA['end'] = int(tmRNA_coords[1])
+					tmRNAdict[tRNAseq.id] = indtmRNA
+			elif re.match(tRNApat, tRNA_information[1]):
+				indtRNA['product'] = re.sub("\(\w{3}\)", "",  tRNA_information[1])
+				tRNA_coords = tRNA_information[2]
+				Beginningrevcomppat = re.compile("^c")
+				if re.match(Beginningrevcomppat, tRNA_coords):
+					indtRNA['strand'] = -1
+					tRNA_coords = tRNA_coords.replace("c[","").replace("]","").split(",")
+				else:
+					indtRNA['strand'] = 1
+					tRNA_coords = tRNA_coords.replace("[","").replace("]","").split(",")
+				indtRNA['begin'] = int(tRNA_coords[0])
+				indtRNA['end'] = int(tRNA_coords[1])
+				tRNAdict[tRNAseq.id] = indtRNA
+
+	#Predicting CRISPR repeats and others
+	eprint("Running PILERCR to predict CRISPR repeats in %s" % newfile)
+	subprocess.call(["pilercr", "-in", newfile, "-out", "crisprfile.txt", "-noinfo", "-minrepeat", str(args.minrepeat), "-maxrepeat", str(args.maxrepeat), "-minspacer", str(args.minspacer), "-maxspacer", str(args.maxspacer)])
+	eprint("Predicting repeats in the sequences using TRF and IRF")
+	with open("/dev/null", "w") as stderr:
+		subprocess.call(["trf", newfile, "2", "7", "7", "80", "10", "50", "500", "-h"], stderr=stderr)
+		os.rename("%s.2.7.7.80.10.50.500.dat" % newfile, "trf_temp.dat")
+	with open("/dev/null", "w") as stderr:
+		subprocess.call(["irf", newfile, "2", "3", "5", "80", "10", "40", "500000", "10000", "-d", "-h"], stderr=stderr)
+		os.rename("%s.2.3.5.80.10.40.500000.10000.dat" % newfile, "irf_temp.dat")
+
+	# Storing CRISPR repeats information
+	information_CRISPR = {}
+	with open("crisprfile.txt", "rU") as crisprfile:
+		for line in crisprfile:
+			if "SUMMARY BY POSITION" in line:
+				for line in crisprfile:
+					information_crispr_repeat = {}
+					try:
+						patC = re.compile('^\s+(\d+)\s+.{16}\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d?\s+(\w+)')
+						key, start, length, seq = re.match(patC, line).groups()
+					except AttributeError:
+						continue
+					else:
+						information_crispr_repeat['start'] = start
+						information_crispr_repeat['end'] = int(start) + int(length)
+						information_crispr_repeat['repeatseq'] = seq
+						information_crispr_repeat['repeatend'] = int(start) + len(seq)
+						information_CRISPR[key] = information_crispr_repeat
+
+	# Storing tandem repeats information
+	information_TRF = {}
+	count = 1
+	with open("trf_temp.dat", "rU") as trfile:
+		for line in trfile:
+			information_tandem_repeat = {}
+			try:
+				patT = re.compile('^(\d+)\s(\d+)\s\d+\s\d+\.\d+\s')
+				start, end = re.match(patT, line).groups()
+			except AttributeError:
+				continue
+			else:
+				information_tandem_repeat['start'] = start
+				information_tandem_repeat['end'] = end
+				information_TRF[count] = information_tandem_repeat
+				count += 1
+
+	# Storing inverted repeats information
+	information_IRF = {}
+	count = 1
+	with open("irf_temp.dat", "rU") as irfile:
+		for line in irfile:
+			information_inverted_repeat = {}
+			try:
+				patI = re.compile('^(\d+)\s(\d+)\s\d+\s\d+\s\d+')
+				start, end = re.match(patI, line).groups()
+			except AttributeError:
+				continue
+			else:
+				information_inverted_repeat['start'] = start
+				information_inverted_repeat['end'] = end
+				information_IRF[count] = information_inverted_repeat
+				count += 1
+
+	# Creating a new Genbank and GFF file
+	eprint("Creating the output files")
+	newtempgbk = "%s.temp.gbk" % newfile
+	with open(newfile, "rU") as basefile, open(newtempgbk, "w"):
+		for record in SeqIO.parse(basefile, "fasta", IUPAC.ambiguous_dna):
+			whole_sequence = SeqRecord(record.seq)
+			whole_sequence.id = str(record.id)
+			whole_sequence.annotations['data_file_division'] = args.typedata.upper()
+			whole_sequence.annotations['date'] = strftime("%d-%b-%Y").upper()
+			for protein in sorted(protsdict, key = stringSplitByNumbers):
+				putative_start = int(protsdict[protein]['begin'])
+				start_pos = SeqFeature.ExactPosition(putative_start)
+				end_pos = SeqFeature.ExactPosition(protsdict[protein]['end'])
+				feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=protsdict[protein]['strand'])
+				qualifiersgene = OrderedDict([('locus_tag', protsdict[protein]['protein_id'])])
+				new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = protsdict[protein]['strand'], qualifiers = qualifiersgene)
+				whole_sequence.features.append(new_data_gene)
+				qualifiers = [('locus_tag', protsdict[protein]['protein_id']), ('product', protsdict[protein]['product']), ('protein_id', protsdict[protein]['protein_id']), ('translation', protsdict[protein]['translation'])]
+				feature_qualifiers = OrderedDict(qualifiers)
+				new_data_cds = SeqFeature.SeqFeature(feature_location, type = "CDS", strand = protsdict[protein]['strand'], qualifiers = feature_qualifiers)
+				whole_sequence.features.append(new_data_cds)
+			for rRNA in sorted(subunits, key = stringSplitByNumbers):
+				i = 0
+				try:
+					lengthlist = len(subunits[rRNA]['listdata'])
+				except KeyError:
+					continue
+				else:
+					while i < lengthlist:
+						putative_start = int(subunits[rRNA]['listdata'][i]['begin'])
+						start_pos = SeqFeature.ExactPosition(putative_start)
+						end_pos = SeqFeature.ExactPosition(subunits[rRNA]['listdata'][i]['end'])
+						feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=subunits[rRNA]['listdata'][i]['strand'])
+						new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = subunits[rRNA]['listdata'][i]['strand'])
+						whole_sequence.features.append(new_data_gene)
+						qualifiers = [('product', subunits[rRNA]['listdata'][i]['product'])]
+						feature_qualifiers = OrderedDict(qualifiers)
+						new_data_rRNA = SeqFeature.SeqFeature(feature_location, type = "rRNA", strand = subunits[rRNA]['listdata'][i]['strand'], qualifiers = feature_qualifiers)
+						whole_sequence.features.append(new_data_rRNA)
+						i += 1
+			for tRNA in sorted(tRNAdict, key = stringSplitByNumbers):
+				putative_start = int(tRNAdict[tRNA]['begin'])
+				start_pos = SeqFeature.ExactPosition(putative_start)
+				end_pos = SeqFeature.ExactPosition(tRNAdict[tRNA]['end'])
+				feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=tRNAdict[tRNA]['strand'])
+				new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = tRNAdict[tRNA]['strand'])
+				whole_sequence.features.append(new_data_gene)
+				qualifiers = [('product', tRNAdict[tRNA]['product'])]
+				feature_qualifiers = OrderedDict(qualifiers)
+				new_data_tRNA = SeqFeature.SeqFeature(feature_location, type = "tRNA", strand = tRNAdict[tRNA]['strand'], qualifiers = feature_qualifiers)
+				whole_sequence.features.append(new_data_tRNA)
+			for tmRNA in sorted(tmRNAdict, key = stringSplitByNumbers):
+				putative_start = int(tmRNAdict[tmRNA]['begin'])
+				start_pos = SeqFeature.ExactPosition(putative_start)
+				end_pos = SeqFeature.ExactPosition(tmRNAdict[tmRNA]['end'])
+				feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=tmRNAdict[tmRNA]['strand'])
+				new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = tmRNAdict[tmRNA]['strand'])
+				whole_sequence.features.append(new_data_gene)
+				qualifiers = [('product', tmRNAdict[tmRNA]['product'])]
+				feature_qualifiers = OrderedDict(qualifiers)
+				new_data_tmRNA = SeqFeature.SeqFeature(feature_location, type = "tmRNA", strand = tmRNAdict[tmRNA]['strand'], qualifiers = feature_qualifiers)
+				whole_sequence.features.append(new_data_tmRNA)
+			for CRISPR in sorted(information_CRISPR, key = stringSplitByNumbers):
+				putative_start = int(information_CRISPR[CRISPR]['start'])
+				start_pos = SeqFeature.ExactPosition(putative_start)
+				end_pos = SeqFeature.ExactPosition(information_CRISPR[CRISPR]['end'])
+				feature_location = SeqFeature.FeatureLocation(start_pos, end_pos)
+				qualifiers = [('rpt_family', 'CRISPR'), ('rpt_type', 'direct'), ('rpt_unit_range', "%i..%i" % (int(information_CRISPR[CRISPR]['start']), int(information_CRISPR[CRISPR]['repeatend']))), ('rpt_unit_seq', information_CRISPR[CRISPR]['repeatseq'])]
+				feature_qualifiers = OrderedDict(qualifiers)
+				new_data_CRISPRrepeat = SeqFeature.SeqFeature(feature_location, type = "repeat_region", qualifiers = feature_qualifiers)
+				whole_sequence.features.append(new_data_CRISPRrepeat)
+			for tandem in sorted(information_TRF):
+				putative_start = int(information_TRF[tandem]['start'])
+				start_pos = SeqFeature.ExactPosition(putative_start)
+				end_pos = SeqFeature.ExactPosition(information_TRF[tandem]['end'])
+				feature_location = SeqFeature.FeatureLocation(start_pos, end_pos)
+				qualifiers = [('rpt_type', 'direct')]
+				feature_qualifiers = OrderedDict(qualifiers)
+				new_data_tandemrepeat = SeqFeature.SeqFeature(feature_location, type = "repeat_region", qualifiers = feature_qualifiers)
+				whole_sequence.features.append(new_data_tandemrepeat)
+			for inverted in sorted(information_IRF):
+				putative_start = int(information_IRF[inverted]['start'])
+				start_pos = SeqFeature.ExactPosition(putative_start)
+				end_pos = SeqFeature.ExactPosition(information_IRF[inverted]['end'])
+				feature_location = SeqFeature.FeatureLocation(start_pos, end_pos)
+				qualifiers = [('rpt_type', 'inverted')]
+				feature_qualifiers = OrderedDict(qualifiers)
+				new_data_invertedrepeat = SeqFeature.SeqFeature(feature_location, type = "repeat_region", qualifiers = feature_qualifiers)
+				whole_sequence.features.append(new_data_invertedrepeat)
+			SeqIO.write(whole_sequence, newtempgbk, "genbank")
+
+	newgbk = "%s.gbk" % newfile
+	with open(newtempgbk, "rU") as gbktempfile, open(newgbk, "w") as gbkrealfile:
+		newpat = re.compile("D|RNA\s+(CON|PHG|VRL|BCT)")
+		for line in gbktempfile:
+			if line.startswith("LOCUS ") and re.search(newpat, line):
+				if genomeshape['genomeshape'] == "linear":
+					newline = re.sub("bp    DNA\s+", "bp    DNA     linear   ", line)
+				else:
+					newline = re.sub("bp    DNA\s+", "bp    DNA     circular ", line)
+				gbkrealfile.write(newline)
+			else:
+				gbkrealfile.write(line)
+
+	for f in glob.glob("*.temp.gbk"):
+		os.remove(f)
+
+	if args.gffprint==True:
+		newgff = "%s.gff" % newfile
+		with open(newgff, "w") as outgff, open(newgbk, "rU") as ingbk:
+			GFF.write(SeqIO.parse(ingbk, "genbank"), outgff)
+
+	# Removing intermediate files
+	os.remove(newfile)
+	os.remove("temp.faa")
+	os.remove("temp_blast.csv")
+	os.remove("crisprfile.txt")
+	os.remove("trnafile.fasta")
+	os.remove("rrnafile.csv")
+	os.remove("trf_temp.dat")
+	os.remove("irf_temp.dat")
+	for f in glob.glob("SEQ*"):
+		os.remove(f)
+
+# Joining all GENBANK files into one
+listgbk = sorted(glob.glob('CONTIG_*.gbk'))
+gbkoutputfile = "%s.gbk" % root_output
+with open(gbkoutputfile, 'w') as finalgbk:
+	for fname in listgbk:
+		with open(fname) as infile:
+			for line in infile:
+				finalgbk.write(line)
+
+for tempgbk in glob.glob("CONTIG_*.gbk"):
+	os.remove(tempgbk)
+
+# Joining all GFF files into one
+if args.gffprint==True:
+	listgff = sorted(glob.glob('CONTIG_*.gff'))
+	gffoutputfile = "%s.gff" % root_output
+	with open(gffoutputfile, 'w') as finalgff:
+		for fname in listgff:
+			with open(fname) as infile:
+				for line in infile:
+					finalgff.write(line)
+	for tempgff in glob.glob("CONTIG_*.gff"):
+		os.remove(tempgff)
+
+# Joining all TABLE files into one
+listcsv = sorted(glob.glob('CONTIG_*.csv'))
+tbloutputfile = "%s.csv" % root_output
+with open(tbloutputfile, 'w') as finaltable:
+	for fname in listcsv:
+		with open(fname) as infile:
+			for line in infile:
+				finaltable.write(line)
+
+for temptbl in glob.glob("CONTIG_*.csv"):
+	os.remove(temptbl)
+
+# Preparing sequences for GenBank submission (Original code from Wan Yu's gbk2tbl.py script [https://github.com/wanyuac/BINF_toolkit/blob/master/gbk2tbl.py])
+allowed_qualifiers = ['locus_tag', 'gene', 'product', 'pseudo', 'protein_id', 'gene_desc', 'old_locus_tag', 'note', 'inference', 'organism', 'mol_type', 'strain', 'sub_species', 'isolation-source', 'country']
+newfastafile = "%s.fasta" % root_output
+newtablefile = "%s.tbl" % root_output
+with open(args.modifiers, "rU") as modifiers, open(gbkoutputfile, "r") as genbank_fh, open(newfastafile, "w") as fasta_fh, open(newtablefile, "w") as feature_fh: 
+	info = modifiers.readline()
+	wholelist = list(SeqIO.parse(genbank_fh, 'genbank'))
+	for record in wholelist:
+		if len(record) <= args.mincontigsize:
+			eprint("WARNING: Skipping small contig %s" % record.id)
+			continue
+		record.description = "%s %s" % (record.id, info)
+		SeqIO.write([record], fasta_fh, 'fasta')
+		print('>Feature %s' % (record.name), file=feature_fh)
+		for line in record.features:
+			if line.strand == 1:
+				print('%d\t%d\t%s' % (line.location.nofuzzy_start + 1, line.location.nofuzzy_end, line.type), file=feature_fh)
+			else:
+				print('%d\t%d\t%s' % (line.location.nofuzzy_end, line.location.nofuzzy_start + 1, line.type), file=feature_fh)
+			for (key, values) in line.qualifiers.iteritems():
+				if key not in allowed_qualifiers:
+					continue
+				for v in values:
+					print('\t\t\t%s\t%s' % (key, v), file=feature_fh)
+
+# Final statement
+eprint("Genome annotation done!")
+eprint("The GenBank file is %s" % gbkoutputfile)
+if args.gffprint==True:
+	eprint("The GFF3 file is %s" % gffoutputfile)
+eprint("The table file for GenBank submission is %s" % tbloutputfile)
+eprint("The FASTA file for GenBank submission is %s" % newfastafile)
+eprint("The table file with all protein information is %s" % newtablefile)
author	vimalkumarvelayudhan
date	Tue, 27 Feb 2018 14:16:54 -0500
parents
children