Mercurial > repos > vimalkumarvelayudhan > viga

#!/usr/bin/env python

# -*- coding: utf-8 -*-

# VIGA - De-novo VIral Genome Annotator
#
# Copyright (C) 2017 - Enrique Gonzalez-Tortuero
#                      Vimalkumar Velayudhan
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# Importing python libraries
from __future__ import print_function
import argparse
import csv
import fileinput
import fractions
import glob
import numpy
import os
import os.path
import re
import sys
import shutil
import subprocess
from BCBio import GFF
from Bio import SeqIO
from Bio import SeqFeature
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.SeqFeature import FeatureLocation
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from collections import OrderedDict, defaultdict
from itertools import product
from scipy import signal
from time import strftime

# Preparing functions
def batch_iterator(iterator, batch_size):
	entry = True
	while entry:
		batch = []
		while len(batch) < batch_size:
			try:
				entry = iterator.next()
			except StopIteration:
				entry = None
			if entry is None:
				break
			batch.append(entry)
		if batch:
			yield batch

def check_peaks(peaks, length):
	# Checking if origin/terminus peaks are too close or too far apart. In that case, they are probably wrong
    closest, farthest = int(length * float(0.45)), int(length * float(0.55))
    pairs = []
    for pair in list(product(*peaks)):
        ### added this to make sure gets origin and ter right
        tr, pk = sorted(list(pair), key = lambda x: x[1], reverse = False) # trough and peak
        a = (tr[0] - pk[0]) % length
        b = (pk[0] - tr[0]) % length
        pt = abs(tr[1] - pk[1]) # distance between values
        if (a <= farthest and a >= closest) or (b <=farthest and b >= closest):
            pairs.append([pt, tr, pk])
    if len(pairs) == 0:
        return [False, False]
    pt, tr, pk = sorted(pairs, reverse = True)[0]
    return [tr[0], pk[0]]

def cmd_exists(cmd):
	return subprocess.call("type " + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0

def GCskew(name, length, seq, window, slide):
	replacements = {'G':1, 'C':-1, 'A':0, 'T':0, 'N':0}
	gmc = [] # G - C
	for base in seq:
		try:
			gmc.append(replacements[base])
		except:
			gmc.append(0)
	# convert to G + C
	gpc = [abs(i) for i in gmc] # G + C
	# calculate sliding windows for (G - C) and (G + C)
	weights = numpy.ones(window)/window
	gmc = [[i, c] for i, c in enumerate(signal.fftconvolve(gmc, weights, 'same').tolist())]
	gpc = [[i, c] for i, c in enumerate(signal.fftconvolve(gpc, weights, 'same').tolist())]
	# calculate gc skew and cummulative gc skew sum
	skew = [[], []] # x and y for gc skew
	c_skew = [[], []] # x and y for gc skew cummulative sums
	cs = 0 # cummulative sum
	# select windows to use based on slide
	for i, m in gmc[0::slide]:
		p = gpc[i][1]
		if p == 0:
			gcs = 0
		else:
			gcs = m/p
		cs += gcs
		skew[0].append(i)
		c_skew[0].append(i)
		skew[1].append(gcs)
		c_skew[1].append(cs)
	ori, ter = find_ori_ter(c_skew, length)
	return ori, ter, skew, c_skew

def eprint(*args, **kwargs):
	print(*args, file=sys.stderr, **kwargs)

def find_ori_ter(c_skew, length):
    # Find origin and terminus of replication based on cumulative GC skew min and max peaks
    c_skew_min = signal.argrelextrema(numpy.asarray(c_skew[1]), numpy.less, order = 1)[0].tolist()
    c_skew_max = signal.argrelextrema(numpy.asarray(c_skew[1]), numpy.greater, order = 1)[0].tolist()
    # return False if no peaks were detected
    if len(c_skew_min) == 0 or len(c_skew_min) == 0:
        return [False, False]
    else:
        c_skew_min = [[c_skew[0][i], c_skew[1][i]] for i in c_skew_min]
        c_skew_max = [[c_skew[0][i], c_skew[1][i]] for i in c_skew_max]
        ori, ter = check_peaks([c_skew_min, c_skew_max], length)
    return ori, ter

def stringSplitByNumbers(x):
	r = re.compile('(\d+)')
	l = r.split(x)
	return [int(y) if y.isdigit() else y for y in l]

# Defining the program version
version = "0.10.3"

# Processing the parameters
parser = argparse.ArgumentParser(description='VIGA is an automatic de novo VIral Genome Annotator.')
basic_group = parser.add_argument_group('Basic options for VIGA [REQUIRED]')

basic_group.add_argument("--input", dest="inputfile", type=str, required=True, help='Input file as a FASTA file', metavar="FASTAFILE")
basic_group.add_argument("--rfamdb", dest="rfamdatabase", type=str, required=True, help='RFAM Database that will be used for the ribosomal RNA prediction. RFAMDB should be in the format "/full/path/to/rfamdb/Rfam.cm" and must be compressed accordingly (see INFERNAL manual) before running the script.', metavar="RFAMDB")
basic_group.add_argument("--modifiers", dest="modifiers", type=str, required=True, help='Input file as a plain text file with the modifiers per every FASTA header according to SeqIn (https://www.ncbi.nlm.nih.gov/Sequin/modifiers.html). All modifiers must be written in a single line and are separated by a single space character. No space should be placed besides the = sign. For example: [organism=Serratia marcescens subsp. marcescens] [sub-species=marcescens] [strain=AH0650_Sm1] [topology=linear] [moltype=DNA] [tech=wgs] [gcode=11] [country=Australia] [isolation-source=sputum]. This line will be copied and printed along with the record name as the definition line of every contig sequence.', metavar="TEXTFILE")

advanced_group = parser.add_argument_group('Advanced options for VIGA [OPTIONAL]')
advanced_group.add_argument("--readlength", dest="read_length", type=int, default=101, help='Read length for the circularity prediction (default: 101 bp)', metavar="INT")
advanced_group.add_argument("--windowsize", dest="window", type=int, default=100, help='Window size used to determine the origin of replication in circular contigs according to the cumulative GC skew (default: 100 bp)', metavar="INT")
advanced_group.add_argument("--slidingsize", dest="slide", type=int, default=10, help='Window size used to determine the origin of replication in circular contigs according to the cumulative GC skew (default: 10 bp)', metavar="INT")
advanced_group.add_argument("--out", dest="rootoutput", type=str, help='Name of the outputs files (without extension)', metavar="OUTPUTNAME")
advanced_group.add_argument("--locus", dest="locus", type=str, default='LOC', help='Name of the sequences. If the input is a multiFASTA file, please put a general name as the program will add the number of the contig at the end of the name (Default: %(default)s)', metavar="STRING")
advanced_group.add_argument("--gff", dest="gffprint", action='store_true', default=False, help='Printing the output as GFF3 file (Default: False)')
advanced_group.add_argument("--threads", dest="ncpus", default=1, help='Number of threads/cpus (Default: %(default)s cpu)', metavar="INT")
advanced_group.add_argument("--nohmmer", dest="nohmmer", action='store_true', default=False, help='Running only BLAST to predict protein function. (Default: False)')
advanced_group.add_argument("--noblast", dest="noblast", action='store_true', default=False, help='Running DIAMOND instead of BLAST to predict protein function according to homology. This will be less sensitive but faster than BLAST. (Default: False)')
advanced_group.add_argument("--blastdb", dest="blastdatabase", type=str, help='BLAST Database that will be used for the protein function prediction. The database must be an amino acid one, not nucleotidic. This argument is mandatory if --noblast option is disabled', metavar="BLASTDB")
advanced_group.add_argument("--diamonddb", dest="diamonddatabase", type=str, help='DIAMOND Database that will be used for the protein function prediction. The database must be created from a amino acid FASTA file as indicated in https://github.com/bbuchfink/diamond. This argument is mandatory when --noblast option is enabled', metavar="DIAMONDDB")
advanced_group.add_argument("--blastevalue", dest="blastevalue", default=0.00001, help='BLAST/DIAMOND e-value threshold (Default: 0.00001)', metavar="FLOAT")
advanced_group.add_argument("--hmmdb", dest="hmmdatabase", type=str, help='PHMMER Database that will be used for the protein function prediction according to Hidden Markov Models. In this case, HMMDB must be in FASTA format (e.g. UniProt: "', metavar="HMMDB")
advanced_group.add_argument("--hmmerevalue", dest="hmmerevalue", default=0.001, help='PHMMER e-value threshold (Default: 0.001)', metavar="FLOAT")

type_choices = {'BCT': 'Prokaryotic chromosome', 'CON': 'Contig', 'PHG': 'Phages', 'VRL': 'Eukaryotic/Archaea virus'}
type_help = ('GenBank Division: One of the following codes - {0}. (Default: %(default)s)'.format(', '.join('{0} ({1})'.format(k, v) for k, v in type_choices.items())))
advanced_group.add_argument("--typedata", dest="typedata", type=str, default='CON', help=type_help, metavar="BCT|CON|VRL|PHG")

gcode_choices = {'1': 'Standard genetic code [Eukaryotic]', '2': 'Vertebrate mitochondrial code', '3': 'Yeast mitochondrial code', '4': 'Mycoplasma/Spiroplasma and Protozoan/mold/coelenterate mitochondrial code', '5': 'Invertebrate mitochondrial code', '6': 'Ciliate, dasycladacean and hexamita nuclear code', '9': 'Echinoderm/flatworm mitochondrial code', '10': 'Euplotid nuclear code', '11': 'Bacteria/Archaea/Phages/Plant plastid', '12': 'Alternative yeast nuclear code', '13': 'Ascidian mitochondrial code', '14': 'Alternative flatworm mitochondrial code', '16': 'Chlorophycean mitochondrial code', '21': 'Trematode mitochondrial code', '22': 'Scedenesmus obliquus mitochondrial code', '23': 'Thraustochytrium mitochondrial code', '24': 'Pterobranquia mitochondrial code', '25': 'Gracilibacteria & Candidate division SR1', '26': 'Pachysolen tannophilus nuclear code', '27': 'Karyorelict nuclear code', '28': 'Condylostoma nuclear code', '29': 'Mesodinium nuclear code', '30': 'Peritrich nuclear code', '31': 'Blastocrithidia nuclear code'}
gcode_help = ('Number of GenBank translation table. At this moment, the available options are {0}. (Default: %(default)s)'.format('{}'.format(', '.join('{0} ({1})'.format(k, v) for k, v in gcode_choices.items()))))
advanced_group.add_argument("--gcode", dest="gcode", type=str, default='11', help=gcode_help, metavar="NUMBER")

advanced_group.add_argument('--mincontigsize', dest="mincontigsize", type=int, default = 200, help = 'Minimum contig length to be considered in the final files (Default: 200 bp)', metavar="INT")
advanced_group.add_argument("--idthr", dest="idthreshold", default=50.00, help='ID threshold (Default: 50.0)', metavar="FLOAT")
advanced_group.add_argument("--coverthr", dest="covthreshold", default=50.00, help='Coverage threshold (Default: 50.0)', metavar="FLOAT")
advanced_group.add_argument("--diffid", dest="diffid", default=5.00, help='Max allowed difference between the ID percentages of BLAST and HMMER. (Default = 5.00; Not recommended to change such value)', metavar="FLOAT (>0.01)")
advanced_group.add_argument("--minrepeat", dest="minrepeat", type=int, default=16, help="Minimum repeat length for CRISPR detection (Default: 16)", metavar="INT")
advanced_group.add_argument("--maxrepeat", dest="maxrepeat", type=int, default=64, help="Maximum repeat length for CRISPR detection (Default: 64)")
advanced_group.add_argument("--minspacer", dest="minspacer", type=int, default=8, help="Minimum spacer length for CRISPR detection (Default: 8)")
advanced_group.add_argument("--maxspacer", dest="maxspacer", type=int, default=64, help="Maximum spacer length for CRISPR detection (Default: 64)")
advanced_group.add_argument("--blastexh", dest="blastexh", action='store_true', default=False, help='Use of exhaustive BLAST to predict the proteins by homology according to Fozo et al. (2010) Nucleic Acids Res (Default=FALSE)')

args = parser.parse_args()

root_output = args.rootoutput
if not root_output:
	root_output = '{}_annotated'.format(os.path.splitext(args.inputfile)[0])

if args.noblast == False and args.blastdatabase == None:
    sys.exit('You MUST specify BLAST database using the parameter --blastdb if you are not using --noblast option')

if args.noblast == True and args.diamonddatabase == None:
    sys.exit('You MUST specify DIAMOND database using the parameter --diamonddb if you are using --noblast option')

if args.nohmmer == False and args.hmmdatabase == None:
		sys.exit('You MUST specify HMMER database using the parameter --hmmdb if you are not using --nohmmer option')

# Printing the header of the program
eprint("This is VIGA %s" % str(version))
eprint("Written by Enrique Gonzalez Tortuero & Vimalkumar Velayudhan")
eprint("Homepage is https://github.com/EGTortuero/virannot")
eprint("Local time: ", strftime("%a, %d %b %Y %H:%M:%S"))
eprint("\n\n")

# checking the presence of the programs in the system

if not cmd_exists("lastz")==True:
	sys.exit("You must install LASTZ before using this script")
elif not cmd_exists("cmscan")==True:
	sys.exit("You must install INFERNAL before using this script")
elif not cmd_exists("prodigal")==True:
	sys.exit("You must install prodigal before using this script")
elif not cmd_exists("parallel")==True:
	sys.exit("You must install GNU Parallel before using this script")
elif not cmd_exists("blastp")==True:
	sys.exit("You must install BLAST before using this script")
elif not cmd_exists("diamond")==True:
	sys.exit("You must install DIAMOND before using this script")
elif not cmd_exists("phmmer")==True:
	sys.exit("You must install HMMER 3 before using this script")
elif not cmd_exists("aragorn")==True:
	sys.exit("You must install ARAGORN before using this script")
elif not cmd_exists("pilercr")==True:
	sys.exit("You must install PILERCR before using this script")
elif not cmd_exists("trf")==True:
	sys.exit("You must install Tandem Repeats Finder before using this script")
elif not cmd_exists("irf")==True:
	sys.exit("You must install Inverted Repeats Finder before using this script")

eprint("Data type is {0} and GenBank translation table no is {1}\n".format(args.typedata, args.gcode))

# Correcting the original file (long headers problem + multiple FASTA files)
record_iter = SeqIO.parse(open(args.inputfile, "rU"),"fasta")
counter = 1
newnamessequences = {}
for i, batch in enumerate(batch_iterator(record_iter, 1)):
	seq_index = "CONTIG_%i" % (i + 1)
	filename = "%s.temp.fna" % seq_index
	newfilename = "%s.fna" % seq_index
	with open(filename, "w") as handle:
		count = SeqIO.write(batch, filename, "fasta")

	with open(filename, "rU") as original, open(newfilename, "w") as corrected:
		sequences = SeqIO.parse(original, "fasta", IUPAC.ambiguous_dna)
		for record in sequences:
			original_name = record.id
			record.id = "%s_%i" % (args.locus, counter)
			record.description = record.description
			counter += 1
			newnamessequences[record.id] = original_name
			eprint("WARNING: The name of the sequence %s was corrected as %s" % (original_name, record.id))
		SeqIO.write(record, corrected, "fasta")

	with open("logfile.txt", "w") as logfile:
		logfile.write("#Original\tNew\n")
		for oldname in sorted(newnamessequences, key = stringSplitByNumbers):
			logfile.write("%s\t%s\n" % (oldname, newnamessequences[oldname]))
	os.remove(filename)

for newfile in sorted(glob.glob("CONTIG_*.fna")):

	# Predicting the shape of the contig (code based on Alex Crits-Christoph's find_circular.py script [https://github.com/alexcritschristoph/VICA/blob/master/find_circular.py])
	eprint("Predicting the shape of the contig using LASTZ")
	genomeshape = {}
	with open(newfile, "rU") as targetfasta:
		Sequence = SeqIO.parse(newfile, "fasta")
		for record in Sequence:
			seq_beginning = str(record.seq[0:args.read_length])
			seq_ending = str(record.seq[len(record.seq)-args.read_length:len(record.seq)])
			combined_seqs = SeqRecord(Seq(seq_beginning + seq_ending, IUPAC.ambiguous_dna), id = record.description)
			SeqIO.write(combined_seqs, "temporal_circular.fasta", "fasta")
			outputlastz = subprocess.check_output(["lastz", "temporal_circular.fasta", "--self", "--notrivial", "--nomirror", "--format=general-:start1,end1,start2,end2,score,strand1,strand2,identity,length1"])
			resultslastz = outputlastz.split("\n")
			for resultlastz in resultslastz:
				if resultlastz != '':
					start1 = resultlastz.split()[0]
					end1 = resultlastz.split()[1]
					start2 = resultlastz.split()[2]
					end2 = resultlastz.split()[3]
					strand1 = resultlastz.split()[5]
					strand2 = resultlastz.split()[6]
					identity = resultlastz.split()[7]
					length = int(resultlastz.split()[9])
					if strand1 == strand2 and length > 0.4 * args.read_length and float(fractions.Fraction(identity)) > 0.95 and int(start1) < 5 and int(start2) > args.read_length and int(end1) < args.read_length and int(end2) > args.read_length * 2 * 0.9:
						genomeshape['genomeshape'] = "circular"
						try:
							if genomeshape['identity'] >= float(fractions.Fraction(identity)):
								genomeshape['identity'] = float(fractions.Fraction(identity))
								genomeshape['length'] = length
						except KeyError:
							genomeshape['identity'] = float(fractions.Fraction(identity))
							genomeshape['length'] = length
					else:
						continue
					if strand1 == strand2 and length > 0.4 * args.read_length and float(fractions.Fraction(identity)) > 0.95 and int(start1) < 5 and int(start2) > args.read_length and int(end1) < args.read_length and int(end2) > args.read_length * 2 * 0.9:
						genomeshape['genomeshape'] = "circular"
						try:
							if genomeshape['identity'] >= float(fractions.Fraction(identity)):
								genomeshape['identity'] = float(fractions.Fraction(identity))
								genomeshape['length'] = length
						except KeyError:
							genomeshape['identity'] = float(fractions.Fraction(identity))
							genomeshape['length'] = length
			try:
				if genomeshape['genomeshape'] == "":
						genomeshape['genomeshape'] = "linear"
			except KeyError:
				genomeshape['genomeshape'] = "linear"
			else:
				genomeshape['genomeshape'] = "circular"
				with open("temp.fasta", "w") as correctedcircular:
					Corrseq = str(record.seq[int(genomeshape['length'])//2:-int(genomeshape['length'])//2])
					Newseq = SeqRecord(Seq(Corrseq, IUPAC.ambiguous_dna), id = record.description)
					SeqIO.write(Newseq, correctedcircular, "fasta")
				os.rename("temp.fasta", "temp2.fasta")
		eprint("LASTZ predicted that %s is %s\n" % (newfile, genomeshape['genomeshape']))
		os.remove("temporal_circular.fasta")

	# Calculate the cumulative GCskew in circular contigs to determine the origin of replication (Based on iRep -- Brown CT, Olm MR, Thomas BC, Banfield JF (2016) Measurement of bacterial replication rates in microbial communities. Nature Biotechnology 34: 1256-63.)
	if genomeshape['genomeshape'] == "circular":
		for record in SeqIO.parse("temp2.fasta", "fasta"):
			length_contig = len(record.seq)
			#if length < min_len:
			#    print('%s: Too Short' % (name), file=sys.stderr)
			#    continue
			oric, term, gcskew, cgcskew = GCskew(record.id, length_contig, record.seq, args.window, args.slide)
			valoric = oric
			if oric == False:
				oric, term = 'n/a', 'n/a'
			else:
				oric, term = '{:,}'.format(oric), '{:,}'.format(term)
			eprint('%s -> Origin: %s Terminus: %s' % (record.id, oric, term))
			#print('\t'.join(['# Name', 'Position', 'GC Skew', 'Cumulative GC Skew']))
			#for i, pos in enumerate(gcskew[0]):
			#	out = [record.id, pos, gcskew[1][i], cgcskew[1][i]]
			#	print('\t'.join([str(i) for i in out]))
			if valoric != False:
				firstpartseq = str(record.seq[valoric:-1])
				secondpartseq = str(record.seq[0:(valoric-1)])
				combinedcorrectedseq = SeqRecord(Seq(firstpartseq + secondpartseq, IUPAC.ambiguous_dna), id = record.description)
				SeqIO.write(combinedcorrectedseq, newfile, "fasta")
			else:
				eprint("As the program was unable to predict the origin of replication, %s was considered as is without correcting!" % record.id)
				os.rename("temp2.fasta", newfile)
		if os.path.isfile("temp2.fasta"):
			os.remove("temp2.fasta")

	# Predicting genes using PRODIGAL
	eprint("\nRunning Prodigal to predict the genes in %s" % newfile)
	for record in SeqIO.parse(newfile, "fasta"):
		length_contig = len(record.seq)
		if (length_contig >= 100000):
			if genomeshape['genomeshape'] == 'linear':
				subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-g", args.gcode, "-c", "-q"])
			else:
				subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-g", args.gcode, "-q"])
		else:
			if genomeshape['genomeshape'] == 'linear':
				subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-p", "meta", "-g", args.gcode, "-c", "-q"])
			else:
				subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-p", "meta", "-g", args.gcode, "-q"])
	num_seqs = len(list(SeqIO.parse("pretemp.faa", "fasta")))
	eprint("PRODIGAL was able to predict %i genes in %s\n" % (num_seqs, newfile))

	with open("pretemp.faa", "rU") as originalfaa, open("temp.faa", "w") as correctedfaa:
		sequences = SeqIO.parse(originalfaa, "fasta")
		for record in sequences:
			record.seq = record.seq.rstrip("*")
			SeqIO.write(record, correctedfaa, "fasta")

	faa_file = "%s.faa" % newfile # TO DEBUG
	shutil.copyfile("temp.faa", faa_file) # TO DEBUG
	os.remove("pretemp.faa")

	# Predicting the function of the proteins based on homology using BLAST
	equivalence = {}
	record_iter = SeqIO.parse(open("temp.faa"),"fasta")
	for i, batch in enumerate(batch_iterator(record_iter, 1)):
		seq_index = "SEQ_%i" % (i + 1)
		filename = "%s.faa" % seq_index
		with open(filename, "w") as handle:
			count = SeqIO.write(batch, handle, "fasta")
			equivalence[seq_index] = batch[0].id

	if args.blastexh==True:
		eprint("Running BLAST to predict the genes according to homology inference in %s using exhaustive mode (see Fozo et al. (2010) Nucleic Acids Res for details)" % newfile)
		subprocess.call(['blastp', '-query', "temp.faa", '-db', args.blastdatabase, '-evalue', str(args.blastevalue), '-outfmt', '6 qseqid sseqid pident length qlen slen qstart qend evalue bitscore stitle', '-out', 'temp_blast.csv', '-max_target_seqs', '10', '-word_size', '2', '-gapopen', '8', '-gapextend', '2', '-matrix', '"PAM70"', '-comp_based_stats', '"0"', "-num_threads", str(args.ncpus)])
		eprint("Done. BLAST was done to predict the genes by homology\n")
		blast_log = "%s.blast.log" % newfile # TO DEBUG
		shutil.copyfile("temp_blast.csv", blast_log) # TO DEBUG
	elif args.noblast==True:
		eprint("Running DIAMOND to predict the genes according to homology inference in %s using default parameters" % newfile)
		with open("temp.faa", "r") as tempfile:
			first_line = tempfile.readline()
			if first_line.startswith(">"):
				subprocess.call(['diamond', 'blastp', '-q', 'temp.faa', '-d', args.diamonddatabase, '-e', str(args.blastevalue), '-f', '6', 'qseqid', 'sseqid', 'pident', 'length', 'qlen', 'slen', 'qstart', 'qend', 'evalue', 'bitscore', 'stitle', '-o', 'temp_blast.csv', '-k', '10', "-p", str(args.ncpus), '--quiet'])
			else:
				open("temp_blast.csv", 'a').close()
		blast_log = "%s.blast.log" % newfile # TO DEBUG
		shutil.copyfile("temp_blast.csv", blast_log) # TO DEBUG
		eprint("Done. DIAMOND was done to predict the genes by homology\n")
	else:
		eprint("Running BLAST to predict the genes according to homology inference in %s using default parameters" % newfile)
		subprocess.call(['blastp', '-query', "temp.faa", '-db', args.blastdatabase, '-evalue', str(args.blastevalue), '-outfmt', '6 qseqid sseqid pident length qlen slen qstart qend evalue bitscore stitle', '-out', 'temp_blast.csv', '-max_target_seqs', '10', "-num_threads", str(args.ncpus)])
		blast_log = "%s.blast.log" % newfile
		shutil.copyfile("temp_blast.csv", blast_log) # TO DEBUG
		eprint("Done. BLAST was done to predict the genes by homology\n") # TO DEBUG

	# Parsing the results from BLAST
	with open("temp_blast.csv", "rU") as blastresults:
		hypotheticalpat = re.compile(r'(((((?i)hypothetical)|(?i)uncharacteri(z|s)ed|(?i)predicted)) protein)|((?i)ORF|((?i)unnamed protein product|(?i)gp\d+))')
		reader = csv.DictReader(blastresults, delimiter='\t', fieldnames=['qseqid','sseqid','pident','length','qlen','slen','qstart','qend','evalue','bitscore','stitle'])
		information_proteins_blast = {}
		for row in reader:
			perc_cover = round(100.00*(float(row['length'])/float(row['qlen'])),2)
			perc_id = float(row['pident'])
			infoprot_blast = {}
			infoprot_blast['sseqid'] = row['sseqid']
			infoprot_blast['pident'] = perc_id
			infoprot_blast['pcover'] = perc_cover
			infoprot_blast['evalue'] = row['evalue']
			infoprot_blast['descr'] = row['stitle']
			try:
				data = information_proteins_blast[row['qseqid']]
			except KeyError:
				if not re.search(hypotheticalpat, infoprot_blast['descr']) and float(perc_id) >= float(args.idthreshold) and float(perc_cover) >= float(args.covthreshold) and float(row['evalue']) <= float(args.blastevalue):
						information_proteins_blast[row['qseqid']] = infoprot_blast
				else:
					continue
			else:
				if not re.search(hypotheticalpat, infoprot_blast['descr']) and float(perc_id) >= float(args.idthreshold) and float(perc_id) >= float(infoprot_blast['pident']) and float(perc_cover) >= float(args.covthreshold) and float(perc_cover) >= float(infoprot_blast['pcover']) and float(row['evalue']) <= float(args.blastevalue):
						information_proteins_blast[row['qseqid']] = infoprot_blast

	## Predicting the function of the proteins based on HMM predictions using phmmer
	if args.nohmmer == False:
		with open("commands.sh", "w") as commands:
			for singleprot in sorted(glob.glob("SEQ_*.faa")):
				hhmtable = "%s.tbl" % singleprot
				eprint("Creating file to run parallel PHMMER")
				eprint("Adding %s to run PHMMER." % singleprot)
				line2write = ' '.join(["phmmer", "--cpu", "1", "--domtblout", hhmtable, "-E", str(args.hmmerevalue), "-o", "/dev/null", singleprot, args.hmmdatabase, '\n'])
				commands.write(line2write)

		eprint("Running parallel PHMMER")
		subprocess.call(['parallel', '-j', str(args.ncpus)], stdin=open('commands.sh'))
		eprint("Done. PHMMER was done to predict the function of the genes according to Hidden Markov Models\n")
		os.remove("commands.sh")

		# Parsing the results from HMMER
		information_proteins_hmmer = {}
		hypotheticalpat = re.compile(r'(((((?i)hypothetical)|(?i)uncharacteri(z|s)ed|(?i)predicted)) protein)|((?i)ORF|((?i)unnamed protein product|(?i)gp\d+))')
		for singletbl in sorted(glob.glob("*.faa.tbl")):
			rootname = singletbl.replace(".faa.tbl", "")
			with open(singletbl) as tblfile:
				infoprot_hmmer = {}
				for line in tblfile:
					if line.startswith("#"):
						continue
					else:
						try:
							pat = re.compile("^(\S+)\s+\S\s+\d+\s+(\S+)\s+\S\s+(\d+)\s+((?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?)\s+\S+\s+\S+\s+\S+\s+\S+\s+(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s+\S+\s+\S+\s+\S+\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+\S+\s+\S+\s+(\S+)\s+(.*)")
							matchname, lociname, length, evaluehh, start, end, pident, description = re.match(pat, line).groups()
						except AttributeError:
							continue
						else:
							length = float(length)
							pident = 100.00*float(pident)
							protarea = 100.00*(((float(end)-1.00) - (float(start)-1.00))/length)
							try:
								data2 = infoprot_hmmer['lociname']
							except KeyError:
								if not re.search(hypotheticalpat, description) and float(protarea) >= float(args.covthreshold) and float(evaluehh) <= float(args.hmmerevalue) and float(pident) >= 50.00:
									infoprot_hmmer['lociname'] = lociname
									infoprot_hmmer['name'] = matchname
									infoprot_hmmer['evalue'] = float(evaluehh)
									infoprot_hmmer['pcover'] = float(protarea)
									infoprot_hmmer['pident'] = float(pident)
									infoprot_hmmer['descr'] = description
								else:
									try:
										if not re.search(hypotheticalpat, description) and float(protarea) >= float(args.covthreshold) and float(evaluehh) <= float(args.hmmerevalue) and float(pident) >= 50.00 and float(pident) >= infoprot_hmmer['pident']:
											infoprot_hmmer['lociname'] = lociname
											infoprot_hmmer['name'] = matchname
											infoprot_hmmer['evalue'] = float(evaluehh)
											infoprot_hmmer['pcover'] = float(protarea)
											infoprot_hmmer['pident'] = float(pident)
											infoprot_hmmer['descr'] = description
									except KeyError:
											continue
							else:
								if not re.search(hypotheticalpat, description) and float(protarea) >= float(args.covthreshold) and float(evaluehh) <= float(args.hmmerevalue) and float(pident) >= 50.00 and float(pident) >= infoprot_hmmer['pident']:
									infoprot_hmmer['lociname'] = lociname
									infoprot_hmmer['name'] = matchname
									infoprot_hmmer['evalue'] = float(evaluehh)
									infoprot_hmmer['pcover'] = float(protarea)
									infoprot_hmmer['pident'] = float(pident)
									infoprot_hmmer['descr'] = description
				information_proteins_hmmer[rootname] = infoprot_hmmer

	#Storing protein information in memory
	with open("temp.faa", "rU") as protsfile:
		tempprotsdict = {}
		for protseq in SeqIO.parse(protsfile, "fasta"):
			tempindprot = {}
			dataprot = protseq.description.split(' # ')
			modseq = str(protseq.seq).replace("X","") # Removing all ambiguous amino acids to avoid problems with ProteinAnalysis module
			analysed_seq = ProteinAnalysis(modseq)
			tempindprot['length'] = len(protseq.seq)
			tempindprot['isoelectricpoint'] = analysed_seq.isoelectric_point()
			tempindprot['molweightkda'] = analysed_seq.molecular_weight()/1000.00
			tempindprot['instability'] = analysed_seq.instability_index()
			tempindprot['protein_id'] = dataprot[0]
			tempindprot['strand'] = int(dataprot[3])
			tempindprot['begin'] = int(dataprot[1])-1
			tempindprot['end'] = int(dataprot[2])
			tempprotsdict[dataprot[0]] = tempindprot

	# Creation of table
	debugtable = "%s.csv" % newfile
	with open(debugtable, "w") as tablefile:
		if args.nohmmer == False:
			print("\t".join(["Identifier", "Start", "Stop", "Strand", "size_aa", "pI", "Mol_weight_kDa", "Instability_index", "ID_BLAST", "Descr_BLAST", "evalue_BLAST", "%ID_BLAST", "%Cover_BLAST", "ID_HMMER", "Descr_HMMER", "evalue_HMMER", "%ID_HMMER", "%Cover_HMMER"]), file=tablefile)
			keylist = information_proteins_hmmer.keys()
			keylist.sort()
			for keyB in keylist:
				keyB = keyB.replace(".faa.tbl", "")
				try:
					print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), information_proteins_blast[equivalence[keyB]]['sseqid'], information_proteins_blast[equivalence[keyB]]['descr'], str(information_proteins_blast[equivalence[keyB]]['evalue']), str(information_proteins_blast[equivalence[keyB]]['pident']), str(information_proteins_blast[equivalence[keyB]]['pcover']), information_proteins_hmmer[keyB]['name'], information_proteins_hmmer[keyB]['descr'], str(information_proteins_hmmer[keyB]['evalue']), str(information_proteins_hmmer[keyB]['pident']), str(information_proteins_hmmer[keyB]['pcover'])]), file=tablefile)
				except KeyError:
					try:
						print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), information_proteins_blast[equivalence[keyB]]['sseqid'], information_proteins_blast[equivalence[keyB]]['descr'], str(information_proteins_blast[equivalence[keyB]]['evalue']), str(information_proteins_blast[equivalence[keyB]]['pident']), str(information_proteins_blast[equivalence[keyB]]['pcover']), "None", "None", "NA", "NA", "NA"]), file=tablefile)
					except KeyError:
						try:
							print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), "None", "None", "NA", "NA", "NA", information_proteins_hmmer[keyB]['name'], information_proteins_hmmer[keyB]['descr'], str(information_proteins_hmmer[keyB]['evalue']), str(information_proteins_hmmer[keyB]['pident']), str(information_proteins_hmmer[keyB]['pcover'])]), file=tablefile)
						except KeyError:
							print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), "None", "None", "NA", "NA", "NA",  "None", "None", "NA", "NA", "NA"]), file=tablefile)
		else:
			print("\t".join(["Identifier", "Start", "Stop", "Strand", "size_aa", "pI", "Mol_weight_kDa", "Instability_index", "ID_BLAST", "Descr_BLAST", "evalue_BLAST", "%ID_BLAST", "%Cover_BLAST"]), file=tablefile)
			keylist = equivalence.values()
			keylist.sort()
			for keyB in keylist:
				try:
					print("\t".join([keyB, str(tempprotsdict[keyB]['begin']), str(tempprotsdict[keyB]['end']), str(tempprotsdict[keyB]['strand']), str(tempprotsdict[keyB]['length']), str(tempprotsdict[keyB]['isoelectricpoint']), str(tempprotsdict[keyB]['molweightkda']), str(tempprotsdict[keyB]['instability']), information_proteins_blast[keyB]['sseqid'], information_proteins_blast[keyB]['descr'], str(information_proteins_blast[keyB]['evalue']), str(information_proteins_blast[keyB]['pident']), str(information_proteins_blast[keyB]['pcover'])]), file=tablefile)
				except KeyError:
					print("\t".join([keyB, str(tempprotsdict[keyB]['begin']), str(tempprotsdict[keyB]['end']), str(tempprotsdict[keyB]['strand']), str(tempprotsdict[keyB]['length']), str(tempprotsdict[keyB]['isoelectricpoint']), str(tempprotsdict[keyB]['molweightkda']), str(tempprotsdict[keyB]['instability']), "None", "None", "NA", "NA", "NA"]), file=tablefile)

	# Algorithm of decisions (which one: BLAST/HMMER?)
	multipleprots = {}
	Hypotheticalpat = re.compile(r'(((((?i)hypothetical)|(?i)uncharacteri(z|s)ed|(?i)predicted)) protein)|((?i)ORF|((?i)unnamed protein product|(?i)gp\d+))')
	if args.nohmmer == False:
		keylist = information_proteins_hmmer.keys()
		keylist.sort()
		for keyB in keylist:
			keyB = keyB.replace(".faa.tbl", "")
			singleprot = {}
			singleprot['name'] = equivalence[keyB]
			if (equivalence[keyB] in information_proteins_blast) and (keyB in information_proteins_hmmer):
				if re.search(Hypotheticalpat, information_proteins_blast[equivalence[keyB]]['descr']):
					try:
						if re.search(Hypotheticalpat, information_proteins_hmmer[keyB]['descr']):
							singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
						else:
							singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
					except KeyError:
						singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
				else:
					try:
						if (float(information_proteins_blast[equivalence[keyB]]['pident'])>float(information_proteins_hmmer[keyB]['pident'])) and (float(information_proteins_blast[equivalence[keyB]]['pcover'])>float(information_proteins_hmmer[keyB]['pcover'])):
							singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
						elif (float(information_proteins_blast[equivalence[keyB]]['pident'])<float(information_proteins_hmmer[keyB]['pident'])) and (float(information_proteins_blast[equivalence[keyB]]['pcover'])<float(information_proteins_hmmer[keyB]['pcover'])):
							singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
						elif (float(information_proteins_blast[equivalence[keyB]]['pident'])>float(information_proteins_hmmer[keyB]['pident'])) and (float(information_proteins_blast[equivalence[keyB]]['pcover'])<float(information_proteins_hmmer[keyB]['pcover'])):
							if (float(information_proteins_blast[equivalence[keyB]]['pident'])-float(information_proteins_hmmer[keyB]['pident']) >= args.diffid):
								singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
							else:
								singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
						else:
							if (float(information_proteins_hmmer[keyB]['pident'])-float(information_proteins_blast[equivalence[keyB]]['pident']) >= args.diffid):
								singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
							else:
								singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
					except KeyError:
						try:
							if (float(information_proteins_blast[equivalence[keyB]]['pcover'])>float(information_proteins_hmmer[keyB]['pcover'])):
								singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
							else:
								singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
						except KeyError:
							singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
			elif equivalence[keyB] in information_proteins_blast:
				singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
			elif keyB in information_proteins_hmmer:
				try:
					singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
				except KeyError:
					singleprot['descr'] = 'Hypothetical protein'
			else:
				singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
			multipleprots[keyB] = singleprot
	else:
		keylist = equivalence.values()
		keylist.sort()
		for keyB in keylist:
			singleprot = {}
			singleprot['name'] = keyB
			try:
				if information_proteins_blast[keyB]['descr'] == None:
					singleprot['descr'] = 'Hypothetical protein'
				elif re.search(Hypotheticalpat, information_proteins_blast[keyB]['descr']):
					singleprot['descr'] = 'Conserved hypothetical protein'
				else:
					singleprot['descr'] = information_proteins_blast[keyB]['descr']
			except KeyError:
				singleprot['descr'] = 'Hypothetical protein'
			multipleprots[keyB] = singleprot

	#Storing protein information in memory
	with open("temp.faa", "rU") as protsfile:
		protsdict = {}
		for protseq in SeqIO.parse(protsfile, "fasta"):
			indprot = {}
			dataprot = protseq.description.split(' # ')
			indprot['translation'] = protseq.seq
			indprot['protein_id'] = dataprot[0]
			indprot['strand'] = int(dataprot[3])
			indprot['begin'] = int(dataprot[1])-1
			indprot['end'] = int(dataprot[2])
			for keyOmega in sorted(multipleprots):
				if multipleprots[keyOmega]['name'] == dataprot[0]:
					indprot['product'] = multipleprots[keyOmega]['descr']
			protsdict[dataprot[0]] = indprot

	# Predicting the rRNA sequences
	with open(newfile, "rU") as targetfasta, open("/dev/null", "w") as apocalypse:
		eprint("Running INFERNAL+RFAM to predict rRNA-like sequences in %s" % newfile)
		subprocess.call(["cmscan", "--rfam", "--cut_ga", "--nohmmonly", "--tblout", "rrnafile.csv", "--cpu", args.ncpus, args.rfamdatabase, newfile], stdout=apocalypse)

	#Storing rRNA information in memory
	with open("rrnafile.csv", "rU") as rrnafile:
		namedict = {"SSU_rRNA_archaea": "16s_rRNA", "SSU_rRNA_bacteria": "16s_rRNA", "SSU_rRNA_eukarya": "18s_rRNA", "SSU_rRNA_microsporidia": "16s_rRNA", "LSU_rRNA_archaea": "23s_rRNA", "LSU_rRNA_bacteria": "23s_rRNA", "LSU_rRNA_eukarya": "28s_rRNA", "5S_rRNA": "5s_rRNA"}
		rRNAdict = defaultdict(list)
		for line in rrnafile:
			if not line.startswith("#"):
				InfoLINE = re.sub("\s+", ",", line)
				line_splitted = InfoLINE.split(",")
				item_type = line_splitted[0]
				if item_type.startswith(('LSU', 'SSU', '5S')):
					strand = 0
					if line_splitted[9] == "+":
						strand = 1
					else:
						strand = -1
					rRNAdict[item_type].append({'score': float(line_splitted[14]), 'product': namedict[line_splitted[0]], 'begin': int(line_splitted[7]), 'end': int(line_splitted[8]), 'strand': strand})

		subunits = {'LSU': {'max_score': 0 }, 'SSU': {'max_score': 0 }, '5S': {'max_score': 0 }}
		for type_rRNA, rRNA_data in rRNAdict.items():
			max_score = max([item['score'] for item in rRNAdict[type_rRNA]])
			for item in ('LSU', 'SSU'):
				if type_rRNA.startswith(item):
					if max_score > subunits[item]['max_score']:
						subunits[item]['listdata'] = rRNA_data
						subunits[item]['max_score'] = max_score
			if type_rRNA.startswith('5S'):
				subunits['5S']['listdata'] = rRNA_data
				subunits['5S']['max_score'] = max_score

		for rRNA in subunits:
			i = 0
			try:
				lengthlist = len(subunits[rRNA]['listdata'])
			except KeyError:
				continue
			else:
				while i < lengthlist:
					eprint("%s harbours a %s from %i to %i" % (newfile, subunits[rRNA]['listdata'][i]['product'], int(subunits[rRNA]['listdata'][i]['begin']), int(subunits[rRNA]['listdata'][i]['end'])))
					i += 1

	# Predicting the tRNA sequences
	eprint("Running ARAGORN to predict tRNA-like sequences in %s" % newfile)
	genetictable = "-gc%s" % str(args.gcode)
	with open("trnafile.fasta", "w") as trnafile:
		if genomeshape['genomeshape'] == "circular":
			subprocess.call(["aragorn", "-c", "-fon", genetictable, newfile], stdout=trnafile)
		else:
			subprocess.call(["aragorn", "-l", "-fon", genetictable, newfile], stdout=trnafile)
	num_tRNA = len(list(SeqIO.parse("trnafile.fasta", "fasta")))
	eprint("ARAGORN was able to predict %i tRNAs in %s\n" % (num_tRNA, newfile))

	#Storing tRNA and tmRNA information in memory
	with open("trnafile.fasta", "rU") as trnafile:
		tRNAdict = {}
		tmRNAdict = {}
		for tRNAseq in SeqIO.parse(trnafile, "fasta"):
			indtRNA = {}
			indtmRNA = {}
			tRNA_information = tRNAseq.description.split(" ")
			tRNApat = re.compile("^tRNA-")
			if tRNA_information[1] == "tmRNA":
				if str(tRNA_information[2]) == "(Permuted)":
					indtmRNA['product'] = "tmRNA"
					tmRNA_coords = str(tRNA_information[3])
					Beginningrevcomppat = re.compile("^c")
					if re.match(Beginningrevcomppat, tmRNA_coords):
						indtmRNA['strand'] = -1
						tmRNA_coords = tmRNA_coords.replace("c[","").replace("]","").split(",")
					else:
						indtmRNA['strand'] = 1
						tmRNA_coords = tmRNA_coords.replace("[","").replace("]","").split(",")
					indtmRNA['begin'] = int(tmRNA_coords[0])
					indtmRNA['end'] = int(tmRNA_coords[1])
					tmRNAdict[tRNAseq.id] = indtmRNA
				else:
					indtmRNA['product'] = "tmRNA"
					tmRNA_coords = str(tRNA_information[2])
					Beginningrevcomppat = re.compile("^c")
					if re.match(Beginningrevcomppat, tmRNA_coords):
						indtmRNA['strand'] = -1
						tmRNA_coords = tmRNA_coords.replace("c[","").replace("]","").split(",")
					else:
						indtmRNA['strand'] = 1
						tmRNA_coords = tmRNA_coords.replace("[","").replace("]","").split(",")
					indtmRNA['begin'] = int(tmRNA_coords[0])
					indtmRNA['end'] = int(tmRNA_coords[1])
					tmRNAdict[tRNAseq.id] = indtmRNA
			elif re.match(tRNApat, tRNA_information[1]):
				indtRNA['product'] = re.sub("\(\w{3}\)", "",  tRNA_information[1])
				tRNA_coords = tRNA_information[2]
				Beginningrevcomppat = re.compile("^c")
				if re.match(Beginningrevcomppat, tRNA_coords):
					indtRNA['strand'] = -1
					tRNA_coords = tRNA_coords.replace("c[","").replace("]","").split(",")
				else:
					indtRNA['strand'] = 1
					tRNA_coords = tRNA_coords.replace("[","").replace("]","").split(",")
				indtRNA['begin'] = int(tRNA_coords[0])
				indtRNA['end'] = int(tRNA_coords[1])
				tRNAdict[tRNAseq.id] = indtRNA

	#Predicting CRISPR repeats and others
	eprint("Running PILERCR to predict CRISPR repeats in %s" % newfile)
	subprocess.call(["pilercr", "-in", newfile, "-out", "crisprfile.txt", "-noinfo", "-minrepeat", str(args.minrepeat), "-maxrepeat", str(args.maxrepeat), "-minspacer", str(args.minspacer), "-maxspacer", str(args.maxspacer)])
	eprint("Predicting repeats in the sequences using TRF and IRF")
	with open("/dev/null", "w") as stderr:
		subprocess.call(["trf", newfile, "2", "7", "7", "80", "10", "50", "500", "-h"], stderr=stderr)
		os.rename("%s.2.7.7.80.10.50.500.dat" % newfile, "trf_temp.dat")
	with open("/dev/null", "w") as stderr:
		subprocess.call(["irf", newfile, "2", "3", "5", "80", "10", "40", "500000", "10000", "-d", "-h"], stderr=stderr)
		os.rename("%s.2.3.5.80.10.40.500000.10000.dat" % newfile, "irf_temp.dat")

	# Storing CRISPR repeats information
	information_CRISPR = {}
	with open("crisprfile.txt", "rU") as crisprfile:
		for line in crisprfile:
			if "SUMMARY BY POSITION" in line:
				for line in crisprfile:
					information_crispr_repeat = {}
					try:
						patC = re.compile('^\s+(\d+)\s+.{16}\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d?\s+(\w+)')
						key, start, length, seq = re.match(patC, line).groups()
					except AttributeError:
						continue
					else:
						information_crispr_repeat['start'] = start
						information_crispr_repeat['end'] = int(start) + int(length)
						information_crispr_repeat['repeatseq'] = seq
						information_crispr_repeat['repeatend'] = int(start) + len(seq)
						information_CRISPR[key] = information_crispr_repeat

	# Storing tandem repeats information
	information_TRF = {}
	count = 1
	with open("trf_temp.dat", "rU") as trfile:
		for line in trfile:
			information_tandem_repeat = {}
			try:
				patT = re.compile('^(\d+)\s(\d+)\s\d+\s\d+\.\d+\s')
				start, end = re.match(patT, line).groups()
			except AttributeError:
				continue
			else:
				information_tandem_repeat['start'] = start
				information_tandem_repeat['end'] = end
				information_TRF[count] = information_tandem_repeat
				count += 1

	# Storing inverted repeats information
	information_IRF = {}
	count = 1
	with open("irf_temp.dat", "rU") as irfile:
		for line in irfile:
			information_inverted_repeat = {}
			try:
				patI = re.compile('^(\d+)\s(\d+)\s\d+\s\d+\s\d+')
				start, end = re.match(patI, line).groups()
			except AttributeError:
				continue
			else:
				information_inverted_repeat['start'] = start
				information_inverted_repeat['end'] = end
				information_IRF[count] = information_inverted_repeat
				count += 1

	# Creating a new Genbank and GFF file
	eprint("Creating the output files")
	newtempgbk = "%s.temp.gbk" % newfile
	with open(newfile, "rU") as basefile, open(newtempgbk, "w"):
		for record in SeqIO.parse(basefile, "fasta", IUPAC.ambiguous_dna):
			whole_sequence = SeqRecord(record.seq)
			whole_sequence.id = str(record.id)
			whole_sequence.annotations['data_file_division'] = args.typedata.upper()
			whole_sequence.annotations['date'] = strftime("%d-%b-%Y").upper()
			for protein in sorted(protsdict, key = stringSplitByNumbers):
				putative_start = int(protsdict[protein]['begin'])
				start_pos = SeqFeature.ExactPosition(putative_start)
				end_pos = SeqFeature.ExactPosition(protsdict[protein]['end'])
				feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=protsdict[protein]['strand'])
				qualifiersgene = OrderedDict([('locus_tag', protsdict[protein]['protein_id'])])
				new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = protsdict[protein]['strand'], qualifiers = qualifiersgene)
				whole_sequence.features.append(new_data_gene)
				qualifiers = [('locus_tag', protsdict[protein]['protein_id']), ('product', protsdict[protein]['product']), ('protein_id', protsdict[protein]['protein_id']), ('translation', protsdict[protein]['translation'])]
				feature_qualifiers = OrderedDict(qualifiers)
				new_data_cds = SeqFeature.SeqFeature(feature_location, type = "CDS", strand = protsdict[protein]['strand'], qualifiers = feature_qualifiers)
				whole_sequence.features.append(new_data_cds)
			for rRNA in sorted(subunits, key = stringSplitByNumbers):
				i = 0
				try:
					lengthlist = len(subunits[rRNA]['listdata'])
				except KeyError:
					continue
				else:
					while i < lengthlist:
						putative_start = int(subunits[rRNA]['listdata'][i]['begin'])
						start_pos = SeqFeature.ExactPosition(putative_start)
						end_pos = SeqFeature.ExactPosition(subunits[rRNA]['listdata'][i]['end'])
						feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=subunits[rRNA]['listdata'][i]['strand'])
						new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = subunits[rRNA]['listdata'][i]['strand'])
						whole_sequence.features.append(new_data_gene)
						qualifiers = [('product', subunits[rRNA]['listdata'][i]['product'])]
						feature_qualifiers = OrderedDict(qualifiers)
						new_data_rRNA = SeqFeature.SeqFeature(feature_location, type = "rRNA", strand = subunits[rRNA]['listdata'][i]['strand'], qualifiers = feature_qualifiers)
						whole_sequence.features.append(new_data_rRNA)
						i += 1
			for tRNA in sorted(tRNAdict, key = stringSplitByNumbers):
				putative_start = int(tRNAdict[tRNA]['begin'])
				start_pos = SeqFeature.ExactPosition(putative_start)
				end_pos = SeqFeature.ExactPosition(tRNAdict[tRNA]['end'])
				feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=tRNAdict[tRNA]['strand'])
				new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = tRNAdict[tRNA]['strand'])
				whole_sequence.features.append(new_data_gene)
				qualifiers = [('product', tRNAdict[tRNA]['product'])]
				feature_qualifiers = OrderedDict(qualifiers)
				new_data_tRNA = SeqFeature.SeqFeature(feature_location, type = "tRNA", strand = tRNAdict[tRNA]['strand'], qualifiers = feature_qualifiers)
				whole_sequence.features.append(new_data_tRNA)
			for tmRNA in sorted(tmRNAdict, key = stringSplitByNumbers):
				putative_start = int(tmRNAdict[tmRNA]['begin'])
				start_pos = SeqFeature.ExactPosition(putative_start)
				end_pos = SeqFeature.ExactPosition(tmRNAdict[tmRNA]['end'])
				feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=tmRNAdict[tmRNA]['strand'])
				new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = tmRNAdict[tmRNA]['strand'])
				whole_sequence.features.append(new_data_gene)
				qualifiers = [('product', tmRNAdict[tmRNA]['product'])]
				feature_qualifiers = OrderedDict(qualifiers)
				new_data_tmRNA = SeqFeature.SeqFeature(feature_location, type = "tmRNA", strand = tmRNAdict[tmRNA]['strand'], qualifiers = feature_qualifiers)
				whole_sequence.features.append(new_data_tmRNA)
			for CRISPR in sorted(information_CRISPR, key = stringSplitByNumbers):
				putative_start = int(information_CRISPR[CRISPR]['start'])
				start_pos = SeqFeature.ExactPosition(putative_start)
				end_pos = SeqFeature.ExactPosition(information_CRISPR[CRISPR]['end'])
				feature_location = SeqFeature.FeatureLocation(start_pos, end_pos)
				qualifiers = [('rpt_family', 'CRISPR'), ('rpt_type', 'direct'), ('rpt_unit_range', "%i..%i" % (int(information_CRISPR[CRISPR]['start']), int(information_CRISPR[CRISPR]['repeatend']))), ('rpt_unit_seq', information_CRISPR[CRISPR]['repeatseq'])]
				feature_qualifiers = OrderedDict(qualifiers)
				new_data_CRISPRrepeat = SeqFeature.SeqFeature(feature_location, type = "repeat_region", qualifiers = feature_qualifiers)
				whole_sequence.features.append(new_data_CRISPRrepeat)
			for tandem in sorted(information_TRF):
				putative_start = int(information_TRF[tandem]['start'])
				start_pos = SeqFeature.ExactPosition(putative_start)
				end_pos = SeqFeature.ExactPosition(information_TRF[tandem]['end'])
				feature_location = SeqFeature.FeatureLocation(start_pos, end_pos)
				qualifiers = [('rpt_type', 'direct')]
				feature_qualifiers = OrderedDict(qualifiers)
				new_data_tandemrepeat = SeqFeature.SeqFeature(feature_location, type = "repeat_region", qualifiers = feature_qualifiers)
				whole_sequence.features.append(new_data_tandemrepeat)
			for inverted in sorted(information_IRF):
				putative_start = int(information_IRF[inverted]['start'])
				start_pos = SeqFeature.ExactPosition(putative_start)
				end_pos = SeqFeature.ExactPosition(information_IRF[inverted]['end'])
				feature_location = SeqFeature.FeatureLocation(start_pos, end_pos)
				qualifiers = [('rpt_type', 'inverted')]
				feature_qualifiers = OrderedDict(qualifiers)
				new_data_invertedrepeat = SeqFeature.SeqFeature(feature_location, type = "repeat_region", qualifiers = feature_qualifiers)
				whole_sequence.features.append(new_data_invertedrepeat)
			SeqIO.write(whole_sequence, newtempgbk, "genbank")

	newgbk = "%s.gbk" % newfile
	with open(newtempgbk, "rU") as gbktempfile, open(newgbk, "w") as gbkrealfile:
		newpat = re.compile("D|RNA\s+(CON|PHG|VRL|BCT)")
		for line in gbktempfile:
			if line.startswith("LOCUS ") and re.search(newpat, line):
				if genomeshape['genomeshape'] == "linear":
					newline = re.sub("bp    DNA\s+", "bp    DNA     linear   ", line)
				else:
					newline = re.sub("bp    DNA\s+", "bp    DNA     circular ", line)
				gbkrealfile.write(newline)
			else:
				gbkrealfile.write(line)

	for f in glob.glob("*.temp.gbk"):
		os.remove(f)

	if args.gffprint==True:
		newgff = "%s.gff" % newfile
		with open(newgff, "w") as outgff, open(newgbk, "rU") as ingbk:
			GFF.write(SeqIO.parse(ingbk, "genbank"), outgff)

	# Removing intermediate files
	os.remove(newfile)
	os.remove("temp.faa")
	os.remove("temp_blast.csv")
	os.remove("crisprfile.txt")
	os.remove("trnafile.fasta")
	os.remove("rrnafile.csv")
	os.remove("trf_temp.dat")
	os.remove("irf_temp.dat")
	for f in glob.glob("SEQ*"):
		os.remove(f)

# Joining all GENBANK files into one
listgbk = sorted(glob.glob('CONTIG_*.gbk'))
gbkoutputfile = "%s.gbk" % root_output
with open(gbkoutputfile, 'w') as finalgbk:
	for fname in listgbk:
		with open(fname) as infile:
			for line in infile:
				finalgbk.write(line)

for tempgbk in glob.glob("CONTIG_*.gbk"):
	os.remove(tempgbk)

# Joining all GFF files into one
if args.gffprint==True:
	listgff = sorted(glob.glob('CONTIG_*.gff'))
	gffoutputfile = "%s.gff" % root_output
	with open(gffoutputfile, 'w') as finalgff:
		for fname in listgff:
			with open(fname) as infile:
				for line in infile:
					finalgff.write(line)
	for tempgff in glob.glob("CONTIG_*.gff"):
		os.remove(tempgff)

# Joining all TABLE files into one
listcsv = sorted(glob.glob('CONTIG_*.csv'))
tbloutputfile = "%s.csv" % root_output
with open(tbloutputfile, 'w') as finaltable:
	for fname in listcsv:
		with open(fname) as infile:
			for line in infile:
				finaltable.write(line)

for temptbl in glob.glob("CONTIG_*.csv"):
	os.remove(temptbl)

# Preparing sequences for GenBank submission (Original code from Wan Yu's gbk2tbl.py script [https://github.com/wanyuac/BINF_toolkit/blob/master/gbk2tbl.py])
allowed_qualifiers = ['locus_tag', 'gene', 'product', 'pseudo', 'protein_id', 'gene_desc', 'old_locus_tag', 'note', 'inference', 'organism', 'mol_type', 'strain', 'sub_species', 'isolation-source', 'country']
newfastafile = "%s.fasta" % root_output
newtablefile = "%s.tbl" % root_output
with open(args.modifiers, "rU") as modifiers, open(gbkoutputfile, "r") as genbank_fh, open(newfastafile, "w") as fasta_fh, open(newtablefile, "w") as feature_fh:
	info = modifiers.readline()
	wholelist = list(SeqIO.parse(genbank_fh, 'genbank'))
	for record in wholelist:
		if len(record) <= args.mincontigsize:
			eprint("WARNING: Skipping small contig %s" % record.id)
			continue
		record.description = "%s %s" % (record.id, info)
		SeqIO.write([record], fasta_fh, 'fasta')
		print('>Feature %s' % (record.name), file=feature_fh)
		for line in record.features:
			if line.strand == 1:
				print('%d\t%d\t%s' % (line.location.nofuzzy_start + 1, line.location.nofuzzy_end, line.type), file=feature_fh)
			else:
				print('%d\t%d\t%s' % (line.location.nofuzzy_end, line.location.nofuzzy_start + 1, line.type), file=feature_fh)
			for (key, values) in line.qualifiers.iteritems():
				if key not in allowed_qualifiers:
					continue
				for v in values:
					print('\t\t\t%s\t%s' % (key, v), file=feature_fh)

# Final statement
eprint("Genome annotation done!")
eprint("The GenBank file is %s" % gbkoutputfile)
if args.gffprint==True:
	eprint("The GFF3 file is %s" % gffoutputfile)
eprint("The table file for GenBank submission is %s" % tbloutputfile)
eprint("The FASTA file for GenBank submission is %s" % newfastafile)
eprint("The table file with all protein information is %s" % newtablefile)
author	vimalkumarvelayudhan
date	Wed, 28 Feb 2018 05:00:51 -0500
parents	231e4c669675
children