comparison VIGA.py @ 0:231e4c669675 draft

Initial commit - v0.10.3 git commit deeded0
author vimalkumarvelayudhan
date Tue, 27 Feb 2018 14:16:54 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:231e4c669675
1 #!/usr/bin/env python
2
3 # -*- coding: utf-8 -*-
4
5 # VIGA - De-novo VIral Genome Annotator
6 #
7 # Copyright (C) 2017 - Enrique Gonzalez-Tortuero
8 # Vimalkumar Velayudhan
9 #
10 # This program is free software: you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License as published by
12 # the Free Software Foundation, either version 3 of the License, or
13 # (at your option) any later version.
14 #
15 # This program is distributed in the hope that it will be useful,
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 # GNU General Public License for more details.
19 #
20 # You should have received a copy of the GNU General Public License
21 # along with this program. If not, see <http://www.gnu.org/licenses/>.
22
23 # Importing python libraries
24 from __future__ import print_function
25 import argparse
26 import csv
27 import fileinput
28 import fractions
29 import glob
30 import numpy
31 import os
32 import os.path
33 import re
34 import sys
35 import shutil
36 import subprocess
37 from BCBio import GFF
38 from Bio import SeqIO
39 from Bio import SeqFeature
40 from Bio.Alphabet import IUPAC
41 from Bio.Seq import Seq
42 from Bio.SeqFeature import FeatureLocation
43 from Bio.SeqRecord import SeqRecord
44 from Bio.SeqUtils.ProtParam import ProteinAnalysis
45 from collections import OrderedDict, defaultdict
46 from itertools import product
47 from scipy import signal
48 from time import strftime
49
50 # Preparing functions
51 def batch_iterator(iterator, batch_size):
52 entry = True
53 while entry:
54 batch = []
55 while len(batch) < batch_size:
56 try:
57 entry = iterator.next()
58 except StopIteration:
59 entry = None
60 if entry is None:
61 break
62 batch.append(entry)
63 if batch:
64 yield batch
65
66 def check_peaks(peaks, length):
67 # Checking if origin/terminus peaks are too close or too far apart. In that case, they are probably wrong
68 closest, farthest = int(length * float(0.45)), int(length * float(0.55))
69 pairs = []
70 for pair in list(product(*peaks)):
71 ### added this to make sure gets origin and ter right
72 tr, pk = sorted(list(pair), key = lambda x: x[1], reverse = False) # trough and peak
73 a = (tr[0] - pk[0]) % length
74 b = (pk[0] - tr[0]) % length
75 pt = abs(tr[1] - pk[1]) # distance between values
76 if (a <= farthest and a >= closest) or (b <=farthest and b >= closest):
77 pairs.append([pt, tr, pk])
78 if len(pairs) == 0:
79 return [False, False]
80 pt, tr, pk = sorted(pairs, reverse = True)[0]
81 return [tr[0], pk[0]]
82
83 def cmd_exists(cmd):
84 return subprocess.call("type " + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0
85
86 def GCskew(name, length, seq, window, slide):
87 replacements = {'G':1, 'C':-1, 'A':0, 'T':0, 'N':0}
88 gmc = [] # G - C
89 for base in seq:
90 try:
91 gmc.append(replacements[base])
92 except:
93 gmc.append(0)
94 # convert to G + C
95 gpc = [abs(i) for i in gmc] # G + C
96 # calculate sliding windows for (G - C) and (G + C)
97 weights = numpy.ones(window)/window
98 gmc = [[i, c] for i, c in enumerate(signal.fftconvolve(gmc, weights, 'same').tolist())]
99 gpc = [[i, c] for i, c in enumerate(signal.fftconvolve(gpc, weights, 'same').tolist())]
100 # calculate gc skew and cummulative gc skew sum
101 skew = [[], []] # x and y for gc skew
102 c_skew = [[], []] # x and y for gc skew cummulative sums
103 cs = 0 # cummulative sum
104 # select windows to use based on slide
105 for i, m in gmc[0::slide]:
106 p = gpc[i][1]
107 if p == 0:
108 gcs = 0
109 else:
110 gcs = m/p
111 cs += gcs
112 skew[0].append(i)
113 c_skew[0].append(i)
114 skew[1].append(gcs)
115 c_skew[1].append(cs)
116 ori, ter = find_ori_ter(c_skew, length)
117 return ori, ter, skew, c_skew
118
119 def eprint(*args, **kwargs):
120 print(*args, file=sys.stderr, **kwargs)
121
122 def find_ori_ter(c_skew, length):
123 # Find origin and terminus of replication based on cumulative GC skew min and max peaks
124 c_skew_min = signal.argrelextrema(numpy.asarray(c_skew[1]), numpy.less, order = 1)[0].tolist()
125 c_skew_max = signal.argrelextrema(numpy.asarray(c_skew[1]), numpy.greater, order = 1)[0].tolist()
126 # return False if no peaks were detected
127 if len(c_skew_min) == 0 or len(c_skew_min) == 0:
128 return [False, False]
129 else:
130 c_skew_min = [[c_skew[0][i], c_skew[1][i]] for i in c_skew_min]
131 c_skew_max = [[c_skew[0][i], c_skew[1][i]] for i in c_skew_max]
132 ori, ter = check_peaks([c_skew_min, c_skew_max], length)
133 return ori, ter
134
135 def stringSplitByNumbers(x):
136 r = re.compile('(\d+)')
137 l = r.split(x)
138 return [int(y) if y.isdigit() else y for y in l]
139
140 # Defining the program version
141 version = "0.10.3"
142
143 # Processing the parameters
144 parser = argparse.ArgumentParser(description='VIGA is an automatic de novo VIral Genome Annotator.')
145 basic_group = parser.add_argument_group('Basic options for VIGA [REQUIRED]')
146
147 basic_group.add_argument("--input", dest="inputfile", type=str, required=True, help='Input file as a FASTA file', metavar="FASTAFILE")
148 basic_group.add_argument("--rfamdb", dest="rfamdatabase", type=str, required=True, help='RFAM Database that will be used for the ribosomal RNA prediction. RFAMDB should be in the format "/full/path/to/rfamdb/Rfam.cm" and must be compressed accordingly (see INFERNAL manual) before running the script.', metavar="RFAMDB")
149 basic_group.add_argument("--modifiers", dest="modifiers", type=str, required=True, help='Input file as a plain text file with the modifiers per every FASTA header according to SeqIn (https://www.ncbi.nlm.nih.gov/Sequin/modifiers.html). All modifiers must be written in a single line and are separated by a single space character. No space should be placed besides the = sign. For example: [organism=Serratia marcescens subsp. marcescens] [sub-species=marcescens] [strain=AH0650_Sm1] [topology=linear] [moltype=DNA] [tech=wgs] [gcode=11] [country=Australia] [isolation-source=sputum]. This line will be copied and printed along with the record name as the definition line of every contig sequence.', metavar="TEXTFILE")
150
151 advanced_group = parser.add_argument_group('Advanced options for VIGA [OPTIONAL]')
152 advanced_group.add_argument("--readlength", dest="read_length", type=int, default=101, help='Read length for the circularity prediction (default: 101 bp)', metavar="INT")
153 advanced_group.add_argument("--windowsize", dest="window", type=int, default=100, help='Window size used to determine the origin of replication in circular contigs according to the cumulative GC skew (default: 100 bp)', metavar="INT")
154 advanced_group.add_argument("--slidingsize", dest="slide", type=int, default=10, help='Window size used to determine the origin of replication in circular contigs according to the cumulative GC skew (default: 10 bp)', metavar="INT")
155 advanced_group.add_argument("--out", dest="rootoutput", type=str, help='Name of the outputs files (without extension)', metavar="OUTPUTNAME")
156 advanced_group.add_argument("--locus", dest="locus", type=str, default='LOC', help='Name of the sequences. If the input is a multiFASTA file, please put a general name as the program will add the number of the contig at the end of the name (Default: %(default)s)', metavar="STRING")
157 advanced_group.add_argument("--gff", dest="gffprint", action='store_true', default=False, help='Printing the output as GFF3 file (Default: False)')
158 advanced_group.add_argument("--threads", dest="ncpus", default=1, help='Number of threads/cpus (Default: %(default)s cpu)', metavar="INT")
159 advanced_group.add_argument("--nohmmer", dest="nohmmer", action='store_true', default=False, help='Running only BLAST to predict protein function. (Default: False)')
160 advanced_group.add_argument("--noblast", dest="noblast", action='store_true', default=False, help='Running DIAMOND instead of BLAST to predict protein function according to homology. This will be less sensitive but faster than BLAST. (Default: False)')
161 advanced_group.add_argument("--blastdb", dest="blastdatabase", type=str, help='BLAST Database that will be used for the protein function prediction. The database must be an amino acid one, not nucleotidic. This argument is mandatory if --noblast option is disabled', metavar="BLASTDB")
162 advanced_group.add_argument("--diamonddb", dest="diamonddatabase", type=str, help='DIAMOND Database that will be used for the protein function prediction. The database must be created from a amino acid FASTA file as indicated in https://github.com/bbuchfink/diamond. This argument is mandatory when --noblast option is enabled', metavar="DIAMONDDB")
163 advanced_group.add_argument("--blastevalue", dest="blastevalue", default=0.00001, help='BLAST/DIAMOND e-value threshold (Default: 0.00001)', metavar="FLOAT")
164 advanced_group.add_argument("--hmmdb", dest="hmmdatabase", type=str, help='PHMMER Database that will be used for the protein function prediction according to Hidden Markov Models. In this case, HMMDB must be in FASTA format (e.g. UniProt: "', metavar="HMMDB")
165 advanced_group.add_argument("--hmmerevalue", dest="hmmerevalue", default=0.001, help='PHMMER e-value threshold (Default: 0.001)', metavar="FLOAT")
166
167 type_choices = {'BCT': 'Prokaryotic chromosome', 'CON': 'Contig', 'PHG': 'Phages', 'VRL': 'Eukaryotic/Archaea virus'}
168 type_help = ('GenBank Division: One of the following codes - {0}. (Default: %(default)s)'.format(', '.join('{0} ({1})'.format(k, v) for k, v in type_choices.items())))
169 advanced_group.add_argument("--typedata", dest="typedata", type=str, default='CON', help=type_help, metavar="BCT|CON|VRL|PHG")
170
171 gcode_choices = {'1': 'Standard genetic code [Eukaryotic]', '2': 'Vertebrate mitochondrial code', '3': 'Yeast mitochondrial code', '4': 'Mycoplasma/Spiroplasma and Protozoan/mold/coelenterate mitochondrial code', '5': 'Invertebrate mitochondrial code', '6': 'Ciliate, dasycladacean and hexamita nuclear code', '9': 'Echinoderm/flatworm mitochondrial code', '10': 'Euplotid nuclear code', '11': 'Bacteria/Archaea/Phages/Plant plastid', '12': 'Alternative yeast nuclear code', '13': 'Ascidian mitochondrial code', '14': 'Alternative flatworm mitochondrial code', '16': 'Chlorophycean mitochondrial code', '21': 'Trematode mitochondrial code', '22': 'Scedenesmus obliquus mitochondrial code', '23': 'Thraustochytrium mitochondrial code', '24': 'Pterobranquia mitochondrial code', '25': 'Gracilibacteria & Candidate division SR1', '26': 'Pachysolen tannophilus nuclear code', '27': 'Karyorelict nuclear code', '28': 'Condylostoma nuclear code', '29': 'Mesodinium nuclear code', '30': 'Peritrich nuclear code', '31': 'Blastocrithidia nuclear code'}
172 gcode_help = ('Number of GenBank translation table. At this moment, the available options are {0}. (Default: %(default)s)'.format('{}'.format(', '.join('{0} ({1})'.format(k, v) for k, v in gcode_choices.items()))))
173 advanced_group.add_argument("--gcode", dest="gcode", type=str, default='11', help=gcode_help, metavar="NUMBER")
174
175 advanced_group.add_argument('--mincontigsize', dest="mincontigsize", type=int, default = 200, help = 'Minimum contig length to be considered in the final files (Default: 200 bp)', metavar="INT")
176 advanced_group.add_argument("--idthr", dest="idthreshold", default=50.00, help='ID threshold (Default: 50.0)', metavar="FLOAT")
177 advanced_group.add_argument("--coverthr", dest="covthreshold", default=50.00, help='Coverage threshold (Default: 50.0)', metavar="FLOAT")
178 advanced_group.add_argument("--diffid", dest="diffid", default=5.00, help='Max allowed difference between the ID percentages of BLAST and HMMER. (Default = 5.00; Not recommended to change such value)', metavar="FLOAT (>0.01)")
179 advanced_group.add_argument("--minrepeat", dest="minrepeat", type=int, default=16, help="Minimum repeat length for CRISPR detection (Default: 16)", metavar="INT")
180 advanced_group.add_argument("--maxrepeat", dest="maxrepeat", type=int, default=64, help="Maximum repeat length for CRISPR detection (Default: 64)")
181 advanced_group.add_argument("--minspacer", dest="minspacer", type=int, default=8, help="Minimum spacer length for CRISPR detection (Default: 8)")
182 advanced_group.add_argument("--maxspacer", dest="maxspacer", type=int, default=64, help="Maximum spacer length for CRISPR detection (Default: 64)")
183 advanced_group.add_argument("--blastexh", dest="blastexh", action='store_true', default=False, help='Use of exhaustive BLAST to predict the proteins by homology according to Fozo et al. (2010) Nucleic Acids Res (Default=FALSE)')
184
185 args = parser.parse_args()
186
187 root_output = args.rootoutput
188 if not root_output:
189 root_output = '{}_annotated'.format(os.path.splitext(args.inputfile)[0])
190
191 if args.noblast == False and args.blastdatabase == None:
192 sys.exit('You MUST specify BLAST database using the parameter --blastdb if you are not using --noblast option')
193
194 if args.noblast == True and args.diamonddatabase == None:
195 sys.exit('You MUST specify DIAMOND database using the parameter --diamonddb if you are using --noblast option')
196
197 if args.nohmmer == False and args.hmmdatabase == None:
198 sys.exit('You MUST specify HMMER database using the parameter --hmmdb if you are not using --nohmmer option')
199
200 # Printing the header of the program
201 eprint("This is VIGA %s" % str(version))
202 eprint("Written by Enrique Gonzalez Tortuero & Vimalkumar Velayudhan")
203 eprint("Homepage is https://github.com/EGTortuero/virannot")
204 eprint("Local time: ", strftime("%a, %d %b %Y %H:%M:%S"))
205 eprint("\n\n")
206
207 # checking the presence of the programs in the system
208
209 if not cmd_exists("lastz")==True:
210 sys.exit("You must install LASTZ before using this script")
211 elif not cmd_exists("cmscan")==True:
212 sys.exit("You must install INFERNAL before using this script")
213 elif not cmd_exists("prodigal")==True:
214 sys.exit("You must install prodigal before using this script")
215 elif not cmd_exists("parallel")==True:
216 sys.exit("You must install GNU Parallel before using this script")
217 elif not cmd_exists("blastp")==True:
218 sys.exit("You must install BLAST before using this script")
219 elif not cmd_exists("diamond")==True:
220 sys.exit("You must install DIAMOND before using this script")
221 elif not cmd_exists("phmmer")==True:
222 sys.exit("You must install HMMER 3 before using this script")
223 elif not cmd_exists("aragorn")==True:
224 sys.exit("You must install ARAGORN before using this script")
225 elif not cmd_exists("pilercr")==True:
226 sys.exit("You must install PILERCR before using this script")
227 elif not cmd_exists("trf")==True:
228 sys.exit("You must install Tandem Repeats Finder before using this script")
229 elif not cmd_exists("irf")==True:
230 sys.exit("You must install Inverted Repeats Finder before using this script")
231
232 eprint("Data type is {0} and GenBank translation table no is {1}\n".format(args.typedata, args.gcode))
233
234 # Correcting the original file (long headers problem + multiple FASTA files)
235 record_iter = SeqIO.parse(open(args.inputfile, "rU"),"fasta")
236 counter = 1
237 newnamessequences = {}
238 for i, batch in enumerate(batch_iterator(record_iter, 1)):
239 seq_index = "CONTIG_%i" % (i + 1)
240 filename = "%s.temp.fna" % seq_index
241 newfilename = "%s.fna" % seq_index
242 with open(filename, "w") as handle:
243 count = SeqIO.write(batch, filename, "fasta")
244
245 with open(filename, "rU") as original, open(newfilename, "w") as corrected:
246 sequences = SeqIO.parse(original, "fasta", IUPAC.ambiguous_dna)
247 for record in sequences:
248 original_name = record.id
249 record.id = "%s_%i" % (args.locus, counter)
250 record.description = record.description
251 counter += 1
252 newnamessequences[record.id] = original_name
253 eprint("WARNING: The name of the sequence %s was corrected as %s" % (original_name, record.id))
254 SeqIO.write(record, corrected, "fasta")
255
256 with open("logfile.txt", "w") as logfile:
257 logfile.write("#Original\tNew\n")
258 for oldname in sorted(newnamessequences, key = stringSplitByNumbers):
259 logfile.write("%s\t%s\n" % (oldname, newnamessequences[oldname]))
260 os.remove(filename)
261
262 for newfile in sorted(glob.glob("CONTIG_*.fna")):
263
264 # Predicting the shape of the contig (code based on Alex Crits-Christoph's find_circular.py script [https://github.com/alexcritschristoph/VICA/blob/master/find_circular.py])
265 eprint("Predicting the shape of the contig using LASTZ")
266 genomeshape = {}
267 with open(newfile, "rU") as targetfasta:
268 Sequence = SeqIO.parse(newfile, "fasta")
269 for record in Sequence:
270 seq_beginning = str(record.seq[0:args.read_length])
271 seq_ending = str(record.seq[len(record.seq)-args.read_length:len(record.seq)])
272 combined_seqs = SeqRecord(Seq(seq_beginning + seq_ending, IUPAC.ambiguous_dna), id = record.description)
273 SeqIO.write(combined_seqs, "temporal_circular.fasta", "fasta")
274 outputlastz = subprocess.check_output(["lastz", "temporal_circular.fasta", "--self", "--notrivial", "--nomirror", "--format=general-:start1,end1,start2,end2,score,strand1,strand2,identity,length1"])
275 resultslastz = outputlastz.split("\n")
276 for resultlastz in resultslastz:
277 if resultlastz != '':
278 start1 = resultlastz.split()[0]
279 end1 = resultlastz.split()[1]
280 start2 = resultlastz.split()[2]
281 end2 = resultlastz.split()[3]
282 strand1 = resultlastz.split()[5]
283 strand2 = resultlastz.split()[6]
284 identity = resultlastz.split()[7]
285 length = int(resultlastz.split()[9])
286 if strand1 == strand2 and length > 0.4 * args.read_length and float(fractions.Fraction(identity)) > 0.95 and int(start1) < 5 and int(start2) > args.read_length and int(end1) < args.read_length and int(end2) > args.read_length * 2 * 0.9:
287 genomeshape['genomeshape'] = "circular"
288 try:
289 if genomeshape['identity'] >= float(fractions.Fraction(identity)):
290 genomeshape['identity'] = float(fractions.Fraction(identity))
291 genomeshape['length'] = length
292 except KeyError:
293 genomeshape['identity'] = float(fractions.Fraction(identity))
294 genomeshape['length'] = length
295 else:
296 continue
297 if strand1 == strand2 and length > 0.4 * args.read_length and float(fractions.Fraction(identity)) > 0.95 and int(start1) < 5 and int(start2) > args.read_length and int(end1) < args.read_length and int(end2) > args.read_length * 2 * 0.9:
298 genomeshape['genomeshape'] = "circular"
299 try:
300 if genomeshape['identity'] >= float(fractions.Fraction(identity)):
301 genomeshape['identity'] = float(fractions.Fraction(identity))
302 genomeshape['length'] = length
303 except KeyError:
304 genomeshape['identity'] = float(fractions.Fraction(identity))
305 genomeshape['length'] = length
306 try:
307 if genomeshape['genomeshape'] == "":
308 genomeshape['genomeshape'] = "linear"
309 except KeyError:
310 genomeshape['genomeshape'] = "linear"
311 else:
312 genomeshape['genomeshape'] = "circular"
313 with open("temp.fasta", "w") as correctedcircular:
314 Corrseq = str(record.seq[int(genomeshape['length'])//2:-int(genomeshape['length'])//2])
315 Newseq = SeqRecord(Seq(Corrseq, IUPAC.ambiguous_dna), id = record.description)
316 SeqIO.write(Newseq, correctedcircular, "fasta")
317 os.rename("temp.fasta", "temp2.fasta")
318 eprint("LASTZ predicted that %s is %s\n" % (newfile, genomeshape['genomeshape']))
319 os.remove("temporal_circular.fasta")
320
321 # Calculate the cumulative GCskew in circular contigs to determine the origin of replication (Based on iRep -- Brown CT, Olm MR, Thomas BC, Banfield JF (2016) Measurement of bacterial replication rates in microbial communities. Nature Biotechnology 34: 1256-63.)
322 if genomeshape['genomeshape'] == "circular":
323 for record in SeqIO.parse("temp2.fasta", "fasta"):
324 length_contig = len(record.seq)
325 #if length < min_len:
326 # print('%s: Too Short' % (name), file=sys.stderr)
327 # continue
328 oric, term, gcskew, cgcskew = GCskew(record.id, length_contig, record.seq, args.window, args.slide)
329 valoric = oric
330 if oric == False:
331 oric, term = 'n/a', 'n/a'
332 else:
333 oric, term = '{:,}'.format(oric), '{:,}'.format(term)
334 eprint('%s -> Origin: %s Terminus: %s' % (record.id, oric, term))
335 #print('\t'.join(['# Name', 'Position', 'GC Skew', 'Cumulative GC Skew']))
336 #for i, pos in enumerate(gcskew[0]):
337 # out = [record.id, pos, gcskew[1][i], cgcskew[1][i]]
338 # print('\t'.join([str(i) for i in out]))
339 if valoric != False:
340 firstpartseq = str(record.seq[valoric:-1])
341 secondpartseq = str(record.seq[0:(valoric-1)])
342 combinedcorrectedseq = SeqRecord(Seq(firstpartseq + secondpartseq, IUPAC.ambiguous_dna), id = record.description)
343 SeqIO.write(combinedcorrectedseq, newfile, "fasta")
344 else:
345 eprint("As the program was unable to predict the origin of replication, %s was considered as is without correcting!" % record.id)
346 os.rename("temp2.fasta", newfile)
347 if os.path.isfile("temp2.fasta"):
348 os.remove("temp2.fasta")
349
350 # Predicting genes using PRODIGAL
351 eprint("\nRunning Prodigal to predict the genes in %s" % newfile)
352 for record in SeqIO.parse(newfile, "fasta"):
353 length_contig = len(record.seq)
354 if (length_contig >= 100000):
355 if genomeshape['genomeshape'] == 'linear':
356 subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-g", args.gcode, "-c", "-q"])
357 else:
358 subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-g", args.gcode, "-q"])
359 else:
360 if genomeshape['genomeshape'] == 'linear':
361 subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-p", "meta", "-g", args.gcode, "-c", "-q"])
362 else:
363 subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-p", "meta", "-g", args.gcode, "-q"])
364 num_seqs = len(list(SeqIO.parse("pretemp.faa", "fasta")))
365 eprint("PRODIGAL was able to predict %i genes in %s\n" % (num_seqs, newfile))
366
367 with open("pretemp.faa", "rU") as originalfaa, open("temp.faa", "w") as correctedfaa:
368 sequences = SeqIO.parse(originalfaa, "fasta")
369 for record in sequences:
370 record.seq = record.seq.rstrip("*")
371 SeqIO.write(record, correctedfaa, "fasta")
372
373 faa_file = "%s.faa" % newfile # TO DEBUG
374 shutil.copyfile("temp.faa", faa_file) # TO DEBUG
375 os.remove("pretemp.faa")
376
377 # Predicting the function of the proteins based on homology using BLAST
378 equivalence = {}
379 record_iter = SeqIO.parse(open("temp.faa"),"fasta")
380 for i, batch in enumerate(batch_iterator(record_iter, 1)):
381 seq_index = "SEQ_%i" % (i + 1)
382 filename = "%s.faa" % seq_index
383 with open(filename, "w") as handle:
384 count = SeqIO.write(batch, handle, "fasta")
385 equivalence[seq_index] = batch[0].id
386
387 if args.blastexh==True:
388 eprint("Running BLAST to predict the genes according to homology inference in %s using exhaustive mode (see Fozo et al. (2010) Nucleic Acids Res for details)" % newfile)
389 subprocess.call(['blastp', '-query', "temp.faa", '-db', args.blastdatabase, '-evalue', str(args.blastevalue), '-outfmt', '6 qseqid sseqid pident length qlen slen qstart qend evalue bitscore stitle', '-out', 'temp_blast.csv', '-max_target_seqs', '10', '-word_size', '2', '-gapopen', '8', '-gapextend', '2', '-matrix', '"PAM70"', '-comp_based_stats', '"0"', "-num_threads", str(args.ncpus)])
390 eprint("Done. BLAST was done to predict the genes by homology\n")
391 blast_log = "%s.blast.log" % newfile # TO DEBUG
392 shutil.copyfile("temp_blast.csv", blast_log) # TO DEBUG
393 elif args.noblast==True:
394 eprint("Running DIAMOND to predict the genes according to homology inference in %s using default parameters" % newfile)
395 with open("temp.faa", "r") as tempfile:
396 first_line = tempfile.readline()
397 if first_line.startswith(">"):
398 subprocess.call(['diamond', 'blastp', '-q', 'temp.faa', '-d', args.diamonddatabase, '-e', str(args.blastevalue), '-f', '6', 'qseqid', 'sseqid', 'pident', 'length', 'qlen', 'slen', 'qstart', 'qend', 'evalue', 'bitscore', 'stitle', '-o', 'temp_blast.csv', '-k', '10', "-p", str(args.ncpus), '--quiet'])
399 else:
400 open("temp_blast.csv", 'a').close()
401 blast_log = "%s.blast.log" % newfile # TO DEBUG
402 shutil.copyfile("temp_blast.csv", blast_log) # TO DEBUG
403 eprint("Done. DIAMOND was done to predict the genes by homology\n")
404 else:
405 eprint("Running BLAST to predict the genes according to homology inference in %s using default parameters" % newfile)
406 subprocess.call(['blastp', '-query', "temp.faa", '-db', args.blastdatabase, '-evalue', str(args.blastevalue), '-outfmt', '6 qseqid sseqid pident length qlen slen qstart qend evalue bitscore stitle', '-out', 'temp_blast.csv', '-max_target_seqs', '10', "-num_threads", str(args.ncpus)])
407 blast_log = "%s.blast.log" % newfile
408 shutil.copyfile("temp_blast.csv", blast_log) # TO DEBUG
409 eprint("Done. BLAST was done to predict the genes by homology\n") # TO DEBUG
410
411 # Parsing the results from BLAST
412 with open("temp_blast.csv", "rU") as blastresults:
413 hypotheticalpat = re.compile(r'(((((?i)hypothetical)|(?i)uncharacteri(z|s)ed|(?i)predicted)) protein)|((?i)ORF|((?i)unnamed protein product|(?i)gp\d+))')
414 reader = csv.DictReader(blastresults, delimiter='\t', fieldnames=['qseqid','sseqid','pident','length','qlen','slen','qstart','qend','evalue','bitscore','stitle'])
415 information_proteins_blast = {}
416 for row in reader:
417 perc_cover = round(100.00*(float(row['length'])/float(row['qlen'])),2)
418 perc_id = float(row['pident'])
419 infoprot_blast = {}
420 infoprot_blast['sseqid'] = row['sseqid']
421 infoprot_blast['pident'] = perc_id
422 infoprot_blast['pcover'] = perc_cover
423 infoprot_blast['evalue'] = row['evalue']
424 infoprot_blast['descr'] = row['stitle']
425 try:
426 data = information_proteins_blast[row['qseqid']]
427 except KeyError:
428 if not re.search(hypotheticalpat, infoprot_blast['descr']) and float(perc_id) >= float(args.idthreshold) and float(perc_cover) >= float(args.covthreshold) and float(row['evalue']) <= float(args.blastevalue):
429 information_proteins_blast[row['qseqid']] = infoprot_blast
430 else:
431 continue
432 else:
433 if not re.search(hypotheticalpat, infoprot_blast['descr']) and float(perc_id) >= float(args.idthreshold) and float(perc_id) >= float(infoprot_blast['pident']) and float(perc_cover) >= float(args.covthreshold) and float(perc_cover) >= float(infoprot_blast['pcover']) and float(row['evalue']) <= float(args.blastevalue):
434 information_proteins_blast[row['qseqid']] = infoprot_blast
435
436 ## Predicting the function of the proteins based on HMM predictions using phmmer
437 if args.nohmmer == False:
438 with open("commands.sh", "w") as commands:
439 for singleprot in sorted(glob.glob("SEQ_*.faa")):
440 hhmtable = "%s.tbl" % singleprot
441 eprint("Creating file to run parallel PHMMER")
442 eprint("Adding %s to run PHMMER." % singleprot)
443 line2write = ' '.join(["phmmer", "--cpu", "1", "--domtblout", hhmtable, "-E", str(args.hmmerevalue), "-o", "/dev/null", singleprot, args.hmmdatabase, '\n'])
444 commands.write(line2write)
445
446 eprint("Running parallel PHMMER")
447 subprocess.call(['parallel', '-j', str(args.ncpus)], stdin=open('commands.sh'))
448 eprint("Done. PHMMER was done to predict the function of the genes according to Hidden Markov Models\n")
449 os.remove("commands.sh")
450
451 # Parsing the results from HMMER
452 information_proteins_hmmer = {}
453 hypotheticalpat = re.compile(r'(((((?i)hypothetical)|(?i)uncharacteri(z|s)ed|(?i)predicted)) protein)|((?i)ORF|((?i)unnamed protein product|(?i)gp\d+))')
454 for singletbl in sorted(glob.glob("*.faa.tbl")):
455 rootname = singletbl.replace(".faa.tbl", "")
456 with open(singletbl) as tblfile:
457 infoprot_hmmer = {}
458 for line in tblfile:
459 if line.startswith("#"):
460 continue
461 else:
462 try:
463 pat = re.compile("^(\S+)\s+\S\s+\d+\s+(\S+)\s+\S\s+(\d+)\s+((?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?)\s+\S+\s+\S+\s+\S+\s+\S+\s+(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s+\S+\s+\S+\s+\S+\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+\S+\s+\S+\s+(\S+)\s+(.*)")
464 matchname, lociname, length, evaluehh, start, end, pident, description = re.match(pat, line).groups()
465 except AttributeError:
466 continue
467 else:
468 length = float(length)
469 pident = 100.00*float(pident)
470 protarea = 100.00*(((float(end)-1.00) - (float(start)-1.00))/length)
471 try:
472 data2 = infoprot_hmmer['lociname']
473 except KeyError:
474 if not re.search(hypotheticalpat, description) and float(protarea) >= float(args.covthreshold) and float(evaluehh) <= float(args.hmmerevalue) and float(pident) >= 50.00:
475 infoprot_hmmer['lociname'] = lociname
476 infoprot_hmmer['name'] = matchname
477 infoprot_hmmer['evalue'] = float(evaluehh)
478 infoprot_hmmer['pcover'] = float(protarea)
479 infoprot_hmmer['pident'] = float(pident)
480 infoprot_hmmer['descr'] = description
481 else:
482 try:
483 if not re.search(hypotheticalpat, description) and float(protarea) >= float(args.covthreshold) and float(evaluehh) <= float(args.hmmerevalue) and float(pident) >= 50.00 and float(pident) >= infoprot_hmmer['pident']:
484 infoprot_hmmer['lociname'] = lociname
485 infoprot_hmmer['name'] = matchname
486 infoprot_hmmer['evalue'] = float(evaluehh)
487 infoprot_hmmer['pcover'] = float(protarea)
488 infoprot_hmmer['pident'] = float(pident)
489 infoprot_hmmer['descr'] = description
490 except KeyError:
491 continue
492 else:
493 if not re.search(hypotheticalpat, description) and float(protarea) >= float(args.covthreshold) and float(evaluehh) <= float(args.hmmerevalue) and float(pident) >= 50.00 and float(pident) >= infoprot_hmmer['pident']:
494 infoprot_hmmer['lociname'] = lociname
495 infoprot_hmmer['name'] = matchname
496 infoprot_hmmer['evalue'] = float(evaluehh)
497 infoprot_hmmer['pcover'] = float(protarea)
498 infoprot_hmmer['pident'] = float(pident)
499 infoprot_hmmer['descr'] = description
500 information_proteins_hmmer[rootname] = infoprot_hmmer
501
502 #Storing protein information in memory
503 with open("temp.faa", "rU") as protsfile:
504 tempprotsdict = {}
505 for protseq in SeqIO.parse(protsfile, "fasta"):
506 tempindprot = {}
507 dataprot = protseq.description.split(' # ')
508 modseq = str(protseq.seq).replace("X","") # Removing all ambiguous amino acids to avoid problems with ProteinAnalysis module
509 analysed_seq = ProteinAnalysis(modseq)
510 tempindprot['length'] = len(protseq.seq)
511 tempindprot['isoelectricpoint'] = analysed_seq.isoelectric_point()
512 tempindprot['molweightkda'] = analysed_seq.molecular_weight()/1000.00
513 tempindprot['instability'] = analysed_seq.instability_index()
514 tempindprot['protein_id'] = dataprot[0]
515 tempindprot['strand'] = int(dataprot[3])
516 tempindprot['begin'] = int(dataprot[1])-1
517 tempindprot['end'] = int(dataprot[2])
518 tempprotsdict[dataprot[0]] = tempindprot
519
520 # Creation of table
521 debugtable = "%s.csv" % newfile
522 with open(debugtable, "w") as tablefile:
523 if args.nohmmer == False:
524 print("\t".join(["Identifier", "Start", "Stop", "Strand", "size_aa", "pI", "Mol_weight_kDa", "Instability_index", "ID_BLAST", "Descr_BLAST", "evalue_BLAST", "%ID_BLAST", "%Cover_BLAST", "ID_HMMER", "Descr_HMMER", "evalue_HMMER", "%ID_HMMER", "%Cover_HMMER"]), file=tablefile)
525 keylist = information_proteins_hmmer.keys()
526 keylist.sort()
527 for keyB in keylist:
528 keyB = keyB.replace(".faa.tbl", "")
529 try:
530 print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), information_proteins_blast[equivalence[keyB]]['sseqid'], information_proteins_blast[equivalence[keyB]]['descr'], str(information_proteins_blast[equivalence[keyB]]['evalue']), str(information_proteins_blast[equivalence[keyB]]['pident']), str(information_proteins_blast[equivalence[keyB]]['pcover']), information_proteins_hmmer[keyB]['name'], information_proteins_hmmer[keyB]['descr'], str(information_proteins_hmmer[keyB]['evalue']), str(information_proteins_hmmer[keyB]['pident']), str(information_proteins_hmmer[keyB]['pcover'])]), file=tablefile)
531 except KeyError:
532 try:
533 print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), information_proteins_blast[equivalence[keyB]]['sseqid'], information_proteins_blast[equivalence[keyB]]['descr'], str(information_proteins_blast[equivalence[keyB]]['evalue']), str(information_proteins_blast[equivalence[keyB]]['pident']), str(information_proteins_blast[equivalence[keyB]]['pcover']), "None", "None", "NA", "NA", "NA"]), file=tablefile)
534 except KeyError:
535 try:
536 print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), "None", "None", "NA", "NA", "NA", information_proteins_hmmer[keyB]['name'], information_proteins_hmmer[keyB]['descr'], str(information_proteins_hmmer[keyB]['evalue']), str(information_proteins_hmmer[keyB]['pident']), str(information_proteins_hmmer[keyB]['pcover'])]), file=tablefile)
537 except KeyError:
538 print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), "None", "None", "NA", "NA", "NA", "None", "None", "NA", "NA", "NA"]), file=tablefile)
539 else:
540 print("\t".join(["Identifier", "Start", "Stop", "Strand", "size_aa", "pI", "Mol_weight_kDa", "Instability_index", "ID_BLAST", "Descr_BLAST", "evalue_BLAST", "%ID_BLAST", "%Cover_BLAST"]), file=tablefile)
541 keylist = equivalence.values()
542 keylist.sort()
543 for keyB in keylist:
544 try:
545 print("\t".join([keyB, str(tempprotsdict[keyB]['begin']), str(tempprotsdict[keyB]['end']), str(tempprotsdict[keyB]['strand']), str(tempprotsdict[keyB]['length']), str(tempprotsdict[keyB]['isoelectricpoint']), str(tempprotsdict[keyB]['molweightkda']), str(tempprotsdict[keyB]['instability']), information_proteins_blast[keyB]['sseqid'], information_proteins_blast[keyB]['descr'], str(information_proteins_blast[keyB]['evalue']), str(information_proteins_blast[keyB]['pident']), str(information_proteins_blast[keyB]['pcover'])]), file=tablefile)
546 except KeyError:
547 print("\t".join([keyB, str(tempprotsdict[keyB]['begin']), str(tempprotsdict[keyB]['end']), str(tempprotsdict[keyB]['strand']), str(tempprotsdict[keyB]['length']), str(tempprotsdict[keyB]['isoelectricpoint']), str(tempprotsdict[keyB]['molweightkda']), str(tempprotsdict[keyB]['instability']), "None", "None", "NA", "NA", "NA"]), file=tablefile)
548
549 # Algorithm of decisions (which one: BLAST/HMMER?)
550 multipleprots = {}
551 Hypotheticalpat = re.compile(r'(((((?i)hypothetical)|(?i)uncharacteri(z|s)ed|(?i)predicted)) protein)|((?i)ORF|((?i)unnamed protein product|(?i)gp\d+))')
552 if args.nohmmer == False:
553 keylist = information_proteins_hmmer.keys()
554 keylist.sort()
555 for keyB in keylist:
556 keyB = keyB.replace(".faa.tbl", "")
557 singleprot = {}
558 singleprot['name'] = equivalence[keyB]
559 if (equivalence[keyB] in information_proteins_blast) and (keyB in information_proteins_hmmer):
560 if re.search(Hypotheticalpat, information_proteins_blast[equivalence[keyB]]['descr']):
561 try:
562 if re.search(Hypotheticalpat, information_proteins_hmmer[keyB]['descr']):
563 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
564 else:
565 singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
566 except KeyError:
567 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
568 else:
569 try:
570 if (float(information_proteins_blast[equivalence[keyB]]['pident'])>float(information_proteins_hmmer[keyB]['pident'])) and (float(information_proteins_blast[equivalence[keyB]]['pcover'])>float(information_proteins_hmmer[keyB]['pcover'])):
571 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
572 elif (float(information_proteins_blast[equivalence[keyB]]['pident'])<float(information_proteins_hmmer[keyB]['pident'])) and (float(information_proteins_blast[equivalence[keyB]]['pcover'])<float(information_proteins_hmmer[keyB]['pcover'])):
573 singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
574 elif (float(information_proteins_blast[equivalence[keyB]]['pident'])>float(information_proteins_hmmer[keyB]['pident'])) and (float(information_proteins_blast[equivalence[keyB]]['pcover'])<float(information_proteins_hmmer[keyB]['pcover'])):
575 if (float(information_proteins_blast[equivalence[keyB]]['pident'])-float(information_proteins_hmmer[keyB]['pident']) >= args.diffid):
576 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
577 else:
578 singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
579 else:
580 if (float(information_proteins_hmmer[keyB]['pident'])-float(information_proteins_blast[equivalence[keyB]]['pident']) >= args.diffid):
581 singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
582 else:
583 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
584 except KeyError:
585 try:
586 if (float(information_proteins_blast[equivalence[keyB]]['pcover'])>float(information_proteins_hmmer[keyB]['pcover'])):
587 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
588 else:
589 singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
590 except KeyError:
591 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
592 elif equivalence[keyB] in information_proteins_blast:
593 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
594 elif keyB in information_proteins_hmmer:
595 try:
596 singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
597 except KeyError:
598 singleprot['descr'] = 'Hypothetical protein'
599 else:
600 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
601 multipleprots[keyB] = singleprot
602 else:
603 keylist = equivalence.values()
604 keylist.sort()
605 for keyB in keylist:
606 singleprot = {}
607 singleprot['name'] = keyB
608 try:
609 if information_proteins_blast[keyB]['descr'] == None:
610 singleprot['descr'] = 'Hypothetical protein'
611 elif re.search(Hypotheticalpat, information_proteins_blast[keyB]['descr']):
612 singleprot['descr'] = 'Conserved hypothetical protein'
613 else:
614 singleprot['descr'] = information_proteins_blast[keyB]['descr']
615 except KeyError:
616 singleprot['descr'] = 'Hypothetical protein'
617 multipleprots[keyB] = singleprot
618
619 #Storing protein information in memory
620 with open("temp.faa", "rU") as protsfile:
621 protsdict = {}
622 for protseq in SeqIO.parse(protsfile, "fasta"):
623 indprot = {}
624 dataprot = protseq.description.split(' # ')
625 indprot['translation'] = protseq.seq
626 indprot['protein_id'] = dataprot[0]
627 indprot['strand'] = int(dataprot[3])
628 indprot['begin'] = int(dataprot[1])-1
629 indprot['end'] = int(dataprot[2])
630 for keyOmega in sorted(multipleprots):
631 if multipleprots[keyOmega]['name'] == dataprot[0]:
632 indprot['product'] = multipleprots[keyOmega]['descr']
633 protsdict[dataprot[0]] = indprot
634
635 # Predicting the rRNA sequences
636 with open(newfile, "rU") as targetfasta, open("/dev/null", "w") as apocalypse:
637 eprint("Running INFERNAL+RFAM to predict rRNA-like sequences in %s" % newfile)
638 subprocess.call(["cmscan", "--rfam", "--cut_ga", "--nohmmonly", "--tblout", "rrnafile.csv", "--cpu", args.ncpus, args.rfamdatabase, newfile], stdout=apocalypse)
639
640 #Storing rRNA information in memory
641 with open("rrnafile.csv", "rU") as rrnafile:
642 namedict = {"SSU_rRNA_archaea": "16s_rRNA", "SSU_rRNA_bacteria": "16s_rRNA", "SSU_rRNA_eukarya": "18s_rRNA", "SSU_rRNA_microsporidia": "16s_rRNA", "LSU_rRNA_archaea": "23s_rRNA", "LSU_rRNA_bacteria": "23s_rRNA", "LSU_rRNA_eukarya": "28s_rRNA", "5S_rRNA": "5s_rRNA"}
643 rRNAdict = defaultdict(list)
644 for line in rrnafile:
645 if not line.startswith("#"):
646 InfoLINE = re.sub("\s+", ",", line)
647 line_splitted = InfoLINE.split(",")
648 item_type = line_splitted[0]
649 if item_type.startswith(('LSU', 'SSU', '5S')):
650 strand = 0
651 if line_splitted[9] == "+":
652 strand = 1
653 else:
654 strand = -1
655 rRNAdict[item_type].append({'score': float(line_splitted[14]), 'product': namedict[line_splitted[0]], 'begin': int(line_splitted[7]), 'end': int(line_splitted[8]), 'strand': strand})
656
657 subunits = {'LSU': {'max_score': 0 }, 'SSU': {'max_score': 0 }, '5S': {'max_score': 0 }}
658 for type_rRNA, rRNA_data in rRNAdict.items():
659 max_score = max([item['score'] for item in rRNAdict[type_rRNA]])
660 for item in ('LSU', 'SSU'):
661 if type_rRNA.startswith(item):
662 if max_score > subunits[item]['max_score']:
663 subunits[item]['listdata'] = rRNA_data
664 subunits[item]['max_score'] = max_score
665 if type_rRNA.startswith('5S'):
666 subunits['5S']['listdata'] = rRNA_data
667 subunits['5S']['max_score'] = max_score
668
669 for rRNA in subunits:
670 i = 0
671 try:
672 lengthlist = len(subunits[rRNA]['listdata'])
673 except KeyError:
674 continue
675 else:
676 while i < lengthlist:
677 eprint("%s harbours a %s from %i to %i" % (newfile, subunits[rRNA]['listdata'][i]['product'], int(subunits[rRNA]['listdata'][i]['begin']), int(subunits[rRNA]['listdata'][i]['end'])))
678 i += 1
679
680 # Predicting the tRNA sequences
681 eprint("Running ARAGORN to predict tRNA-like sequences in %s" % newfile)
682 genetictable = "-gc%s" % str(args.gcode)
683 with open("trnafile.fasta", "w") as trnafile:
684 if genomeshape['genomeshape'] == "circular":
685 subprocess.call(["aragorn", "-c", "-fon", genetictable, newfile], stdout=trnafile)
686 else:
687 subprocess.call(["aragorn", "-l", "-fon", genetictable, newfile], stdout=trnafile)
688 num_tRNA = len(list(SeqIO.parse("trnafile.fasta", "fasta")))
689 eprint("ARAGORN was able to predict %i tRNAs in %s\n" % (num_tRNA, newfile))
690
691 #Storing tRNA and tmRNA information in memory
692 with open("trnafile.fasta", "rU") as trnafile:
693 tRNAdict = {}
694 tmRNAdict = {}
695 for tRNAseq in SeqIO.parse(trnafile, "fasta"):
696 indtRNA = {}
697 indtmRNA = {}
698 tRNA_information = tRNAseq.description.split(" ")
699 tRNApat = re.compile("^tRNA-")
700 if tRNA_information[1] == "tmRNA":
701 if str(tRNA_information[2]) == "(Permuted)":
702 indtmRNA['product'] = "tmRNA"
703 tmRNA_coords = str(tRNA_information[3])
704 Beginningrevcomppat = re.compile("^c")
705 if re.match(Beginningrevcomppat, tmRNA_coords):
706 indtmRNA['strand'] = -1
707 tmRNA_coords = tmRNA_coords.replace("c[","").replace("]","").split(",")
708 else:
709 indtmRNA['strand'] = 1
710 tmRNA_coords = tmRNA_coords.replace("[","").replace("]","").split(",")
711 indtmRNA['begin'] = int(tmRNA_coords[0])
712 indtmRNA['end'] = int(tmRNA_coords[1])
713 tmRNAdict[tRNAseq.id] = indtmRNA
714 else:
715 indtmRNA['product'] = "tmRNA"
716 tmRNA_coords = str(tRNA_information[2])
717 Beginningrevcomppat = re.compile("^c")
718 if re.match(Beginningrevcomppat, tmRNA_coords):
719 indtmRNA['strand'] = -1
720 tmRNA_coords = tmRNA_coords.replace("c[","").replace("]","").split(",")
721 else:
722 indtmRNA['strand'] = 1
723 tmRNA_coords = tmRNA_coords.replace("[","").replace("]","").split(",")
724 indtmRNA['begin'] = int(tmRNA_coords[0])
725 indtmRNA['end'] = int(tmRNA_coords[1])
726 tmRNAdict[tRNAseq.id] = indtmRNA
727 elif re.match(tRNApat, tRNA_information[1]):
728 indtRNA['product'] = re.sub("\(\w{3}\)", "", tRNA_information[1])
729 tRNA_coords = tRNA_information[2]
730 Beginningrevcomppat = re.compile("^c")
731 if re.match(Beginningrevcomppat, tRNA_coords):
732 indtRNA['strand'] = -1
733 tRNA_coords = tRNA_coords.replace("c[","").replace("]","").split(",")
734 else:
735 indtRNA['strand'] = 1
736 tRNA_coords = tRNA_coords.replace("[","").replace("]","").split(",")
737 indtRNA['begin'] = int(tRNA_coords[0])
738 indtRNA['end'] = int(tRNA_coords[1])
739 tRNAdict[tRNAseq.id] = indtRNA
740
741 #Predicting CRISPR repeats and others
742 eprint("Running PILERCR to predict CRISPR repeats in %s" % newfile)
743 subprocess.call(["pilercr", "-in", newfile, "-out", "crisprfile.txt", "-noinfo", "-minrepeat", str(args.minrepeat), "-maxrepeat", str(args.maxrepeat), "-minspacer", str(args.minspacer), "-maxspacer", str(args.maxspacer)])
744 eprint("Predicting repeats in the sequences using TRF and IRF")
745 with open("/dev/null", "w") as stderr:
746 subprocess.call(["trf", newfile, "2", "7", "7", "80", "10", "50", "500", "-h"], stderr=stderr)
747 os.rename("%s.2.7.7.80.10.50.500.dat" % newfile, "trf_temp.dat")
748 with open("/dev/null", "w") as stderr:
749 subprocess.call(["irf", newfile, "2", "3", "5", "80", "10", "40", "500000", "10000", "-d", "-h"], stderr=stderr)
750 os.rename("%s.2.3.5.80.10.40.500000.10000.dat" % newfile, "irf_temp.dat")
751
752 # Storing CRISPR repeats information
753 information_CRISPR = {}
754 with open("crisprfile.txt", "rU") as crisprfile:
755 for line in crisprfile:
756 if "SUMMARY BY POSITION" in line:
757 for line in crisprfile:
758 information_crispr_repeat = {}
759 try:
760 patC = re.compile('^\s+(\d+)\s+.{16}\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d?\s+(\w+)')
761 key, start, length, seq = re.match(patC, line).groups()
762 except AttributeError:
763 continue
764 else:
765 information_crispr_repeat['start'] = start
766 information_crispr_repeat['end'] = int(start) + int(length)
767 information_crispr_repeat['repeatseq'] = seq
768 information_crispr_repeat['repeatend'] = int(start) + len(seq)
769 information_CRISPR[key] = information_crispr_repeat
770
771 # Storing tandem repeats information
772 information_TRF = {}
773 count = 1
774 with open("trf_temp.dat", "rU") as trfile:
775 for line in trfile:
776 information_tandem_repeat = {}
777 try:
778 patT = re.compile('^(\d+)\s(\d+)\s\d+\s\d+\.\d+\s')
779 start, end = re.match(patT, line).groups()
780 except AttributeError:
781 continue
782 else:
783 information_tandem_repeat['start'] = start
784 information_tandem_repeat['end'] = end
785 information_TRF[count] = information_tandem_repeat
786 count += 1
787
788 # Storing inverted repeats information
789 information_IRF = {}
790 count = 1
791 with open("irf_temp.dat", "rU") as irfile:
792 for line in irfile:
793 information_inverted_repeat = {}
794 try:
795 patI = re.compile('^(\d+)\s(\d+)\s\d+\s\d+\s\d+')
796 start, end = re.match(patI, line).groups()
797 except AttributeError:
798 continue
799 else:
800 information_inverted_repeat['start'] = start
801 information_inverted_repeat['end'] = end
802 information_IRF[count] = information_inverted_repeat
803 count += 1
804
805 # Creating a new Genbank and GFF file
806 eprint("Creating the output files")
807 newtempgbk = "%s.temp.gbk" % newfile
808 with open(newfile, "rU") as basefile, open(newtempgbk, "w"):
809 for record in SeqIO.parse(basefile, "fasta", IUPAC.ambiguous_dna):
810 whole_sequence = SeqRecord(record.seq)
811 whole_sequence.id = str(record.id)
812 whole_sequence.annotations['data_file_division'] = args.typedata.upper()
813 whole_sequence.annotations['date'] = strftime("%d-%b-%Y").upper()
814 for protein in sorted(protsdict, key = stringSplitByNumbers):
815 putative_start = int(protsdict[protein]['begin'])
816 start_pos = SeqFeature.ExactPosition(putative_start)
817 end_pos = SeqFeature.ExactPosition(protsdict[protein]['end'])
818 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=protsdict[protein]['strand'])
819 qualifiersgene = OrderedDict([('locus_tag', protsdict[protein]['protein_id'])])
820 new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = protsdict[protein]['strand'], qualifiers = qualifiersgene)
821 whole_sequence.features.append(new_data_gene)
822 qualifiers = [('locus_tag', protsdict[protein]['protein_id']), ('product', protsdict[protein]['product']), ('protein_id', protsdict[protein]['protein_id']), ('translation', protsdict[protein]['translation'])]
823 feature_qualifiers = OrderedDict(qualifiers)
824 new_data_cds = SeqFeature.SeqFeature(feature_location, type = "CDS", strand = protsdict[protein]['strand'], qualifiers = feature_qualifiers)
825 whole_sequence.features.append(new_data_cds)
826 for rRNA in sorted(subunits, key = stringSplitByNumbers):
827 i = 0
828 try:
829 lengthlist = len(subunits[rRNA]['listdata'])
830 except KeyError:
831 continue
832 else:
833 while i < lengthlist:
834 putative_start = int(subunits[rRNA]['listdata'][i]['begin'])
835 start_pos = SeqFeature.ExactPosition(putative_start)
836 end_pos = SeqFeature.ExactPosition(subunits[rRNA]['listdata'][i]['end'])
837 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=subunits[rRNA]['listdata'][i]['strand'])
838 new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = subunits[rRNA]['listdata'][i]['strand'])
839 whole_sequence.features.append(new_data_gene)
840 qualifiers = [('product', subunits[rRNA]['listdata'][i]['product'])]
841 feature_qualifiers = OrderedDict(qualifiers)
842 new_data_rRNA = SeqFeature.SeqFeature(feature_location, type = "rRNA", strand = subunits[rRNA]['listdata'][i]['strand'], qualifiers = feature_qualifiers)
843 whole_sequence.features.append(new_data_rRNA)
844 i += 1
845 for tRNA in sorted(tRNAdict, key = stringSplitByNumbers):
846 putative_start = int(tRNAdict[tRNA]['begin'])
847 start_pos = SeqFeature.ExactPosition(putative_start)
848 end_pos = SeqFeature.ExactPosition(tRNAdict[tRNA]['end'])
849 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=tRNAdict[tRNA]['strand'])
850 new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = tRNAdict[tRNA]['strand'])
851 whole_sequence.features.append(new_data_gene)
852 qualifiers = [('product', tRNAdict[tRNA]['product'])]
853 feature_qualifiers = OrderedDict(qualifiers)
854 new_data_tRNA = SeqFeature.SeqFeature(feature_location, type = "tRNA", strand = tRNAdict[tRNA]['strand'], qualifiers = feature_qualifiers)
855 whole_sequence.features.append(new_data_tRNA)
856 for tmRNA in sorted(tmRNAdict, key = stringSplitByNumbers):
857 putative_start = int(tmRNAdict[tmRNA]['begin'])
858 start_pos = SeqFeature.ExactPosition(putative_start)
859 end_pos = SeqFeature.ExactPosition(tmRNAdict[tmRNA]['end'])
860 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=tmRNAdict[tmRNA]['strand'])
861 new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = tmRNAdict[tmRNA]['strand'])
862 whole_sequence.features.append(new_data_gene)
863 qualifiers = [('product', tmRNAdict[tmRNA]['product'])]
864 feature_qualifiers = OrderedDict(qualifiers)
865 new_data_tmRNA = SeqFeature.SeqFeature(feature_location, type = "tmRNA", strand = tmRNAdict[tmRNA]['strand'], qualifiers = feature_qualifiers)
866 whole_sequence.features.append(new_data_tmRNA)
867 for CRISPR in sorted(information_CRISPR, key = stringSplitByNumbers):
868 putative_start = int(information_CRISPR[CRISPR]['start'])
869 start_pos = SeqFeature.ExactPosition(putative_start)
870 end_pos = SeqFeature.ExactPosition(information_CRISPR[CRISPR]['end'])
871 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos)
872 qualifiers = [('rpt_family', 'CRISPR'), ('rpt_type', 'direct'), ('rpt_unit_range', "%i..%i" % (int(information_CRISPR[CRISPR]['start']), int(information_CRISPR[CRISPR]['repeatend']))), ('rpt_unit_seq', information_CRISPR[CRISPR]['repeatseq'])]
873 feature_qualifiers = OrderedDict(qualifiers)
874 new_data_CRISPRrepeat = SeqFeature.SeqFeature(feature_location, type = "repeat_region", qualifiers = feature_qualifiers)
875 whole_sequence.features.append(new_data_CRISPRrepeat)
876 for tandem in sorted(information_TRF):
877 putative_start = int(information_TRF[tandem]['start'])
878 start_pos = SeqFeature.ExactPosition(putative_start)
879 end_pos = SeqFeature.ExactPosition(information_TRF[tandem]['end'])
880 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos)
881 qualifiers = [('rpt_type', 'direct')]
882 feature_qualifiers = OrderedDict(qualifiers)
883 new_data_tandemrepeat = SeqFeature.SeqFeature(feature_location, type = "repeat_region", qualifiers = feature_qualifiers)
884 whole_sequence.features.append(new_data_tandemrepeat)
885 for inverted in sorted(information_IRF):
886 putative_start = int(information_IRF[inverted]['start'])
887 start_pos = SeqFeature.ExactPosition(putative_start)
888 end_pos = SeqFeature.ExactPosition(information_IRF[inverted]['end'])
889 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos)
890 qualifiers = [('rpt_type', 'inverted')]
891 feature_qualifiers = OrderedDict(qualifiers)
892 new_data_invertedrepeat = SeqFeature.SeqFeature(feature_location, type = "repeat_region", qualifiers = feature_qualifiers)
893 whole_sequence.features.append(new_data_invertedrepeat)
894 SeqIO.write(whole_sequence, newtempgbk, "genbank")
895
896 newgbk = "%s.gbk" % newfile
897 with open(newtempgbk, "rU") as gbktempfile, open(newgbk, "w") as gbkrealfile:
898 newpat = re.compile("D|RNA\s+(CON|PHG|VRL|BCT)")
899 for line in gbktempfile:
900 if line.startswith("LOCUS ") and re.search(newpat, line):
901 if genomeshape['genomeshape'] == "linear":
902 newline = re.sub("bp DNA\s+", "bp DNA linear ", line)
903 else:
904 newline = re.sub("bp DNA\s+", "bp DNA circular ", line)
905 gbkrealfile.write(newline)
906 else:
907 gbkrealfile.write(line)
908
909 for f in glob.glob("*.temp.gbk"):
910 os.remove(f)
911
912 if args.gffprint==True:
913 newgff = "%s.gff" % newfile
914 with open(newgff, "w") as outgff, open(newgbk, "rU") as ingbk:
915 GFF.write(SeqIO.parse(ingbk, "genbank"), outgff)
916
917 # Removing intermediate files
918 os.remove(newfile)
919 os.remove("temp.faa")
920 os.remove("temp_blast.csv")
921 os.remove("crisprfile.txt")
922 os.remove("trnafile.fasta")
923 os.remove("rrnafile.csv")
924 os.remove("trf_temp.dat")
925 os.remove("irf_temp.dat")
926 for f in glob.glob("SEQ*"):
927 os.remove(f)
928
929 # Joining all GENBANK files into one
930 listgbk = sorted(glob.glob('CONTIG_*.gbk'))
931 gbkoutputfile = "%s.gbk" % root_output
932 with open(gbkoutputfile, 'w') as finalgbk:
933 for fname in listgbk:
934 with open(fname) as infile:
935 for line in infile:
936 finalgbk.write(line)
937
938 for tempgbk in glob.glob("CONTIG_*.gbk"):
939 os.remove(tempgbk)
940
941 # Joining all GFF files into one
942 if args.gffprint==True:
943 listgff = sorted(glob.glob('CONTIG_*.gff'))
944 gffoutputfile = "%s.gff" % root_output
945 with open(gffoutputfile, 'w') as finalgff:
946 for fname in listgff:
947 with open(fname) as infile:
948 for line in infile:
949 finalgff.write(line)
950 for tempgff in glob.glob("CONTIG_*.gff"):
951 os.remove(tempgff)
952
953 # Joining all TABLE files into one
954 listcsv = sorted(glob.glob('CONTIG_*.csv'))
955 tbloutputfile = "%s.csv" % root_output
956 with open(tbloutputfile, 'w') as finaltable:
957 for fname in listcsv:
958 with open(fname) as infile:
959 for line in infile:
960 finaltable.write(line)
961
962 for temptbl in glob.glob("CONTIG_*.csv"):
963 os.remove(temptbl)
964
965 # Preparing sequences for GenBank submission (Original code from Wan Yu's gbk2tbl.py script [https://github.com/wanyuac/BINF_toolkit/blob/master/gbk2tbl.py])
966 allowed_qualifiers = ['locus_tag', 'gene', 'product', 'pseudo', 'protein_id', 'gene_desc', 'old_locus_tag', 'note', 'inference', 'organism', 'mol_type', 'strain', 'sub_species', 'isolation-source', 'country']
967 newfastafile = "%s.fasta" % root_output
968 newtablefile = "%s.tbl" % root_output
969 with open(args.modifiers, "rU") as modifiers, open(gbkoutputfile, "r") as genbank_fh, open(newfastafile, "w") as fasta_fh, open(newtablefile, "w") as feature_fh:
970 info = modifiers.readline()
971 wholelist = list(SeqIO.parse(genbank_fh, 'genbank'))
972 for record in wholelist:
973 if len(record) <= args.mincontigsize:
974 eprint("WARNING: Skipping small contig %s" % record.id)
975 continue
976 record.description = "%s %s" % (record.id, info)
977 SeqIO.write([record], fasta_fh, 'fasta')
978 print('>Feature %s' % (record.name), file=feature_fh)
979 for line in record.features:
980 if line.strand == 1:
981 print('%d\t%d\t%s' % (line.location.nofuzzy_start + 1, line.location.nofuzzy_end, line.type), file=feature_fh)
982 else:
983 print('%d\t%d\t%s' % (line.location.nofuzzy_end, line.location.nofuzzy_start + 1, line.type), file=feature_fh)
984 for (key, values) in line.qualifiers.iteritems():
985 if key not in allowed_qualifiers:
986 continue
987 for v in values:
988 print('\t\t\t%s\t%s' % (key, v), file=feature_fh)
989
990 # Final statement
991 eprint("Genome annotation done!")
992 eprint("The GenBank file is %s" % gbkoutputfile)
993 if args.gffprint==True:
994 eprint("The GFF3 file is %s" % gffoutputfile)
995 eprint("The table file for GenBank submission is %s" % tbloutputfile)
996 eprint("The FASTA file for GenBank submission is %s" % newfastafile)
997 eprint("The table file with all protein information is %s" % newtablefile)