annotate VIGA.py @ 5:ff53c4ab7257 draft default tip

Uploaded
author vimalkumarvelayudhan
date Wed, 28 Feb 2018 05:00:51 -0500
parents 231e4c669675
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
1 #!/usr/bin/env python
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
2
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
3 # -*- coding: utf-8 -*-
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
4
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
5 # VIGA - De-novo VIral Genome Annotator
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
6 #
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
7 # Copyright (C) 2017 - Enrique Gonzalez-Tortuero
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
8 # Vimalkumar Velayudhan
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
9 #
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
10 # This program is free software: you can redistribute it and/or modify
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
11 # it under the terms of the GNU General Public License as published by
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
12 # the Free Software Foundation, either version 3 of the License, or
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
13 # (at your option) any later version.
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
14 #
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
15 # This program is distributed in the hope that it will be useful,
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
18 # GNU General Public License for more details.
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
19 #
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
20 # You should have received a copy of the GNU General Public License
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
21 # along with this program. If not, see <http://www.gnu.org/licenses/>.
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
22
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
23 # Importing python libraries
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
24 from __future__ import print_function
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
25 import argparse
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
26 import csv
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
27 import fileinput
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
28 import fractions
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
29 import glob
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
30 import numpy
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
31 import os
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
32 import os.path
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
33 import re
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
34 import sys
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
35 import shutil
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
36 import subprocess
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
37 from BCBio import GFF
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
38 from Bio import SeqIO
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
39 from Bio import SeqFeature
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
40 from Bio.Alphabet import IUPAC
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
41 from Bio.Seq import Seq
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
42 from Bio.SeqFeature import FeatureLocation
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
43 from Bio.SeqRecord import SeqRecord
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
44 from Bio.SeqUtils.ProtParam import ProteinAnalysis
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
45 from collections import OrderedDict, defaultdict
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
46 from itertools import product
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
47 from scipy import signal
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
48 from time import strftime
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
49
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
50 # Preparing functions
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
51 def batch_iterator(iterator, batch_size):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
52 entry = True
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
53 while entry:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
54 batch = []
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
55 while len(batch) < batch_size:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
56 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
57 entry = iterator.next()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
58 except StopIteration:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
59 entry = None
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
60 if entry is None:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
61 break
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
62 batch.append(entry)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
63 if batch:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
64 yield batch
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
65
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
66 def check_peaks(peaks, length):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
67 # Checking if origin/terminus peaks are too close or too far apart. In that case, they are probably wrong
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
68 closest, farthest = int(length * float(0.45)), int(length * float(0.55))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
69 pairs = []
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
70 for pair in list(product(*peaks)):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
71 ### added this to make sure gets origin and ter right
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
72 tr, pk = sorted(list(pair), key = lambda x: x[1], reverse = False) # trough and peak
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
73 a = (tr[0] - pk[0]) % length
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
74 b = (pk[0] - tr[0]) % length
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
75 pt = abs(tr[1] - pk[1]) # distance between values
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
76 if (a <= farthest and a >= closest) or (b <=farthest and b >= closest):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
77 pairs.append([pt, tr, pk])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
78 if len(pairs) == 0:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
79 return [False, False]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
80 pt, tr, pk = sorted(pairs, reverse = True)[0]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
81 return [tr[0], pk[0]]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
82
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
83 def cmd_exists(cmd):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
84 return subprocess.call("type " + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
85
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
86 def GCskew(name, length, seq, window, slide):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
87 replacements = {'G':1, 'C':-1, 'A':0, 'T':0, 'N':0}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
88 gmc = [] # G - C
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
89 for base in seq:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
90 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
91 gmc.append(replacements[base])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
92 except:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
93 gmc.append(0)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
94 # convert to G + C
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
95 gpc = [abs(i) for i in gmc] # G + C
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
96 # calculate sliding windows for (G - C) and (G + C)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
97 weights = numpy.ones(window)/window
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
98 gmc = [[i, c] for i, c in enumerate(signal.fftconvolve(gmc, weights, 'same').tolist())]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
99 gpc = [[i, c] for i, c in enumerate(signal.fftconvolve(gpc, weights, 'same').tolist())]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
100 # calculate gc skew and cummulative gc skew sum
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
101 skew = [[], []] # x and y for gc skew
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
102 c_skew = [[], []] # x and y for gc skew cummulative sums
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
103 cs = 0 # cummulative sum
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
104 # select windows to use based on slide
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
105 for i, m in gmc[0::slide]:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
106 p = gpc[i][1]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
107 if p == 0:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
108 gcs = 0
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
109 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
110 gcs = m/p
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
111 cs += gcs
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
112 skew[0].append(i)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
113 c_skew[0].append(i)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
114 skew[1].append(gcs)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
115 c_skew[1].append(cs)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
116 ori, ter = find_ori_ter(c_skew, length)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
117 return ori, ter, skew, c_skew
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
118
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
119 def eprint(*args, **kwargs):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
120 print(*args, file=sys.stderr, **kwargs)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
121
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
122 def find_ori_ter(c_skew, length):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
123 # Find origin and terminus of replication based on cumulative GC skew min and max peaks
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
124 c_skew_min = signal.argrelextrema(numpy.asarray(c_skew[1]), numpy.less, order = 1)[0].tolist()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
125 c_skew_max = signal.argrelextrema(numpy.asarray(c_skew[1]), numpy.greater, order = 1)[0].tolist()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
126 # return False if no peaks were detected
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
127 if len(c_skew_min) == 0 or len(c_skew_min) == 0:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
128 return [False, False]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
129 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
130 c_skew_min = [[c_skew[0][i], c_skew[1][i]] for i in c_skew_min]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
131 c_skew_max = [[c_skew[0][i], c_skew[1][i]] for i in c_skew_max]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
132 ori, ter = check_peaks([c_skew_min, c_skew_max], length)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
133 return ori, ter
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
134
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
135 def stringSplitByNumbers(x):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
136 r = re.compile('(\d+)')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
137 l = r.split(x)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
138 return [int(y) if y.isdigit() else y for y in l]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
139
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
140 # Defining the program version
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
141 version = "0.10.3"
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
142
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
143 # Processing the parameters
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
144 parser = argparse.ArgumentParser(description='VIGA is an automatic de novo VIral Genome Annotator.')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
145 basic_group = parser.add_argument_group('Basic options for VIGA [REQUIRED]')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
146
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
147 basic_group.add_argument("--input", dest="inputfile", type=str, required=True, help='Input file as a FASTA file', metavar="FASTAFILE")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
148 basic_group.add_argument("--rfamdb", dest="rfamdatabase", type=str, required=True, help='RFAM Database that will be used for the ribosomal RNA prediction. RFAMDB should be in the format "/full/path/to/rfamdb/Rfam.cm" and must be compressed accordingly (see INFERNAL manual) before running the script.', metavar="RFAMDB")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
149 basic_group.add_argument("--modifiers", dest="modifiers", type=str, required=True, help='Input file as a plain text file with the modifiers per every FASTA header according to SeqIn (https://www.ncbi.nlm.nih.gov/Sequin/modifiers.html). All modifiers must be written in a single line and are separated by a single space character. No space should be placed besides the = sign. For example: [organism=Serratia marcescens subsp. marcescens] [sub-species=marcescens] [strain=AH0650_Sm1] [topology=linear] [moltype=DNA] [tech=wgs] [gcode=11] [country=Australia] [isolation-source=sputum]. This line will be copied and printed along with the record name as the definition line of every contig sequence.', metavar="TEXTFILE")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
150
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
151 advanced_group = parser.add_argument_group('Advanced options for VIGA [OPTIONAL]')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
152 advanced_group.add_argument("--readlength", dest="read_length", type=int, default=101, help='Read length for the circularity prediction (default: 101 bp)', metavar="INT")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
153 advanced_group.add_argument("--windowsize", dest="window", type=int, default=100, help='Window size used to determine the origin of replication in circular contigs according to the cumulative GC skew (default: 100 bp)', metavar="INT")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
154 advanced_group.add_argument("--slidingsize", dest="slide", type=int, default=10, help='Window size used to determine the origin of replication in circular contigs according to the cumulative GC skew (default: 10 bp)', metavar="INT")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
155 advanced_group.add_argument("--out", dest="rootoutput", type=str, help='Name of the outputs files (without extension)', metavar="OUTPUTNAME")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
156 advanced_group.add_argument("--locus", dest="locus", type=str, default='LOC', help='Name of the sequences. If the input is a multiFASTA file, please put a general name as the program will add the number of the contig at the end of the name (Default: %(default)s)', metavar="STRING")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
157 advanced_group.add_argument("--gff", dest="gffprint", action='store_true', default=False, help='Printing the output as GFF3 file (Default: False)')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
158 advanced_group.add_argument("--threads", dest="ncpus", default=1, help='Number of threads/cpus (Default: %(default)s cpu)', metavar="INT")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
159 advanced_group.add_argument("--nohmmer", dest="nohmmer", action='store_true', default=False, help='Running only BLAST to predict protein function. (Default: False)')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
160 advanced_group.add_argument("--noblast", dest="noblast", action='store_true', default=False, help='Running DIAMOND instead of BLAST to predict protein function according to homology. This will be less sensitive but faster than BLAST. (Default: False)')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
161 advanced_group.add_argument("--blastdb", dest="blastdatabase", type=str, help='BLAST Database that will be used for the protein function prediction. The database must be an amino acid one, not nucleotidic. This argument is mandatory if --noblast option is disabled', metavar="BLASTDB")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
162 advanced_group.add_argument("--diamonddb", dest="diamonddatabase", type=str, help='DIAMOND Database that will be used for the protein function prediction. The database must be created from a amino acid FASTA file as indicated in https://github.com/bbuchfink/diamond. This argument is mandatory when --noblast option is enabled', metavar="DIAMONDDB")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
163 advanced_group.add_argument("--blastevalue", dest="blastevalue", default=0.00001, help='BLAST/DIAMOND e-value threshold (Default: 0.00001)', metavar="FLOAT")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
164 advanced_group.add_argument("--hmmdb", dest="hmmdatabase", type=str, help='PHMMER Database that will be used for the protein function prediction according to Hidden Markov Models. In this case, HMMDB must be in FASTA format (e.g. UniProt: "', metavar="HMMDB")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
165 advanced_group.add_argument("--hmmerevalue", dest="hmmerevalue", default=0.001, help='PHMMER e-value threshold (Default: 0.001)', metavar="FLOAT")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
166
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
167 type_choices = {'BCT': 'Prokaryotic chromosome', 'CON': 'Contig', 'PHG': 'Phages', 'VRL': 'Eukaryotic/Archaea virus'}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
168 type_help = ('GenBank Division: One of the following codes - {0}. (Default: %(default)s)'.format(', '.join('{0} ({1})'.format(k, v) for k, v in type_choices.items())))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
169 advanced_group.add_argument("--typedata", dest="typedata", type=str, default='CON', help=type_help, metavar="BCT|CON|VRL|PHG")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
170
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
171 gcode_choices = {'1': 'Standard genetic code [Eukaryotic]', '2': 'Vertebrate mitochondrial code', '3': 'Yeast mitochondrial code', '4': 'Mycoplasma/Spiroplasma and Protozoan/mold/coelenterate mitochondrial code', '5': 'Invertebrate mitochondrial code', '6': 'Ciliate, dasycladacean and hexamita nuclear code', '9': 'Echinoderm/flatworm mitochondrial code', '10': 'Euplotid nuclear code', '11': 'Bacteria/Archaea/Phages/Plant plastid', '12': 'Alternative yeast nuclear code', '13': 'Ascidian mitochondrial code', '14': 'Alternative flatworm mitochondrial code', '16': 'Chlorophycean mitochondrial code', '21': 'Trematode mitochondrial code', '22': 'Scedenesmus obliquus mitochondrial code', '23': 'Thraustochytrium mitochondrial code', '24': 'Pterobranquia mitochondrial code', '25': 'Gracilibacteria & Candidate division SR1', '26': 'Pachysolen tannophilus nuclear code', '27': 'Karyorelict nuclear code', '28': 'Condylostoma nuclear code', '29': 'Mesodinium nuclear code', '30': 'Peritrich nuclear code', '31': 'Blastocrithidia nuclear code'}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
172 gcode_help = ('Number of GenBank translation table. At this moment, the available options are {0}. (Default: %(default)s)'.format('{}'.format(', '.join('{0} ({1})'.format(k, v) for k, v in gcode_choices.items()))))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
173 advanced_group.add_argument("--gcode", dest="gcode", type=str, default='11', help=gcode_help, metavar="NUMBER")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
174
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
175 advanced_group.add_argument('--mincontigsize', dest="mincontigsize", type=int, default = 200, help = 'Minimum contig length to be considered in the final files (Default: 200 bp)', metavar="INT")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
176 advanced_group.add_argument("--idthr", dest="idthreshold", default=50.00, help='ID threshold (Default: 50.0)', metavar="FLOAT")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
177 advanced_group.add_argument("--coverthr", dest="covthreshold", default=50.00, help='Coverage threshold (Default: 50.0)', metavar="FLOAT")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
178 advanced_group.add_argument("--diffid", dest="diffid", default=5.00, help='Max allowed difference between the ID percentages of BLAST and HMMER. (Default = 5.00; Not recommended to change such value)', metavar="FLOAT (>0.01)")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
179 advanced_group.add_argument("--minrepeat", dest="minrepeat", type=int, default=16, help="Minimum repeat length for CRISPR detection (Default: 16)", metavar="INT")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
180 advanced_group.add_argument("--maxrepeat", dest="maxrepeat", type=int, default=64, help="Maximum repeat length for CRISPR detection (Default: 64)")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
181 advanced_group.add_argument("--minspacer", dest="minspacer", type=int, default=8, help="Minimum spacer length for CRISPR detection (Default: 8)")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
182 advanced_group.add_argument("--maxspacer", dest="maxspacer", type=int, default=64, help="Maximum spacer length for CRISPR detection (Default: 64)")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
183 advanced_group.add_argument("--blastexh", dest="blastexh", action='store_true', default=False, help='Use of exhaustive BLAST to predict the proteins by homology according to Fozo et al. (2010) Nucleic Acids Res (Default=FALSE)')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
184
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
185 args = parser.parse_args()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
186
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
187 root_output = args.rootoutput
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
188 if not root_output:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
189 root_output = '{}_annotated'.format(os.path.splitext(args.inputfile)[0])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
190
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
191 if args.noblast == False and args.blastdatabase == None:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
192 sys.exit('You MUST specify BLAST database using the parameter --blastdb if you are not using --noblast option')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
193
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
194 if args.noblast == True and args.diamonddatabase == None:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
195 sys.exit('You MUST specify DIAMOND database using the parameter --diamonddb if you are using --noblast option')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
196
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
197 if args.nohmmer == False and args.hmmdatabase == None:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
198 sys.exit('You MUST specify HMMER database using the parameter --hmmdb if you are not using --nohmmer option')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
199
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
200 # Printing the header of the program
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
201 eprint("This is VIGA %s" % str(version))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
202 eprint("Written by Enrique Gonzalez Tortuero & Vimalkumar Velayudhan")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
203 eprint("Homepage is https://github.com/EGTortuero/virannot")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
204 eprint("Local time: ", strftime("%a, %d %b %Y %H:%M:%S"))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
205 eprint("\n\n")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
206
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
207 # checking the presence of the programs in the system
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
208
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
209 if not cmd_exists("lastz")==True:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
210 sys.exit("You must install LASTZ before using this script")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
211 elif not cmd_exists("cmscan")==True:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
212 sys.exit("You must install INFERNAL before using this script")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
213 elif not cmd_exists("prodigal")==True:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
214 sys.exit("You must install prodigal before using this script")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
215 elif not cmd_exists("parallel")==True:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
216 sys.exit("You must install GNU Parallel before using this script")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
217 elif not cmd_exists("blastp")==True:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
218 sys.exit("You must install BLAST before using this script")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
219 elif not cmd_exists("diamond")==True:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
220 sys.exit("You must install DIAMOND before using this script")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
221 elif not cmd_exists("phmmer")==True:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
222 sys.exit("You must install HMMER 3 before using this script")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
223 elif not cmd_exists("aragorn")==True:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
224 sys.exit("You must install ARAGORN before using this script")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
225 elif not cmd_exists("pilercr")==True:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
226 sys.exit("You must install PILERCR before using this script")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
227 elif not cmd_exists("trf")==True:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
228 sys.exit("You must install Tandem Repeats Finder before using this script")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
229 elif not cmd_exists("irf")==True:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
230 sys.exit("You must install Inverted Repeats Finder before using this script")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
231
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
232 eprint("Data type is {0} and GenBank translation table no is {1}\n".format(args.typedata, args.gcode))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
233
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
234 # Correcting the original file (long headers problem + multiple FASTA files)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
235 record_iter = SeqIO.parse(open(args.inputfile, "rU"),"fasta")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
236 counter = 1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
237 newnamessequences = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
238 for i, batch in enumerate(batch_iterator(record_iter, 1)):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
239 seq_index = "CONTIG_%i" % (i + 1)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
240 filename = "%s.temp.fna" % seq_index
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
241 newfilename = "%s.fna" % seq_index
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
242 with open(filename, "w") as handle:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
243 count = SeqIO.write(batch, filename, "fasta")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
244
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
245 with open(filename, "rU") as original, open(newfilename, "w") as corrected:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
246 sequences = SeqIO.parse(original, "fasta", IUPAC.ambiguous_dna)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
247 for record in sequences:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
248 original_name = record.id
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
249 record.id = "%s_%i" % (args.locus, counter)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
250 record.description = record.description
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
251 counter += 1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
252 newnamessequences[record.id] = original_name
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
253 eprint("WARNING: The name of the sequence %s was corrected as %s" % (original_name, record.id))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
254 SeqIO.write(record, corrected, "fasta")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
255
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
256 with open("logfile.txt", "w") as logfile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
257 logfile.write("#Original\tNew\n")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
258 for oldname in sorted(newnamessequences, key = stringSplitByNumbers):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
259 logfile.write("%s\t%s\n" % (oldname, newnamessequences[oldname]))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
260 os.remove(filename)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
261
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
262 for newfile in sorted(glob.glob("CONTIG_*.fna")):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
263
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
264 # Predicting the shape of the contig (code based on Alex Crits-Christoph's find_circular.py script [https://github.com/alexcritschristoph/VICA/blob/master/find_circular.py])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
265 eprint("Predicting the shape of the contig using LASTZ")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
266 genomeshape = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
267 with open(newfile, "rU") as targetfasta:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
268 Sequence = SeqIO.parse(newfile, "fasta")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
269 for record in Sequence:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
270 seq_beginning = str(record.seq[0:args.read_length])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
271 seq_ending = str(record.seq[len(record.seq)-args.read_length:len(record.seq)])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
272 combined_seqs = SeqRecord(Seq(seq_beginning + seq_ending, IUPAC.ambiguous_dna), id = record.description)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
273 SeqIO.write(combined_seqs, "temporal_circular.fasta", "fasta")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
274 outputlastz = subprocess.check_output(["lastz", "temporal_circular.fasta", "--self", "--notrivial", "--nomirror", "--format=general-:start1,end1,start2,end2,score,strand1,strand2,identity,length1"])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
275 resultslastz = outputlastz.split("\n")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
276 for resultlastz in resultslastz:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
277 if resultlastz != '':
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
278 start1 = resultlastz.split()[0]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
279 end1 = resultlastz.split()[1]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
280 start2 = resultlastz.split()[2]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
281 end2 = resultlastz.split()[3]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
282 strand1 = resultlastz.split()[5]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
283 strand2 = resultlastz.split()[6]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
284 identity = resultlastz.split()[7]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
285 length = int(resultlastz.split()[9])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
286 if strand1 == strand2 and length > 0.4 * args.read_length and float(fractions.Fraction(identity)) > 0.95 and int(start1) < 5 and int(start2) > args.read_length and int(end1) < args.read_length and int(end2) > args.read_length * 2 * 0.9:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
287 genomeshape['genomeshape'] = "circular"
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
288 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
289 if genomeshape['identity'] >= float(fractions.Fraction(identity)):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
290 genomeshape['identity'] = float(fractions.Fraction(identity))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
291 genomeshape['length'] = length
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
292 except KeyError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
293 genomeshape['identity'] = float(fractions.Fraction(identity))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
294 genomeshape['length'] = length
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
295 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
296 continue
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
297 if strand1 == strand2 and length > 0.4 * args.read_length and float(fractions.Fraction(identity)) > 0.95 and int(start1) < 5 and int(start2) > args.read_length and int(end1) < args.read_length and int(end2) > args.read_length * 2 * 0.9:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
298 genomeshape['genomeshape'] = "circular"
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
299 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
300 if genomeshape['identity'] >= float(fractions.Fraction(identity)):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
301 genomeshape['identity'] = float(fractions.Fraction(identity))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
302 genomeshape['length'] = length
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
303 except KeyError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
304 genomeshape['identity'] = float(fractions.Fraction(identity))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
305 genomeshape['length'] = length
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
306 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
307 if genomeshape['genomeshape'] == "":
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
308 genomeshape['genomeshape'] = "linear"
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
309 except KeyError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
310 genomeshape['genomeshape'] = "linear"
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
311 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
312 genomeshape['genomeshape'] = "circular"
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
313 with open("temp.fasta", "w") as correctedcircular:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
314 Corrseq = str(record.seq[int(genomeshape['length'])//2:-int(genomeshape['length'])//2])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
315 Newseq = SeqRecord(Seq(Corrseq, IUPAC.ambiguous_dna), id = record.description)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
316 SeqIO.write(Newseq, correctedcircular, "fasta")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
317 os.rename("temp.fasta", "temp2.fasta")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
318 eprint("LASTZ predicted that %s is %s\n" % (newfile, genomeshape['genomeshape']))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
319 os.remove("temporal_circular.fasta")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
320
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
321 # Calculate the cumulative GCskew in circular contigs to determine the origin of replication (Based on iRep -- Brown CT, Olm MR, Thomas BC, Banfield JF (2016) Measurement of bacterial replication rates in microbial communities. Nature Biotechnology 34: 1256-63.)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
322 if genomeshape['genomeshape'] == "circular":
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
323 for record in SeqIO.parse("temp2.fasta", "fasta"):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
324 length_contig = len(record.seq)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
325 #if length < min_len:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
326 # print('%s: Too Short' % (name), file=sys.stderr)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
327 # continue
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
328 oric, term, gcskew, cgcskew = GCskew(record.id, length_contig, record.seq, args.window, args.slide)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
329 valoric = oric
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
330 if oric == False:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
331 oric, term = 'n/a', 'n/a'
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
332 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
333 oric, term = '{:,}'.format(oric), '{:,}'.format(term)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
334 eprint('%s -> Origin: %s Terminus: %s' % (record.id, oric, term))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
335 #print('\t'.join(['# Name', 'Position', 'GC Skew', 'Cumulative GC Skew']))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
336 #for i, pos in enumerate(gcskew[0]):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
337 # out = [record.id, pos, gcskew[1][i], cgcskew[1][i]]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
338 # print('\t'.join([str(i) for i in out]))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
339 if valoric != False:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
340 firstpartseq = str(record.seq[valoric:-1])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
341 secondpartseq = str(record.seq[0:(valoric-1)])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
342 combinedcorrectedseq = SeqRecord(Seq(firstpartseq + secondpartseq, IUPAC.ambiguous_dna), id = record.description)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
343 SeqIO.write(combinedcorrectedseq, newfile, "fasta")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
344 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
345 eprint("As the program was unable to predict the origin of replication, %s was considered as is without correcting!" % record.id)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
346 os.rename("temp2.fasta", newfile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
347 if os.path.isfile("temp2.fasta"):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
348 os.remove("temp2.fasta")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
349
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
350 # Predicting genes using PRODIGAL
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
351 eprint("\nRunning Prodigal to predict the genes in %s" % newfile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
352 for record in SeqIO.parse(newfile, "fasta"):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
353 length_contig = len(record.seq)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
354 if (length_contig >= 100000):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
355 if genomeshape['genomeshape'] == 'linear':
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
356 subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-g", args.gcode, "-c", "-q"])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
357 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
358 subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-g", args.gcode, "-q"])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
359 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
360 if genomeshape['genomeshape'] == 'linear':
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
361 subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-p", "meta", "-g", args.gcode, "-c", "-q"])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
362 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
363 subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-p", "meta", "-g", args.gcode, "-q"])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
364 num_seqs = len(list(SeqIO.parse("pretemp.faa", "fasta")))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
365 eprint("PRODIGAL was able to predict %i genes in %s\n" % (num_seqs, newfile))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
366
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
367 with open("pretemp.faa", "rU") as originalfaa, open("temp.faa", "w") as correctedfaa:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
368 sequences = SeqIO.parse(originalfaa, "fasta")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
369 for record in sequences:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
370 record.seq = record.seq.rstrip("*")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
371 SeqIO.write(record, correctedfaa, "fasta")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
372
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
373 faa_file = "%s.faa" % newfile # TO DEBUG
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
374 shutil.copyfile("temp.faa", faa_file) # TO DEBUG
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
375 os.remove("pretemp.faa")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
376
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
377 # Predicting the function of the proteins based on homology using BLAST
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
378 equivalence = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
379 record_iter = SeqIO.parse(open("temp.faa"),"fasta")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
380 for i, batch in enumerate(batch_iterator(record_iter, 1)):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
381 seq_index = "SEQ_%i" % (i + 1)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
382 filename = "%s.faa" % seq_index
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
383 with open(filename, "w") as handle:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
384 count = SeqIO.write(batch, handle, "fasta")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
385 equivalence[seq_index] = batch[0].id
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
386
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
387 if args.blastexh==True:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
388 eprint("Running BLAST to predict the genes according to homology inference in %s using exhaustive mode (see Fozo et al. (2010) Nucleic Acids Res for details)" % newfile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
389 subprocess.call(['blastp', '-query', "temp.faa", '-db', args.blastdatabase, '-evalue', str(args.blastevalue), '-outfmt', '6 qseqid sseqid pident length qlen slen qstart qend evalue bitscore stitle', '-out', 'temp_blast.csv', '-max_target_seqs', '10', '-word_size', '2', '-gapopen', '8', '-gapextend', '2', '-matrix', '"PAM70"', '-comp_based_stats', '"0"', "-num_threads", str(args.ncpus)])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
390 eprint("Done. BLAST was done to predict the genes by homology\n")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
391 blast_log = "%s.blast.log" % newfile # TO DEBUG
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
392 shutil.copyfile("temp_blast.csv", blast_log) # TO DEBUG
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
393 elif args.noblast==True:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
394 eprint("Running DIAMOND to predict the genes according to homology inference in %s using default parameters" % newfile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
395 with open("temp.faa", "r") as tempfile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
396 first_line = tempfile.readline()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
397 if first_line.startswith(">"):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
398 subprocess.call(['diamond', 'blastp', '-q', 'temp.faa', '-d', args.diamonddatabase, '-e', str(args.blastevalue), '-f', '6', 'qseqid', 'sseqid', 'pident', 'length', 'qlen', 'slen', 'qstart', 'qend', 'evalue', 'bitscore', 'stitle', '-o', 'temp_blast.csv', '-k', '10', "-p", str(args.ncpus), '--quiet'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
399 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
400 open("temp_blast.csv", 'a').close()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
401 blast_log = "%s.blast.log" % newfile # TO DEBUG
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
402 shutil.copyfile("temp_blast.csv", blast_log) # TO DEBUG
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
403 eprint("Done. DIAMOND was done to predict the genes by homology\n")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
404 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
405 eprint("Running BLAST to predict the genes according to homology inference in %s using default parameters" % newfile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
406 subprocess.call(['blastp', '-query', "temp.faa", '-db', args.blastdatabase, '-evalue', str(args.blastevalue), '-outfmt', '6 qseqid sseqid pident length qlen slen qstart qend evalue bitscore stitle', '-out', 'temp_blast.csv', '-max_target_seqs', '10', "-num_threads", str(args.ncpus)])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
407 blast_log = "%s.blast.log" % newfile
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
408 shutil.copyfile("temp_blast.csv", blast_log) # TO DEBUG
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
409 eprint("Done. BLAST was done to predict the genes by homology\n") # TO DEBUG
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
410
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
411 # Parsing the results from BLAST
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
412 with open("temp_blast.csv", "rU") as blastresults:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
413 hypotheticalpat = re.compile(r'(((((?i)hypothetical)|(?i)uncharacteri(z|s)ed|(?i)predicted)) protein)|((?i)ORF|((?i)unnamed protein product|(?i)gp\d+))')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
414 reader = csv.DictReader(blastresults, delimiter='\t', fieldnames=['qseqid','sseqid','pident','length','qlen','slen','qstart','qend','evalue','bitscore','stitle'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
415 information_proteins_blast = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
416 for row in reader:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
417 perc_cover = round(100.00*(float(row['length'])/float(row['qlen'])),2)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
418 perc_id = float(row['pident'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
419 infoprot_blast = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
420 infoprot_blast['sseqid'] = row['sseqid']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
421 infoprot_blast['pident'] = perc_id
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
422 infoprot_blast['pcover'] = perc_cover
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
423 infoprot_blast['evalue'] = row['evalue']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
424 infoprot_blast['descr'] = row['stitle']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
425 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
426 data = information_proteins_blast[row['qseqid']]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
427 except KeyError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
428 if not re.search(hypotheticalpat, infoprot_blast['descr']) and float(perc_id) >= float(args.idthreshold) and float(perc_cover) >= float(args.covthreshold) and float(row['evalue']) <= float(args.blastevalue):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
429 information_proteins_blast[row['qseqid']] = infoprot_blast
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
430 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
431 continue
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
432 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
433 if not re.search(hypotheticalpat, infoprot_blast['descr']) and float(perc_id) >= float(args.idthreshold) and float(perc_id) >= float(infoprot_blast['pident']) and float(perc_cover) >= float(args.covthreshold) and float(perc_cover) >= float(infoprot_blast['pcover']) and float(row['evalue']) <= float(args.blastevalue):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
434 information_proteins_blast[row['qseqid']] = infoprot_blast
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
435
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
436 ## Predicting the function of the proteins based on HMM predictions using phmmer
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
437 if args.nohmmer == False:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
438 with open("commands.sh", "w") as commands:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
439 for singleprot in sorted(glob.glob("SEQ_*.faa")):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
440 hhmtable = "%s.tbl" % singleprot
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
441 eprint("Creating file to run parallel PHMMER")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
442 eprint("Adding %s to run PHMMER." % singleprot)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
443 line2write = ' '.join(["phmmer", "--cpu", "1", "--domtblout", hhmtable, "-E", str(args.hmmerevalue), "-o", "/dev/null", singleprot, args.hmmdatabase, '\n'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
444 commands.write(line2write)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
445
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
446 eprint("Running parallel PHMMER")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
447 subprocess.call(['parallel', '-j', str(args.ncpus)], stdin=open('commands.sh'))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
448 eprint("Done. PHMMER was done to predict the function of the genes according to Hidden Markov Models\n")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
449 os.remove("commands.sh")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
450
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
451 # Parsing the results from HMMER
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
452 information_proteins_hmmer = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
453 hypotheticalpat = re.compile(r'(((((?i)hypothetical)|(?i)uncharacteri(z|s)ed|(?i)predicted)) protein)|((?i)ORF|((?i)unnamed protein product|(?i)gp\d+))')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
454 for singletbl in sorted(glob.glob("*.faa.tbl")):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
455 rootname = singletbl.replace(".faa.tbl", "")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
456 with open(singletbl) as tblfile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
457 infoprot_hmmer = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
458 for line in tblfile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
459 if line.startswith("#"):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
460 continue
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
461 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
462 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
463 pat = re.compile("^(\S+)\s+\S\s+\d+\s+(\S+)\s+\S\s+(\d+)\s+((?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?)\s+\S+\s+\S+\s+\S+\s+\S+\s+(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s+\S+\s+\S+\s+\S+\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+\S+\s+\S+\s+(\S+)\s+(.*)")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
464 matchname, lociname, length, evaluehh, start, end, pident, description = re.match(pat, line).groups()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
465 except AttributeError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
466 continue
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
467 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
468 length = float(length)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
469 pident = 100.00*float(pident)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
470 protarea = 100.00*(((float(end)-1.00) - (float(start)-1.00))/length)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
471 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
472 data2 = infoprot_hmmer['lociname']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
473 except KeyError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
474 if not re.search(hypotheticalpat, description) and float(protarea) >= float(args.covthreshold) and float(evaluehh) <= float(args.hmmerevalue) and float(pident) >= 50.00:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
475 infoprot_hmmer['lociname'] = lociname
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
476 infoprot_hmmer['name'] = matchname
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
477 infoprot_hmmer['evalue'] = float(evaluehh)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
478 infoprot_hmmer['pcover'] = float(protarea)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
479 infoprot_hmmer['pident'] = float(pident)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
480 infoprot_hmmer['descr'] = description
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
481 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
482 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
483 if not re.search(hypotheticalpat, description) and float(protarea) >= float(args.covthreshold) and float(evaluehh) <= float(args.hmmerevalue) and float(pident) >= 50.00 and float(pident) >= infoprot_hmmer['pident']:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
484 infoprot_hmmer['lociname'] = lociname
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
485 infoprot_hmmer['name'] = matchname
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
486 infoprot_hmmer['evalue'] = float(evaluehh)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
487 infoprot_hmmer['pcover'] = float(protarea)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
488 infoprot_hmmer['pident'] = float(pident)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
489 infoprot_hmmer['descr'] = description
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
490 except KeyError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
491 continue
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
492 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
493 if not re.search(hypotheticalpat, description) and float(protarea) >= float(args.covthreshold) and float(evaluehh) <= float(args.hmmerevalue) and float(pident) >= 50.00 and float(pident) >= infoprot_hmmer['pident']:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
494 infoprot_hmmer['lociname'] = lociname
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
495 infoprot_hmmer['name'] = matchname
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
496 infoprot_hmmer['evalue'] = float(evaluehh)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
497 infoprot_hmmer['pcover'] = float(protarea)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
498 infoprot_hmmer['pident'] = float(pident)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
499 infoprot_hmmer['descr'] = description
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
500 information_proteins_hmmer[rootname] = infoprot_hmmer
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
501
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
502 #Storing protein information in memory
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
503 with open("temp.faa", "rU") as protsfile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
504 tempprotsdict = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
505 for protseq in SeqIO.parse(protsfile, "fasta"):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
506 tempindprot = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
507 dataprot = protseq.description.split(' # ')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
508 modseq = str(protseq.seq).replace("X","") # Removing all ambiguous amino acids to avoid problems with ProteinAnalysis module
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
509 analysed_seq = ProteinAnalysis(modseq)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
510 tempindprot['length'] = len(protseq.seq)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
511 tempindprot['isoelectricpoint'] = analysed_seq.isoelectric_point()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
512 tempindprot['molweightkda'] = analysed_seq.molecular_weight()/1000.00
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
513 tempindprot['instability'] = analysed_seq.instability_index()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
514 tempindprot['protein_id'] = dataprot[0]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
515 tempindprot['strand'] = int(dataprot[3])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
516 tempindprot['begin'] = int(dataprot[1])-1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
517 tempindprot['end'] = int(dataprot[2])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
518 tempprotsdict[dataprot[0]] = tempindprot
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
519
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
520 # Creation of table
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
521 debugtable = "%s.csv" % newfile
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
522 with open(debugtable, "w") as tablefile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
523 if args.nohmmer == False:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
524 print("\t".join(["Identifier", "Start", "Stop", "Strand", "size_aa", "pI", "Mol_weight_kDa", "Instability_index", "ID_BLAST", "Descr_BLAST", "evalue_BLAST", "%ID_BLAST", "%Cover_BLAST", "ID_HMMER", "Descr_HMMER", "evalue_HMMER", "%ID_HMMER", "%Cover_HMMER"]), file=tablefile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
525 keylist = information_proteins_hmmer.keys()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
526 keylist.sort()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
527 for keyB in keylist:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
528 keyB = keyB.replace(".faa.tbl", "")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
529 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
530 print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), information_proteins_blast[equivalence[keyB]]['sseqid'], information_proteins_blast[equivalence[keyB]]['descr'], str(information_proteins_blast[equivalence[keyB]]['evalue']), str(information_proteins_blast[equivalence[keyB]]['pident']), str(information_proteins_blast[equivalence[keyB]]['pcover']), information_proteins_hmmer[keyB]['name'], information_proteins_hmmer[keyB]['descr'], str(information_proteins_hmmer[keyB]['evalue']), str(information_proteins_hmmer[keyB]['pident']), str(information_proteins_hmmer[keyB]['pcover'])]), file=tablefile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
531 except KeyError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
532 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
533 print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), information_proteins_blast[equivalence[keyB]]['sseqid'], information_proteins_blast[equivalence[keyB]]['descr'], str(information_proteins_blast[equivalence[keyB]]['evalue']), str(information_proteins_blast[equivalence[keyB]]['pident']), str(information_proteins_blast[equivalence[keyB]]['pcover']), "None", "None", "NA", "NA", "NA"]), file=tablefile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
534 except KeyError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
535 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
536 print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), "None", "None", "NA", "NA", "NA", information_proteins_hmmer[keyB]['name'], information_proteins_hmmer[keyB]['descr'], str(information_proteins_hmmer[keyB]['evalue']), str(information_proteins_hmmer[keyB]['pident']), str(information_proteins_hmmer[keyB]['pcover'])]), file=tablefile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
537 except KeyError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
538 print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), "None", "None", "NA", "NA", "NA", "None", "None", "NA", "NA", "NA"]), file=tablefile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
539 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
540 print("\t".join(["Identifier", "Start", "Stop", "Strand", "size_aa", "pI", "Mol_weight_kDa", "Instability_index", "ID_BLAST", "Descr_BLAST", "evalue_BLAST", "%ID_BLAST", "%Cover_BLAST"]), file=tablefile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
541 keylist = equivalence.values()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
542 keylist.sort()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
543 for keyB in keylist:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
544 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
545 print("\t".join([keyB, str(tempprotsdict[keyB]['begin']), str(tempprotsdict[keyB]['end']), str(tempprotsdict[keyB]['strand']), str(tempprotsdict[keyB]['length']), str(tempprotsdict[keyB]['isoelectricpoint']), str(tempprotsdict[keyB]['molweightkda']), str(tempprotsdict[keyB]['instability']), information_proteins_blast[keyB]['sseqid'], information_proteins_blast[keyB]['descr'], str(information_proteins_blast[keyB]['evalue']), str(information_proteins_blast[keyB]['pident']), str(information_proteins_blast[keyB]['pcover'])]), file=tablefile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
546 except KeyError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
547 print("\t".join([keyB, str(tempprotsdict[keyB]['begin']), str(tempprotsdict[keyB]['end']), str(tempprotsdict[keyB]['strand']), str(tempprotsdict[keyB]['length']), str(tempprotsdict[keyB]['isoelectricpoint']), str(tempprotsdict[keyB]['molweightkda']), str(tempprotsdict[keyB]['instability']), "None", "None", "NA", "NA", "NA"]), file=tablefile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
548
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
549 # Algorithm of decisions (which one: BLAST/HMMER?)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
550 multipleprots = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
551 Hypotheticalpat = re.compile(r'(((((?i)hypothetical)|(?i)uncharacteri(z|s)ed|(?i)predicted)) protein)|((?i)ORF|((?i)unnamed protein product|(?i)gp\d+))')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
552 if args.nohmmer == False:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
553 keylist = information_proteins_hmmer.keys()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
554 keylist.sort()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
555 for keyB in keylist:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
556 keyB = keyB.replace(".faa.tbl", "")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
557 singleprot = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
558 singleprot['name'] = equivalence[keyB]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
559 if (equivalence[keyB] in information_proteins_blast) and (keyB in information_proteins_hmmer):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
560 if re.search(Hypotheticalpat, information_proteins_blast[equivalence[keyB]]['descr']):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
561 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
562 if re.search(Hypotheticalpat, information_proteins_hmmer[keyB]['descr']):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
563 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
564 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
565 singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
566 except KeyError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
567 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
568 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
569 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
570 if (float(information_proteins_blast[equivalence[keyB]]['pident'])>float(information_proteins_hmmer[keyB]['pident'])) and (float(information_proteins_blast[equivalence[keyB]]['pcover'])>float(information_proteins_hmmer[keyB]['pcover'])):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
571 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
572 elif (float(information_proteins_blast[equivalence[keyB]]['pident'])<float(information_proteins_hmmer[keyB]['pident'])) and (float(information_proteins_blast[equivalence[keyB]]['pcover'])<float(information_proteins_hmmer[keyB]['pcover'])):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
573 singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
574 elif (float(information_proteins_blast[equivalence[keyB]]['pident'])>float(information_proteins_hmmer[keyB]['pident'])) and (float(information_proteins_blast[equivalence[keyB]]['pcover'])<float(information_proteins_hmmer[keyB]['pcover'])):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
575 if (float(information_proteins_blast[equivalence[keyB]]['pident'])-float(information_proteins_hmmer[keyB]['pident']) >= args.diffid):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
576 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
577 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
578 singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
579 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
580 if (float(information_proteins_hmmer[keyB]['pident'])-float(information_proteins_blast[equivalence[keyB]]['pident']) >= args.diffid):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
581 singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
582 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
583 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
584 except KeyError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
585 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
586 if (float(information_proteins_blast[equivalence[keyB]]['pcover'])>float(information_proteins_hmmer[keyB]['pcover'])):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
587 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
588 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
589 singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
590 except KeyError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
591 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
592 elif equivalence[keyB] in information_proteins_blast:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
593 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
594 elif keyB in information_proteins_hmmer:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
595 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
596 singleprot['descr'] = information_proteins_hmmer[keyB]['descr']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
597 except KeyError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
598 singleprot['descr'] = 'Hypothetical protein'
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
599 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
600 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
601 multipleprots[keyB] = singleprot
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
602 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
603 keylist = equivalence.values()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
604 keylist.sort()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
605 for keyB in keylist:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
606 singleprot = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
607 singleprot['name'] = keyB
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
608 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
609 if information_proteins_blast[keyB]['descr'] == None:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
610 singleprot['descr'] = 'Hypothetical protein'
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
611 elif re.search(Hypotheticalpat, information_proteins_blast[keyB]['descr']):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
612 singleprot['descr'] = 'Conserved hypothetical protein'
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
613 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
614 singleprot['descr'] = information_proteins_blast[keyB]['descr']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
615 except KeyError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
616 singleprot['descr'] = 'Hypothetical protein'
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
617 multipleprots[keyB] = singleprot
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
618
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
619 #Storing protein information in memory
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
620 with open("temp.faa", "rU") as protsfile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
621 protsdict = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
622 for protseq in SeqIO.parse(protsfile, "fasta"):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
623 indprot = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
624 dataprot = protseq.description.split(' # ')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
625 indprot['translation'] = protseq.seq
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
626 indprot['protein_id'] = dataprot[0]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
627 indprot['strand'] = int(dataprot[3])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
628 indprot['begin'] = int(dataprot[1])-1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
629 indprot['end'] = int(dataprot[2])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
630 for keyOmega in sorted(multipleprots):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
631 if multipleprots[keyOmega]['name'] == dataprot[0]:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
632 indprot['product'] = multipleprots[keyOmega]['descr']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
633 protsdict[dataprot[0]] = indprot
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
634
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
635 # Predicting the rRNA sequences
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
636 with open(newfile, "rU") as targetfasta, open("/dev/null", "w") as apocalypse:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
637 eprint("Running INFERNAL+RFAM to predict rRNA-like sequences in %s" % newfile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
638 subprocess.call(["cmscan", "--rfam", "--cut_ga", "--nohmmonly", "--tblout", "rrnafile.csv", "--cpu", args.ncpus, args.rfamdatabase, newfile], stdout=apocalypse)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
639
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
640 #Storing rRNA information in memory
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
641 with open("rrnafile.csv", "rU") as rrnafile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
642 namedict = {"SSU_rRNA_archaea": "16s_rRNA", "SSU_rRNA_bacteria": "16s_rRNA", "SSU_rRNA_eukarya": "18s_rRNA", "SSU_rRNA_microsporidia": "16s_rRNA", "LSU_rRNA_archaea": "23s_rRNA", "LSU_rRNA_bacteria": "23s_rRNA", "LSU_rRNA_eukarya": "28s_rRNA", "5S_rRNA": "5s_rRNA"}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
643 rRNAdict = defaultdict(list)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
644 for line in rrnafile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
645 if not line.startswith("#"):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
646 InfoLINE = re.sub("\s+", ",", line)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
647 line_splitted = InfoLINE.split(",")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
648 item_type = line_splitted[0]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
649 if item_type.startswith(('LSU', 'SSU', '5S')):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
650 strand = 0
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
651 if line_splitted[9] == "+":
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
652 strand = 1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
653 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
654 strand = -1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
655 rRNAdict[item_type].append({'score': float(line_splitted[14]), 'product': namedict[line_splitted[0]], 'begin': int(line_splitted[7]), 'end': int(line_splitted[8]), 'strand': strand})
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
656
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
657 subunits = {'LSU': {'max_score': 0 }, 'SSU': {'max_score': 0 }, '5S': {'max_score': 0 }}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
658 for type_rRNA, rRNA_data in rRNAdict.items():
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
659 max_score = max([item['score'] for item in rRNAdict[type_rRNA]])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
660 for item in ('LSU', 'SSU'):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
661 if type_rRNA.startswith(item):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
662 if max_score > subunits[item]['max_score']:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
663 subunits[item]['listdata'] = rRNA_data
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
664 subunits[item]['max_score'] = max_score
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
665 if type_rRNA.startswith('5S'):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
666 subunits['5S']['listdata'] = rRNA_data
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
667 subunits['5S']['max_score'] = max_score
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
668
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
669 for rRNA in subunits:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
670 i = 0
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
671 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
672 lengthlist = len(subunits[rRNA]['listdata'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
673 except KeyError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
674 continue
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
675 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
676 while i < lengthlist:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
677 eprint("%s harbours a %s from %i to %i" % (newfile, subunits[rRNA]['listdata'][i]['product'], int(subunits[rRNA]['listdata'][i]['begin']), int(subunits[rRNA]['listdata'][i]['end'])))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
678 i += 1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
679
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
680 # Predicting the tRNA sequences
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
681 eprint("Running ARAGORN to predict tRNA-like sequences in %s" % newfile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
682 genetictable = "-gc%s" % str(args.gcode)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
683 with open("trnafile.fasta", "w") as trnafile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
684 if genomeshape['genomeshape'] == "circular":
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
685 subprocess.call(["aragorn", "-c", "-fon", genetictable, newfile], stdout=trnafile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
686 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
687 subprocess.call(["aragorn", "-l", "-fon", genetictable, newfile], stdout=trnafile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
688 num_tRNA = len(list(SeqIO.parse("trnafile.fasta", "fasta")))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
689 eprint("ARAGORN was able to predict %i tRNAs in %s\n" % (num_tRNA, newfile))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
690
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
691 #Storing tRNA and tmRNA information in memory
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
692 with open("trnafile.fasta", "rU") as trnafile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
693 tRNAdict = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
694 tmRNAdict = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
695 for tRNAseq in SeqIO.parse(trnafile, "fasta"):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
696 indtRNA = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
697 indtmRNA = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
698 tRNA_information = tRNAseq.description.split(" ")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
699 tRNApat = re.compile("^tRNA-")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
700 if tRNA_information[1] == "tmRNA":
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
701 if str(tRNA_information[2]) == "(Permuted)":
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
702 indtmRNA['product'] = "tmRNA"
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
703 tmRNA_coords = str(tRNA_information[3])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
704 Beginningrevcomppat = re.compile("^c")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
705 if re.match(Beginningrevcomppat, tmRNA_coords):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
706 indtmRNA['strand'] = -1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
707 tmRNA_coords = tmRNA_coords.replace("c[","").replace("]","").split(",")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
708 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
709 indtmRNA['strand'] = 1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
710 tmRNA_coords = tmRNA_coords.replace("[","").replace("]","").split(",")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
711 indtmRNA['begin'] = int(tmRNA_coords[0])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
712 indtmRNA['end'] = int(tmRNA_coords[1])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
713 tmRNAdict[tRNAseq.id] = indtmRNA
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
714 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
715 indtmRNA['product'] = "tmRNA"
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
716 tmRNA_coords = str(tRNA_information[2])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
717 Beginningrevcomppat = re.compile("^c")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
718 if re.match(Beginningrevcomppat, tmRNA_coords):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
719 indtmRNA['strand'] = -1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
720 tmRNA_coords = tmRNA_coords.replace("c[","").replace("]","").split(",")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
721 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
722 indtmRNA['strand'] = 1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
723 tmRNA_coords = tmRNA_coords.replace("[","").replace("]","").split(",")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
724 indtmRNA['begin'] = int(tmRNA_coords[0])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
725 indtmRNA['end'] = int(tmRNA_coords[1])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
726 tmRNAdict[tRNAseq.id] = indtmRNA
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
727 elif re.match(tRNApat, tRNA_information[1]):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
728 indtRNA['product'] = re.sub("\(\w{3}\)", "", tRNA_information[1])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
729 tRNA_coords = tRNA_information[2]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
730 Beginningrevcomppat = re.compile("^c")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
731 if re.match(Beginningrevcomppat, tRNA_coords):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
732 indtRNA['strand'] = -1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
733 tRNA_coords = tRNA_coords.replace("c[","").replace("]","").split(",")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
734 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
735 indtRNA['strand'] = 1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
736 tRNA_coords = tRNA_coords.replace("[","").replace("]","").split(",")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
737 indtRNA['begin'] = int(tRNA_coords[0])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
738 indtRNA['end'] = int(tRNA_coords[1])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
739 tRNAdict[tRNAseq.id] = indtRNA
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
740
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
741 #Predicting CRISPR repeats and others
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
742 eprint("Running PILERCR to predict CRISPR repeats in %s" % newfile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
743 subprocess.call(["pilercr", "-in", newfile, "-out", "crisprfile.txt", "-noinfo", "-minrepeat", str(args.minrepeat), "-maxrepeat", str(args.maxrepeat), "-minspacer", str(args.minspacer), "-maxspacer", str(args.maxspacer)])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
744 eprint("Predicting repeats in the sequences using TRF and IRF")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
745 with open("/dev/null", "w") as stderr:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
746 subprocess.call(["trf", newfile, "2", "7", "7", "80", "10", "50", "500", "-h"], stderr=stderr)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
747 os.rename("%s.2.7.7.80.10.50.500.dat" % newfile, "trf_temp.dat")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
748 with open("/dev/null", "w") as stderr:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
749 subprocess.call(["irf", newfile, "2", "3", "5", "80", "10", "40", "500000", "10000", "-d", "-h"], stderr=stderr)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
750 os.rename("%s.2.3.5.80.10.40.500000.10000.dat" % newfile, "irf_temp.dat")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
751
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
752 # Storing CRISPR repeats information
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
753 information_CRISPR = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
754 with open("crisprfile.txt", "rU") as crisprfile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
755 for line in crisprfile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
756 if "SUMMARY BY POSITION" in line:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
757 for line in crisprfile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
758 information_crispr_repeat = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
759 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
760 patC = re.compile('^\s+(\d+)\s+.{16}\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d?\s+(\w+)')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
761 key, start, length, seq = re.match(patC, line).groups()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
762 except AttributeError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
763 continue
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
764 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
765 information_crispr_repeat['start'] = start
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
766 information_crispr_repeat['end'] = int(start) + int(length)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
767 information_crispr_repeat['repeatseq'] = seq
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
768 information_crispr_repeat['repeatend'] = int(start) + len(seq)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
769 information_CRISPR[key] = information_crispr_repeat
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
770
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
771 # Storing tandem repeats information
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
772 information_TRF = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
773 count = 1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
774 with open("trf_temp.dat", "rU") as trfile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
775 for line in trfile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
776 information_tandem_repeat = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
777 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
778 patT = re.compile('^(\d+)\s(\d+)\s\d+\s\d+\.\d+\s')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
779 start, end = re.match(patT, line).groups()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
780 except AttributeError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
781 continue
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
782 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
783 information_tandem_repeat['start'] = start
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
784 information_tandem_repeat['end'] = end
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
785 information_TRF[count] = information_tandem_repeat
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
786 count += 1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
787
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
788 # Storing inverted repeats information
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
789 information_IRF = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
790 count = 1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
791 with open("irf_temp.dat", "rU") as irfile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
792 for line in irfile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
793 information_inverted_repeat = {}
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
794 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
795 patI = re.compile('^(\d+)\s(\d+)\s\d+\s\d+\s\d+')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
796 start, end = re.match(patI, line).groups()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
797 except AttributeError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
798 continue
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
799 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
800 information_inverted_repeat['start'] = start
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
801 information_inverted_repeat['end'] = end
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
802 information_IRF[count] = information_inverted_repeat
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
803 count += 1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
804
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
805 # Creating a new Genbank and GFF file
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
806 eprint("Creating the output files")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
807 newtempgbk = "%s.temp.gbk" % newfile
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
808 with open(newfile, "rU") as basefile, open(newtempgbk, "w"):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
809 for record in SeqIO.parse(basefile, "fasta", IUPAC.ambiguous_dna):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
810 whole_sequence = SeqRecord(record.seq)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
811 whole_sequence.id = str(record.id)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
812 whole_sequence.annotations['data_file_division'] = args.typedata.upper()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
813 whole_sequence.annotations['date'] = strftime("%d-%b-%Y").upper()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
814 for protein in sorted(protsdict, key = stringSplitByNumbers):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
815 putative_start = int(protsdict[protein]['begin'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
816 start_pos = SeqFeature.ExactPosition(putative_start)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
817 end_pos = SeqFeature.ExactPosition(protsdict[protein]['end'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
818 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=protsdict[protein]['strand'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
819 qualifiersgene = OrderedDict([('locus_tag', protsdict[protein]['protein_id'])])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
820 new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = protsdict[protein]['strand'], qualifiers = qualifiersgene)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
821 whole_sequence.features.append(new_data_gene)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
822 qualifiers = [('locus_tag', protsdict[protein]['protein_id']), ('product', protsdict[protein]['product']), ('protein_id', protsdict[protein]['protein_id']), ('translation', protsdict[protein]['translation'])]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
823 feature_qualifiers = OrderedDict(qualifiers)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
824 new_data_cds = SeqFeature.SeqFeature(feature_location, type = "CDS", strand = protsdict[protein]['strand'], qualifiers = feature_qualifiers)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
825 whole_sequence.features.append(new_data_cds)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
826 for rRNA in sorted(subunits, key = stringSplitByNumbers):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
827 i = 0
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
828 try:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
829 lengthlist = len(subunits[rRNA]['listdata'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
830 except KeyError:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
831 continue
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
832 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
833 while i < lengthlist:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
834 putative_start = int(subunits[rRNA]['listdata'][i]['begin'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
835 start_pos = SeqFeature.ExactPosition(putative_start)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
836 end_pos = SeqFeature.ExactPosition(subunits[rRNA]['listdata'][i]['end'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
837 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=subunits[rRNA]['listdata'][i]['strand'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
838 new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = subunits[rRNA]['listdata'][i]['strand'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
839 whole_sequence.features.append(new_data_gene)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
840 qualifiers = [('product', subunits[rRNA]['listdata'][i]['product'])]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
841 feature_qualifiers = OrderedDict(qualifiers)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
842 new_data_rRNA = SeqFeature.SeqFeature(feature_location, type = "rRNA", strand = subunits[rRNA]['listdata'][i]['strand'], qualifiers = feature_qualifiers)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
843 whole_sequence.features.append(new_data_rRNA)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
844 i += 1
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
845 for tRNA in sorted(tRNAdict, key = stringSplitByNumbers):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
846 putative_start = int(tRNAdict[tRNA]['begin'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
847 start_pos = SeqFeature.ExactPosition(putative_start)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
848 end_pos = SeqFeature.ExactPosition(tRNAdict[tRNA]['end'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
849 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=tRNAdict[tRNA]['strand'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
850 new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = tRNAdict[tRNA]['strand'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
851 whole_sequence.features.append(new_data_gene)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
852 qualifiers = [('product', tRNAdict[tRNA]['product'])]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
853 feature_qualifiers = OrderedDict(qualifiers)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
854 new_data_tRNA = SeqFeature.SeqFeature(feature_location, type = "tRNA", strand = tRNAdict[tRNA]['strand'], qualifiers = feature_qualifiers)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
855 whole_sequence.features.append(new_data_tRNA)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
856 for tmRNA in sorted(tmRNAdict, key = stringSplitByNumbers):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
857 putative_start = int(tmRNAdict[tmRNA]['begin'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
858 start_pos = SeqFeature.ExactPosition(putative_start)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
859 end_pos = SeqFeature.ExactPosition(tmRNAdict[tmRNA]['end'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
860 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=tmRNAdict[tmRNA]['strand'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
861 new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = tmRNAdict[tmRNA]['strand'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
862 whole_sequence.features.append(new_data_gene)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
863 qualifiers = [('product', tmRNAdict[tmRNA]['product'])]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
864 feature_qualifiers = OrderedDict(qualifiers)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
865 new_data_tmRNA = SeqFeature.SeqFeature(feature_location, type = "tmRNA", strand = tmRNAdict[tmRNA]['strand'], qualifiers = feature_qualifiers)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
866 whole_sequence.features.append(new_data_tmRNA)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
867 for CRISPR in sorted(information_CRISPR, key = stringSplitByNumbers):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
868 putative_start = int(information_CRISPR[CRISPR]['start'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
869 start_pos = SeqFeature.ExactPosition(putative_start)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
870 end_pos = SeqFeature.ExactPosition(information_CRISPR[CRISPR]['end'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
871 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
872 qualifiers = [('rpt_family', 'CRISPR'), ('rpt_type', 'direct'), ('rpt_unit_range', "%i..%i" % (int(information_CRISPR[CRISPR]['start']), int(information_CRISPR[CRISPR]['repeatend']))), ('rpt_unit_seq', information_CRISPR[CRISPR]['repeatseq'])]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
873 feature_qualifiers = OrderedDict(qualifiers)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
874 new_data_CRISPRrepeat = SeqFeature.SeqFeature(feature_location, type = "repeat_region", qualifiers = feature_qualifiers)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
875 whole_sequence.features.append(new_data_CRISPRrepeat)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
876 for tandem in sorted(information_TRF):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
877 putative_start = int(information_TRF[tandem]['start'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
878 start_pos = SeqFeature.ExactPosition(putative_start)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
879 end_pos = SeqFeature.ExactPosition(information_TRF[tandem]['end'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
880 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
881 qualifiers = [('rpt_type', 'direct')]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
882 feature_qualifiers = OrderedDict(qualifiers)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
883 new_data_tandemrepeat = SeqFeature.SeqFeature(feature_location, type = "repeat_region", qualifiers = feature_qualifiers)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
884 whole_sequence.features.append(new_data_tandemrepeat)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
885 for inverted in sorted(information_IRF):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
886 putative_start = int(information_IRF[inverted]['start'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
887 start_pos = SeqFeature.ExactPosition(putative_start)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
888 end_pos = SeqFeature.ExactPosition(information_IRF[inverted]['end'])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
889 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
890 qualifiers = [('rpt_type', 'inverted')]
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
891 feature_qualifiers = OrderedDict(qualifiers)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
892 new_data_invertedrepeat = SeqFeature.SeqFeature(feature_location, type = "repeat_region", qualifiers = feature_qualifiers)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
893 whole_sequence.features.append(new_data_invertedrepeat)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
894 SeqIO.write(whole_sequence, newtempgbk, "genbank")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
895
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
896 newgbk = "%s.gbk" % newfile
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
897 with open(newtempgbk, "rU") as gbktempfile, open(newgbk, "w") as gbkrealfile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
898 newpat = re.compile("D|RNA\s+(CON|PHG|VRL|BCT)")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
899 for line in gbktempfile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
900 if line.startswith("LOCUS ") and re.search(newpat, line):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
901 if genomeshape['genomeshape'] == "linear":
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
902 newline = re.sub("bp DNA\s+", "bp DNA linear ", line)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
903 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
904 newline = re.sub("bp DNA\s+", "bp DNA circular ", line)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
905 gbkrealfile.write(newline)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
906 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
907 gbkrealfile.write(line)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
908
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
909 for f in glob.glob("*.temp.gbk"):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
910 os.remove(f)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
911
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
912 if args.gffprint==True:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
913 newgff = "%s.gff" % newfile
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
914 with open(newgff, "w") as outgff, open(newgbk, "rU") as ingbk:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
915 GFF.write(SeqIO.parse(ingbk, "genbank"), outgff)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
916
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
917 # Removing intermediate files
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
918 os.remove(newfile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
919 os.remove("temp.faa")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
920 os.remove("temp_blast.csv")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
921 os.remove("crisprfile.txt")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
922 os.remove("trnafile.fasta")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
923 os.remove("rrnafile.csv")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
924 os.remove("trf_temp.dat")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
925 os.remove("irf_temp.dat")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
926 for f in glob.glob("SEQ*"):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
927 os.remove(f)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
928
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
929 # Joining all GENBANK files into one
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
930 listgbk = sorted(glob.glob('CONTIG_*.gbk'))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
931 gbkoutputfile = "%s.gbk" % root_output
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
932 with open(gbkoutputfile, 'w') as finalgbk:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
933 for fname in listgbk:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
934 with open(fname) as infile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
935 for line in infile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
936 finalgbk.write(line)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
937
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
938 for tempgbk in glob.glob("CONTIG_*.gbk"):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
939 os.remove(tempgbk)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
940
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
941 # Joining all GFF files into one
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
942 if args.gffprint==True:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
943 listgff = sorted(glob.glob('CONTIG_*.gff'))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
944 gffoutputfile = "%s.gff" % root_output
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
945 with open(gffoutputfile, 'w') as finalgff:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
946 for fname in listgff:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
947 with open(fname) as infile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
948 for line in infile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
949 finalgff.write(line)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
950 for tempgff in glob.glob("CONTIG_*.gff"):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
951 os.remove(tempgff)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
952
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
953 # Joining all TABLE files into one
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
954 listcsv = sorted(glob.glob('CONTIG_*.csv'))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
955 tbloutputfile = "%s.csv" % root_output
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
956 with open(tbloutputfile, 'w') as finaltable:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
957 for fname in listcsv:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
958 with open(fname) as infile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
959 for line in infile:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
960 finaltable.write(line)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
961
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
962 for temptbl in glob.glob("CONTIG_*.csv"):
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
963 os.remove(temptbl)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
964
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
965 # Preparing sequences for GenBank submission (Original code from Wan Yu's gbk2tbl.py script [https://github.com/wanyuac/BINF_toolkit/blob/master/gbk2tbl.py])
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
966 allowed_qualifiers = ['locus_tag', 'gene', 'product', 'pseudo', 'protein_id', 'gene_desc', 'old_locus_tag', 'note', 'inference', 'organism', 'mol_type', 'strain', 'sub_species', 'isolation-source', 'country']
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
967 newfastafile = "%s.fasta" % root_output
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
968 newtablefile = "%s.tbl" % root_output
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
969 with open(args.modifiers, "rU") as modifiers, open(gbkoutputfile, "r") as genbank_fh, open(newfastafile, "w") as fasta_fh, open(newtablefile, "w") as feature_fh:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
970 info = modifiers.readline()
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
971 wholelist = list(SeqIO.parse(genbank_fh, 'genbank'))
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
972 for record in wholelist:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
973 if len(record) <= args.mincontigsize:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
974 eprint("WARNING: Skipping small contig %s" % record.id)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
975 continue
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
976 record.description = "%s %s" % (record.id, info)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
977 SeqIO.write([record], fasta_fh, 'fasta')
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
978 print('>Feature %s' % (record.name), file=feature_fh)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
979 for line in record.features:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
980 if line.strand == 1:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
981 print('%d\t%d\t%s' % (line.location.nofuzzy_start + 1, line.location.nofuzzy_end, line.type), file=feature_fh)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
982 else:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
983 print('%d\t%d\t%s' % (line.location.nofuzzy_end, line.location.nofuzzy_start + 1, line.type), file=feature_fh)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
984 for (key, values) in line.qualifiers.iteritems():
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
985 if key not in allowed_qualifiers:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
986 continue
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
987 for v in values:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
988 print('\t\t\t%s\t%s' % (key, v), file=feature_fh)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
989
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
990 # Final statement
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
991 eprint("Genome annotation done!")
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
992 eprint("The GenBank file is %s" % gbkoutputfile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
993 if args.gffprint==True:
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
994 eprint("The GFF3 file is %s" % gffoutputfile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
995 eprint("The table file for GenBank submission is %s" % tbloutputfile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
996 eprint("The FASTA file for GenBank submission is %s" % newfastafile)
231e4c669675 Initial commit - v0.10.3 git commit deeded0
vimalkumarvelayudhan
parents:
diff changeset
997 eprint("The table file with all protein information is %s" % newtablefile)