Mercurial > repos > vimalkumarvelayudhan > viga
annotate VIGA.py @ 0:231e4c669675 draft
Initial commit - v0.10.3 git commit deeded0
| author | vimalkumarvelayudhan | 
|---|---|
| date | Tue, 27 Feb 2018 14:16:54 -0500 | 
| parents | |
| children | 
| rev | line source | 
|---|---|
| 0 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 1 #!/usr/bin/env python | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 2 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 3 # -*- coding: utf-8 -*- | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 4 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 5 # VIGA - De-novo VIral Genome Annotator | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 6 # | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 7 # Copyright (C) 2017 - Enrique Gonzalez-Tortuero | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 8 # Vimalkumar Velayudhan | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 9 # | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 10 # This program is free software: you can redistribute it and/or modify | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 11 # it under the terms of the GNU General Public License as published by | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 12 # the Free Software Foundation, either version 3 of the License, or | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 13 # (at your option) any later version. | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 14 # | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 15 # This program is distributed in the hope that it will be useful, | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 16 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 18 # GNU General Public License for more details. | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 19 # | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 20 # You should have received a copy of the GNU General Public License | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 21 # along with this program. If not, see <http://www.gnu.org/licenses/>. | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 22 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 23 # Importing python libraries | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 24 from __future__ import print_function | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 25 import argparse | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 26 import csv | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 27 import fileinput | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 28 import fractions | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 29 import glob | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 30 import numpy | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 31 import os | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 32 import os.path | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 33 import re | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 34 import sys | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 35 import shutil | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 36 import subprocess | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 37 from BCBio import GFF | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 38 from Bio import SeqIO | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 39 from Bio import SeqFeature | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 40 from Bio.Alphabet import IUPAC | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 41 from Bio.Seq import Seq | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 42 from Bio.SeqFeature import FeatureLocation | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 43 from Bio.SeqRecord import SeqRecord | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 44 from Bio.SeqUtils.ProtParam import ProteinAnalysis | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 45 from collections import OrderedDict, defaultdict | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 46 from itertools import product | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 47 from scipy import signal | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 48 from time import strftime | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 49 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 50 # Preparing functions | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 51 def batch_iterator(iterator, batch_size): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 52 entry = True | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 53 while entry: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 54 batch = [] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 55 while len(batch) < batch_size: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 56 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 57 entry = iterator.next() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 58 except StopIteration: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 59 entry = None | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 60 if entry is None: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 61 break | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 62 batch.append(entry) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 63 if batch: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 64 yield batch | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 65 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 66 def check_peaks(peaks, length): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 67 # Checking if origin/terminus peaks are too close or too far apart. In that case, they are probably wrong | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 68 closest, farthest = int(length * float(0.45)), int(length * float(0.55)) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 69 pairs = [] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 70 for pair in list(product(*peaks)): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 71 ### added this to make sure gets origin and ter right | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 72 tr, pk = sorted(list(pair), key = lambda x: x[1], reverse = False) # trough and peak | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 73 a = (tr[0] - pk[0]) % length | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 74 b = (pk[0] - tr[0]) % length | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 75 pt = abs(tr[1] - pk[1]) # distance between values | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 76 if (a <= farthest and a >= closest) or (b <=farthest and b >= closest): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 77 pairs.append([pt, tr, pk]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 78 if len(pairs) == 0: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 79 return [False, False] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 80 pt, tr, pk = sorted(pairs, reverse = True)[0] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 81 return [tr[0], pk[0]] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 82 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 83 def cmd_exists(cmd): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 84 return subprocess.call("type " + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 85 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 86 def GCskew(name, length, seq, window, slide): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 87 replacements = {'G':1, 'C':-1, 'A':0, 'T':0, 'N':0} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 88 gmc = [] # G - C | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 89 for base in seq: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 90 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 91 gmc.append(replacements[base]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 92 except: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 93 gmc.append(0) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 94 # convert to G + C | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 95 gpc = [abs(i) for i in gmc] # G + C | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 96 # calculate sliding windows for (G - C) and (G + C) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 97 weights = numpy.ones(window)/window | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 98 gmc = [[i, c] for i, c in enumerate(signal.fftconvolve(gmc, weights, 'same').tolist())] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 99 gpc = [[i, c] for i, c in enumerate(signal.fftconvolve(gpc, weights, 'same').tolist())] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 100 # calculate gc skew and cummulative gc skew sum | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 101 skew = [[], []] # x and y for gc skew | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 102 c_skew = [[], []] # x and y for gc skew cummulative sums | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 103 cs = 0 # cummulative sum | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 104 # select windows to use based on slide | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 105 for i, m in gmc[0::slide]: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 106 p = gpc[i][1] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 107 if p == 0: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 108 gcs = 0 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 109 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 110 gcs = m/p | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 111 cs += gcs | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 112 skew[0].append(i) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 113 c_skew[0].append(i) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 114 skew[1].append(gcs) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 115 c_skew[1].append(cs) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 116 ori, ter = find_ori_ter(c_skew, length) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 117 return ori, ter, skew, c_skew | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 118 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 119 def eprint(*args, **kwargs): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 120 print(*args, file=sys.stderr, **kwargs) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 121 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 122 def find_ori_ter(c_skew, length): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 123 # Find origin and terminus of replication based on cumulative GC skew min and max peaks | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 124 c_skew_min = signal.argrelextrema(numpy.asarray(c_skew[1]), numpy.less, order = 1)[0].tolist() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 125 c_skew_max = signal.argrelextrema(numpy.asarray(c_skew[1]), numpy.greater, order = 1)[0].tolist() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 126 # return False if no peaks were detected | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 127 if len(c_skew_min) == 0 or len(c_skew_min) == 0: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 128 return [False, False] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 129 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 130 c_skew_min = [[c_skew[0][i], c_skew[1][i]] for i in c_skew_min] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 131 c_skew_max = [[c_skew[0][i], c_skew[1][i]] for i in c_skew_max] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 132 ori, ter = check_peaks([c_skew_min, c_skew_max], length) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 133 return ori, ter | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 134 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 135 def stringSplitByNumbers(x): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 136 r = re.compile('(\d+)') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 137 l = r.split(x) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 138 return [int(y) if y.isdigit() else y for y in l] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 139 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 140 # Defining the program version | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 141 version = "0.10.3" | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 142 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 143 # Processing the parameters | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 144 parser = argparse.ArgumentParser(description='VIGA is an automatic de novo VIral Genome Annotator.') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 145 basic_group = parser.add_argument_group('Basic options for VIGA [REQUIRED]') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 146 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 147 basic_group.add_argument("--input", dest="inputfile", type=str, required=True, help='Input file as a FASTA file', metavar="FASTAFILE") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 148 basic_group.add_argument("--rfamdb", dest="rfamdatabase", type=str, required=True, help='RFAM Database that will be used for the ribosomal RNA prediction. RFAMDB should be in the format "/full/path/to/rfamdb/Rfam.cm" and must be compressed accordingly (see INFERNAL manual) before running the script.', metavar="RFAMDB") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 149 basic_group.add_argument("--modifiers", dest="modifiers", type=str, required=True, help='Input file as a plain text file with the modifiers per every FASTA header according to SeqIn (https://www.ncbi.nlm.nih.gov/Sequin/modifiers.html). All modifiers must be written in a single line and are separated by a single space character. No space should be placed besides the = sign. For example: [organism=Serratia marcescens subsp. marcescens] [sub-species=marcescens] [strain=AH0650_Sm1] [topology=linear] [moltype=DNA] [tech=wgs] [gcode=11] [country=Australia] [isolation-source=sputum]. This line will be copied and printed along with the record name as the definition line of every contig sequence.', metavar="TEXTFILE") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 150 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 151 advanced_group = parser.add_argument_group('Advanced options for VIGA [OPTIONAL]') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 152 advanced_group.add_argument("--readlength", dest="read_length", type=int, default=101, help='Read length for the circularity prediction (default: 101 bp)', metavar="INT") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 153 advanced_group.add_argument("--windowsize", dest="window", type=int, default=100, help='Window size used to determine the origin of replication in circular contigs according to the cumulative GC skew (default: 100 bp)', metavar="INT") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 154 advanced_group.add_argument("--slidingsize", dest="slide", type=int, default=10, help='Window size used to determine the origin of replication in circular contigs according to the cumulative GC skew (default: 10 bp)', metavar="INT") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 155 advanced_group.add_argument("--out", dest="rootoutput", type=str, help='Name of the outputs files (without extension)', metavar="OUTPUTNAME") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 156 advanced_group.add_argument("--locus", dest="locus", type=str, default='LOC', help='Name of the sequences. If the input is a multiFASTA file, please put a general name as the program will add the number of the contig at the end of the name (Default: %(default)s)', metavar="STRING") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 157 advanced_group.add_argument("--gff", dest="gffprint", action='store_true', default=False, help='Printing the output as GFF3 file (Default: False)') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 158 advanced_group.add_argument("--threads", dest="ncpus", default=1, help='Number of threads/cpus (Default: %(default)s cpu)', metavar="INT") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 159 advanced_group.add_argument("--nohmmer", dest="nohmmer", action='store_true', default=False, help='Running only BLAST to predict protein function. (Default: False)') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 160 advanced_group.add_argument("--noblast", dest="noblast", action='store_true', default=False, help='Running DIAMOND instead of BLAST to predict protein function according to homology. This will be less sensitive but faster than BLAST. (Default: False)') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 161 advanced_group.add_argument("--blastdb", dest="blastdatabase", type=str, help='BLAST Database that will be used for the protein function prediction. The database must be an amino acid one, not nucleotidic. This argument is mandatory if --noblast option is disabled', metavar="BLASTDB") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 162 advanced_group.add_argument("--diamonddb", dest="diamonddatabase", type=str, help='DIAMOND Database that will be used for the protein function prediction. The database must be created from a amino acid FASTA file as indicated in https://github.com/bbuchfink/diamond. This argument is mandatory when --noblast option is enabled', metavar="DIAMONDDB") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 163 advanced_group.add_argument("--blastevalue", dest="blastevalue", default=0.00001, help='BLAST/DIAMOND e-value threshold (Default: 0.00001)', metavar="FLOAT") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 164 advanced_group.add_argument("--hmmdb", dest="hmmdatabase", type=str, help='PHMMER Database that will be used for the protein function prediction according to Hidden Markov Models. In this case, HMMDB must be in FASTA format (e.g. UniProt: "', metavar="HMMDB") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 165 advanced_group.add_argument("--hmmerevalue", dest="hmmerevalue", default=0.001, help='PHMMER e-value threshold (Default: 0.001)', metavar="FLOAT") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 166 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 167 type_choices = {'BCT': 'Prokaryotic chromosome', 'CON': 'Contig', 'PHG': 'Phages', 'VRL': 'Eukaryotic/Archaea virus'} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 168 type_help = ('GenBank Division: One of the following codes - {0}. (Default: %(default)s)'.format(', '.join('{0} ({1})'.format(k, v) for k, v in type_choices.items()))) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 169 advanced_group.add_argument("--typedata", dest="typedata", type=str, default='CON', help=type_help, metavar="BCT|CON|VRL|PHG") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 170 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 171 gcode_choices = {'1': 'Standard genetic code [Eukaryotic]', '2': 'Vertebrate mitochondrial code', '3': 'Yeast mitochondrial code', '4': 'Mycoplasma/Spiroplasma and Protozoan/mold/coelenterate mitochondrial code', '5': 'Invertebrate mitochondrial code', '6': 'Ciliate, dasycladacean and hexamita nuclear code', '9': 'Echinoderm/flatworm mitochondrial code', '10': 'Euplotid nuclear code', '11': 'Bacteria/Archaea/Phages/Plant plastid', '12': 'Alternative yeast nuclear code', '13': 'Ascidian mitochondrial code', '14': 'Alternative flatworm mitochondrial code', '16': 'Chlorophycean mitochondrial code', '21': 'Trematode mitochondrial code', '22': 'Scedenesmus obliquus mitochondrial code', '23': 'Thraustochytrium mitochondrial code', '24': 'Pterobranquia mitochondrial code', '25': 'Gracilibacteria & Candidate division SR1', '26': 'Pachysolen tannophilus nuclear code', '27': 'Karyorelict nuclear code', '28': 'Condylostoma nuclear code', '29': 'Mesodinium nuclear code', '30': 'Peritrich nuclear code', '31': 'Blastocrithidia nuclear code'} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 172 gcode_help = ('Number of GenBank translation table. At this moment, the available options are {0}. (Default: %(default)s)'.format('{}'.format(', '.join('{0} ({1})'.format(k, v) for k, v in gcode_choices.items())))) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 173 advanced_group.add_argument("--gcode", dest="gcode", type=str, default='11', help=gcode_help, metavar="NUMBER") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 174 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 175 advanced_group.add_argument('--mincontigsize', dest="mincontigsize", type=int, default = 200, help = 'Minimum contig length to be considered in the final files (Default: 200 bp)', metavar="INT") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 176 advanced_group.add_argument("--idthr", dest="idthreshold", default=50.00, help='ID threshold (Default: 50.0)', metavar="FLOAT") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 177 advanced_group.add_argument("--coverthr", dest="covthreshold", default=50.00, help='Coverage threshold (Default: 50.0)', metavar="FLOAT") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 178 advanced_group.add_argument("--diffid", dest="diffid", default=5.00, help='Max allowed difference between the ID percentages of BLAST and HMMER. (Default = 5.00; Not recommended to change such value)', metavar="FLOAT (>0.01)") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 179 advanced_group.add_argument("--minrepeat", dest="minrepeat", type=int, default=16, help="Minimum repeat length for CRISPR detection (Default: 16)", metavar="INT") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 180 advanced_group.add_argument("--maxrepeat", dest="maxrepeat", type=int, default=64, help="Maximum repeat length for CRISPR detection (Default: 64)") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 181 advanced_group.add_argument("--minspacer", dest="minspacer", type=int, default=8, help="Minimum spacer length for CRISPR detection (Default: 8)") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 182 advanced_group.add_argument("--maxspacer", dest="maxspacer", type=int, default=64, help="Maximum spacer length for CRISPR detection (Default: 64)") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 183 advanced_group.add_argument("--blastexh", dest="blastexh", action='store_true', default=False, help='Use of exhaustive BLAST to predict the proteins by homology according to Fozo et al. (2010) Nucleic Acids Res (Default=FALSE)') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 184 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 185 args = parser.parse_args() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 186 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 187 root_output = args.rootoutput | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 188 if not root_output: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 189 root_output = '{}_annotated'.format(os.path.splitext(args.inputfile)[0]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 190 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 191 if args.noblast == False and args.blastdatabase == None: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 192 sys.exit('You MUST specify BLAST database using the parameter --blastdb if you are not using --noblast option') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 193 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 194 if args.noblast == True and args.diamonddatabase == None: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 195 sys.exit('You MUST specify DIAMOND database using the parameter --diamonddb if you are using --noblast option') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 196 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 197 if args.nohmmer == False and args.hmmdatabase == None: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 198 sys.exit('You MUST specify HMMER database using the parameter --hmmdb if you are not using --nohmmer option') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 199 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 200 # Printing the header of the program | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 201 eprint("This is VIGA %s" % str(version)) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 202 eprint("Written by Enrique Gonzalez Tortuero & Vimalkumar Velayudhan") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 203 eprint("Homepage is https://github.com/EGTortuero/virannot") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 204 eprint("Local time: ", strftime("%a, %d %b %Y %H:%M:%S")) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 205 eprint("\n\n") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 206 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 207 # checking the presence of the programs in the system | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 208 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 209 if not cmd_exists("lastz")==True: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 210 sys.exit("You must install LASTZ before using this script") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 211 elif not cmd_exists("cmscan")==True: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 212 sys.exit("You must install INFERNAL before using this script") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 213 elif not cmd_exists("prodigal")==True: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 214 sys.exit("You must install prodigal before using this script") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 215 elif not cmd_exists("parallel")==True: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 216 sys.exit("You must install GNU Parallel before using this script") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 217 elif not cmd_exists("blastp")==True: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 218 sys.exit("You must install BLAST before using this script") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 219 elif not cmd_exists("diamond")==True: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 220 sys.exit("You must install DIAMOND before using this script") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 221 elif not cmd_exists("phmmer")==True: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 222 sys.exit("You must install HMMER 3 before using this script") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 223 elif not cmd_exists("aragorn")==True: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 224 sys.exit("You must install ARAGORN before using this script") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 225 elif not cmd_exists("pilercr")==True: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 226 sys.exit("You must install PILERCR before using this script") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 227 elif not cmd_exists("trf")==True: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 228 sys.exit("You must install Tandem Repeats Finder before using this script") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 229 elif not cmd_exists("irf")==True: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 230 sys.exit("You must install Inverted Repeats Finder before using this script") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 231 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 232 eprint("Data type is {0} and GenBank translation table no is {1}\n".format(args.typedata, args.gcode)) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 233 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 234 # Correcting the original file (long headers problem + multiple FASTA files) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 235 record_iter = SeqIO.parse(open(args.inputfile, "rU"),"fasta") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 236 counter = 1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 237 newnamessequences = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 238 for i, batch in enumerate(batch_iterator(record_iter, 1)): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 239 seq_index = "CONTIG_%i" % (i + 1) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 240 filename = "%s.temp.fna" % seq_index | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 241 newfilename = "%s.fna" % seq_index | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 242 with open(filename, "w") as handle: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 243 count = SeqIO.write(batch, filename, "fasta") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 244 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 245 with open(filename, "rU") as original, open(newfilename, "w") as corrected: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 246 sequences = SeqIO.parse(original, "fasta", IUPAC.ambiguous_dna) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 247 for record in sequences: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 248 original_name = record.id | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 249 record.id = "%s_%i" % (args.locus, counter) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 250 record.description = record.description | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 251 counter += 1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 252 newnamessequences[record.id] = original_name | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 253 eprint("WARNING: The name of the sequence %s was corrected as %s" % (original_name, record.id)) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 254 SeqIO.write(record, corrected, "fasta") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 255 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 256 with open("logfile.txt", "w") as logfile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 257 logfile.write("#Original\tNew\n") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 258 for oldname in sorted(newnamessequences, key = stringSplitByNumbers): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 259 logfile.write("%s\t%s\n" % (oldname, newnamessequences[oldname])) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 260 os.remove(filename) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 261 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 262 for newfile in sorted(glob.glob("CONTIG_*.fna")): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 263 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 264 # Predicting the shape of the contig (code based on Alex Crits-Christoph's find_circular.py script [https://github.com/alexcritschristoph/VICA/blob/master/find_circular.py]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 265 eprint("Predicting the shape of the contig using LASTZ") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 266 genomeshape = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 267 with open(newfile, "rU") as targetfasta: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 268 Sequence = SeqIO.parse(newfile, "fasta") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 269 for record in Sequence: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 270 seq_beginning = str(record.seq[0:args.read_length]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 271 seq_ending = str(record.seq[len(record.seq)-args.read_length:len(record.seq)]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 272 combined_seqs = SeqRecord(Seq(seq_beginning + seq_ending, IUPAC.ambiguous_dna), id = record.description) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 273 SeqIO.write(combined_seqs, "temporal_circular.fasta", "fasta") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 274 outputlastz = subprocess.check_output(["lastz", "temporal_circular.fasta", "--self", "--notrivial", "--nomirror", "--format=general-:start1,end1,start2,end2,score,strand1,strand2,identity,length1"]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 275 resultslastz = outputlastz.split("\n") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 276 for resultlastz in resultslastz: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 277 if resultlastz != '': | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 278 start1 = resultlastz.split()[0] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 279 end1 = resultlastz.split()[1] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 280 start2 = resultlastz.split()[2] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 281 end2 = resultlastz.split()[3] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 282 strand1 = resultlastz.split()[5] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 283 strand2 = resultlastz.split()[6] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 284 identity = resultlastz.split()[7] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 285 length = int(resultlastz.split()[9]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 286 if strand1 == strand2 and length > 0.4 * args.read_length and float(fractions.Fraction(identity)) > 0.95 and int(start1) < 5 and int(start2) > args.read_length and int(end1) < args.read_length and int(end2) > args.read_length * 2 * 0.9: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 287 genomeshape['genomeshape'] = "circular" | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 288 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 289 if genomeshape['identity'] >= float(fractions.Fraction(identity)): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 290 genomeshape['identity'] = float(fractions.Fraction(identity)) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 291 genomeshape['length'] = length | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 292 except KeyError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 293 genomeshape['identity'] = float(fractions.Fraction(identity)) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 294 genomeshape['length'] = length | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 295 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 296 continue | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 297 if strand1 == strand2 and length > 0.4 * args.read_length and float(fractions.Fraction(identity)) > 0.95 and int(start1) < 5 and int(start2) > args.read_length and int(end1) < args.read_length and int(end2) > args.read_length * 2 * 0.9: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 298 genomeshape['genomeshape'] = "circular" | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 299 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 300 if genomeshape['identity'] >= float(fractions.Fraction(identity)): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 301 genomeshape['identity'] = float(fractions.Fraction(identity)) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 302 genomeshape['length'] = length | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 303 except KeyError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 304 genomeshape['identity'] = float(fractions.Fraction(identity)) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 305 genomeshape['length'] = length | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 306 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 307 if genomeshape['genomeshape'] == "": | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 308 genomeshape['genomeshape'] = "linear" | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 309 except KeyError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 310 genomeshape['genomeshape'] = "linear" | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 311 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 312 genomeshape['genomeshape'] = "circular" | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 313 with open("temp.fasta", "w") as correctedcircular: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 314 Corrseq = str(record.seq[int(genomeshape['length'])//2:-int(genomeshape['length'])//2]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 315 Newseq = SeqRecord(Seq(Corrseq, IUPAC.ambiguous_dna), id = record.description) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 316 SeqIO.write(Newseq, correctedcircular, "fasta") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 317 os.rename("temp.fasta", "temp2.fasta") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 318 eprint("LASTZ predicted that %s is %s\n" % (newfile, genomeshape['genomeshape'])) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 319 os.remove("temporal_circular.fasta") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 320 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 321 # Calculate the cumulative GCskew in circular contigs to determine the origin of replication (Based on iRep -- Brown CT, Olm MR, Thomas BC, Banfield JF (2016) Measurement of bacterial replication rates in microbial communities. Nature Biotechnology 34: 1256-63.) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 322 if genomeshape['genomeshape'] == "circular": | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 323 for record in SeqIO.parse("temp2.fasta", "fasta"): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 324 length_contig = len(record.seq) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 325 #if length < min_len: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 326 # print('%s: Too Short' % (name), file=sys.stderr) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 327 # continue | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 328 oric, term, gcskew, cgcskew = GCskew(record.id, length_contig, record.seq, args.window, args.slide) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 329 valoric = oric | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 330 if oric == False: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 331 oric, term = 'n/a', 'n/a' | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 332 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 333 oric, term = '{:,}'.format(oric), '{:,}'.format(term) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 334 eprint('%s -> Origin: %s Terminus: %s' % (record.id, oric, term)) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 335 #print('\t'.join(['# Name', 'Position', 'GC Skew', 'Cumulative GC Skew'])) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 336 #for i, pos in enumerate(gcskew[0]): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 337 # out = [record.id, pos, gcskew[1][i], cgcskew[1][i]] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 338 # print('\t'.join([str(i) for i in out])) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 339 if valoric != False: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 340 firstpartseq = str(record.seq[valoric:-1]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 341 secondpartseq = str(record.seq[0:(valoric-1)]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 342 combinedcorrectedseq = SeqRecord(Seq(firstpartseq + secondpartseq, IUPAC.ambiguous_dna), id = record.description) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 343 SeqIO.write(combinedcorrectedseq, newfile, "fasta") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 344 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 345 eprint("As the program was unable to predict the origin of replication, %s was considered as is without correcting!" % record.id) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 346 os.rename("temp2.fasta", newfile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 347 if os.path.isfile("temp2.fasta"): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 348 os.remove("temp2.fasta") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 349 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 350 # Predicting genes using PRODIGAL | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 351 eprint("\nRunning Prodigal to predict the genes in %s" % newfile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 352 for record in SeqIO.parse(newfile, "fasta"): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 353 length_contig = len(record.seq) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 354 if (length_contig >= 100000): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 355 if genomeshape['genomeshape'] == 'linear': | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 356 subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-g", args.gcode, "-c", "-q"]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 357 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 358 subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-g", args.gcode, "-q"]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 359 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 360 if genomeshape['genomeshape'] == 'linear': | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 361 subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-p", "meta", "-g", args.gcode, "-c", "-q"]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 362 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 363 subprocess.call(["prodigal", "-a", "pretemp.faa", "-i", newfile, "-o", "/dev/null", "-p", "meta", "-g", args.gcode, "-q"]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 364 num_seqs = len(list(SeqIO.parse("pretemp.faa", "fasta"))) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 365 eprint("PRODIGAL was able to predict %i genes in %s\n" % (num_seqs, newfile)) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 366 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 367 with open("pretemp.faa", "rU") as originalfaa, open("temp.faa", "w") as correctedfaa: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 368 sequences = SeqIO.parse(originalfaa, "fasta") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 369 for record in sequences: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 370 record.seq = record.seq.rstrip("*") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 371 SeqIO.write(record, correctedfaa, "fasta") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 372 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 373 faa_file = "%s.faa" % newfile # TO DEBUG | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 374 shutil.copyfile("temp.faa", faa_file) # TO DEBUG | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 375 os.remove("pretemp.faa") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 376 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 377 # Predicting the function of the proteins based on homology using BLAST | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 378 equivalence = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 379 record_iter = SeqIO.parse(open("temp.faa"),"fasta") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 380 for i, batch in enumerate(batch_iterator(record_iter, 1)): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 381 seq_index = "SEQ_%i" % (i + 1) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 382 filename = "%s.faa" % seq_index | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 383 with open(filename, "w") as handle: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 384 count = SeqIO.write(batch, handle, "fasta") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 385 equivalence[seq_index] = batch[0].id | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 386 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 387 if args.blastexh==True: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 388 eprint("Running BLAST to predict the genes according to homology inference in %s using exhaustive mode (see Fozo et al. (2010) Nucleic Acids Res for details)" % newfile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 389 subprocess.call(['blastp', '-query', "temp.faa", '-db', args.blastdatabase, '-evalue', str(args.blastevalue), '-outfmt', '6 qseqid sseqid pident length qlen slen qstart qend evalue bitscore stitle', '-out', 'temp_blast.csv', '-max_target_seqs', '10', '-word_size', '2', '-gapopen', '8', '-gapextend', '2', '-matrix', '"PAM70"', '-comp_based_stats', '"0"', "-num_threads", str(args.ncpus)]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 390 eprint("Done. BLAST was done to predict the genes by homology\n") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 391 blast_log = "%s.blast.log" % newfile # TO DEBUG | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 392 shutil.copyfile("temp_blast.csv", blast_log) # TO DEBUG | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 393 elif args.noblast==True: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 394 eprint("Running DIAMOND to predict the genes according to homology inference in %s using default parameters" % newfile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 395 with open("temp.faa", "r") as tempfile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 396 first_line = tempfile.readline() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 397 if first_line.startswith(">"): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 398 subprocess.call(['diamond', 'blastp', '-q', 'temp.faa', '-d', args.diamonddatabase, '-e', str(args.blastevalue), '-f', '6', 'qseqid', 'sseqid', 'pident', 'length', 'qlen', 'slen', 'qstart', 'qend', 'evalue', 'bitscore', 'stitle', '-o', 'temp_blast.csv', '-k', '10', "-p", str(args.ncpus), '--quiet']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 399 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 400 open("temp_blast.csv", 'a').close() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 401 blast_log = "%s.blast.log" % newfile # TO DEBUG | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 402 shutil.copyfile("temp_blast.csv", blast_log) # TO DEBUG | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 403 eprint("Done. DIAMOND was done to predict the genes by homology\n") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 404 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 405 eprint("Running BLAST to predict the genes according to homology inference in %s using default parameters" % newfile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 406 subprocess.call(['blastp', '-query', "temp.faa", '-db', args.blastdatabase, '-evalue', str(args.blastevalue), '-outfmt', '6 qseqid sseqid pident length qlen slen qstart qend evalue bitscore stitle', '-out', 'temp_blast.csv', '-max_target_seqs', '10', "-num_threads", str(args.ncpus)]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 407 blast_log = "%s.blast.log" % newfile | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 408 shutil.copyfile("temp_blast.csv", blast_log) # TO DEBUG | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 409 eprint("Done. BLAST was done to predict the genes by homology\n") # TO DEBUG | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 410 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 411 # Parsing the results from BLAST | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 412 with open("temp_blast.csv", "rU") as blastresults: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 413 hypotheticalpat = re.compile(r'(((((?i)hypothetical)|(?i)uncharacteri(z|s)ed|(?i)predicted)) protein)|((?i)ORF|((?i)unnamed protein product|(?i)gp\d+))') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 414 reader = csv.DictReader(blastresults, delimiter='\t', fieldnames=['qseqid','sseqid','pident','length','qlen','slen','qstart','qend','evalue','bitscore','stitle']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 415 information_proteins_blast = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 416 for row in reader: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 417 perc_cover = round(100.00*(float(row['length'])/float(row['qlen'])),2) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 418 perc_id = float(row['pident']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 419 infoprot_blast = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 420 infoprot_blast['sseqid'] = row['sseqid'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 421 infoprot_blast['pident'] = perc_id | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 422 infoprot_blast['pcover'] = perc_cover | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 423 infoprot_blast['evalue'] = row['evalue'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 424 infoprot_blast['descr'] = row['stitle'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 425 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 426 data = information_proteins_blast[row['qseqid']] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 427 except KeyError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 428 if not re.search(hypotheticalpat, infoprot_blast['descr']) and float(perc_id) >= float(args.idthreshold) and float(perc_cover) >= float(args.covthreshold) and float(row['evalue']) <= float(args.blastevalue): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 429 information_proteins_blast[row['qseqid']] = infoprot_blast | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 430 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 431 continue | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 432 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 433 if not re.search(hypotheticalpat, infoprot_blast['descr']) and float(perc_id) >= float(args.idthreshold) and float(perc_id) >= float(infoprot_blast['pident']) and float(perc_cover) >= float(args.covthreshold) and float(perc_cover) >= float(infoprot_blast['pcover']) and float(row['evalue']) <= float(args.blastevalue): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 434 information_proteins_blast[row['qseqid']] = infoprot_blast | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 435 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 436 ## Predicting the function of the proteins based on HMM predictions using phmmer | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 437 if args.nohmmer == False: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 438 with open("commands.sh", "w") as commands: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 439 for singleprot in sorted(glob.glob("SEQ_*.faa")): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 440 hhmtable = "%s.tbl" % singleprot | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 441 eprint("Creating file to run parallel PHMMER") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 442 eprint("Adding %s to run PHMMER." % singleprot) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 443 line2write = ' '.join(["phmmer", "--cpu", "1", "--domtblout", hhmtable, "-E", str(args.hmmerevalue), "-o", "/dev/null", singleprot, args.hmmdatabase, '\n']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 444 commands.write(line2write) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 445 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 446 eprint("Running parallel PHMMER") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 447 subprocess.call(['parallel', '-j', str(args.ncpus)], stdin=open('commands.sh')) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 448 eprint("Done. PHMMER was done to predict the function of the genes according to Hidden Markov Models\n") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 449 os.remove("commands.sh") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 450 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 451 # Parsing the results from HMMER | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 452 information_proteins_hmmer = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 453 hypotheticalpat = re.compile(r'(((((?i)hypothetical)|(?i)uncharacteri(z|s)ed|(?i)predicted)) protein)|((?i)ORF|((?i)unnamed protein product|(?i)gp\d+))') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 454 for singletbl in sorted(glob.glob("*.faa.tbl")): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 455 rootname = singletbl.replace(".faa.tbl", "") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 456 with open(singletbl) as tblfile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 457 infoprot_hmmer = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 458 for line in tblfile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 459 if line.startswith("#"): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 460 continue | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 461 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 462 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 463 pat = re.compile("^(\S+)\s+\S\s+\d+\s+(\S+)\s+\S\s+(\d+)\s+((?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?)\s+\S+\s+\S+\s+\S+\s+\S+\s+(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s+\S+\s+\S+\s+\S+\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+\S+\s+\S+\s+(\S+)\s+(.*)") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 464 matchname, lociname, length, evaluehh, start, end, pident, description = re.match(pat, line).groups() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 465 except AttributeError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 466 continue | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 467 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 468 length = float(length) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 469 pident = 100.00*float(pident) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 470 protarea = 100.00*(((float(end)-1.00) - (float(start)-1.00))/length) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 471 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 472 data2 = infoprot_hmmer['lociname'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 473 except KeyError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 474 if not re.search(hypotheticalpat, description) and float(protarea) >= float(args.covthreshold) and float(evaluehh) <= float(args.hmmerevalue) and float(pident) >= 50.00: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 475 infoprot_hmmer['lociname'] = lociname | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 476 infoprot_hmmer['name'] = matchname | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 477 infoprot_hmmer['evalue'] = float(evaluehh) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 478 infoprot_hmmer['pcover'] = float(protarea) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 479 infoprot_hmmer['pident'] = float(pident) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 480 infoprot_hmmer['descr'] = description | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 481 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 482 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 483 if not re.search(hypotheticalpat, description) and float(protarea) >= float(args.covthreshold) and float(evaluehh) <= float(args.hmmerevalue) and float(pident) >= 50.00 and float(pident) >= infoprot_hmmer['pident']: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 484 infoprot_hmmer['lociname'] = lociname | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 485 infoprot_hmmer['name'] = matchname | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 486 infoprot_hmmer['evalue'] = float(evaluehh) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 487 infoprot_hmmer['pcover'] = float(protarea) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 488 infoprot_hmmer['pident'] = float(pident) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 489 infoprot_hmmer['descr'] = description | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 490 except KeyError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 491 continue | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 492 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 493 if not re.search(hypotheticalpat, description) and float(protarea) >= float(args.covthreshold) and float(evaluehh) <= float(args.hmmerevalue) and float(pident) >= 50.00 and float(pident) >= infoprot_hmmer['pident']: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 494 infoprot_hmmer['lociname'] = lociname | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 495 infoprot_hmmer['name'] = matchname | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 496 infoprot_hmmer['evalue'] = float(evaluehh) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 497 infoprot_hmmer['pcover'] = float(protarea) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 498 infoprot_hmmer['pident'] = float(pident) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 499 infoprot_hmmer['descr'] = description | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 500 information_proteins_hmmer[rootname] = infoprot_hmmer | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 501 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 502 #Storing protein information in memory | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 503 with open("temp.faa", "rU") as protsfile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 504 tempprotsdict = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 505 for protseq in SeqIO.parse(protsfile, "fasta"): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 506 tempindprot = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 507 dataprot = protseq.description.split(' # ') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 508 modseq = str(protseq.seq).replace("X","") # Removing all ambiguous amino acids to avoid problems with ProteinAnalysis module | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 509 analysed_seq = ProteinAnalysis(modseq) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 510 tempindprot['length'] = len(protseq.seq) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 511 tempindprot['isoelectricpoint'] = analysed_seq.isoelectric_point() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 512 tempindprot['molweightkda'] = analysed_seq.molecular_weight()/1000.00 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 513 tempindprot['instability'] = analysed_seq.instability_index() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 514 tempindprot['protein_id'] = dataprot[0] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 515 tempindprot['strand'] = int(dataprot[3]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 516 tempindprot['begin'] = int(dataprot[1])-1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 517 tempindprot['end'] = int(dataprot[2]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 518 tempprotsdict[dataprot[0]] = tempindprot | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 519 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 520 # Creation of table | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 521 debugtable = "%s.csv" % newfile | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 522 with open(debugtable, "w") as tablefile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 523 if args.nohmmer == False: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 524 print("\t".join(["Identifier", "Start", "Stop", "Strand", "size_aa", "pI", "Mol_weight_kDa", "Instability_index", "ID_BLAST", "Descr_BLAST", "evalue_BLAST", "%ID_BLAST", "%Cover_BLAST", "ID_HMMER", "Descr_HMMER", "evalue_HMMER", "%ID_HMMER", "%Cover_HMMER"]), file=tablefile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 525 keylist = information_proteins_hmmer.keys() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 526 keylist.sort() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 527 for keyB in keylist: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 528 keyB = keyB.replace(".faa.tbl", "") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 529 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 530 print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), information_proteins_blast[equivalence[keyB]]['sseqid'], information_proteins_blast[equivalence[keyB]]['descr'], str(information_proteins_blast[equivalence[keyB]]['evalue']), str(information_proteins_blast[equivalence[keyB]]['pident']), str(information_proteins_blast[equivalence[keyB]]['pcover']), information_proteins_hmmer[keyB]['name'], information_proteins_hmmer[keyB]['descr'], str(information_proteins_hmmer[keyB]['evalue']), str(information_proteins_hmmer[keyB]['pident']), str(information_proteins_hmmer[keyB]['pcover'])]), file=tablefile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 531 except KeyError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 532 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 533 print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), information_proteins_blast[equivalence[keyB]]['sseqid'], information_proteins_blast[equivalence[keyB]]['descr'], str(information_proteins_blast[equivalence[keyB]]['evalue']), str(information_proteins_blast[equivalence[keyB]]['pident']), str(information_proteins_blast[equivalence[keyB]]['pcover']), "None", "None", "NA", "NA", "NA"]), file=tablefile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 534 except KeyError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 535 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 536 print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), "None", "None", "NA", "NA", "NA", information_proteins_hmmer[keyB]['name'], information_proteins_hmmer[keyB]['descr'], str(information_proteins_hmmer[keyB]['evalue']), str(information_proteins_hmmer[keyB]['pident']), str(information_proteins_hmmer[keyB]['pcover'])]), file=tablefile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 537 except KeyError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 538 print("\t".join([equivalence[keyB], str(tempprotsdict[equivalence[keyB]]['begin']), str(tempprotsdict[equivalence[keyB]]['end']), str(tempprotsdict[equivalence[keyB]]['strand']), str(tempprotsdict[equivalence[keyB]]['length']), str(tempprotsdict[equivalence[keyB]]['isoelectricpoint']), str(tempprotsdict[equivalence[keyB]]['molweightkda']), str(tempprotsdict[equivalence[keyB]]['instability']), "None", "None", "NA", "NA", "NA", "None", "None", "NA", "NA", "NA"]), file=tablefile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 539 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 540 print("\t".join(["Identifier", "Start", "Stop", "Strand", "size_aa", "pI", "Mol_weight_kDa", "Instability_index", "ID_BLAST", "Descr_BLAST", "evalue_BLAST", "%ID_BLAST", "%Cover_BLAST"]), file=tablefile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 541 keylist = equivalence.values() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 542 keylist.sort() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 543 for keyB in keylist: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 544 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 545 print("\t".join([keyB, str(tempprotsdict[keyB]['begin']), str(tempprotsdict[keyB]['end']), str(tempprotsdict[keyB]['strand']), str(tempprotsdict[keyB]['length']), str(tempprotsdict[keyB]['isoelectricpoint']), str(tempprotsdict[keyB]['molweightkda']), str(tempprotsdict[keyB]['instability']), information_proteins_blast[keyB]['sseqid'], information_proteins_blast[keyB]['descr'], str(information_proteins_blast[keyB]['evalue']), str(information_proteins_blast[keyB]['pident']), str(information_proteins_blast[keyB]['pcover'])]), file=tablefile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 546 except KeyError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 547 print("\t".join([keyB, str(tempprotsdict[keyB]['begin']), str(tempprotsdict[keyB]['end']), str(tempprotsdict[keyB]['strand']), str(tempprotsdict[keyB]['length']), str(tempprotsdict[keyB]['isoelectricpoint']), str(tempprotsdict[keyB]['molweightkda']), str(tempprotsdict[keyB]['instability']), "None", "None", "NA", "NA", "NA"]), file=tablefile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 548 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 549 # Algorithm of decisions (which one: BLAST/HMMER?) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 550 multipleprots = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 551 Hypotheticalpat = re.compile(r'(((((?i)hypothetical)|(?i)uncharacteri(z|s)ed|(?i)predicted)) protein)|((?i)ORF|((?i)unnamed protein product|(?i)gp\d+))') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 552 if args.nohmmer == False: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 553 keylist = information_proteins_hmmer.keys() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 554 keylist.sort() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 555 for keyB in keylist: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 556 keyB = keyB.replace(".faa.tbl", "") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 557 singleprot = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 558 singleprot['name'] = equivalence[keyB] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 559 if (equivalence[keyB] in information_proteins_blast) and (keyB in information_proteins_hmmer): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 560 if re.search(Hypotheticalpat, information_proteins_blast[equivalence[keyB]]['descr']): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 561 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 562 if re.search(Hypotheticalpat, information_proteins_hmmer[keyB]['descr']): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 563 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 564 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 565 singleprot['descr'] = information_proteins_hmmer[keyB]['descr'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 566 except KeyError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 567 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 568 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 569 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 570 if (float(information_proteins_blast[equivalence[keyB]]['pident'])>float(information_proteins_hmmer[keyB]['pident'])) and (float(information_proteins_blast[equivalence[keyB]]['pcover'])>float(information_proteins_hmmer[keyB]['pcover'])): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 571 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 572 elif (float(information_proteins_blast[equivalence[keyB]]['pident'])<float(information_proteins_hmmer[keyB]['pident'])) and (float(information_proteins_blast[equivalence[keyB]]['pcover'])<float(information_proteins_hmmer[keyB]['pcover'])): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 573 singleprot['descr'] = information_proteins_hmmer[keyB]['descr'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 574 elif (float(information_proteins_blast[equivalence[keyB]]['pident'])>float(information_proteins_hmmer[keyB]['pident'])) and (float(information_proteins_blast[equivalence[keyB]]['pcover'])<float(information_proteins_hmmer[keyB]['pcover'])): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 575 if (float(information_proteins_blast[equivalence[keyB]]['pident'])-float(information_proteins_hmmer[keyB]['pident']) >= args.diffid): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 576 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 577 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 578 singleprot['descr'] = information_proteins_hmmer[keyB]['descr'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 579 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 580 if (float(information_proteins_hmmer[keyB]['pident'])-float(information_proteins_blast[equivalence[keyB]]['pident']) >= args.diffid): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 581 singleprot['descr'] = information_proteins_hmmer[keyB]['descr'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 582 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 583 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 584 except KeyError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 585 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 586 if (float(information_proteins_blast[equivalence[keyB]]['pcover'])>float(information_proteins_hmmer[keyB]['pcover'])): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 587 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 588 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 589 singleprot['descr'] = information_proteins_hmmer[keyB]['descr'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 590 except KeyError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 591 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 592 elif equivalence[keyB] in information_proteins_blast: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 593 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 594 elif keyB in information_proteins_hmmer: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 595 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 596 singleprot['descr'] = information_proteins_hmmer[keyB]['descr'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 597 except KeyError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 598 singleprot['descr'] = 'Hypothetical protein' | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 599 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 600 singleprot['descr'] = information_proteins_blast[equivalence[keyB]]['descr'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 601 multipleprots[keyB] = singleprot | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 602 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 603 keylist = equivalence.values() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 604 keylist.sort() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 605 for keyB in keylist: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 606 singleprot = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 607 singleprot['name'] = keyB | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 608 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 609 if information_proteins_blast[keyB]['descr'] == None: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 610 singleprot['descr'] = 'Hypothetical protein' | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 611 elif re.search(Hypotheticalpat, information_proteins_blast[keyB]['descr']): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 612 singleprot['descr'] = 'Conserved hypothetical protein' | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 613 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 614 singleprot['descr'] = information_proteins_blast[keyB]['descr'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 615 except KeyError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 616 singleprot['descr'] = 'Hypothetical protein' | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 617 multipleprots[keyB] = singleprot | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 618 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 619 #Storing protein information in memory | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 620 with open("temp.faa", "rU") as protsfile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 621 protsdict = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 622 for protseq in SeqIO.parse(protsfile, "fasta"): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 623 indprot = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 624 dataprot = protseq.description.split(' # ') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 625 indprot['translation'] = protseq.seq | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 626 indprot['protein_id'] = dataprot[0] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 627 indprot['strand'] = int(dataprot[3]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 628 indprot['begin'] = int(dataprot[1])-1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 629 indprot['end'] = int(dataprot[2]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 630 for keyOmega in sorted(multipleprots): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 631 if multipleprots[keyOmega]['name'] == dataprot[0]: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 632 indprot['product'] = multipleprots[keyOmega]['descr'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 633 protsdict[dataprot[0]] = indprot | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 634 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 635 # Predicting the rRNA sequences | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 636 with open(newfile, "rU") as targetfasta, open("/dev/null", "w") as apocalypse: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 637 eprint("Running INFERNAL+RFAM to predict rRNA-like sequences in %s" % newfile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 638 subprocess.call(["cmscan", "--rfam", "--cut_ga", "--nohmmonly", "--tblout", "rrnafile.csv", "--cpu", args.ncpus, args.rfamdatabase, newfile], stdout=apocalypse) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 639 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 640 #Storing rRNA information in memory | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 641 with open("rrnafile.csv", "rU") as rrnafile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 642 namedict = {"SSU_rRNA_archaea": "16s_rRNA", "SSU_rRNA_bacteria": "16s_rRNA", "SSU_rRNA_eukarya": "18s_rRNA", "SSU_rRNA_microsporidia": "16s_rRNA", "LSU_rRNA_archaea": "23s_rRNA", "LSU_rRNA_bacteria": "23s_rRNA", "LSU_rRNA_eukarya": "28s_rRNA", "5S_rRNA": "5s_rRNA"} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 643 rRNAdict = defaultdict(list) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 644 for line in rrnafile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 645 if not line.startswith("#"): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 646 InfoLINE = re.sub("\s+", ",", line) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 647 line_splitted = InfoLINE.split(",") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 648 item_type = line_splitted[0] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 649 if item_type.startswith(('LSU', 'SSU', '5S')): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 650 strand = 0 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 651 if line_splitted[9] == "+": | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 652 strand = 1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 653 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 654 strand = -1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 655 rRNAdict[item_type].append({'score': float(line_splitted[14]), 'product': namedict[line_splitted[0]], 'begin': int(line_splitted[7]), 'end': int(line_splitted[8]), 'strand': strand}) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 656 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 657 subunits = {'LSU': {'max_score': 0 }, 'SSU': {'max_score': 0 }, '5S': {'max_score': 0 }} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 658 for type_rRNA, rRNA_data in rRNAdict.items(): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 659 max_score = max([item['score'] for item in rRNAdict[type_rRNA]]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 660 for item in ('LSU', 'SSU'): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 661 if type_rRNA.startswith(item): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 662 if max_score > subunits[item]['max_score']: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 663 subunits[item]['listdata'] = rRNA_data | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 664 subunits[item]['max_score'] = max_score | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 665 if type_rRNA.startswith('5S'): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 666 subunits['5S']['listdata'] = rRNA_data | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 667 subunits['5S']['max_score'] = max_score | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 668 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 669 for rRNA in subunits: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 670 i = 0 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 671 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 672 lengthlist = len(subunits[rRNA]['listdata']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 673 except KeyError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 674 continue | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 675 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 676 while i < lengthlist: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 677 eprint("%s harbours a %s from %i to %i" % (newfile, subunits[rRNA]['listdata'][i]['product'], int(subunits[rRNA]['listdata'][i]['begin']), int(subunits[rRNA]['listdata'][i]['end']))) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 678 i += 1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 679 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 680 # Predicting the tRNA sequences | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 681 eprint("Running ARAGORN to predict tRNA-like sequences in %s" % newfile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 682 genetictable = "-gc%s" % str(args.gcode) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 683 with open("trnafile.fasta", "w") as trnafile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 684 if genomeshape['genomeshape'] == "circular": | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 685 subprocess.call(["aragorn", "-c", "-fon", genetictable, newfile], stdout=trnafile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 686 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 687 subprocess.call(["aragorn", "-l", "-fon", genetictable, newfile], stdout=trnafile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 688 num_tRNA = len(list(SeqIO.parse("trnafile.fasta", "fasta"))) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 689 eprint("ARAGORN was able to predict %i tRNAs in %s\n" % (num_tRNA, newfile)) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 690 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 691 #Storing tRNA and tmRNA information in memory | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 692 with open("trnafile.fasta", "rU") as trnafile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 693 tRNAdict = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 694 tmRNAdict = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 695 for tRNAseq in SeqIO.parse(trnafile, "fasta"): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 696 indtRNA = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 697 indtmRNA = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 698 tRNA_information = tRNAseq.description.split(" ") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 699 tRNApat = re.compile("^tRNA-") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 700 if tRNA_information[1] == "tmRNA": | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 701 if str(tRNA_information[2]) == "(Permuted)": | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 702 indtmRNA['product'] = "tmRNA" | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 703 tmRNA_coords = str(tRNA_information[3]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 704 Beginningrevcomppat = re.compile("^c") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 705 if re.match(Beginningrevcomppat, tmRNA_coords): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 706 indtmRNA['strand'] = -1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 707 tmRNA_coords = tmRNA_coords.replace("c[","").replace("]","").split(",") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 708 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 709 indtmRNA['strand'] = 1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 710 tmRNA_coords = tmRNA_coords.replace("[","").replace("]","").split(",") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 711 indtmRNA['begin'] = int(tmRNA_coords[0]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 712 indtmRNA['end'] = int(tmRNA_coords[1]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 713 tmRNAdict[tRNAseq.id] = indtmRNA | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 714 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 715 indtmRNA['product'] = "tmRNA" | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 716 tmRNA_coords = str(tRNA_information[2]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 717 Beginningrevcomppat = re.compile("^c") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 718 if re.match(Beginningrevcomppat, tmRNA_coords): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 719 indtmRNA['strand'] = -1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 720 tmRNA_coords = tmRNA_coords.replace("c[","").replace("]","").split(",") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 721 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 722 indtmRNA['strand'] = 1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 723 tmRNA_coords = tmRNA_coords.replace("[","").replace("]","").split(",") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 724 indtmRNA['begin'] = int(tmRNA_coords[0]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 725 indtmRNA['end'] = int(tmRNA_coords[1]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 726 tmRNAdict[tRNAseq.id] = indtmRNA | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 727 elif re.match(tRNApat, tRNA_information[1]): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 728 indtRNA['product'] = re.sub("\(\w{3}\)", "", tRNA_information[1]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 729 tRNA_coords = tRNA_information[2] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 730 Beginningrevcomppat = re.compile("^c") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 731 if re.match(Beginningrevcomppat, tRNA_coords): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 732 indtRNA['strand'] = -1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 733 tRNA_coords = tRNA_coords.replace("c[","").replace("]","").split(",") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 734 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 735 indtRNA['strand'] = 1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 736 tRNA_coords = tRNA_coords.replace("[","").replace("]","").split(",") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 737 indtRNA['begin'] = int(tRNA_coords[0]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 738 indtRNA['end'] = int(tRNA_coords[1]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 739 tRNAdict[tRNAseq.id] = indtRNA | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 740 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 741 #Predicting CRISPR repeats and others | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 742 eprint("Running PILERCR to predict CRISPR repeats in %s" % newfile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 743 subprocess.call(["pilercr", "-in", newfile, "-out", "crisprfile.txt", "-noinfo", "-minrepeat", str(args.minrepeat), "-maxrepeat", str(args.maxrepeat), "-minspacer", str(args.minspacer), "-maxspacer", str(args.maxspacer)]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 744 eprint("Predicting repeats in the sequences using TRF and IRF") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 745 with open("/dev/null", "w") as stderr: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 746 subprocess.call(["trf", newfile, "2", "7", "7", "80", "10", "50", "500", "-h"], stderr=stderr) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 747 os.rename("%s.2.7.7.80.10.50.500.dat" % newfile, "trf_temp.dat") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 748 with open("/dev/null", "w") as stderr: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 749 subprocess.call(["irf", newfile, "2", "3", "5", "80", "10", "40", "500000", "10000", "-d", "-h"], stderr=stderr) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 750 os.rename("%s.2.3.5.80.10.40.500000.10000.dat" % newfile, "irf_temp.dat") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 751 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 752 # Storing CRISPR repeats information | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 753 information_CRISPR = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 754 with open("crisprfile.txt", "rU") as crisprfile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 755 for line in crisprfile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 756 if "SUMMARY BY POSITION" in line: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 757 for line in crisprfile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 758 information_crispr_repeat = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 759 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 760 patC = re.compile('^\s+(\d+)\s+.{16}\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d?\s+(\w+)') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 761 key, start, length, seq = re.match(patC, line).groups() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 762 except AttributeError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 763 continue | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 764 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 765 information_crispr_repeat['start'] = start | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 766 information_crispr_repeat['end'] = int(start) + int(length) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 767 information_crispr_repeat['repeatseq'] = seq | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 768 information_crispr_repeat['repeatend'] = int(start) + len(seq) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 769 information_CRISPR[key] = information_crispr_repeat | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 770 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 771 # Storing tandem repeats information | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 772 information_TRF = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 773 count = 1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 774 with open("trf_temp.dat", "rU") as trfile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 775 for line in trfile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 776 information_tandem_repeat = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 777 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 778 patT = re.compile('^(\d+)\s(\d+)\s\d+\s\d+\.\d+\s') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 779 start, end = re.match(patT, line).groups() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 780 except AttributeError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 781 continue | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 782 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 783 information_tandem_repeat['start'] = start | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 784 information_tandem_repeat['end'] = end | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 785 information_TRF[count] = information_tandem_repeat | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 786 count += 1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 787 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 788 # Storing inverted repeats information | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 789 information_IRF = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 790 count = 1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 791 with open("irf_temp.dat", "rU") as irfile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 792 for line in irfile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 793 information_inverted_repeat = {} | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 794 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 795 patI = re.compile('^(\d+)\s(\d+)\s\d+\s\d+\s\d+') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 796 start, end = re.match(patI, line).groups() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 797 except AttributeError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 798 continue | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 799 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 800 information_inverted_repeat['start'] = start | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 801 information_inverted_repeat['end'] = end | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 802 information_IRF[count] = information_inverted_repeat | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 803 count += 1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 804 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 805 # Creating a new Genbank and GFF file | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 806 eprint("Creating the output files") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 807 newtempgbk = "%s.temp.gbk" % newfile | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 808 with open(newfile, "rU") as basefile, open(newtempgbk, "w"): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 809 for record in SeqIO.parse(basefile, "fasta", IUPAC.ambiguous_dna): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 810 whole_sequence = SeqRecord(record.seq) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 811 whole_sequence.id = str(record.id) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 812 whole_sequence.annotations['data_file_division'] = args.typedata.upper() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 813 whole_sequence.annotations['date'] = strftime("%d-%b-%Y").upper() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 814 for protein in sorted(protsdict, key = stringSplitByNumbers): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 815 putative_start = int(protsdict[protein]['begin']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 816 start_pos = SeqFeature.ExactPosition(putative_start) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 817 end_pos = SeqFeature.ExactPosition(protsdict[protein]['end']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 818 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=protsdict[protein]['strand']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 819 qualifiersgene = OrderedDict([('locus_tag', protsdict[protein]['protein_id'])]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 820 new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = protsdict[protein]['strand'], qualifiers = qualifiersgene) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 821 whole_sequence.features.append(new_data_gene) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 822 qualifiers = [('locus_tag', protsdict[protein]['protein_id']), ('product', protsdict[protein]['product']), ('protein_id', protsdict[protein]['protein_id']), ('translation', protsdict[protein]['translation'])] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 823 feature_qualifiers = OrderedDict(qualifiers) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 824 new_data_cds = SeqFeature.SeqFeature(feature_location, type = "CDS", strand = protsdict[protein]['strand'], qualifiers = feature_qualifiers) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 825 whole_sequence.features.append(new_data_cds) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 826 for rRNA in sorted(subunits, key = stringSplitByNumbers): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 827 i = 0 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 828 try: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 829 lengthlist = len(subunits[rRNA]['listdata']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 830 except KeyError: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 831 continue | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 832 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 833 while i < lengthlist: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 834 putative_start = int(subunits[rRNA]['listdata'][i]['begin']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 835 start_pos = SeqFeature.ExactPosition(putative_start) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 836 end_pos = SeqFeature.ExactPosition(subunits[rRNA]['listdata'][i]['end']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 837 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=subunits[rRNA]['listdata'][i]['strand']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 838 new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = subunits[rRNA]['listdata'][i]['strand']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 839 whole_sequence.features.append(new_data_gene) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 840 qualifiers = [('product', subunits[rRNA]['listdata'][i]['product'])] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 841 feature_qualifiers = OrderedDict(qualifiers) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 842 new_data_rRNA = SeqFeature.SeqFeature(feature_location, type = "rRNA", strand = subunits[rRNA]['listdata'][i]['strand'], qualifiers = feature_qualifiers) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 843 whole_sequence.features.append(new_data_rRNA) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 844 i += 1 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 845 for tRNA in sorted(tRNAdict, key = stringSplitByNumbers): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 846 putative_start = int(tRNAdict[tRNA]['begin']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 847 start_pos = SeqFeature.ExactPosition(putative_start) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 848 end_pos = SeqFeature.ExactPosition(tRNAdict[tRNA]['end']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 849 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=tRNAdict[tRNA]['strand']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 850 new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = tRNAdict[tRNA]['strand']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 851 whole_sequence.features.append(new_data_gene) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 852 qualifiers = [('product', tRNAdict[tRNA]['product'])] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 853 feature_qualifiers = OrderedDict(qualifiers) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 854 new_data_tRNA = SeqFeature.SeqFeature(feature_location, type = "tRNA", strand = tRNAdict[tRNA]['strand'], qualifiers = feature_qualifiers) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 855 whole_sequence.features.append(new_data_tRNA) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 856 for tmRNA in sorted(tmRNAdict, key = stringSplitByNumbers): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 857 putative_start = int(tmRNAdict[tmRNA]['begin']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 858 start_pos = SeqFeature.ExactPosition(putative_start) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 859 end_pos = SeqFeature.ExactPosition(tmRNAdict[tmRNA]['end']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 860 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos, strand=tmRNAdict[tmRNA]['strand']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 861 new_data_gene = SeqFeature.SeqFeature(feature_location, type = "gene", strand = tmRNAdict[tmRNA]['strand']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 862 whole_sequence.features.append(new_data_gene) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 863 qualifiers = [('product', tmRNAdict[tmRNA]['product'])] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 864 feature_qualifiers = OrderedDict(qualifiers) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 865 new_data_tmRNA = SeqFeature.SeqFeature(feature_location, type = "tmRNA", strand = tmRNAdict[tmRNA]['strand'], qualifiers = feature_qualifiers) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 866 whole_sequence.features.append(new_data_tmRNA) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 867 for CRISPR in sorted(information_CRISPR, key = stringSplitByNumbers): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 868 putative_start = int(information_CRISPR[CRISPR]['start']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 869 start_pos = SeqFeature.ExactPosition(putative_start) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 870 end_pos = SeqFeature.ExactPosition(information_CRISPR[CRISPR]['end']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 871 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 872 qualifiers = [('rpt_family', 'CRISPR'), ('rpt_type', 'direct'), ('rpt_unit_range', "%i..%i" % (int(information_CRISPR[CRISPR]['start']), int(information_CRISPR[CRISPR]['repeatend']))), ('rpt_unit_seq', information_CRISPR[CRISPR]['repeatseq'])] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 873 feature_qualifiers = OrderedDict(qualifiers) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 874 new_data_CRISPRrepeat = SeqFeature.SeqFeature(feature_location, type = "repeat_region", qualifiers = feature_qualifiers) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 875 whole_sequence.features.append(new_data_CRISPRrepeat) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 876 for tandem in sorted(information_TRF): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 877 putative_start = int(information_TRF[tandem]['start']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 878 start_pos = SeqFeature.ExactPosition(putative_start) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 879 end_pos = SeqFeature.ExactPosition(information_TRF[tandem]['end']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 880 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 881 qualifiers = [('rpt_type', 'direct')] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 882 feature_qualifiers = OrderedDict(qualifiers) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 883 new_data_tandemrepeat = SeqFeature.SeqFeature(feature_location, type = "repeat_region", qualifiers = feature_qualifiers) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 884 whole_sequence.features.append(new_data_tandemrepeat) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 885 for inverted in sorted(information_IRF): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 886 putative_start = int(information_IRF[inverted]['start']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 887 start_pos = SeqFeature.ExactPosition(putative_start) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 888 end_pos = SeqFeature.ExactPosition(information_IRF[inverted]['end']) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 889 feature_location = SeqFeature.FeatureLocation(start_pos, end_pos) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 890 qualifiers = [('rpt_type', 'inverted')] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 891 feature_qualifiers = OrderedDict(qualifiers) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 892 new_data_invertedrepeat = SeqFeature.SeqFeature(feature_location, type = "repeat_region", qualifiers = feature_qualifiers) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 893 whole_sequence.features.append(new_data_invertedrepeat) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 894 SeqIO.write(whole_sequence, newtempgbk, "genbank") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 895 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 896 newgbk = "%s.gbk" % newfile | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 897 with open(newtempgbk, "rU") as gbktempfile, open(newgbk, "w") as gbkrealfile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 898 newpat = re.compile("D|RNA\s+(CON|PHG|VRL|BCT)") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 899 for line in gbktempfile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 900 if line.startswith("LOCUS ") and re.search(newpat, line): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 901 if genomeshape['genomeshape'] == "linear": | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 902 newline = re.sub("bp DNA\s+", "bp DNA linear ", line) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 903 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 904 newline = re.sub("bp DNA\s+", "bp DNA circular ", line) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 905 gbkrealfile.write(newline) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 906 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 907 gbkrealfile.write(line) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 908 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 909 for f in glob.glob("*.temp.gbk"): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 910 os.remove(f) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 911 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 912 if args.gffprint==True: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 913 newgff = "%s.gff" % newfile | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 914 with open(newgff, "w") as outgff, open(newgbk, "rU") as ingbk: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 915 GFF.write(SeqIO.parse(ingbk, "genbank"), outgff) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 916 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 917 # Removing intermediate files | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 918 os.remove(newfile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 919 os.remove("temp.faa") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 920 os.remove("temp_blast.csv") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 921 os.remove("crisprfile.txt") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 922 os.remove("trnafile.fasta") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 923 os.remove("rrnafile.csv") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 924 os.remove("trf_temp.dat") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 925 os.remove("irf_temp.dat") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 926 for f in glob.glob("SEQ*"): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 927 os.remove(f) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 928 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 929 # Joining all GENBANK files into one | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 930 listgbk = sorted(glob.glob('CONTIG_*.gbk')) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 931 gbkoutputfile = "%s.gbk" % root_output | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 932 with open(gbkoutputfile, 'w') as finalgbk: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 933 for fname in listgbk: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 934 with open(fname) as infile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 935 for line in infile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 936 finalgbk.write(line) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 937 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 938 for tempgbk in glob.glob("CONTIG_*.gbk"): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 939 os.remove(tempgbk) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 940 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 941 # Joining all GFF files into one | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 942 if args.gffprint==True: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 943 listgff = sorted(glob.glob('CONTIG_*.gff')) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 944 gffoutputfile = "%s.gff" % root_output | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 945 with open(gffoutputfile, 'w') as finalgff: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 946 for fname in listgff: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 947 with open(fname) as infile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 948 for line in infile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 949 finalgff.write(line) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 950 for tempgff in glob.glob("CONTIG_*.gff"): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 951 os.remove(tempgff) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 952 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 953 # Joining all TABLE files into one | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 954 listcsv = sorted(glob.glob('CONTIG_*.csv')) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 955 tbloutputfile = "%s.csv" % root_output | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 956 with open(tbloutputfile, 'w') as finaltable: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 957 for fname in listcsv: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 958 with open(fname) as infile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 959 for line in infile: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 960 finaltable.write(line) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 961 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 962 for temptbl in glob.glob("CONTIG_*.csv"): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 963 os.remove(temptbl) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 964 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 965 # Preparing sequences for GenBank submission (Original code from Wan Yu's gbk2tbl.py script [https://github.com/wanyuac/BINF_toolkit/blob/master/gbk2tbl.py]) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 966 allowed_qualifiers = ['locus_tag', 'gene', 'product', 'pseudo', 'protein_id', 'gene_desc', 'old_locus_tag', 'note', 'inference', 'organism', 'mol_type', 'strain', 'sub_species', 'isolation-source', 'country'] | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 967 newfastafile = "%s.fasta" % root_output | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 968 newtablefile = "%s.tbl" % root_output | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 969 with open(args.modifiers, "rU") as modifiers, open(gbkoutputfile, "r") as genbank_fh, open(newfastafile, "w") as fasta_fh, open(newtablefile, "w") as feature_fh: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 970 info = modifiers.readline() | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 971 wholelist = list(SeqIO.parse(genbank_fh, 'genbank')) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 972 for record in wholelist: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 973 if len(record) <= args.mincontigsize: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 974 eprint("WARNING: Skipping small contig %s" % record.id) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 975 continue | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 976 record.description = "%s %s" % (record.id, info) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 977 SeqIO.write([record], fasta_fh, 'fasta') | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 978 print('>Feature %s' % (record.name), file=feature_fh) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 979 for line in record.features: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 980 if line.strand == 1: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 981 print('%d\t%d\t%s' % (line.location.nofuzzy_start + 1, line.location.nofuzzy_end, line.type), file=feature_fh) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 982 else: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 983 print('%d\t%d\t%s' % (line.location.nofuzzy_end, line.location.nofuzzy_start + 1, line.type), file=feature_fh) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 984 for (key, values) in line.qualifiers.iteritems(): | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 985 if key not in allowed_qualifiers: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 986 continue | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 987 for v in values: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 988 print('\t\t\t%s\t%s' % (key, v), file=feature_fh) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 989 | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 990 # Final statement | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 991 eprint("Genome annotation done!") | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 992 eprint("The GenBank file is %s" % gbkoutputfile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 993 if args.gffprint==True: | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 994 eprint("The GFF3 file is %s" % gffoutputfile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 995 eprint("The table file for GenBank submission is %s" % tbloutputfile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 996 eprint("The FASTA file for GenBank submission is %s" % newfastafile) | 
| 
231e4c669675
Initial commit - v0.10.3 git commit deeded0
 vimalkumarvelayudhan parents: diff
changeset | 997 eprint("The table file with all protein information is %s" % newtablefile) | 
