annotate TEisotools-1.1.a/commons/core/seq/FastaUtils.py @ 15:255c852351c5 draft

Uploaded
author urgi-team
date Thu, 21 Jul 2016 07:36:44 -0400
parents feef9a0db09d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
13
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1 # Copyright INRA (Institut National de la Recherche Agronomique)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
2 # http://www.inra.fr
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
3 # http://urgi.versailles.inra.fr
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
4 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
5 # This software is governed by the CeCILL license under French law and
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
6 # abiding by the rules of distribution of free software. You can use,
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
7 # modify and/ or redistribute the software under the terms of the CeCILL
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
8 # license as circulated by CEA, CNRS and INRIA at the following URL
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
9 # "http://www.cecill.info".
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
10 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
11 # As a counterpart to the access to the source code and rights to copy,
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
12 # modify and redistribute granted by the license, users are provided only
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
13 # with a limited warranty and the software's author, the holder of the
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
14 # economic rights, and the successive licensors have only limited
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
15 # liability.
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
16 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
17 # In this respect, the user's attention is drawn to the risks associated
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
18 # with loading, using, modifying and/or developing or reproducing the
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
19 # software by the user in light of its specific status of free software,
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
20 # that may mean that it is complicated to manipulate, and that also
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
21 # therefore means that it is reserved for developers and experienced
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
22 # professionals having in-depth computer knowledge. Users are therefore
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
23 # encouraged to load and test the software's suitability as regards their
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
24 # requirements in conditions enabling the security of their systems and/or
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
25 # data to be ensured and, more generally, to use and operate it in the
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
26 # same conditions as regards security.
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
27 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
28 # The fact that you are presently reading this means that you have had
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
29 # knowledge of the CeCILL license and that you accept its terms.
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
30
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
31
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
32 import os
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
33 import re
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
34 import sys
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
35 import math
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
36 import glob
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
37 import string
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
38 import shutil
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
39 from commons.core.utils.FileUtils import FileUtils
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
40 from commons.core.seq.Bioseq import Bioseq
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
41 from commons.core.seq.BioseqDB import BioseqDB
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
42 from commons.core.coord.MapUtils import MapUtils
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
43 from commons.core.coord.Range import Range
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
44 from commons.core.coord.ConvCoord import ConvCoord
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
45 from commons.core.parsing.FastaParser import FastaParser
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
46 from commons.core.checker.CheckerUtils import CheckerUtils
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
47 from commons.core.launcher.LauncherUtils import LauncherUtils
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
48
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
49
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
50 ## Static methods for fasta file manipulation
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
51 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
52 class FastaUtils( object ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
53
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
54 ## Count the number of sequences in the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
55 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
56 # @param inFile name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
57 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
58 # @return integer number of sequences in the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
59 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
60 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
61 def dbSize( inFile ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
62 nbSeq = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
63 with open(inFile) as fH:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
64 for line in fH:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
65 if line[0] == ">":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
66 nbSeq += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
67
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
68 return nbSeq
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
69
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
70
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
71 ## Compute the cumulative sequence length in the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
72 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
73 # @param inFile handler of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
74 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
75 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
76 def dbCumLength( inFile ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
77 cumLength = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
78 for line in inFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
79 if line[0] != ">":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
80 cumLength += len(string.rstrip(line))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
81
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
82 return cumLength
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
83
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
84
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
85 ## Return a list with the length of each sequence in the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
86 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
87 # @param inFile string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
88 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
89 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
90 def dbLengths(inFile):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
91 lLengths = []
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
92 currentLength = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
93 with open(inFile) as fH:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
94 for line in fH:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
95 if line[0] == ">":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
96 if currentLength != 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
97 lLengths.append(currentLength)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
98 currentLength = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
99 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
100 currentLength += len(line[:-1])
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
101 lLengths.append(currentLength)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
102
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
103 return lLengths
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
104
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
105
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
106 ## Retrieve the sequence headers present in the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
107 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
108 # @param inFile string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
109 # @param verbose integer level of verbosity
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
110 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
111 # @return list of sequence headers
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
112 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
113 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
114 def dbHeaders(inFile, verbose = 0):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
115 lHeaders = [line[1:].rstrip() for line in open(inFile) if line[0] == ">"]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
116 if verbose:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
117 for header in lHeaders:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
118 print header
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
119
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
120 return lHeaders
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
121
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
122
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
123 ## Cut a data bank into chunks according to the input parameters
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
124 # If a sequence is shorter than the threshold, it is only renamed (not cut)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
125 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
126 # @param inFileName string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
127 # @param chkLgth string chunk length (in bp, default=200000)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
128 # @param chkOver string chunk overlap (in bp, default=10000)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
129 # @param wordN string N stretch word length (default=11, 0 for no detection)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
130 # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map')
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
131 # @param clean boolean remove 'cut' and 'Nstretch' files
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
132 # @param verbose integer (default = 0)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
133 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
134 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
135 def dbChunks(inFileName, chkLgth = "200000", chkOver = "10000", wordN = "11", outFilePrefix = "", clean = False, verbose = 0):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
136 nbSeq = FastaUtils.dbSize(inFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
137 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
138 print "cut the %i input sequences with cutterDB..." % nbSeq
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
139 sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
140
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
141 prg = "cutterDB"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
142 cmd = prg
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
143 cmd += " -l %s" % chkLgth
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
144 cmd += " -o %s" % chkOver
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
145 cmd += " -w %s" % wordN
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
146 cmd += " %s" % inFileName
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
147 returnStatus = os.system(cmd)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
148 if returnStatus != 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
149 msg = "ERROR: '%s' returned '%i'" % (prg, returnStatus)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
150 sys.stderr.write("%s\n" % msg)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
151 sys.exit(1)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
152
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
153 nbChunks = FastaUtils.dbSize("%s_cut" % inFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
154 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
155 print "done (%i chunks)" % ( nbChunks )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
156 sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
157
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
158 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
159 print "rename the headers..."
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
160 sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
161
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
162 if outFilePrefix == "":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
163 outFastaName = inFileName + "_chunks.fa"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
164 outMapName = inFileName + "_chunks.map"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
165 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
166 outFastaName = outFilePrefix + ".fa"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
167 outMapName = outFilePrefix + ".map"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
168
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
169 with open("%s_cut" % inFileName) as inFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
170 outFasta = open(outFastaName, "w")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
171 outMap = open(outMapName, "w")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
172
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
173 for line in inFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
174 if line[0] == ">":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
175 if verbose > 1:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
176 print "rename '%s'" % (line[:-1]); sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
177 data = line[:-1].split(" ")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
178 seqID = data[0].split(">")[1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
179 newHeader = "chunk%s" % (str(seqID).zfill(len(str(nbChunks))))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
180 oldHeader = data[2]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
181 seqStart = data[4].split("..")[0]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
182 seqEnd = data[4].split("..")[1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
183 outMap.write("%s\t%s\t%s\t%s\n" % (newHeader, oldHeader, seqStart, seqEnd))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
184 outFasta.write(">%s\n" % newHeader)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
185
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
186 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
187 outFasta.write(line.upper())
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
188
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
189 outFasta.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
190 outMap.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
191
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
192 #stats on .Nstretch.map file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
193 genomeLength = FastaUtils.dbCumLength(open(inFileName))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
194 NstretchMapFile = inFileName + ".Nstretch.map"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
195 outNstrechStats = open('%s.NstretchStats.txt' % inFileName , "w")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
196 if FileUtils.isEmpty(NstretchMapFile) or not FileUtils.isRessourceExists(NstretchMapFile):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
197 outNstrechStats.write("No N in stretch length > %s\n" % wordN)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
198 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
199 with open(NstretchMapFile) as f:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
200 dHeader2lLengths = {}
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
201 for line in f:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
202 data = line.rstrip().split()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
203 header = data[1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
204 length = int(data[3]) - int(data[2]) + 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
205 if header not in dHeader2lLengths:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
206 dHeader2lLengths[header] = []
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
207 dHeader2lLengths[header].append(length)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
208
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
209 for header in sorted(dHeader2lLengths):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
210 lLengths = dHeader2lLengths[header]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
211 outNstrechStats.write("%s\tmin: %s\tmax: %s\tcumul: %s\n" % (header, min(lLengths), max(lLengths), sum(lLengths)))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
212
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
213 cumulAllStretch = sum([sum(lengths) for lengths in dHeader2lLengths.values()])
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
214
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
215 NstretchPrct = float(cumulAllStretch)/genomeLength*100
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
216 outNstrechStats.write("Total N in stretch length > %s: %s bp represent %6.2f %% of genome\n" % (wordN, cumulAllStretch, NstretchPrct))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
217 outNstrechStats.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
218
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
219 if clean == True:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
220 os.remove(inFileName + "_cut")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
221 os.remove(NstretchMapFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
222
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
223
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
224 ## Split the input fasta file in several output files
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
225 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
226 # @param inFile string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
227 # @param nbSeqPerBatch integer number of sequences per output file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
228 # @param newDir boolean put the sequences in a new directory called 'batches'
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
229 # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1')
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
230 # @param prefix prefix in output file name
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
231 # @param verbose integer verbosity level (default = 0)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
232 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
233 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
234 def dbSplit(inFile, nbSeqPerBatch, newDir, useSeqHeader = False, prefix = "batch", verbose = 0):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
235 if not os.path.exists(inFile):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
236 msg = "ERROR: file '%s' doesn't exist" % inFile
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
237 sys.stderr.write("%s\n" % msg)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
238 sys.exit(1)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
239
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
240 nbSeq = FastaUtils.dbSize(inFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
241
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
242 nbBatches = int(math.ceil(nbSeq / float(nbSeqPerBatch)))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
243 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
244 print "save the %i input sequences into %i batches" % (nbSeq, nbBatches)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
245 sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
246
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
247 if nbSeqPerBatch != 1 and useSeqHeader:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
248 useSeqHeader = False
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
249
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
250 if newDir == True:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
251 if os.path.exists("batches"):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
252 shutil.rmtree("batches")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
253 os.mkdir("batches")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
254 os.chdir("batches")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
255 os.system("ln -s ../%s ." % inFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
256
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
257 with open(inFile) as inFileHandler:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
258 countBatch = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
259 countSeq = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
260 for line in inFileHandler:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
261 if line[0] == ">":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
262 countSeq += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
263 if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
264 try:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
265 outFile.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
266 except: pass
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
267 countBatch += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
268 if useSeqHeader:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
269 outFileName = "%s.fa" % (line[1:-1].replace(" ", "_"))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
270 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
271 outFileName = "%s_%s.fa" % (prefix, str(countBatch).zfill(len(str(nbBatches))))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
272 outFile = open(outFileName, "w")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
273
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
274 if verbose > 1:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
275 print "saving seq '%s' in file '%s'..." % (line[1:].rstrip(), outFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
276 sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
277 outFile.write(line)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
278
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
279 if newDir:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
280 os.remove(os.path.basename(inFile))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
281 os.chdir("..")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
282
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
283
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
284 ## Split the input fasta file in several output files
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
285 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
286 # @param inFileName string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
287 # @param maxSize integer max cumulative length for each output file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
288 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
289 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
290 def splitFastaFileInBatches(inFileName, maxSize = 200000):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
291 iBioseqDB = BioseqDB(inFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
292 lHeadersSizeTuples = []
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
293 for iBioseq in iBioseqDB.db:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
294 lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength()))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
295
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
296 lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
297 os.mkdir("batches")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
298 os.chdir("batches")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
299
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
300 iterator = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
301 for lHeader in lHeadersList:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
302 iterator += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
303 with open("batch_%s.fa" % iterator, 'w') as f:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
304 for header in lHeader :
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
305 iBioseq = iBioseqDB.fetch(header)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
306 iBioseq.write(f)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
307 os.chdir("..")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
308
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
309
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
310 ## Split the input fasta file in several output files according to their cluster identifier
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
311 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
312 # @param inFileName string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
313 # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
314 # @param simplifyHeader boolean simplify the headers
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
315 # @param createDir boolean put the sequences in different directories
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
316 # @param outPrefix string prefix of the output files (default='seqCluster')
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
317 # @param verbose integer (default = 0)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
318 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
319 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
320 def splitSeqPerCluster(inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix = "seqCluster", verbose = 0):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
321 if not os.path.exists(inFileName):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
322 print "ERROR: %s doesn't exist" % inFileName
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
323 sys.exit(1)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
324
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
325 inFile = open(inFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
326
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
327 line = inFile.readline()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
328 if line:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
329 name = line.split(" ")[0]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
330 if "Cluster" in name:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
331 clusterID = name.split("Cluster")[1].split("Mb")[0]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
332 seqID = name.split("Mb")[1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
333 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
334 clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
335 if "Q" in name.split("Gr")[0]:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
336 seqID = name.split("Gr")[0].split("MbQ")[1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
337 elif "S" in name:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
338 seqID = name.split("Gr")[0].split("MbS")[1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
339 sClusterIDs = set( [ clusterID ] )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
340 if simplifyHeader == True:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
341 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
342 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
343 header = line[1:-1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
344 if createDir == True:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
345 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
346 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
347 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
348 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
349 outFile = open( outFileName, "w" )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
350 outFile.write( ">%s\n" % ( header ) )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
351 prevClusterID = clusterID
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
352
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
353 line = inFile.readline()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
354 while line:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
355 if line[0] == ">":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
356 name = line.split(" ")[0]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
357 if "Cluster" in name:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
358 clusterID = name.split("Cluster")[1].split("Mb")[0]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
359 seqID = name.split("Mb")[1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
360 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
361 clusterID = name.split("Cl")[0].split("Gr")[1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
362 if "Q" in name.split("Gr")[0]:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
363 seqID = name.split("Gr")[0].split("MbQ")[1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
364 elif "S" in name:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
365 seqID = name.split("Gr")[0].split("MbS")[1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
366
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
367 if clusterID != prevClusterID:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
368 outFile.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
369
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
370 if simplifyHeader == True:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
371 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
372 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
373 header = line[1:-1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
374
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
375 if createDir == True:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
376 os.chdir( ".." )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
377 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
378 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
379 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
380
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
381 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
382 if not os.path.exists( outFileName ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
383 outFile = open( outFileName, "w" )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
384 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
385 if clusterID != prevClusterID:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
386 outFile.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
387 outFile = open( outFileName, "a" )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
388 outFile.write( ">%s\n" % ( header ) )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
389 prevClusterID = clusterID
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
390 sClusterIDs.add( clusterID )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
391
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
392 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
393 outFile.write( line )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
394
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
395 line = inFile.readline()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
396
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
397 outFile.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
398 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
399 print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
400
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
401 if createDir == True:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
402 os.chdir("..")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
403 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
404 print "WARNING: empty input file - no cluster found"; sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
405
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
406
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
407 ## Filter a fasta file in two fasta files using the length of each sequence as a criterion
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
408 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
409 # @param len_min integer length sequence criterion to filter
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
410 # @param inFileName string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
411 # @param verbose integer (default = 0)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
412 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
413 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
414 def dbLengthFilter(len_min, inFileName, verbose = 0):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
415 file_db = open(inFileName,)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
416 file_dbInf = open(inFileName + ".Inf" + str(len_min), "w")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
417 file_dbSup = open(inFileName + ".Sup" + str(len_min), "w")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
418 seq = Bioseq()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
419 numseq = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
420 nbsaveInf = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
421 nbsaveSup = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
422 seq.read(file_db)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
423 while seq.getHeader():
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
424 l = seq.getLength()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
425 numseq = numseq + 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
426 if l >= len_min:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
427 seq.write(file_dbSup)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
428 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
429 nbsaveSup = nbsaveSup + 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
430 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
431 seq.write(file_dbInf)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
432 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
433 nbsaveInf = nbsaveInf + 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
434 seq.read(file_db)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
435
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
436 file_db.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
437 file_dbInf.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
438 file_dbSup.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
439 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
440 print "%i saved sequences in %s: %i sequences for %s.Inf%s and %i sequences for %s.Sup%s" % (nbsaveInf + nbsaveSup, inFileName, nbsaveInf, inFileName, str(len_min), nbsaveSup, inFileName, str(len_min))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
441
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
442
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
443 ## Extract the longest sequences from a fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
444 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
445 # @param num integer maximum number of sequences in the output file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
446 # @param inFileName string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
447 # @param outFileName string name of the output fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
448 # @param minThresh integer minimum length threshold (default=0)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
449 # @param verbose integer (default = 0)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
450 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
451 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
452 def dbLongestSequences(num, inFileName, outFileName = "", verbose = 0, minThresh = 0):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
453 bsDB = BioseqDB(inFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
454 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
455 print "nb of input sequences: %i" % bsDB.getSize()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
456
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
457 if outFileName == "":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
458 outFileName = inFileName + ".best" + str(num)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
459 outFile = open( outFileName, "w" )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
460
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
461 if bsDB.getSize()==0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
462 return 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
463
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
464 num = int(num)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
465 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
466 print "keep the %i longest sequences" % num
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
467 if minThresh > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
468 print "with length > %i bp" % minThresh
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
469 sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
470
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
471 tmpLSeqLgth = bsDB.getListOfSequencesLength()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
472 tmpLSeqLgth.sort(reverse = True)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
473
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
474 # select the longests
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
475 lSeqLgth = []
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
476 for i in xrange( 0, min(num,len(tmpLSeqLgth)) ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
477 if tmpLSeqLgth[i] >= minThresh:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
478 lSeqLgth.append( tmpLSeqLgth[i] )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
479 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
480 print "selected max length: %i" % max(lSeqLgth)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
481 print "selected min length: %i" % min(lSeqLgth)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
482 sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
483
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
484 # save the longest
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
485 inFile = open( inFileName )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
486 seqNum = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
487 nbSave = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
488 for bs in bsDB.db:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
489 seqNum += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
490 if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
491 bs.write( outFile )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
492 if verbose > 1:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
493 print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
494 sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
495 nbSave += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
496 if nbSave == num:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
497 break
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
498 inFile.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
499 outFile.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
500 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
501 print nbSave, "saved sequences in ", outFileName
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
502 sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
503
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
504 return 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
505
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
506
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
507 ## Extract all the sequence headers from a fasta file and write them in a new file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
508 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
509 # @param inFileName string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
510 # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers')
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
511 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
512 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
513 def dbExtractSeqHeaders(inFileName, outFileName = ""):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
514 if not outFileName:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
515 outFileName = inFileName + ".headers"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
516
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
517 with open(outFileName, "w") as f:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
518 for header in FastaUtils.dbHeaders(inFileName):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
519 f.write("%s\n" % header)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
520
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
521 return 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
522
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
523
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
524 ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
525 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
526 # @param pattern regular expression to search in headers
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
527 # @param inFileName string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
528 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
529 # @param verbose integer verbosity level (default = 0)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
530 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
531 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
532 def dbExtractByPattern(pattern, inFileName, outFileName = "", verbose = 0):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
533 if not pattern:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
534 return
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
535
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
536 if not outFileName:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
537 outFileName = inFileName + '.extracted'
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
538 outFile = open(outFileName, 'w')
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
539
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
540 patternTosearch = re.compile(pattern)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
541 bioseq = Bioseq()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
542 bioseqNb = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
543 savedBioseqNb = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
544 inFile = open(inFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
545 bioseq.read(inFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
546 while bioseq.sequence:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
547 bioseqNb = bioseqNb + 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
548 m = patternTosearch.search(bioseq.header)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
549 if m:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
550 bioseq.write(outFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
551 if verbose > 1:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
552 print 'sequence num', bioseqNb, 'matched on', m.group(), '[', bioseq.header[0:40], '...] saved !!'
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
553 savedBioseqNb = savedBioseqNb + 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
554 bioseq.read(inFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
555 inFile.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
556
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
557 outFile.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
558
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
559 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
560 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
561
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
562
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
563 ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
564 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
565 # @param patternFileName string file containing regular expression to search in headers
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
566 # @param inFileName string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
567 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
568 # @param verbose integer verbosity level (default = 0)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
569 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
570 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
571 def dbExtractByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
572 if not patternFileName:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
573 print "ERROR: no file of pattern"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
574 sys.exit(1)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
575
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
576 bioseq = Bioseq()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
577 bioseqNb = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
578 savedBioseqNb = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
579 lHeaders = []
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
580
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
581 with open(inFileName) as inFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
582 bioseq.read(inFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
583 while bioseq.sequence:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
584 lHeaders.append(bioseq.header)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
585 bioseq.read(inFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
586
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
587 lHeadersToKeep = []
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
588 with open(patternFileName) as patternFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
589 for pattern in patternFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
590 pattern = pattern.rstrip()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
591 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
592 print "pattern: ", pattern; sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
593
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
594 patternToSearch = re.compile(pattern)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
595 lHeadersToKeep.extend([h for h in lHeaders if patternToSearch.search(h)])
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
596
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
597 if not outFileName:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
598 outFileName = inFileName + ".extracted"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
599
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
600 with open(outFileName, "w") as outFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
601 with open(inFileName) as inFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
602 bioseq.read(inFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
603 while bioseq.sequence:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
604 bioseqNb += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
605 if bioseq.header in lHeadersToKeep:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
606 bioseq.write(outFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
607 savedBioseqNb += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
608 if verbose > 1:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
609 print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
610 bioseq.read(inFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
611
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
612 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
613 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
614
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
615
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
616 ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
617 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
618 # @param pattern regular expression to search in headers
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
619 # @param inFileName string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
620 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
621 # @param verbose integer verbosity level (default = 0)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
622 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
623 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
624 def dbCleanByPattern(pattern, inFileName, outFileName = "", verbose = 0):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
625 if not pattern:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
626 return
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
627
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
628 patternToSearch = re.compile(pattern)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
629
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
630 if outFileName == "":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
631 outFileName = inFileName + '.cleaned'
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
632
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
633 with open(outFileName, 'w') as outFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
634 bioseq = Bioseq()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
635 bioseqNb = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
636 savedBioseqNb = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
637 with open(inFileName) as inFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
638 bioseq.read(inFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
639 while bioseq.sequence:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
640 bioseqNb += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
641 if not patternToSearch.search(bioseq.header):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
642 bioseq.write(outFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
643 savedBioseqNb += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
644 if verbose > 1:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
645 print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
646 bioseq.read(inFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
647
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
648 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
649 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
650
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
651
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
652 ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
653 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
654 # @param patternFileName string file containing regular expression to search in headers
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
655 # @param inFileName string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
656 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
657 # @param verbose integer verbosity level (default = 0)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
658 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
659 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
660 def dbCleanByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
661 if not patternFileName:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
662 print "ERROR: no file of pattern"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
663 sys.exit(1)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
664
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
665 bioseq = Bioseq()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
666 bioseqNb = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
667 savedBioseqNb = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
668 lHeaders = []
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
669 with open(inFileName) as inFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
670 bioseq.read(inFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
671 while bioseq.sequence:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
672 lHeaders.append(bioseq.header)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
673 bioseq.read(inFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
674
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
675 with open(patternFileName) as patternFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
676 lHeadersToRemove = []
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
677 for pattern in patternFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
678 pattern = pattern.rstrip()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
679 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
680 print "pattern: ", pattern; sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
681
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
682 patternToSearch = re.compile(pattern)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
683 lHeadersToRemove.extend([h for h in lHeaders if patternToSearch.search(h)])
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
684
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
685 if not outFileName:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
686 outFileName = inFileName + '.cleaned'
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
687
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
688 with open(outFileName, 'w') as outFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
689 bioseqNum = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
690 with open(inFileName) as inFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
691 bioseq.read(inFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
692 while bioseq.sequence:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
693 bioseqNum += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
694 if bioseq.header not in lHeadersToRemove:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
695 bioseq.write(outFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
696 savedBioseqNb += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
697 if verbose > 1:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
698 print 'sequence num', bioseqNum, '/', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
699 bioseq.read(inFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
700
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
701 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
702 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
703
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
704
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
705 ## Find sequence's ORFs from a fasta file and write them in a tab file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
706 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
707 # @param inFileName string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
708 # @param orfMaxNb integer Select orfMaxNb best ORFs
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
709 # @param orfMinLength integer Keep ORFs with length > orfMinLength
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
710 # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map')
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
711 # @param verbose integer verbosity level (default = 0)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
712 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
713 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
714 def dbORF(inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose = 0):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
715 if not outFileName:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
716 outFileName = inFileName + ".ORF.map"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
717 outFile = open(outFileName, "w")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
718
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
719 bioseq = Bioseq()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
720 bioseqNb = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
721
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
722 inFile = open(inFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
723 bioseq.read(inFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
724 while bioseq.sequence:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
725 bioseq.upCase()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
726 bioseqNb += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
727 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
728 print 'sequence num', bioseqNb, '=', bioseq.getLength(), '[', bioseq.header[0:40], '...]'
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
729
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
730 orf = bioseq.findORF()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
731 bestOrf = []
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
732 for i in orf.keys():
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
733 orfLen = len(orf[i])
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
734 for j in xrange(1, orfLen):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
735 start = orf[i][j - 1] + 4
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
736 end = orf[i][j] + 3
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
737 if end - start >= orfMinLength:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
738 bestOrf.append((end - start, i + 1, start, end))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
739
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
740 bioseq.reverseComplement()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
741
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
742 orf = bioseq.findORF()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
743 seqLen = bioseq.getLength()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
744 for i in orf.keys():
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
745 orfLen = len(orf[i])
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
746 for j in xrange(1, orfLen):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
747 start = seqLen - orf[i][j - 1] - 3
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
748 end = seqLen - orf[i][j] - 2
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
749 if start - end >= orfMinLength:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
750 bestOrf.append((start - end, (i + 1) * -1, start, end))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
751
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
752 bestOrf.sort(reverse = True)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
753 bestOrfNb = len(bestOrf)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
754 if orfMaxNb != 0 and orfMaxNb < bestOrfNb:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
755 bestOrfNb = orfMaxNb
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
756 for i in xrange(0, bestOrfNb):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
757 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
758 print bestOrf[i]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
759 outFile.write("%s\t%s\t%d\t%d\n" % ("ORF|" + str(bestOrf[i][1]) + \
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
760 "|" + str(bestOrf[i][0]), bioseq.header,
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
761 bestOrf[i][2], bestOrf[i][3]))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
762 bioseq.read(inFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
763
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
764 inFile.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
765 outFile.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
766
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
767 return 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
768
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
769
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
770 ## Sort sequences by increasing length (write a new file)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
771 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
772 # @param inFileName string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
773 # @param outFileName string name of the output fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
774 # @param verbose integer verbosity level
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
775 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
776 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
777 def sortSequencesByIncreasingLength(inFileName, outFileName, verbose = 0):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
778 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
779 print "sort sequences by increasing length"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
780 sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
781 if not os.path.exists(inFileName):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
782 print "ERROR: file '%s' doesn't exist" % (inFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
783 sys.exit(1)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
784
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
785 # read each seq one by one
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
786 # save them in distinct temporary files
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
787 # with their length in the name
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
788 inFileHandler = open(inFileName, "r")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
789 countSeq = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
790 bs = Bioseq()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
791 bs.read(inFileHandler)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
792 while bs.header:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
793 countSeq += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
794 tmpFile = "%ibp_%inb" % (bs.getLength(), countSeq)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
795 bs.appendBioseqInFile(tmpFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
796 if verbose > 1:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
797 print "%s (%i bp) saved in '%s'" % (bs.header, bs.getLength(), tmpFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
798 bs.header = ""
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
799 bs.sequence = ""
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
800 bs.read(inFileHandler)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
801 inFileHandler.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
802
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
803 # sort temporary file names
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
804 # concatenate them into the output file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
805 if os.path.exists(outFileName):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
806 os.remove(outFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
807 lFiles = glob.glob("*bp_*nb")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
808 lFiles.sort(key = lambda s:int(s.split("bp_")[0]))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
809 for fileName in lFiles:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
810 cmd = "cat %s >> %s" % (fileName, outFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
811 returnValue = os.system(cmd)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
812 if returnValue != 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
813 print "ERROR while concatenating '%s' with '%s'" % (fileName, outFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
814 sys.exit(1)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
815 os.remove(fileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
816
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
817 return 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
818
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
819
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
820 ## Sort sequences by header
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
821 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
822 # @param inFileName string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
823 # @param outFileName string name of the output fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
824 # @param verbose integer verbosity level
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
825 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
826 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
827 def sortSequencesByHeader(inFileName, outFileName = ""):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
828 if outFileName == "":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
829 outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
830 iBioseqDB = BioseqDB(inFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
831 with open(outFileName, "w") as f:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
832 for header in sorted(iBioseqDB.getHeaderList()):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
833 iBioseq = iBioseqDB.fetch(header)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
834 iBioseq.write(f)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
835
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
836
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
837 ## Return a dictionary which keys are the headers and values the length of the sequences
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
838 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
839 # @param inFile string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
840 # @param verbose integer verbosity level
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
841 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
842 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
843 def getLengthPerHeader(inFile, verbose = 0):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
844 dHeader2Length = {}
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
845
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
846 with open(inFile) as inFileHandler:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
847 currentSeqHeader = ""
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
848 currentSeqLength = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
849 for line in inFileHandler:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
850 if line[0] == ">":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
851 if currentSeqHeader != "":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
852 dHeader2Length[currentSeqHeader] = currentSeqLength
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
853 currentSeqLength = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
854 currentSeqHeader = line[1:-1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
855 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
856 print "current header: %s" % currentSeqHeader
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
857 sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
858 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
859 currentSeqLength += len(line.replace("\n", ""))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
860 dHeader2Length[currentSeqHeader] = currentSeqLength
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
861
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
862 return dHeader2Length
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
863
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
864
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
865 ## Convert headers from a fasta file having chunk coordinates
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
866 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
867 # @param inFile string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
868 # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
869 # @param outFile string name of the output file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
870 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
871 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
872 def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
873 inFileHandler = open(inFile, "r")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
874 outFileHandler = open(outFile, "w")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
875 dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
876 iConvCoord = ConvCoord()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
877 for line in inFileHandler:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
878 if line[0] == ">":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
879 if "{Fragment}" in line:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
880 chkName = line.split(" ")[1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
881 chrName = dChunk2Map[chkName].seqname
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
882 lCoordPairs = line.split(" ")[3].split(",")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
883 lRangesOnChk = []
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
884 for i in lCoordPairs:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
885 iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1]))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
886 lRangesOnChk.append(iRange)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
887 lRangesOnChr = []
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
888 for iRange in lRangesOnChk:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
889 lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
890 newHeader = line[1:-1].split(" ")[0]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
891 newHeader += " %s" % chrName
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
892 newHeader += " {Fragment}"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
893 newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
894 for iRange in lRangesOnChr[1:]:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
895 newHeader += ",%i..%i" % (iRange.start, iRange.end)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
896 outFileHandler.write(">%s\n" % newHeader)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
897 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
898 chkName = line.split("_")[1].split(" ")[0]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
899 chrName = dChunk2Map[chkName].seqname
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
900 coords = line[line.find("[")+1 : line.find("]")]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
901 start = int(coords.split(",")[0])
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
902 end = int(coords.split(",")[1])
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
903 iRangeOnChk = Range(chkName, start, end)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
904 iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
905 newHeader = line[1:-1].split("_")[0]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
906 newHeader += " %s" % chrName
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
907 newHeader += " %s" % line[line.find("(") : line.find(")")+1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
908 newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd())
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
909 outFileHandler.write(">%s\n" % newHeader)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
910 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
911 outFileHandler.write(line)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
912 inFileHandler.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
913 outFileHandler.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
914
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
915
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
916 ## Convert a fasta file to a length file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
917 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
918 # @param inFile string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
919 # @param outFile string name of the output file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
920 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
921 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
922 def convertFastaToLength(inFile, outFile = ""):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
923 if not outFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
924 outFile = "%s.length" % inFile
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
925
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
926 if inFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
927 with open(inFile) as inFH:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
928 with open(outFile, "w") as outFH:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
929 bioseq = Bioseq()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
930 bioseq.read(inFH)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
931 while bioseq.sequence:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
932 seqLen = bioseq.getLength()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
933 outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
934 bioseq.read(inFH)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
935
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
936
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
937 ## Convert a fasta file to a seq file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
938 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
939 # @param inFile string name of the input fasta file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
940 # @param outFile string name of the output file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
941 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
942 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
943 def convertFastaToSeq(inFile, outFile = ""):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
944 if not outFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
945 outFile = "%s.seq" % inFile
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
946
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
947 if inFile:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
948 with open(inFile) as inFH:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
949 with open(outFile, "w") as outFH:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
950 bioseq = Bioseq()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
951 bioseq.read(inFH)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
952 while bioseq.sequence:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
953 seqLen = bioseq.getLength()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
954 outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
955 bioseq.sequence, bioseq.header, seqLen))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
956 bioseq.read(inFH)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
957
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
958
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
959 ## Splice an input fasta file using coordinates in a Map file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
960 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
961 # @note the coordinates should be merged beforehand!
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
962 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
963 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
964 def spliceFromCoords(genomeFile, coordFile, obsFile):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
965 genomeFileHandler = open(genomeFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
966 obsFileHandler = open(obsFile, "w")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
967 dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile(coordFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
968
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
969 bs = Bioseq()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
970 bs.read(genomeFileHandler)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
971 while bs.sequence:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
972
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
973 if dChr2Maps.has_key(bs.header):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
974 lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax(dChr2Maps[bs.header])
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
975 splicedSeq = []
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
976 currentSite = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
977 for iMap in lCoords:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
978 minSplice = iMap.getMin() - 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
979 if minSplice > currentSite:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
980 splicedSeq += bs.sequence[currentSite : minSplice]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
981 if currentSite <= iMap.getMax():
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
982 currentSite = iMap.getMax()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
983 splicedSeq += bs.sequence[currentSite:]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
984 bs.sequence = "".join(splicedSeq)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
985 bs.write(obsFileHandler)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
986 bs.read(genomeFileHandler)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
987
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
988 genomeFileHandler.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
989 obsFileHandler.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
990
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
991
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
992 ## Shuffle input sequences (single file or files in a directory)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
993 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
994 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
995 def dbShuffle(inData, outData, verbose = 0):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
996 if CheckerUtils.isExecutableInUserPath("esl-shuffle"):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
997 prg = "esl-shuffle"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
998 else : prg = "shuffle"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
999 genericCmd = prg + " --seed 1 -d INPUT > OUTPUT"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1000 if os.path.isfile(inData):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1001 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1002 print "shuffle input file '%s'" % inData
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1003 cmd = genericCmd.replace("INPUT", inData).replace("OUTPUT", outData)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1004 print cmd
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1005 returnStatus = os.system(cmd)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1006 if returnStatus:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1007 sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1008 sys.exit(1)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1009
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1010 elif os.path.isdir(inData):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1011 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1012 print "shuffle files in input directory '%s'" % inData
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1013 if os.path.exists(outData):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1014 shutil.rmtree(outData)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1015 os.mkdir(outData)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1016 lInputFiles = glob.glob("%s/*.fa" % inData)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1017 nbFastaFiles = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1018 for inputFile in lInputFiles:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1019 nbFastaFiles += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1020 if verbose > 1:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1021 print "%3i / %3i" % (nbFastaFiles, len(lInputFiles))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1022 fastaBaseName = os.path.basename(inputFile)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1023 prefix = os.path.splitext(fastaBaseName)[0]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1024 cmd = genericCmd.replace("INPUT", inputFile).replace("OUTPUT", "%s/%s_shuffle.fa" % (outData, prefix))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1025 returnStatus = os.system(cmd)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1026 if returnStatus:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1027 sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1028 sys.exit(1)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1029
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1030
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1031 ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1032 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1033 # @param inClusterFileName string input cluster file name
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1034 # @param inFastaFileName string input fasta file name
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1035 # @param outFileName string output file name
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1036 # @param verbosity integer verbosity
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1037 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1038 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1039 def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1040 dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1041 iFastaParser = FastaParser(inFastaFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1042 with open(outFileName, "w") as f:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1043 for iSequence in iFastaParser.getIterator():
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1044
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1045 header = iSequence.getName()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1046 if dHeader2ClusterClusterMember.get(header):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1047 cluster = dHeader2ClusterClusterMember[header][0]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1048 member = dHeader2ClusterClusterMember[header][1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1049 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1050 clusterIdForSingletonCluster += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1051 cluster = clusterIdForSingletonCluster
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1052 member = 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1053
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1054 newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1055 iSequence.setName(newHeader)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1056 f.write(iSequence.printFasta())
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1057
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1058 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1059 def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1060 dHeader2ClusterClusterMember = {}
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1061 clusterId = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1062 with open(inClusterFileName) as f:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1063 for line in f:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1064 lineWithoutLastChar = line.rstrip()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1065 lHeaders = lineWithoutLastChar.split("\t")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1066 clusterId += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1067 if verbosity > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1068 print "%i sequences in cluster %i" % (len(lHeaders), clusterId)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1069 memberId = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1070 for header in lHeaders:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1071 memberId += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1072 dHeader2ClusterClusterMember[header] = (clusterId, memberId)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1073 if verbosity > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1074 print "%i clusters" % clusterId
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1075 return dHeader2ClusterClusterMember, clusterId
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1076
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1077 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1078 def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1079 """
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1080 Write a map file from fasta output of clustering tool.
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1081 Warning: only works if input fasta headers are formated like LTRharvest fasta output.
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1082 """
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1083 if not outMapFileName:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1084 outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0])
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1085
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1086 fileDb = open(fastaFileNameFromClustering)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1087 fileMap = open(outMapFileName, "w")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1088 seq = Bioseq()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1089 numseq = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1090 seq.read(fileDb)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1091 while seq.sequence:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1092 numseq = numseq + 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1093 ID = seq.header.split(' ')[0].split('_')[0]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1094 chunk = seq.header.split(' ')[0].split('_')[1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1095 start = seq.header.split(' ')[-1].split(',')[0][1:]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1096 end = seq.header.split(' ')[-1].split(',')[1][:-1]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1097 line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1098 fileMap.write("%s\n" % line)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1099 seq.read(fileDb)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1100
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1101 fileDb.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1102 fileMap.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1103 print "saved in %s" % outMapFileName
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1104
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1105 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1106 def getNstretchesRangesList(fastaFileName, nbN = 2):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1107 lNstretchesRanges = []
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1108 if nbN != 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1109 iBSDB = BioseqDB(fastaFileName)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1110 for iBS in iBSDB.db:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1111 nbNFound = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1112 start = 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1113 pos = 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1114 lastPos = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1115
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1116 while pos <= iBS.getLength():
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1117 if nbNFound == 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1118 start = pos
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1119
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1120 while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1121 nbNFound += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1122 pos += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1123 lastPos = pos
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1124
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1125 if pos - lastPos >= nbN:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1126 if nbNFound >= nbN:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1127 lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1128 nbNFound = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1129 pos += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1130
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1131 if nbNFound >= nbN:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1132 lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1133
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1134 lNstretchesRanges.sort(key = lambda iRange: (iRange.getSeqname(), iRange.getStart(), iRange.getEnd()), reverse = False)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1135
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1136 return lNstretchesRanges
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1137
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1138
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1139 @staticmethod
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1140 def writeNstretches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1141 lNstretchesRanges = FastaUtils.getNstretchesRangesList(fastaFileName, nbN)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1142
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1143 outFormat = outFormat.lower()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1144 if outFormat in ["gff", "gff3"]:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1145 outFormat = "gff3"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1146 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1147 outFormat = "map"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1148
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1149 if outFileName == "":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1150 outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1151
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1152 with open(outFileName, "w") as fH:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1153 if outFormat == "gff3":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1154 fH.write("##gff-version 3\n")
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1155 for iRange in lNstretchesRanges:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1156 seq = iRange.getSeqname()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1157 start = iRange.getStart()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1158 end = iRange.getEnd()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1159 if outFormat == "gff3":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1160 fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1161 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1162 fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1163