comparison smRtools.py @ 0:b996480cd604 draft

planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author drosofff
date Wed, 27 May 2015 17:19:15 -0400
parents
children ca3845fb0b31
comparison
equal deleted inserted replaced
-1:000000000000 0:b996480cd604
1 #!/usr/bin/python
2 # version 1 7-5-2012 unification of the SmRNAwindow class
3
4 import sys, subprocess
5 from collections import defaultdict
6 from numpy import mean, median, std
7 from scipy import stats
8
9 def get_fasta (index="/home/galaxy/galaxy-dist/bowtie/5.37_Dmel/5.37_Dmel"):
10 '''This function will return a dictionary containing fasta identifiers as keys and the
11 sequence as values. Index must be the path to a fasta file.'''
12 p = subprocess.Popen(args=["bowtie-inspect","-a", "0", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines
13 outputlines = p.stdout.readlines()
14 p.wait()
15 item_dic = {}
16 for line in outputlines:
17 if (line[0] == ">"):
18 try:
19 item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item
20 except: pass
21 current_item = line[1:].rstrip().split()[0] #take the first word before space because bowtie splits headers !
22 item_dic[current_item] = ""
23 stringlist=[]
24 else:
25 stringlist.append(line.rstrip() )
26 item_dic[current_item] = "".join(stringlist) # for the last item
27 return item_dic
28
29 def get_fasta_headers (index):
30 p = subprocess.Popen(args=["bowtie-inspect","-n", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines
31 outputlines = p.stdout.readlines()
32 p.wait()
33 item_dic = {}
34 for line in outputlines:
35 header = line.rstrip().split()[0] #take the first word before space because bowtie splits headers !
36 item_dic[header] = 1
37 return item_dic
38
39
40 def get_file_sample (file, numberoflines):
41 '''import random to use this function'''
42 F=open(file)
43 fullfile = F.read().splitlines()
44 F.close()
45 if len(fullfile) < numberoflines:
46 return "sample size exceeds file size"
47 return random.sample(fullfile, numberoflines)
48
49 def get_fasta_from_history (file):
50 F = open (file, "r")
51 item_dic = {}
52 for line in F:
53 if (line[0] == ">"):
54 try:
55 item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item
56 except: pass
57 current_item = line[1:-1].split()[0] #take the first word before space because bowtie splits headers !
58 item_dic[current_item] = ""
59 stringlist=[]
60 else:
61 stringlist.append(line[:-1])
62 item_dic[current_item] = "".join(stringlist) # for the last item
63 return item_dic
64
65 def antipara (sequence):
66 antidict = {"A":"T", "T":"A", "G":"C", "C":"G", "N":"N"}
67 revseq = sequence[::-1]
68 return "".join([antidict[i] for i in revseq])
69
70 def RNAtranslate (sequence):
71 return "".join([i if i in "AGCN" else "U" for i in sequence])
72 def DNAtranslate (sequence):
73 return "".join([i if i in "AGCN" else "T" for i in sequence])
74
75 def RNAfold (sequence_list):
76 thestring= "\n".join(sequence_list)
77 p = subprocess.Popen(args=["RNAfold","--noPS"], stdin= subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
78 output=p.communicate(thestring)[0]
79 p.wait()
80 output=output.split("\n")
81 if not output[-1]: output = output[:-1] # nasty patch to remove last empty line
82 buffer=[]
83 for line in output:
84 if line[0] in ["N","A","T","U","G","C"]:
85 buffer.append(DNAtranslate(line))
86 if line[0] in ["(",".",")"]:
87 fields=line.split("(")
88 energy= fields[-1]
89 energy = energy[:-1] # remove the ) parenthesis
90 energy=float(energy)
91 buffer.append(str(energy))
92 return dict(zip(buffer[::2], buffer[1::2]))
93
94 def extractsubinstance (start, end, instance):
95 ''' Testing whether this can be an function external to the class to save memory'''
96 subinstance = SmRNAwindow (instance.gene, instance.sequence[start-1:end], start)
97 subinstance.gene = "%s %s %s" % (subinstance.gene, subinstance.windowoffset, subinstance.windowoffset + subinstance.size - 1)
98 upcoordinate = [i for i in range(start,end+1) if instance.readDict.has_key(i) ]
99 downcoordinate = [-i for i in range(start,end+1) if instance.readDict.has_key(-i) ]
100 for i in upcoordinate:
101 subinstance.readDict[i]=instance.readDict[i]
102 for i in downcoordinate:
103 subinstance.readDict[i]=instance.readDict[i]
104 return subinstance
105
106 class HandleSmRNAwindows:
107 def __init__(self, alignmentFile="~", alignmentFileFormat="tabular", genomeRefFile="~", genomeRefFormat="bowtieIndex", biosample="undetermined", size_inf=None, size_sup=1000, norm=1.0):
108 self.biosample = biosample
109 self.alignmentFile = alignmentFile
110 self.alignmentFileFormat = alignmentFileFormat # can be "tabular" or "sam"
111 self.genomeRefFile = genomeRefFile
112 self.genomeRefFormat = genomeRefFormat # can be "bowtieIndex" or "fastaSource"
113 self.alignedReads = 0
114 self.instanceDict = {}
115 self.size_inf=size_inf
116 self.size_sup=size_sup
117 self.norm=norm
118 if genomeRefFormat == "bowtieIndex":
119 self.itemDict = get_fasta (genomeRefFile)
120 elif genomeRefFormat == "fastaSource":
121 self.itemDict = get_fasta_from_history (genomeRefFile)
122 for item in self.itemDict:
123 self.instanceDict[item] = SmRNAwindow(item, sequence=self.itemDict[item], windowoffset=1, biosample=self.biosample, norm=self.norm) # create as many instances as there is items
124 self.readfile()
125
126 def readfile (self) :
127 if self.alignmentFileFormat == "tabular":
128 F = open (self.alignmentFile, "r")
129 for line in F:
130 fields = line.split()
131 polarity = fields[1]
132 gene = fields[2]
133 offset = int(fields[3])
134 size = len (fields[4])
135 if self.size_inf:
136 if (size>=self.size_inf and size<= self.size_sup):
137 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
138 self.alignedReads += 1
139 else:
140 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
141 self.alignedReads += 1
142 F.close()
143 return self.instanceDict
144 # elif self.alignmentFileFormat == "sam":
145 # F = open (self.alignmentFile, "r")
146 # dict = {"0":"+", "16":"-"}
147 # for line in F:
148 # if line[0]=='@':
149 # continue
150 # fields = line.split()
151 # if fields[2] == "*": continue
152 # polarity = dict[fields[1]]
153 # gene = fields[2]
154 # offset = int(fields[3])
155 # size = len (fields[9])
156 # if self.size_inf:
157 # if (size>=self.size_inf and size<= self.size_sup):
158 # self.instanceDict[gene].addread (polarity, offset, size)
159 # self.alignedReads += 1
160 # else:
161 # self.instanceDict[gene].addread (polarity, offset, size)
162 # self.alignedReads += 1
163 # F.close()
164 elif self.alignmentFileFormat == "bam" or self.alignmentFileFormat == "sam":
165 import pysam
166 samfile = pysam.Samfile(self.alignmentFile)
167 for read in samfile:
168 if read.tid == -1:
169 continue # filter out unaligned reads
170 if read.is_reverse:
171 polarity="-"
172 else:
173 polarity="+"
174 gene = samfile.getrname(read.tid)
175 offset = read.pos
176 size = read.qlen
177 if self.size_inf:
178 if (size>=self.size_inf and size<= self.size_sup):
179 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
180 self.alignedReads += 1
181 else:
182 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
183 self.alignedReads += 1
184 return self.instanceDict
185
186 # def size_histogram (self):
187 # size_dict={}
188 # size_dict['F']= defaultdict (int)
189 # size_dict['R']= defaultdict (int)
190 # size_dict['both'] = defaultdict (int)
191 # for item in self.instanceDict:
192 # buffer_dict_F = self.instanceDict[item].size_histogram()['F']
193 # buffer_dict_R = self.instanceDict[item].size_histogram()['R']
194 # for size in buffer_dict_F:
195 # size_dict['F'][size] += buffer_dict_F[size]
196 # for size in buffer_dict_R:
197 # size_dict['R'][size] -= buffer_dict_R[size]
198 # allSizeKeys = list (set (size_dict['F'].keys() + size_dict['R'].keys() ) )
199 # for size in allSizeKeys:
200 # size_dict['both'][size] = size_dict['F'][size] + size_dict['R'][size]
201 # return size_dict
202 def size_histogram (self): # in HandleSmRNAwindows
203 '''refactored on 7-9-2014 to debug size_histogram tool'''
204 size_dict={}
205 size_dict['F']= defaultdict (float)
206 size_dict['R']= defaultdict (float)
207 size_dict['both'] = defaultdict (float)
208 for item in self.instanceDict:
209 buffer_dict = self.instanceDict[item].size_histogram()
210 for polarity in ["F", "R"]:
211 for size in buffer_dict[polarity]:
212 size_dict[polarity][size] += buffer_dict[polarity][size]
213 for size in buffer_dict["both"]:
214 size_dict["both"][size] += buffer_dict["F"][size] - buffer_dict["R"][size]
215 return size_dict
216
217 def CountFeatures (self, GFF3="path/to/file"):
218 featureDict = defaultdict(int)
219 F = open (GFF3, "r")
220 for line in F:
221 if line[0] == "#": continue
222 fields = line[:-1].split()
223 chrom, feature, leftcoord, rightcoord, polarity = fields[0], fields[2], fields[3], fields[4], fields[6]
224 featureDict[feature] += self.instanceDict[chrom].readcount(upstream_coord=int(leftcoord), downstream_coord=int(rightcoord), polarity="both", method="destructive")
225 F.close()
226 return featureDict
227
228 class SmRNAwindow:
229
230 def __init__(self, gene, sequence="ATGC", windowoffset=1, biosample="Undetermined", norm=1.0):
231 self.biosample = biosample
232 self.sequence = sequence
233 self.gene = gene
234 self.windowoffset = windowoffset
235 self.size = len(sequence)
236 self.readDict = defaultdict(list) # with a {+/-offset:[size1, size2, ...], ...}
237 self.matchedreadsUp = 0
238 self.matchedreadsDown = 0
239 self.norm=norm
240
241 def addread (self, polarity, offset, size):
242 '''ATTENTION ATTENTION ATTENTION'''
243 ''' We removed the conversion from 0 to 1 based offset, as we do this now during readparsing.'''
244 if polarity == "+":
245 self.readDict[offset].append(size)
246 self.matchedreadsUp += 1
247 else:
248 self.readDict[-(offset + size -1)].append(size)
249 self.matchedreadsDown += 1
250 return
251
252 def barycenter (self, upstream_coord=None, downstream_coord=None):
253 '''refactored 24-12-2013 to save memory and introduce offset filtering see readcount method for further discussion on that
254 In this version, attempt to replace the dictionary structure by a list of tupple to save memory too'''
255 upstream_coord = upstream_coord or self.windowoffset
256 downstream_coord = downstream_coord or self.windowoffset+self.size-1
257 window_size = downstream_coord - upstream_coord +1
258 def weigthAverage (TuppleList):
259 weightSum = 0
260 PonderWeightSum = 0
261 for tuple in TuppleList:
262 PonderWeightSum += tuple[0] * tuple[1]
263 weightSum += tuple[1]
264 if weightSum > 0:
265 return PonderWeightSum / float(weightSum)
266 else:
267 return 0
268 forwardTuppleList = [(k, len(self.readDict[k])) for k in self.readDict.keys() if (k > 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both forward and in the proper offset window
269 reverseTuppleList = [(-k, len(self.readDict[k])) for k in self.readDict.keys() if (k < 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both reverse and in the proper offset window
270 Fbarycenter = (weigthAverage (forwardTuppleList) - upstream_coord) / window_size
271 Rbarycenter = (weigthAverage (reverseTuppleList) - upstream_coord) / window_size
272 return Fbarycenter, Rbarycenter
273
274 def correlation_mapper (self, reference, window_size):
275 '''to map correlation with a sliding window 26-2-2013'''
276 if window_size > self.size:
277 return []
278 F=open(reference, "r")
279 reference_forward = []
280 reference_reverse = []
281 for line in F:
282 fields=line.split()
283 reference_forward.append(int(float(fields[1])))
284 reference_reverse.append(int(float(fields[2])))
285 F.close()
286 local_object_forward=[]
287 local_object_reverse=[]
288 ## Dict to list for the local object
289 for i in range(1, self.size+1):
290 local_object_forward.append(len(self.readDict[i]))
291 local_object_reverse.append(len(self.readDict[-i]))
292 ## start compiling results by slides
293 results=[]
294 for coordinate in range(self.size - window_size):
295 local_forward=local_object_forward[coordinate:coordinate + window_size]
296 local_reverse=local_object_reverse[coordinate:coordinate + window_size]
297 if sum(local_forward) == 0 or sum(local_reverse) == 0:
298 continue
299 try:
300 reference_to_local_cor_forward = stats.spearmanr(local_forward, reference_forward)
301 reference_to_local_cor_reverse = stats.spearmanr(local_reverse, reference_reverse)
302 if (reference_to_local_cor_forward[0] > 0.2 or reference_to_local_cor_reverse[0]>0.2):
303 results.append([coordinate+1, reference_to_local_cor_forward[0], reference_to_local_cor_reverse[0]])
304 except:
305 pass
306 return results
307
308 def readcount (self, size_inf=0, size_sup=1000, upstream_coord=None, downstream_coord=None, polarity="both", method="conservative"):
309 '''refactored 24-12-2013 to save memory and introduce offset filtering
310 take a look at the defaut parameters that cannot be defined relatively to the instance are they are defined before instanciation
311 the trick is to pass None and then test
312 polarity parameter can take "both", "forward" or "reverse" as value'''
313 upstream_coord = upstream_coord or self.windowoffset
314 downstream_coord = downstream_coord or self.windowoffset+self.size-1
315 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "both":
316 return self.matchedreadsUp + self.matchedreadsDown
317 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "forward":
318 return self.matchedreadsUp
319 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "reverse":
320 return self.matchedreadsDown
321 n=0
322 if polarity == "both":
323 for offset in xrange(upstream_coord, downstream_coord+1):
324 if self.readDict.has_key(offset):
325 for read in self.readDict[offset]:
326 if (read>=size_inf and read<= size_sup):
327 n += 1
328 if method != "conservative":
329 del self.readDict[offset] ## Carefull ! precludes re-use on the self.readDict dictionary !!!!!! TEST
330 if self.readDict.has_key(-offset):
331 for read in self.readDict[-offset]:
332 if (read>=size_inf and read<= size_sup):
333 n += 1
334 if method != "conservative":
335 del self.readDict[-offset]
336 return n
337 elif polarity == "forward":
338 for offset in xrange(upstream_coord, downstream_coord+1):
339 if self.readDict.has_key(offset):
340 for read in self.readDict[offset]:
341 if (read>=size_inf and read<= size_sup):
342 n += 1
343 return n
344 elif polarity == "reverse":
345 for offset in xrange(upstream_coord, downstream_coord+1):
346 if self.readDict.has_key(-offset):
347 for read in self.readDict[-offset]:
348 if (read>=size_inf and read<= size_sup):
349 n += 1
350 return n
351
352 def readsizes (self):
353 '''return a dictionary of number of reads by size (the keys)'''
354 dicsize = {}
355 for offset in self.readDict:
356 for size in self.readDict[offset]:
357 dicsize[size] = dicsize.get(size, 0) + 1
358 for offset in range (min(dicsize.keys()), max(dicsize.keys())+1):
359 dicsize[size] = dicsize.get(size, 0) # to fill offsets with null values
360 return dicsize
361
362 # def size_histogram(self):
363 # norm=self.norm
364 # hist_dict={}
365 # hist_dict['F']={}
366 # hist_dict['R']={}
367 # for offset in self.readDict:
368 # for size in self.readDict[offset]:
369 # if offset < 0:
370 # hist_dict['R'][size] = hist_dict['R'].get(size, 0) - 1*norm
371 # else:
372 # hist_dict['F'][size] = hist_dict['F'].get(size, 0) + 1*norm
373 # ## patch to avoid missing graphs when parsed by R-lattice. 27-08-2014. Test and validate !
374 # if not (hist_dict['F']) and (not hist_dict['R']):
375 # hist_dict['F'][21] = 0
376 # hist_dict['R'][21] = 0
377 # ##
378 # return hist_dict
379
380 def size_histogram(self, minquery=None, maxquery=None): # in SmRNAwindow
381 '''refactored on 7-9-2014 to debug size_histogram tool'''
382 norm=self.norm
383 size_dict={}
384 size_dict['F']= defaultdict (float)
385 size_dict['R']= defaultdict (float)
386 size_dict['both']= defaultdict (float)
387 for offset in self.readDict:
388 for size in self.readDict[offset]:
389 if offset < 0:
390 size_dict['R'][size] = size_dict['R'][size] - 1*norm
391 else:
392 size_dict['F'][size] = size_dict['F'][size] + 1*norm
393 ## patch to avoid missing graphs when parsed by R-lattice. 27-08-2014. Test and validate !
394 if not (size_dict['F']) and (not size_dict['R']):
395 size_dict['F'][21] = 0
396 size_dict['R'][21] = 0
397 ##
398 allSizeKeys = list (set (size_dict['F'].keys() + size_dict['R'].keys() ) )
399 for size in allSizeKeys:
400 size_dict['both'][size] = size_dict['F'][size] - size_dict['R'][size]
401 if minquery:
402 for polarity in size_dict.keys():
403 for size in xrange(minquery, maxquery+1):
404 if not size in size_dict[polarity].keys():
405 size_dict[polarity][size]=0
406 return size_dict
407
408 def statsizes (self, upstream_coord=None, downstream_coord=None):
409 ''' migration to memory saving by specifying possible subcoordinates
410 see the readcount method for further discussion'''
411 upstream_coord = upstream_coord or self.windowoffset
412 downstream_coord = downstream_coord or self.windowoffset+self.size-1
413 L = []
414 for offset in self.readDict:
415 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
416 for size in self.readDict[offset]:
417 L.append(size)
418 meansize = mean(L)
419 stdv = std(L)
420 mediansize = median(L)
421 return meansize, mediansize, stdv
422
423 def foldEnergy (self, upstream_coord=None, downstream_coord=None):
424 ''' migration to memory saving by specifying possible subcoordinates
425 see the readcount method for further discussion'''
426 upstream_coord = upstream_coord or self.windowoffset
427 downstream_coord = downstream_coord or self.windowoffset+self.size-1
428 Energy = RNAfold ([self.sequence[upstream_coord-1:downstream_coord] ])
429 return float(Energy[self.sequence[upstream_coord-1:downstream_coord]])
430
431 def Ufreq (self, size_scope, upstream_coord=None, downstream_coord=None):
432 ''' migration to memory saving by specifying possible subcoordinates
433 see the readcount method for further discussion. size_scope must be an interable'''
434 upstream_coord = upstream_coord or self.windowoffset
435 downstream_coord = downstream_coord or self.windowoffset+self.size-1
436 freqDic = {"A":0,"T":0,"G":0,"C":0, "N":0}
437 convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
438 for offset in self.readDict:
439 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
440 for size in self.readDict[offset]:
441 if size in size_scope:
442 startbase = self.sequence[abs(offset)-self.windowoffset]
443 if offset < 0:
444 startbase = convertDic[startbase]
445 freqDic[startbase] += 1
446 base_sum = float ( sum( freqDic.values()) )
447 if base_sum == 0:
448 return "."
449 else:
450 return freqDic["T"] / base_sum * 100
451
452 def Ufreq_stranded (self, size_scope, upstream_coord=None, downstream_coord=None):
453 ''' migration to memory saving by specifying possible subcoordinates
454 see the readcount method for further discussion. size_scope must be an interable
455 This method is similar to the Ufreq method but take strandness into account'''
456 upstream_coord = upstream_coord or self.windowoffset
457 downstream_coord = downstream_coord or self.windowoffset+self.size-1
458 freqDic = {"Afor":0,"Tfor":0,"Gfor":0,"Cfor":0, "Nfor":0,"Arev":0,"Trev":0,"Grev":0,"Crev":0, "Nrev":0}
459 convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
460 for offset in self.readDict:
461 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
462 for size in self.readDict[offset]:
463 if size in size_scope:
464 startbase = self.sequence[abs(offset)-self.windowoffset]
465 if offset < 0:
466 startbase = convertDic[startbase]
467 freqDic[startbase+"rev"] += 1
468 else:
469 freqDic[startbase+"for"] += 1
470 forward_sum = float ( freqDic["Afor"]+freqDic["Tfor"]+freqDic["Gfor"]+freqDic["Cfor"]+freqDic["Nfor"])
471 reverse_sum = float ( freqDic["Arev"]+freqDic["Trev"]+freqDic["Grev"]+freqDic["Crev"]+freqDic["Nrev"])
472 if forward_sum == 0 and reverse_sum == 0:
473 return ". | ."
474 elif reverse_sum == 0:
475 return "%s | ." % (freqDic["Tfor"] / forward_sum * 100)
476 elif forward_sum == 0:
477 return ". | %s" % (freqDic["Trev"] / reverse_sum * 100)
478 else:
479 return "%s | %s" % (freqDic["Tfor"] / forward_sum * 100, freqDic["Trev"] / reverse_sum * 100)
480
481
482 def readplot (self):
483 norm=self.norm
484 readmap = {}
485 for offset in self.readDict.keys():
486 readmap[abs(offset)] = ( len(self.readDict.get(-abs(offset),[]))*norm , len(self.readDict.get(abs(offset),[]))*norm )
487 mylist = []
488 for offset in sorted(readmap):
489 if readmap[offset][1] != 0:
490 mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, readmap[offset][1], "F") )
491 if readmap[offset][0] != 0:
492 mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, -readmap[offset][0], "R") )
493 ## patch to avoid missing graphs when parsed by R-lattice. 27-08-2014. Test and validate !
494 if not mylist:
495 mylist.append("%s\t%s\t%s\t%s" % (self.gene, 1, 0, "F") )
496 ###
497 return mylist
498
499 def readcoverage (self, upstream_coord=None, downstream_coord=None, windowName=None):
500 '''Use by MirParser tool'''
501 upstream_coord = upstream_coord or 1
502 downstream_coord = downstream_coord or self.size
503 windowName = windowName or "%s_%s_%s" % (self.gene, upstream_coord, downstream_coord)
504 forORrev_coverage = dict ([(i,0) for i in xrange(1, downstream_coord-upstream_coord+1)])
505 totalforward = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="forward")
506 totalreverse = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="reverse")
507 if totalforward > totalreverse:
508 majorcoverage = "forward"
509 for offset in self.readDict.keys():
510 if (offset > 0) and ((offset-upstream_coord+1) in forORrev_coverage.keys() ):
511 for read in self.readDict[offset]:
512 for i in xrange(read):
513 try:
514 forORrev_coverage[offset-upstream_coord+1+i] += 1
515 except KeyError:
516 continue # a sense read may span over the downstream limit
517 else:
518 majorcoverage = "reverse"
519 for offset in self.readDict.keys():
520 if (offset < 0) and (-offset-upstream_coord+1 in forORrev_coverage.keys() ):
521 for read in self.readDict[offset]:
522 for i in xrange(read):
523 try:
524 forORrev_coverage[-offset-upstream_coord-i] += 1 ## positive coordinates in the instance, with + for forward coverage and - for reverse coverage
525 except KeyError:
526 continue # an antisense read may span over the upstream limit
527 output_list = []
528 maximum = max (forORrev_coverage.values()) or 1
529 for n in sorted (forORrev_coverage):
530 output_list.append("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (self.biosample, windowName, n, float(n)/(downstream_coord-upstream_coord+1), forORrev_coverage[n], float(forORrev_coverage[n])/maximum, majorcoverage))
531 return "\n".join(output_list)
532
533
534 def signature (self, minquery, maxquery, mintarget, maxtarget, scope, zscore="no", upstream_coord=None, downstream_coord=None):
535 ''' migration to memory saving by specifying possible subcoordinates
536 see the readcount method for further discussion
537 scope must be a python iterable; scope define the *relative* offset range to be computed'''
538 upstream_coord = upstream_coord or self.windowoffset
539 downstream_coord = downstream_coord or self.windowoffset+self.size-1
540 query_range = range (minquery, maxquery+1)
541 target_range = range (mintarget, maxtarget+1)
542 Query_table = {}
543 Target_table = {}
544 frequency_table = dict ([(i, 0) for i in scope])
545 for offset in self.readDict:
546 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
547 for size in self.readDict[offset]:
548 if size in query_range:
549 Query_table[offset] = Query_table.get(offset, 0) + 1
550 if size in target_range:
551 Target_table[offset] = Target_table.get(offset, 0) + 1
552 for offset in Query_table:
553 for i in scope:
554 frequency_table[i] += min(Query_table[offset], Target_table.get(-offset -i +1, 0))
555 if minquery==mintarget and maxquery==maxtarget: ## added to incorporate the division by 2 in the method (26/11/2013), see signature_options.py and lattice_signature.py
556 frequency_table = dict([(i,frequency_table[i]/2) for i in frequency_table])
557 if zscore == "yes":
558 z_mean = mean(frequency_table.values() )
559 z_std = std(frequency_table.values() )
560 if z_std == 0:
561 frequency_table = dict([(i,0) for i in frequency_table] )
562 else:
563 frequency_table = dict([(i, (frequency_table[i]- z_mean)/z_std) for i in frequency_table] )
564 return frequency_table
565
566 def hannon_signature (self, minquery, maxquery, mintarget, maxtarget, scope, upstream_coord=None, downstream_coord=None):
567 ''' migration to memory saving by specifying possible subcoordinates see the readcount method for further discussion
568 note that scope must be an iterable (a list or a tuple), which specifies the relative offsets that will be computed'''
569 upstream_coord = upstream_coord or self.windowoffset
570 downstream_coord = downstream_coord or self.windowoffset+self.size-1
571 query_range = range (minquery, maxquery+1)
572 target_range = range (mintarget, maxtarget+1)
573 Query_table = {}
574 Target_table = {}
575 Total_Query_Numb = 0
576 general_frequency_table = dict ([(i,0) for i in scope])
577 ## filtering the appropriate reads for the study
578 for offset in self.readDict:
579 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
580 for size in self.readDict[offset]:
581 if size in query_range:
582 Query_table[offset] = Query_table.get(offset, 0) + 1
583 Total_Query_Numb += 1
584 if size in target_range:
585 Target_table[offset] = Target_table.get(offset, 0) + 1
586 for offset in Query_table:
587 frequency_table = dict ([(i,0) for i in scope])
588 number_of_targets = 0
589 for i in scope:
590 frequency_table[i] += Query_table[offset] * Target_table.get(-offset -i +1, 0)
591 number_of_targets += Target_table.get(-offset -i +1, 0)
592 for i in scope:
593 try:
594 general_frequency_table[i] += (1. / number_of_targets / Total_Query_Numb) * frequency_table[i]
595 except ZeroDivisionError :
596 continue
597 return general_frequency_table
598
599 def phasing (self, size_range, scope):
600 ''' to calculate autocorelation like signal - scope must be an python iterable'''
601 read_table = {}
602 total_read_number = 0
603 general_frequency_table = dict ([(i, 0) for i in scope])
604 ## read input filtering
605 for offset in self.readDict:
606 for size in self.readDict[offset]:
607 if size in size_range:
608 read_table[offset] = read_table.get(offset, 0) + 1
609 total_read_number += 1
610 ## per offset read phasing computing
611 for offset in read_table:
612 frequency_table = dict ([(i, 0) for i in scope]) # local frequency table
613 number_of_targets = 0
614 for i in scope:
615 if offset > 0:
616 frequency_table[i] += read_table[offset] * read_table.get(offset + i, 0)
617 number_of_targets += read_table.get(offset + i, 0)
618 else:
619 frequency_table[i] += read_table[offset] * read_table.get(offset - i, 0)
620 number_of_targets += read_table.get(offset - i, 0)
621 ## inclusion of local frequency table in the general frequency table (all offsets average)
622 for i in scope:
623 try:
624 general_frequency_table[i] += (1. / number_of_targets / total_read_number) * frequency_table[i]
625 except ZeroDivisionError :
626 continue
627 return general_frequency_table
628
629
630
631 def z_signature (self, minquery, maxquery, mintarget, maxtarget, scope):
632 '''Must do: from numpy import mean, std, to use this method; scope must be a python iterable and defines the relative offsets to compute'''
633 frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope)
634 z_table = {}
635 frequency_list = [frequency_table[i] for i in sorted (frequency_table)]
636 if std(frequency_list):
637 meanlist = mean(frequency_list)
638 stdlist = std(frequency_list)
639 z_list = [(i-meanlist)/stdlist for i in frequency_list]
640 return dict (zip (sorted(frequency_table), z_list) )
641 else:
642 return dict (zip (sorted(frequency_table), [0 for i in frequency_table]) )
643
644 def percent_signature (self, minquery, maxquery, mintarget, maxtarget, scope):
645 frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope)
646 total = float(sum ([self.readsizes().get(i,0) for i in set(range(minquery,maxquery)+range(mintarget,maxtarget))]) )
647 if total == 0:
648 return dict( [(i,0) for i in scope])
649 return dict( [(i, frequency_table[i]/total*100) for i in scope])
650
651 def pairer (self, overlap, minquery, maxquery, mintarget, maxtarget):
652 queryhash = defaultdict(list)
653 targethash = defaultdict(list)
654 query_range = range (int(minquery), int(maxquery)+1)
655 target_range = range (int(mintarget), int(maxtarget)+1)
656 paired_sequences = []
657 for offset in self.readDict: # selection of data
658 for size in self.readDict[offset]:
659 if size in query_range:
660 queryhash[offset].append(size)
661 if size in target_range:
662 targethash[offset].append(size)
663 for offset in queryhash:
664 if offset >= 0: matched_offset = -offset - overlap + 1
665 else: matched_offset = -offset - overlap + 1
666 if targethash[matched_offset]:
667 paired = min ( len(queryhash[offset]), len(targethash[matched_offset]) )
668 if offset >= 0:
669 for i in range (paired):
670 paired_sequences.append("+%s" % RNAtranslate ( self.sequence[offset:offset+queryhash[offset][i]]) )
671 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-targethash[matched_offset][i]+1:-matched_offset+1]) ) )
672 if offset < 0:
673 for i in range (paired):
674 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-queryhash[offset][i]+1:-offset+1]) ) )
675 paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+targethash[matched_offset][i]] ) )
676 return paired_sequences
677
678 def pairable (self, overlap, minquery, maxquery, mintarget, maxtarget):
679 queryhash = defaultdict(list)
680 targethash = defaultdict(list)
681 query_range = range (int(minquery), int(maxquery)+1)
682 target_range = range (int(mintarget), int(maxtarget)+1)
683 paired_sequences = []
684
685 for offset in self.readDict: # selection of data
686 for size in self.readDict[offset]:
687 if size in query_range:
688 queryhash[offset].append(size)
689 if size in target_range:
690 targethash[offset].append(size)
691
692 for offset in queryhash:
693 matched_offset = -offset - overlap + 1
694 if targethash[matched_offset]:
695 if offset >= 0:
696 for i in queryhash[offset]:
697 paired_sequences.append("+%s" % RNAtranslate (self.sequence[offset:offset+i]) )
698 for i in targethash[matched_offset]:
699 paired_sequences.append( "-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-i+1:-matched_offset+1]) ) )
700 if offset < 0:
701 for i in queryhash[offset]:
702 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-i+1:-offset+1]) ) )
703 for i in targethash[matched_offset]:
704 paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+i] ) )
705 return paired_sequences
706
707 def newpairable_bowtie (self, overlap, minquery, maxquery, mintarget, maxtarget):
708 ''' revision of pairable on 3-12-2012, with focus on the offset shift problem (bowtie is 1-based cooordinates whereas python strings are 0-based coordinates'''
709 queryhash = defaultdict(list)
710 targethash = defaultdict(list)
711 query_range = range (int(minquery), int(maxquery)+1)
712 target_range = range (int(mintarget), int(maxtarget)+1)
713 bowtie_output = []
714
715 for offset in self.readDict: # selection of data
716 for size in self.readDict[offset]:
717 if size in query_range:
718 queryhash[offset].append(size)
719 if size in target_range:
720 targethash[offset].append(size)
721 counter = 0
722 for offset in queryhash:
723 matched_offset = -offset - overlap + 1
724 if targethash[matched_offset]:
725 if offset >= 0:
726 for i in queryhash[offset]:
727 counter += 1
728 bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "+", self.gene, offset-1, self.sequence[offset-1:offset-1+i]) ) # attention a la base 1-0 de l'offset
729 if offset < 0:
730 for i in queryhash[offset]:
731 counter += 1
732 bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "-", self.gene, -offset-i, self.sequence[-offset-i:-offset])) # attention a la base 1-0 de l'offset
733 return bowtie_output
734
735
736 def __main__(bowtie_index_path, bowtie_output_path):
737 sequenceDic = get_fasta (bowtie_index_path)
738 objDic = {}
739 F = open (bowtie_output_path, "r") # F is the bowtie output taken as input
740 for line in F:
741 fields = line.split()
742 polarity = fields[1]
743 gene = fields[2]
744 offset = int(fields[3])
745 size = len (fields[4])
746 try:
747 objDic[gene].addread (polarity, offset, size)
748 except KeyError:
749 objDic[gene] = SmRNAwindow(gene, sequenceDic[gene])
750 objDic[gene].addread (polarity, offset, size)
751 F.close()
752 for gene in objDic:
753 print gene, objDic[gene].pairer(19,19,23,19,23)
754
755 if __name__ == "__main__" : __main__(sys.argv[1], sys.argv[2])