comparison data_manager_build_alfa_indexes/data_manager/ALFA.py @ 4:6f0be85be8fb draft

Uploaded
author charles-bernard
date Thu, 27 Oct 2016 06:49:58 -0400
parents
children
comparison
equal deleted inserted replaced
3:4ada30780811 4:6f0be85be8fb
1 #!/usr/bin/python
2 #-*- coding: utf-8 -*-
3
4 __author__ = 'noel & bahin'
5 ''' <decription> '''
6
7 import argparse
8 import os
9 import numpy
10 import sys
11 import subprocess
12 import matplotlib.pyplot as plt
13 import matplotlib.cm as cmx
14 import matplotlib.colors as colors
15 import matplotlib.patheffects as PathEffects
16 import re
17 from matplotlib.backends.backend_pdf import PdfPages
18 # To correctly embbed the texts when saving plots in svg format
19 import matplotlib
20 matplotlib.rcParams['svg.fonttype'] = 'none'
21
22 ##########################################################################
23 # FUNCTIONS #
24 ##########################################################################
25
26 def init_dict(d, key, init):
27 if key not in d:
28 d[key] = init
29
30 def get_chromosome_lengths(args):
31 """
32 Parse the file containing the chromosomes lengths.
33 If no length file is provided, browse the annotation file (GTF) to estimate the chromosome sizes (
34 """
35 lengths={}
36 gtf_chrom_names=set()
37 force_get_lengths = False
38 # If the user provided the chromosome length file
39 if args.chr_len:
40 with open(args.chr_len, 'r') as chr_len_file:
41 for line in chr_len_file:
42 lengths[line.split('\t')[0]] = int(line.rstrip().split('\t')[1])
43 with open(args.annotation,'r') as gtf_file:
44 for line in gtf_file:
45 if not line.startswith('#'):
46 chrom = line.split('\t')[0]
47 if chrom not in gtf_chrom_names:
48 gtf_chrom_names.add(chrom)
49 for chrom in lengths.keys():
50 if chrom not in gtf_chrom_names:
51 print "Warning: at least one chromosome name ('"+chrom+"') of the file '"+args.chr_len+"'does not match any chromosome name if GTF and was ignored."
52 #del lengths[chrom]
53 break
54 for chrom in gtf_chrom_names:
55 if force_get_lengths: break
56 if chrom not in lengths.keys():
57 print "WARNING: chromosome name '"+chrom+"' was found in gtf and does not match any chromosome name provided in",args.chr_len+". "
58 print "\t=> The chromosome lenghts will be approximated using annotations in the GTF file."
59 continue_value =""
60 while continue_value not in {"yes","y","no","n"}:
61 continue_value = raw_input("\tDo you want to continue ('yes' or 'y')?\n\tElse write 'no' or 'n' to exit the script and check your file of lengths.\n")
62 if continue_value == "no" or continue_value == "n":
63 sys.exit("Exiting")
64 elif continue_value == "yes" or continue_value == "y":
65 force_get_lengths = True
66 break
67 print "Error: use 'yes/y/no/n' only"
68 if not force_get_lengths:
69 return lengths
70 # Otherwise, (or if at least one chromosome was missing in chromosome lengths file) we consider the end of the last annotation of the chromosome in the GTF file as the chromosome length
71 with open(args.annotation, 'r') as gtf_file:
72 for line in gtf_file:
73 if not line.startswith('#'):
74 chrom = line.split('\t')[0]
75 end = int(line.split('\t')[4])
76 init_dict(lengths, chrom, 0)
77 lengths[chrom] = max(lengths[chrom], end)
78 if force_get_lengths:
79 print "The chromosome lenghts have been approximated using the last annotations in the GTF file."
80 return lengths
81
82 def write_feature_on_index(feat,chrom, start, stop, sign, stranded_genome_index, unstranded_genome_index=None):
83 grouped_by_biotype_features = []
84 for biotype,categs in feat.iteritems():
85 categ_list=[]
86 for cat in set(categs):
87 categ_list.append(cat)
88 grouped_by_biotype_features.append(":".join((str(biotype),",".join(categ_list))))
89 stranded_genome_index.write('\t'.join((chrom, start, stop, sign,''))+'\t'.join(grouped_by_biotype_features)+'\n')
90 if unstranded_genome_index :
91 unstranded_genome_index.write('\t'.join((chrom, start, stop, '.',''))+'\t'.join(grouped_by_biotype_features)+'\n')
92
93
94 def add_info(cpt, feat_values, start, stop, chrom=None, unstranded_genome_index=None, stranded_genome_index = None , biotype_prios=None, coverage=1, categ_prios=None):
95 """
96 From an annotated genomic interval (start/stop positions and associated feature : one or more category(ies)/biotype pair(s) )
97 - If a file is provided: write the information at the end of the index file being generated;
98 - else : browse the features and update the counts of categories/biotypes found in the genome.
99 """
100 ## Writing in the file is it was provided
101 if stranded_genome_index :
102 unstranded_features = None
103 # If only one strand has a feature, this feature will directly be written on the unstranded index
104 if feat_values[0] == {} :
105 # A interval with no feature corresponds to a region annotated only on the reverse strand : update 'antisense' counts
106 stranded_genome_index.write('\t'.join((chrom, start, stop, '+','antisense\n')))
107 write_feature_on_index(feat_values[1], chrom ,start, stop, '-', stranded_genome_index, unstranded_genome_index)
108 else :
109 if feat_values[1] == {} :
110 write_feature_on_index(feat_values[0], chrom ,start, stop, '+', stranded_genome_index, unstranded_genome_index)
111 stranded_genome_index.write('\t'.join((chrom, start, stop, '-','antisense\n')))
112
113 # Else, the unstranded index should contain the union of plus and minus features
114 else :
115 write_feature_on_index(feat_values[0], chrom ,start, stop, '+', stranded_genome_index)
116 write_feature_on_index(feat_values[1], chrom ,start, stop, '-', stranded_genome_index)
117 unstranded_feat = dict(feat_values[0], **feat_values[1])
118 for name in set(feat_values[0]) & set(feat_values[1]):
119 unstranded_feat[name]+=feat_values[0][name]
120 write_feature_on_index(unstranded_feat, chrom ,start, stop, '.', unstranded_genome_index)
121
122 ## Increasing category counter(s)
123 else :
124 # Default behavior if no biotype priorities : category with the highest priority for each found biotype has the same weight (1/n_biotypes)
125 if not biotype_prios:
126 nb_feat = len(feat_values)
127 # For every categ(s)/biotype pairs
128 for feat in feat_values:
129 cur_prio = 0
130 selected_categ = []
131 # Separate categorie(s) and biotype
132 try:
133 biot,cats = feat.split(":")
134 # Error if feature equal "antisense" : update the 'antisense/antisense' counts
135 except ValueError :
136 try :
137 cpt[(feat,feat)] += (int(stop) - int(start)) * coverage / float(nb_feat)
138 except :
139 cpt[(feat,feat)] = (int(stop) - int(start)) * coverage / float(nb_feat)
140 return
141 # Browse the categories and get only the one(s) with highest priority
142 for cat in cats.split(','):
143 try: prio = prios[cat]
144 except:
145 #TODO Find a way to add unknown categories
146 if cat not in unknown_feature:
147 print >> sys.stderr, "Warning: Unknown categorie %s found and ignored\n...\r" %cat,
148 unknown_feature.append(cat)
149 continue
150 if prio > cur_prio :
151 cur_prio = prio
152 selected_categ = [cat]
153 if prio == cur_prio :
154 if cat != selected_categ :
155 try:
156 if cat not in selected_categ :
157 selected_categ.append(cat)
158 except TypeError :
159 selected_categ = [selected_categ,cat]
160 # Increase each counts by the coverage divided by the number of categories and biotypes
161 nb_cats = len(selected_categ)
162 for cat in selected_categ :
163 try:
164 cpt[(cat,biot)] += (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats))
165 except KeyError:
166 cpt[(cat,biot)] = (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats))
167 #else :
168 #cpt[(cats,biot)] = (int(stop) - int(start)) / float(nb_feat) * coverage
169 # Else, apply biotype selection according to the priority set
170 else:
171 #TODO Add an option to pass biotype priorities and handle it
172 pass
173
174 def print_chrom(features_dict, chrom, stranded_index_file, unstranded_index_file, cpt_genome):
175 with open(unstranded_index_file,'a') as findex, open(stranded_index_file,'a') as fstrandedindex:
176 # Initialization of variables : start position of the first interval and associated features for +/- strands
177 start = ""
178 for pos in sorted(features_dict['+'].keys()):
179 if start != "":
180 add_info(cpt_genome, [features_plus,features_minus], str(start), str(pos), chrom, stranded_genome_index = fstrandedindex, unstranded_genome_index = findex)
181 start = pos
182 features_plus = features_dict['+'][pos]
183 features_minus = features_dict['-'][pos]
184
185
186 def create_genome_index(annotation, unstranded_genome_index, stranded_genome_index,cpt_genome,prios,biotypes,chrom_sizes):
187 ''' Create an index of the genome annotations and save it in a file'''
188 print '\n### Generating genome indexes\n',
189 sys.stdout.flush()
190 # Initializations
191 intervals_dict = 0
192 max_value = -1
193 prev_chrom = ''
194 reverse_strand = {'+':'-','-':'+'}
195 i = 0 # Line counter
196 # Write the chromosomes lengths as comment lines before the genome index
197 with open(unstranded_genome_index,'w') as findex, open(stranded_genome_index,'w') as fstrandedindex:
198 for key,value in chrom_sizes.items():
199 findex.write("#%s\t%s\n" %(key, value))
200 fstrandedindex.write("#%s\t%s\n" %(key, value))
201 # Running through the GTF file and writing into genome index files
202 with open(annotation, 'r') as gtf_file:
203 for line in gtf_file:
204 # Print messages after X processed lines
205 i += 1
206 if i % 100000 == 0:
207 print >> sys.stderr, '\r%s line processed...' %str(i)
208 print '\r \r. . .',
209 sys.stdout.flush()
210 elif i % 20000 == 0:
211 print '\r \r. . .',
212 sys.stdout.flush()
213 elif i % 2000 == 0:
214 print '.',
215 sys.stdout.flush()
216 # Processing lines except comment ones
217 if not line.startswith('#'):
218 # Getting the line infos
219 line_split=line[:-1].split('\t')
220 chrom = line_split[0]
221 cat=line_split[2]
222 start = int(line_split[3]) - 1
223 stop = int(line_split[4])
224 strand = line_split[6]
225 antisense = reverse_strand[strand]
226 biotype=line_split[8].split('biotype')[1].split(';')[0].strip('" ')
227 feat = [(cat,biotype)]
228 # Registering chromosome info in the genome index file if this is a new chromosome or a annotation not overlapping previously recorded features
229 if start > max_value or chrom != prev_chrom:
230 # Write the previous features
231 if intervals_dict != 0:
232 if chrom != prev_chrom :
233 print_chrom(intervals_dict, prev_chrom, stranded_genome_index, unstranded_genome_index, cpt_genome)
234 print "\rChromosome '" + prev_chrom + "' registered."
235 else:
236 print_chrom(intervals_dict, chrom, stranded_genome_index, unstranded_genome_index, cpt_genome)
237 prev_chrom = chrom
238 # (Re)Initializing the chromosome lists
239 intervals_dict = {strand:{start:{biotype:[cat]}, stop:{}},antisense:{start:{},stop:{}}}
240 max_value = stop
241
242 # Update the dictionary which represents intervals for every disctinct annotation
243 else:
244 # Get intervals on the same strand as the current feature
245 stranded_intervals = intervals_dict[strand]
246 start_added = False
247 for pos in sorted(stranded_intervals.keys()):
248 #print pos
249 #print stranded_intervals[pos]
250 #print
251 # While the position is below the feature's interval, store the features
252 if pos < start :
253 cur_cat_dict = dict(stranded_intervals[pos])
254 cur_antisense_dict = dict(intervals_dict[antisense][pos])
255
256 # If the start position already exists: update it by addind the new feature
257 elif pos == start :
258 cur_cat_dict = dict(stranded_intervals[pos])
259 cur_antisense_dict = dict(intervals_dict[antisense][pos])
260 #print "cur",cur_cat_dict
261 try : stranded_intervals[pos][biotype] = stranded_intervals[pos][biotype]+[cat]
262 except KeyError: stranded_intervals[pos][biotype] = [cat]
263 start_added = True
264 #print "cur",cur_cat_dict
265
266 elif pos > start :
267 # Create a new entry for the start position if necessary
268 if not start_added :
269 #print "cur",cur_cat_dict
270 stranded_intervals[start] = dict(cur_cat_dict)
271 stranded_intervals[start][biotype] = [cat]
272 #stranded_intervals[pos][biotype].append(cat)
273 intervals_dict[antisense][start] = cur_antisense_dict
274 start_added = True
275 #print "cur",cur_cat_dict
276 # While the position is below the stop, just add the new feature
277 if pos < stop :
278 cur_cat_dict = dict(stranded_intervals[pos])
279 cur_antisense_dict = dict(intervals_dict[antisense][pos])
280 try: stranded_intervals[pos][biotype] = stranded_intervals[pos][biotype] + [cat]
281 except KeyError: stranded_intervals[pos][biotype] = [cat]
282 # Close the created interval : create an entry at the stop position and restore the features
283 elif pos > stop :
284 stranded_intervals[stop] = dict(cur_cat_dict)
285 intervals_dict[antisense][stop] = cur_antisense_dict
286 break
287 else :
288 break
289 #try:
290 #cur_cat_dict = list(stranded_intervals[pos][biotype])
291 #except KeyError: cur_cat_dict = list()
292 #print stranded_intervals[pos]
293 #print
294 # Extend the dictionary if necessary
295 if stop > max_value:
296 max_value = stop
297 stranded_intervals[stop] = {}
298 intervals_dict[antisense][stop] = {}
299 #except KeyError:
300 #print intervals_dict
301 #quit()
302 #intervals_dict[strand] = {strand:{start:{biotype:[cat]}, stop:{}},antisense:{start:{},stop:{}}}
303 #continue
304 #for sign in ['-','+']:
305 #print sign
306 #for key,val in sorted(intervals_dict[sign].iteritems()):
307 #print key,'\t',val
308 #print
309 #print "-------\n"
310
311
312 #Store the categories of the last chromosome
313 print_chrom(intervals_dict, chrom, stranded_genome_index, unstranded_genome_index, cpt_genome)
314 print "\rChromosome '" + prev_chrom + "' registered.\nDone!"
315
316
317 def create_bedgraph_files(bams,strandness):
318 samples_files = []
319 labels = []
320 print "\n### Generating the bedgraph files"
321 for n in range(0, len(bams), 2):
322 print "\rProcessing '%s'\n..." %bams[n],
323 sys.stdout.flush()
324 #Get the label for this sample
325 label = bams[n+1]
326 #Modify it to contain only alphanumeric caracters (avoid files generation with dangerous names)
327 modified_label = "_".join(re.findall(r"[\w']+", label))
328 if strandness in ["reverse","fr-secondstrand"]:
329 subprocess.call('bedtools genomecov -bg -split -strand - -ibam ' + bams[n] + ' > ' + modified_label + '.plus.bedgraph', shell=True)
330 subprocess.call('bedtools genomecov -bg -split -strand + -ibam ' + bams[n] + ' > ' + modified_label + '.minus.bedgraph', shell=True)
331 elif strandness in ["forward","fr-firststrand"]:
332 subprocess.call('bedtools genomecov -bg -split -strand + -ibam ' + bams[n] + ' > ' + modified_label + '.plus.bedgraph', shell=True)
333 subprocess.call('bedtools genomecov -bg -split -strand - -ibam ' + bams[n] + ' > ' + modified_label + '.minus.bedgraph', shell=True)
334 else :
335 subprocess.call('bedtools genomecov -bg -split -ibam ' + bams[n] + ' > ' + modified_label + '.bedgraph', shell=True)
336 samples_files.append(modified_label)
337 labels.append(label)
338 print "\rDone!"
339 return samples_files, labels
340
341 def read_gtf(gtf_index_file, sign):
342 global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_features, endGTF
343 strand = ""
344 while strand != sign :
345 gtf_line = gtf_index_file.readline()
346 # If the GTF file is finished
347 if not gtf_line:
348 endGTF = True
349 return endGTF
350 splitline = gtf_line.rstrip().split('\t')
351 try: strand = splitline[3]
352 # strand information can not be found in the file file header
353 except IndexError: pass
354 gtf_chrom = splitline[0]
355 gtf_features = splitline[4:]
356 gtf_start, gtf_stop = map(int, splitline[1:3])
357 return endGTF
358
359 def read_counts_files(counts_files):
360 cpt={}
361 cpt_genome={}
362 labels=[]
363 for fcounts in counts_files:
364 label=os.path.splitext(os.path.basename(fcounts))[0]
365 labels.append(label)
366 cpt[label]={}
367 with open (fcounts,"r") as f:
368 for line in f:
369 if line[0]=="#":
370 continue
371 line_split=line[:-1].split('\t')
372 feature=tuple(line_split[0].split(','))
373 cpt[label][feature]=float(line_split[1])
374 cpt_genome[feature]=float(line_split[2])
375 return cpt,cpt_genome,labels
376
377
378 def get_chromosome_names_in_index(genome_index):
379 chrom_list = []
380 with open(genome_index, 'r') as findex:
381 chrom = ""
382 for line in findex:
383 cur_chrom = line.split('\t')[0]
384 if cur_chrom == chrom:
385 pass
386 else:
387 chrom = cur_chrom
388 if chrom not in chrom_list:
389 chrom_list.append(chrom)
390 return set(chrom_list)
391
392
393 def intersect_bedgraphs_and_index_to_counts_categories(samples_files,samples_names,prios,genome_index, strandness, biotype_prios = None):
394 global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_cat, endGTF
395 print "\n### Intersecting files with indexes"
396 unknown_chrom = []
397 cpt = {} # Counter for the nucleotides in the BAM input file(s)
398 for n in range(len(samples_files)):
399 sample_file=samples_files[n]
400 sample_name=samples_names[n]
401 # Initializing the category counter dict for this sample
402 init_dict(cpt, sample_name, {})
403 if strandness == "unstranded":
404 strands = [("",".")]
405 else:
406 strands = [('.plus','+'), ('.minus','-')]
407
408 # Intersecting the BEDGRAPH and genome index files
409 print "\rProcessing '%s'\n. . ." %sample_file,
410 sys.stdout.flush()
411
412 for strand,sign in strands:
413 prev_chrom = ''
414 endGTF = False # Reaching the next chr or the end of the GTF index
415 intergenic_adds = 0.0
416 i = 0
417 i_chgt = 0
418 with open(sample_file + strand + '.bedgraph', 'r') as bam_count_file:
419 # Running through the BEDGRAPH file
420 for bam_line in bam_count_file:
421 i += 1
422 if i % 10000 == 0:
423 print ".",
424 sys.stdout.flush()
425 if i % 100000 == 0:
426 print "\r \r. . .",
427 sys.stdout.flush()
428 # Getting the BAM line info
429 bam_chrom = bam_line.split('\t')[0]
430 bam_start, bam_stop, bam_cpt = map(float, bam_line.split('\t')[1:4])
431 # Skip the line if the chromosome is not in the index
432 if bam_chrom not in chrom_list:
433 if bam_chrom not in unknown_chrom:
434 unknown_chrom.append(bam_chrom)
435 print "\r \r Chromosome '" + bam_chrom + "' not found in index."
436 continue
437 # If this is a new chromosome (or the first one)
438 if bam_chrom != prev_chrom:
439 i_chgt = i
440 intergenic_adds = 0.0
441 # (Re)opening the GTF index and looking for the first line of the matching chr
442 try: gtf_index_file.close()
443 except UnboundLocalError: pass
444 gtf_index_file = open(genome_index, 'r')
445 endGTF = False
446 read_gtf(gtf_index_file, sign)
447 while bam_chrom != gtf_chrom:
448 read_gtf(gtf_index_file, sign)
449 if endGTF:
450 break
451 prev_chrom = bam_chrom
452
453 # Looking for the first matching annotation in the GTF index
454 while (not endGTF) and (gtf_chrom == bam_chrom) and (bam_start >= gtf_stop):
455 read_gtf(gtf_index_file, sign)
456 if gtf_chrom != bam_chrom:
457 endGTF = True
458 # Processing BAM lines before the first GTF annotation if there are
459 if bam_start < gtf_start:
460 # Increase the 'intergenic' category counter with all or part of the BAM interval
461 try:
462 intergenic_adds += min(bam_stop,gtf_start)-bam_start
463 cpt[sample_name][('intergenic','intergenic')] += (min(bam_stop, gtf_start) - bam_start) * bam_cpt
464 except KeyError:
465 cpt[sample_name][('intergenic','intergenic')] = (min(bam_stop, gtf_start) - bam_start) * bam_cpt
466 # Go to next line if the BAM interval was totally upstream the first GTF annotation, carry on with the remaining part otherwise
467 if endGTF or (bam_stop <= gtf_start):
468 continue
469 else:
470 bam_start = gtf_start
471
472 # We can start the crossover
473 while not endGTF:
474 # Update category counter
475 add_info(cpt[sample_name], gtf_features, bam_start, min(bam_stop,gtf_stop), coverage = bam_cpt, categ_prios = prios)
476 # Read the next GTF file line if the BAM line is not entirely covered
477 if bam_stop > gtf_stop:
478 # Update the BAM start pointer
479 bam_start = gtf_stop
480 endGTF = read_gtf(gtf_index_file, sign)
481 # If we read a new chromosome in the GTF file then it is considered finished
482 if bam_chrom != gtf_chrom:
483 endGTF = True
484 if endGTF:
485 break
486 else:
487 # Next if the BAM line is entirely covered
488 bam_start = bam_stop
489 break
490
491 # Processing the end of the BAM line if necessary
492 if endGTF and (bam_stop > bam_start):
493 try:
494 cpt[sample_name][('intergenic','intergenic')] += (bam_stop - bam_start) * bam_cpt
495 except KeyError:
496 cpt[sample_name][('intergenic','intergenic')] = (bam_stop - bam_start) * bam_cpt
497 gtf_index_file.close()
498 print "\r \rDone!"
499 return cpt
500
501 def write_counts_in_files(cpt,genome_counts):
502 for label,dico in cpt.items():
503 label = "_".join(re.findall(r"[\w']+", label))
504 with open(label+".categories_counts","w") as fout:
505 fout.write("#Category,biotype\tCounts_in_bam\tSize_in_genome\n" )
506 for feature,counts in dico.items():
507 fout.write("%s\t%s\t%s\n" %(','.join(feature),counts,genome_counts[feature]))
508
509 def recategorize_the_counts(cpt,cpt_genome,final):
510 final_cat_cpt={}
511 final_genome_cpt={}
512 for f in cpt:
513 #print "\nFinal categories for",f,"sample"
514 final_cat_cpt[f]={}
515 for final_cat in final:
516 tot = 0
517 tot_genome=0
518 for cat in final[final_cat]:
519 tot += cpt[f][cat]
520 tot_genome+=cpt_genome[cat]
521 #output_file.write('\t'.join((final_cat, str(tot))) + '\n')
522 #print '\t'.join((final_cat, str(tot)))
523 final_cat_cpt[f][final_cat]=tot
524 final_genome_cpt[final_cat]=tot_genome
525 return final_cat_cpt,final_genome_cpt
526
527 def group_counts_by_categ(cpt,cpt_genome,final,selected_biotype):
528 final_cat_cpt={}
529 final_genome_cpt={}
530 filtered_cat_cpt = {}
531 for f in cpt:
532 final_cat_cpt[f]={}
533 filtered_cat_cpt[f] = {}
534 for final_cat in final:
535 tot = 0
536 tot_filter = 0
537 tot_genome=0
538 for cat in final[final_cat]:
539 for key,value in cpt[f].items():
540 if key[0] == cat :
541 tot += value
542 tot_genome+=cpt_genome[key]
543 if key[1] == selected_biotype :
544 tot_filter += value
545 #output_file.write('\t'.join((final_cat, str(tot))) + '\n')
546 #print '\t'.join((final_cat, str(tot)))
547 final_cat_cpt[f][final_cat]=tot
548 if tot_genome == 0:
549 final_genome_cpt[final_cat]= 1e-100
550 else:
551 final_genome_cpt[final_cat]=tot_genome
552 filtered_cat_cpt[f][final_cat]=tot_filter
553 #if 'antisense' in final_genome_cpt: final_genome_cpt['antisense'] = 0
554 return final_cat_cpt,final_genome_cpt,filtered_cat_cpt
555
556 def group_counts_by_biotype(cpt,cpt_genome,biotypes):
557 final_cpt={}
558 final_genome_cpt={}
559 for f in cpt:
560 final_cpt[f]={}
561 for biot in biotypes:
562 tot = 0
563 tot_genome=0
564 try:
565 for final_biot in biotypes[biot]:
566 for key,value in cpt[f].items():
567 if key[1] == final_biot :
568 tot += value
569 #if key[1] != 'antisense':
570 tot_genome+=cpt_genome[key]
571 except:
572 for key,value in cpt[f].items():
573 if key[1] == biot :
574 tot += value
575 tot_genome+=cpt_genome[key]
576 if tot != 0:
577 final_cpt[f][biot]=tot
578 final_genome_cpt[biot]=tot_genome
579 return final_cpt,final_genome_cpt
580
581 #def get_cmap(N):
582 #'''Returns a function that maps each index in 0, 1, ... N-1 to a distinct
583 #RGB color.'''
584 #color_norm = colors.Normalize(vmin=0, vmax=N-1)
585 #scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv')
586 #def map_index_to_rgb_color(index):
587 #return scalar_map.to_rgba(index)
588 #return map_index_to_rgb_color
589
590 def one_sample_plot(ordered_categs, percentages, enrichment, n_cat, index, index_enrichment, bar_width, counts_type, title) :
591 ### Initialization
592 fig = plt.figure(figsize=(13,9))
593 ax1 = plt.subplot2grid((2,4),(0,0),colspan=2)
594 ax2 = plt.subplot2grid((2,4),(1,0),colspan=2)
595 cmap= plt.get_cmap('Spectral')
596 cols=[cmap(x) for x in xrange(0,256,256/n_cat)]
597 if title:
598 ax1.set_title(title+"in: %s" %samples_names[0])
599 else :
600 ax1.set_title(counts_type+" distribution in mapped reads in: %s" %samples_names[0])
601 ax2.set_title('Normalized counts of '+counts_type)
602
603
604 ### Barplots
605 #First barplot: percentage of reads in each categorie
606 ax1.bar(index, percentages, bar_width,
607 color=cols)
608 #Second barplot: enrichment relative to the genome for each categ
609 # (the reads count in a categ is divided by the categ size in the genome)
610 ax2.bar(index_enrichment, enrichment, bar_width,
611 color=cols,)
612 ### Piecharts
613 pielabels = [ordered_categs[i] if percentages[i]>0.025 else '' for i in xrange(n_cat)]
614 sum_enrichment = numpy.sum(enrichment)
615 pielabels_enrichment = [ordered_categs[i] if enrichment[i]/sum_enrichment>0.025 else '' for i in xrange(n_cat)]
616 # Categories piechart
617 ax3 = plt.subplot2grid((2,4),(0,2))
618 pie_wedge_collection, texts = ax3.pie(percentages,labels=pielabels, shadow=True, colors=cols)
619 # Enrichment piechart
620 ax4 = plt.subplot2grid((2,4),(1,2))
621 pie_wedge_collection, texts = ax4.pie(enrichment,labels=pielabels_enrichment, shadow=True, colors=cols)
622 # Add legends (append percentages to labels)
623 labels = [" ".join((ordered_categs[i],"({:.1%})".format(percentages[i]))) for i in range(len(ordered_categs))]
624 ax3.legend(pie_wedge_collection,labels,loc='center',fancybox=True, shadow=True,prop={'size':'medium'}, bbox_to_anchor=(1.7,0.5))
625 labels = [" ".join((ordered_categs[i],"({:.1%})".format(enrichment[i]/sum_enrichment))) for i in range(len(ordered_categs))]# if ordered_categs[i] != 'antisense']
626 ax4.legend(pie_wedge_collection,labels,loc='center',fancybox=True, shadow=True,prop={'size':'medium'},bbox_to_anchor=(1.7,0.5))
627 # Set aspect ratio to be equal so that pie is drawn as a circle
628 ax3.set_aspect('equal')
629 ax4.set_aspect('equal')
630 return fig, ax1, ax2
631
632 def make_plot(ordered_categs,samples_names,categ_counts,genome_counts,pdf, counts_type, threshold, title = None ,svg = None, png = None):
633 # From ordered_categs, keep only the features (categs or biotypes) that we can find in at least one sample.
634 existing_categs = set()
635 for sample in categ_counts.values():
636 existing_categs |= set(sample.keys())
637 ordered_categs = filter(existing_categs.__contains__, ordered_categs)
638 n_cat = len(ordered_categs)
639 n_exp=len(samples_names)
640 ##Initialization of the matrix of counts (nrow=nb_experiements, ncol=nb_categorie)
641 counts=numpy.matrix(numpy.zeros(shape=(n_exp,n_cat)))
642 for exp in xrange(len(samples_names)):
643 for cat in xrange(len(ordered_categs)):
644 try: counts[exp,cat]=categ_counts[samples_names[exp]][ordered_categs[cat]]
645 except: pass
646
647 ##Normalize the categorie sizes by the total size to get percentages
648 sizes=[]
649 sizes_sum=0
650 for cat in ordered_categs:
651 sizes.append(genome_counts[cat])
652 sizes_sum+=genome_counts[cat]
653 if 'antisense' in ordered_categs:
654 antisense_pos = ordered_categs.index('antisense')
655 sizes[antisense_pos] = 1e-100
656 for cpt in xrange(len(sizes)):
657 sizes[cpt]/=float(sizes_sum)
658
659 ## Create array which contains the percentage of reads in each categ for every sample
660 percentages=numpy.array(counts/numpy.sum(counts,axis=1))
661 ## Create the enrichment array (counts divided by the categorie sizes in the genome)
662 enrichment=numpy.array(percentages/sizes)
663 if 'antisense_pos' in locals():
664 for i in xrange(len(samples_names)):
665 enrichment[i][antisense_pos] = 0
666 #enrichment=numpy.log(numpy.array(percentages/sizes))
667 for exp in xrange(n_exp):
668 for i in xrange(n_cat):
669 val = enrichment[exp][i]
670 if val > 1:
671 enrichment[exp][i] = val-1
672 elif val == 1 or val == 0:
673 enrichment[exp][i] = 0
674 else:
675 enrichment[exp][i] = -1/val+1
676
677 #### Finally, produce the plot
678
679 ##Get the colors from the colormap
680 ncolor=16
681 cmap = ["#e47878", "#68b4e5", "#a3ea9b", "#ea9cf3", "#e5c957", "#a3ecd1", "#e97ca0", "#66d985", "#8e7ae5", "#b3e04b", "#b884e4", "#e4e758", "#738ee3", "#e76688", "#70dddd", "#e49261"]
682 if n_exp > ncolor:
683 cmap = plt.get_cmap('Set3',n_exp)
684 cmap = [cmap(i) for i in xrange(n_exp)]
685
686 ## Parameters for the plot
687 opacity = 1
688 #Create a vector which contains the position of each bar
689 index = numpy.arange(n_cat)
690 #Size of the bars (depends on the categs number)
691 bar_width = 0.9/n_exp
692
693
694 ##Initialise the subplot
695 # if there is only one sample, also plot piecharts
696 #if n_exp == 1 and counts_type.lower() == 'categories':
697 #fig, ax1, ax2 = one_sample_plot(ordered_categs, percentages[0], enrichment[0], n_cat, index, bar_width, counts_type, title)
698 ## If more than one sample
699 #else:
700 fig, (ax1,ax2) = plt.subplots(2,figsize=(5+(n_cat+2*n_exp)/3,10))
701 # Store the bars objects for enrichment plot
702 rects = []
703 #For each sample/experiment
704 for i in range(n_exp):
705 #First barplot: percentage of reads in each categorie
706 ax1.bar(index+i*bar_width, percentages[i], bar_width,
707 alpha=opacity,
708 color=cmap[i],
709 label=samples_names[i], edgecolor='#FFFFFF', lw=0)
710 #Second barplot: enrichment relative to the genome for each categ
711 # (the reads count in a categ is divided by the categ size in the genome)
712 rects.append( ax2.bar(index+i*bar_width, enrichment[i], bar_width,
713 alpha=opacity,
714 color=cmap[i],
715 label=samples_names[i], edgecolor=cmap[i], lw=0))
716
717 ## Graphical options for the plot
718 # Adding of the legend
719 ax1.legend(loc='best',frameon=False)
720 #ax2.legend(loc='upper center',bbox_to_anchor=(0.5,-0.1), fancybox=True, shadow=True)
721 # Main titles
722 if title:
723 ax1.set_title(title)
724 else :
725 ax1.set_title(counts_type+" distribution in mapped reads")
726 ax2.set_title('Normalized counts of '+counts_type)
727
728 # Adding enrichment baseline
729 #ax2.axhline(y=0,color='black',linestyle='dashed',linewidth='1.5')
730 # Axes limits
731 ax1.set_xlim(-0.1,len(ordered_categs)+0.1)
732 if len(sizes) == 1: ax1.set_xlim(-2,3)
733 ax2.set_xlim(ax1.get_xlim())
734 # Set axis limits (max/min values + 5% margin)
735 ax2_ymin = []
736 ax2_ymax = []
737 for sample_values in enrichment:
738 ax2_ymin.append(min(sample_values))
739 ax2_ymax.append(max(sample_values))
740 ax2_ymax = max(ax2_ymax)
741 ax2_ymin = min(ax2_ymin)
742 margin_top, margin_bottom = (abs(0.05*ax2_ymax), abs(0.05*ax2_ymin))
743 ax1.set_ylim(0,ax1.get_ylim()[1]*1.05)
744 if threshold:
745 threshold_bottom = -abs(float(threshold[0]))+1
746 threshold_top = float(threshold[1])-1
747
748 for i in xrange(n_exp):
749 for y in xrange(n_cat):
750 val = enrichment[i][y]
751 if not numpy.isnan(val) and not (threshold_bottom < val < threshold_top):
752 rect = rects[i][y]
753 rect_height = rect.get_height()
754 if rect.get_y() < 0:
755 diff = rect_height + threshold_bottom
756 rect.set_y(threshold_bottom)
757 ax2_ymin = threshold_bottom
758 margin_bottom = 0
759 else:
760 diff = rect_height - threshold_top
761 ax2_ymax = threshold_top
762 margin_top = 0
763 rect.set_height(rect.get_height()-diff)
764 if margin_top != 0 and margin_bottom != 0:
765 margin_top, margin_bottom = [max(margin_top, margin_bottom) for i in xrange(2)]
766 ax2.set_ylim(ax2_ymin-margin_bottom,ax2_ymax+margin_top)
767 # Y axis title
768 ax1.set_ylabel('Proportion of reads (%)')
769 ax2.set_ylabel('Enrichment relative to genome')
770 # X axis title
771 ax1.set_xlabel(counts_type)
772 ax2.set_xlabel(counts_type)
773 # Add the categories on the x-axis
774 ax1.set_xticks(index + bar_width*n_exp/2)
775 ax2.set_xticks(index + bar_width*n_exp/2)
776 if counts_type.lower() != 'categories':
777 ax1.set_xticklabels(ordered_categs,rotation='30',ha='right')
778 ax2.set_xticklabels(ordered_categs,rotation='30',ha='right')
779 else:
780 ax1.set_xticklabels(ordered_categs)
781 ax2.set_xticklabels(ordered_categs)
782 # Display fractions values in percentages
783 ax1.set_yticklabels([str(int(i*100)) for i in ax1.get_yticks()])
784 # Correct y-axis ticks labels for enrichment subplot
785 #ax2.set_yticklabels([str(i+1)+"$^{+1}$" if i>0 else 1 if i==0 else str(-(i-1))+"$^{-1}$" for i in ax2.get_yticks()])
786 yticks = list(ax2.get_yticks())
787 yticks = [ yticks[i]-1 if yticks[i]>9 else yticks[i]+1 if yticks[i]<-9 else yticks[i] for i in xrange(len(yticks))]
788 ax2.set_yticks(yticks)
789 ax2.set_yticklabels([str(int(i+1))+"$^{+1}$" if i>0 and i%1==0 else str(i+1)+"$^{+1}$" if i>0 else 1 if i==0 else str(int(-(i-1)))+"$^{-1}$" if i<0 and i%1==0 else str(-(i-1))+"$^{-1}$" for i in ax2.get_yticks()])
790 #ax2.set_yticklabels([i+1 if i>0 else 1 if i==0 else "$\\frac{1}{%s}$" %-(i-1) for i in ax2.get_yticks()])
791 # Change appearance of 'antisense' bars on enrichment plot since we cannot calculate an enrichment for this artificial category
792 if 'antisense_pos' in locals(): #ax2.text(antisense_pos+bar_width/2,ax2.get_ylim()[1]/10,'NA')
793 for i in xrange(n_exp) :
794 rect = rects[i][antisense_pos]
795 rect.set_y(ax2.get_ylim()[0])
796 rect.set_height(ax2.get_ylim()[1] - ax2.get_ylim()[0])
797 rect.set_hatch('/')
798 rect.set_fill(False)
799 rect.set_linewidth(0)
800 #rect.set_color('lightgrey')
801 #rect.set_edgecolor('#EDEDED')
802 rect.set_color('#EDEDED')
803 ax2.text(index[antisense_pos] + bar_width*n_exp/2 - 0.1, (ax2_ymax+ax2_ymin)/2, 'NA')
804 # Add text for features absent in sample
805 for i in xrange(n_exp):
806 for y in xrange(n_cat):
807 if percentages[i][y] == 0:
808 txt = ax1.text(y + bar_width*(i+0.5), 0.02, 'Absent in sample', rotation = 'vertical', color = cmap[i], horizontalalignment ='center', verticalalignment = 'bottom')
809 txt.set_path_effects([PathEffects.Stroke(linewidth=0.5),PathEffects.Normal()])
810 elif enrichment[i][y] == 0:
811 rects[i][y].set_linewidth(1)
812
813 # Remove top/right/bottom axes
814 for ax in [ax1,ax2]:
815 ax.spines['top'].set_visible(False)
816 ax.spines['right'].set_visible(False)
817 ax.spines['bottom'].set_visible(False)
818 ax.tick_params(axis='x', which='both', bottom='on', top='off', labelbottom='on')
819 ax.tick_params(axis='y', which='both', left='on', right='off', labelleft='on')
820
821
822
823 #ax1.legend(loc='center right', bbox_to_anchor=(1.2, 0),fancybox=True, shadow=True)
824 ## Showing the plot
825 plt.tight_layout()
826 fig.subplots_adjust(wspace=0.2)
827 if pdf:
828 pdf.savefig()
829 plt.close()
830 elif svg:
831 if svg == True:
832 plt.savefig(counts_type+'.svg')
833 else :
834 if os.path.splitext(svg)[1] == '.svg':
835 plt.savefig('.'.join((os.path.splitext(svg)[0],counts_type,'svg')))
836 else:
837 plt.savefig('.'.join((svg,counts_type,'svg')))
838 elif png:
839 if png == True:
840 plt.savefig(counts_type+'.png')
841 else :
842 if os.path.splitext(png)[1] == '.png':
843 plt.savefig('.'.join((os.path.splitext(png)[0],counts_type,'png')))
844 else:
845 plt.savefig('.'.join((png,counts_type,'png')))
846 else:
847 plt.show()
848 ## Save on disk it as a png image
849 #fig.savefig('image_output.png', dpi=300, format='png', bbox_extra_artists=(lgd,), bbox_inches='tight')
850
851
852 def filter_categs_on_biotype(selected_biotype,cpt) :
853 filtered_cpt = {}
854 for sample in cpt:
855 filtered_cpt[sample] = {}
856 for feature,count in cpt[sample].items():
857 if feature[1] == selected_biotype:
858 filtered_cpt[sample][feature[0]] = count
859 return filtered_cpt
860
861
862 ##########################################################################
863 # MAIN #
864 ##########################################################################
865
866 def usage_message(name=None):
867 return '''
868 Generate genome indexes:
869 python ALFA.py -a GTF_FILE [-g GENOME_INDEX]
870 [--chr_len CHR_LENGTHS_FILE]
871 Process BAM file(s):
872 python ALFA.py -g GENOME_INDEX -i BAM1 LABEL1 [BAM2 LABEL2 ...]
873 [--bedgraph] [-s STRAND]
874 [-n] [--pdf output.pdf]
875 [-d {1,2,3,4}] [-t YMIN YMAX]
876 Index genome + process BAM:
877 python ALFA.py -a GTF_FILE [-g GENOME_INDEX]
878 -i BAM1 LABEL1 [BAM2 LABEL2 ...]
879 [--chr_len CHR_LENGTHS_FILE]
880 [--bedgraph][-s STRAND]
881 [-n] [--pdf output.pdf]
882 [-d {1,2,3,4}] [-t YMIN YMAX]
883
884 Process previously created ALFA counts file(s):
885 python ALFA.py -c COUNTS1 [COUNTS2 ...]
886 [-s STRAND]
887 [-n] [--pdf output.pdf]
888 [-d {1,2,3,4}] [-t YMIN YMAX]
889
890 '''
891
892
893 #### Parse command line arguments and store them in 'options'
894 parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, usage=usage_message())
895 parser.add_argument('--version', action='version', version='version 1.0', help="show program's version number and exit\n\n-----------\n\n")
896 # Options concernant l'index
897 parser.add_argument('-g','--genome_index', help="Genome index files path and basename for existing index, or path and basename for new index creation\n\n")
898 parser.add_argument('-a','--annotation', metavar = "GTF_FILE", help='Genomic annotations file (GTF format)\n\n')
899 parser.add_argument('--chr_len', help='Tabulated file containing chromosome names and lengths\n\n-----------\n\n')
900
901 # Options pour l'étape d'intersection
902 parser.add_argument('-i','--input','--bam', dest='input', metavar=('BAM_FILE1 LABEL1',""), nargs='+', help='Input BAM file(s) and label(s). The BAM files must be sorted by position.\n\n')
903 parser.add_argument('--bedgraph', action='store_const',default = False, const = True, help="Use this options if your input file(s) is(are) already in bedgraph format\n\n")
904 parser.add_argument('-c','--counts',metavar=('COUNTS_FILE',""), nargs='+', help="Use this options instead of '-i/--input' to provide ALFA counts files as input \ninstead of bam/bedgraph files.\n\n")
905 parser.add_argument('-s','--strandness', dest="strandness", nargs=1, action = 'store', default = ['unstranded'], choices = ['unstranded','forward','reverse','fr-firststrand','fr-secondstrand'], metavar="", help ="Library orientation. Choose within: 'unstranded', 'forward'/'fr-firststrand' \nor 'reverse'/'fr-secondstrand'. (Default: 'unstranded')\n\n-----------\n\n")
906
907 # Options concernant le plot
908 parser.add_argument('-biotype_filter',nargs=1,help=argparse.SUPPRESS)#"Make an extra plot of categories distribution using only counts of the specified biotype.")
909 parser.add_argument('-d','--categories_depth', type=int, default='3', choices=range(1,5), help = "Use this option to set the hierarchical level that will be considered in the GTF file (default=3): \n(1) gene,intergenic; \n(2) intron,exon,intergenic; \n(3) 5'UTR,CDS,3'UTR,intron,intergenic; \n(4) start_codon,5'UTR,CDS,3'UTR,stop_codon,intron,intergenic. \n\n")
910 parser.add_argument('--pdf', nargs='?', default=False, help="Save produced plots in PDF format at specified path ('categories_plots.pdf' if no argument provided)\n\n")
911 parser.add_argument('--png', nargs='?', default=False, const=True, help="Save produced plots in PNG format with provided argument as basename \nor 'categories.png' and 'biotypes.png' if no argument provided\n\n")
912 parser.add_argument('--svg', nargs='?', default=False, const=True, help="Save produced plots in SVG format with provided argument as basename \nor 'categories.svg' and 'biotypes.svg' if no argument provided\n\n")
913 parser.add_argument('-n','--no_plot', dest='quiet', action='store_const', default=False, const=True, help="Do not show plots\n\n")
914 parser.add_argument('-t','--threshold', dest='threshold', nargs = 2, metavar=("ymin","ymax"), type=float , help="Set axis limits for enrichment plots\n\n")
915
916 if len(sys.argv)==1:
917 parser.print_usage()
918 sys.exit(1)
919
920 options = parser.parse_args()
921
922 def required_arg(arg, aliases):
923 if not arg:
924 print >> sys.stderr, "\nError: %s argument is missing.\n" %aliases
925 parser.print_usage()
926 sys.exit()
927
928 def check_files_enxtension(files):
929 return
930
931 # Booleans for steps to be executed
932 make_index = False
933 intersect_reads = False
934 process_counts = False
935
936 #### Check arguments conformity and define which steps have to be perfomed
937 if options.counts :
938 # Aucun autre argument requis
939 # Vérifier extension input
940
941 # Action : Faire le plot uniquement
942 process_counts = True
943 else:
944 if options.annotation :
945 # Vérifier si présence -gi
946 if options.genome_index :
947 genome_index_basename = options.genome_index
948 else:
949 genome_index_basename = options.annotation.split("/")[-1].split(".gtf")[0]
950 # Vérifier si un fichier existe déjà:
951 if os.path.isfile(genome_index_basename+".stranded.index") :
952 if options.input:
953 print >> sys.stderr, "\nWarning: a index file named '%s' already exists and will be used. If you want to create a new index, please delete this file or specify an other path." %(genome_index_basename+".stranded.index")
954 else:
955 sys.exit("\nError: a index file named %s already exists. If you want to create a new index, please delete this file or specify an other path.\n" %(genome_index_basename+".stranded.index"))
956 # sinon -> action : index à faire
957 else :
958 make_index = True
959 # si l'index n'est pas à faire :
960 if options.input:
961 # Arguments requis: input, genome_index
962 if 'genome_index_basename' not in locals():
963 required_arg(options.genome_index, "-g/--genome_index")
964 genome_index_basename = options.genome_index
965 required_arg(options.input, "-i/--input/--bam")
966 for i in xrange(0, len(options.input), 2):
967 try :
968 extension = os.path.splitext(options.input[i+1])[1]
969 if extension == ".bam" or extension == '.bedgraph' or extension == '.bg':
970 sys.exit("Error: it seems input files and associated labels are not correctly provided.\n\
971 Make sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].")
972 except:
973 sys.exit("Error: it seems input files and associated labels are not correctly provided.\nMake sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].")
974
975 intersect_reads = True
976 # Vérifier input's extension
977 #TODO
978 if not (options.counts or options.input or options.annotation):
979 sys.exit("\nError : some arguments are missing At least '-a', '-c' or '-i' is required. Please refer to help (-h/--help) and usage cases for more details.\n")
980 if not options.counts:
981 # Declare genome_index variables
982 stranded_genome_index = genome_index_basename+".stranded.index"
983 unstranded_genome_index = genome_index_basename+".unstranded.index"
984 if options.strandness[0] == "unstranded":
985 genome_index = unstranded_genome_index
986 else:
987 genome_index = stranded_genome_index
988
989
990
991 #### Initialization of some variables
992
993 # Initializing the category priority order, coding biotypes and the final list
994 prios = {'start_codon': 7, 'stop_codon': 7, 'five_prime_utr': 6, 'three_prime_utr': 6, 'UTR': 6, 'CDS': 5, 'exon': 4, 'transcript': 3, 'gene': 2, 'antisense': 1,'intergenic': 0}
995
996 biotype_prios = None
997 #biotype_prios = {"protein_coding":1, "miRNA":2}
998
999 # Possibles groups of categories to plot
1000 categs_group1={'start': ['start_codon'], '5UTR': ['five_prime_utr','UTR'], 'CDS': ['CDS', 'exon'], '3UTR': ['three_prime_utr'], 'stop': ['stop_codon'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']}
1001 categs_group2={'5UTR': ['five_prime_utr', 'UTR'], 'CDS': ['CDS', 'exon','start_codon','stop_codon'], '3UTR': ['three_prime_utr'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']}
1002 categs_group3={'exons': ['five_prime_utr', 'three_prime_utr', 'UTR', 'CDS', 'exon','start_codon','stop_codon'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']}
1003 categs_group4={'gene': ['five_prime_utr', 'three_prime_utr', 'UTR', 'CDS', 'exon','start_codon','stop_codon', 'transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']}
1004 categs_groups = [categs_group4,categs_group3,categs_group2,categs_group1] # Order and merging for the final plot
1005 cat_list = ['5UTR', 'start', 'CDS', 'stop', '3UTR', 'exons', 'introns', 'gene', 'intergenic', 'antisense']
1006
1007
1008 # biotypes list
1009 biotypes ={'protein_coding','polymorphic_pseudogene','TR_C_gene','TR_D_gene','TR_J_gene','TR_V_gene','IG_C_gene','IG_D_gene','IG_J_gene','IG_V_gene',"3prime_overlapping_ncrna","lincRNA","macro_lncRNA","miRNA","misc_RNA","Mt_rRNA","Mt_tRNA","processed_transcript","ribozyme","rRNA","scaRNA","sense_intronic","sense_overlapping","snoRNA","snRNA","sRNA","TEC","vaultRNA","antisense","transcribed_processed_pseudogene","transcribed_unitary_pseudogene","transcribed_unprocessed_pseudogene","translated_unprocessed_pseudogene","TR_J_pseudogene","TR_V_pseudogene","unitary_pseudogene","unprocessed_pseudogene","processed_pseudogene","IG_C_pseudogene","IG_J_pseudogene","IG_V_pseudogene","pseudogene","ncRNA","tRNA"} # Type: set (to access quickly)
1010
1011 # Grouping of biotypes:
1012 biotypes_group1={'protein_coding':['protein_coding'],'pseudogenes':['polymorphic_pseudogene',"transcribed_processed_pseudogene","transcribed_unitary_pseudogene","transcribed_unprocessed_pseudogene","translated_unprocessed_pseudogene","TR_J_pseudogene","TR_V_pseudogene","unitary_pseudogene","unprocessed_pseudogene","processed_pseudogene","IG_C_pseudogene","IG_J_pseudogene","IG_V_pseudogene","pseudogene"],'TR':['TR_C_gene','TR_D_gene','TR_J_gene','TR_V_gene'],'IG':['IG_C_gene','IG_D_gene','IG_J_gene','IG_V_gene'],\
1013 'MT_RNA':["Mt_rRNA","Mt_tRNA"],\
1014 'ncRNA':["lincRNA","macro_lncRNA","3prime_overlapping_ncrna","ncRNA"],\
1015 "others":["misc_RNA","processed_transcript","ribozyme","scaRNA","sense_intronic","sense_overlapping","TEC","vaultRNA"],
1016 "antisense":["antisense"]}
1017 for biot in ["miRNA","snoRNA","snRNA","rRNA","sRNA","tRNA"]:
1018 biotypes_group1[biot]=[biot]
1019
1020 # # Initializing the unkown features lits
1021 unknown_feature = []
1022
1023 # Initializing the genome category counter dict
1024 cpt_genome = {}
1025
1026
1027 if process_counts :
1028 #### If input files are the categories counts, just load them and continue to recategorization step
1029 cpt,cpt_genome,samples_names = read_counts_files(options.counts)
1030 else:
1031 #### Create genome index if needed and get the sizes of categories
1032 if make_index :
1033 #### Get the chromosome lengths
1034 lengths = get_chromosome_lengths(options)
1035 # Generating the genome index files if the user didn't provide them
1036 create_genome_index(options.annotation, unstranded_genome_index, stranded_genome_index, cpt_genome, prios, biotypes, lengths)
1037
1038
1039 #print '\nChr lengths:', lengths
1040
1041 if intersect_reads:
1042 # If the indexes already exist, read them to compute the sizes of the categories in the genome and retrieve the chromosome lengths
1043 if not make_index :
1044 print "\n### Reading genome indexes\n...\r",
1045 sys.stdout.flush()
1046 lengths={}
1047 with open(genome_index, 'r') as genome_index_file:
1048 for line in genome_index_file:
1049 if line[0] == "#":
1050 lengths[line.split('\t')[0][1:]] = int(line.split('\t')[1])
1051 else :
1052 add_info(cpt_genome, line.rstrip().split('\t')[4:], line.split('\t')[1], line.split('\t')[2], biotype_prios = None, categ_prios = prios)
1053
1054 #### Computing the genome intergenic count: sum of the chr lengths minus sum of the genome annotated intervals
1055 cpt_genome[('intergenic','intergenic')] = sum(lengths.itervalues()) - sum([v for x,v in cpt_genome.iteritems() if x != ('antisense','antisense')])
1056 if not make_index :
1057 print "Done!"
1058 #print '\nGenome category counts:'
1059 #for key,val in cpt_genome.iteritems():
1060 #print key,"\t",val
1061
1062
1063 #### Create the Bedgraph files if needed and get the files list
1064
1065 if not options.bedgraph:
1066 # Generating the BEDGRAPH files is the user provided BAM file(s) and get the samples labels (this names will be used in the plot legend)
1067 samples_files, samples_names = create_bedgraph_files(options.input,options.strandness[0])
1068 else:
1069 # Just initialize the files list with the bedgraph paths
1070 samples_files = [options.input[i] for i in range(0,len(options.input),2)]
1071 # and get the labels
1072 samples_names = [options.input[i] for i in range(1,len(options.input),2)]
1073 #### Retrieving chromosome names saved in index
1074 chrom_list = get_chromosome_names_in_index(genome_index)
1075 #### Processing the BEDGRAPH files: intersecting the bedgraph with the genome index and count the number of aligned positions in each category
1076 cpt = intersect_bedgraphs_and_index_to_counts_categories(samples_files,samples_names,prios,genome_index, options.strandness[0], biotype_prios = None)
1077
1078 #### Write the counts on disk
1079 write_counts_in_files(cpt,cpt_genome)
1080
1081 if not (intersect_reads or process_counts) or (options.quiet and options.pdf == False):
1082 quit("\n### End of program")
1083 print "\n### Generating plots"
1084 # Updating the biotypes lists (biotypes and 'biotype_group1'): adding the 'unknow biotypes' found in gtf/index
1085 if unknown_feature == []: # 'unknown_feature' is define only during the index generation
1086 # Browse the feature to determine whether some biotypes are 'unknown'
1087 for sample,counts in cpt.items():
1088 for (cat,biot) in counts:
1089 if biot not in biotypes and cat not in unknown_feature:
1090 unknown_feature.append(biot)
1091 for new_biot in unknown_feature:
1092 biotypes.add(new_biot)
1093 biotypes_group1["others"].append(new_biot)
1094 biotypes = sorted(biotypes)
1095 # move antisense categ to the end of the list
1096 biotypes.remove('antisense')
1097 biotypes.append('antisense')
1098 biotypes_group1 = sorted(biotypes_group1)
1099
1100
1101 #print '\nCounts for every category/biotype pair: ',cpt
1102
1103 # Generating plots
1104 if options.pdf != False:
1105 if options.pdf == None:
1106 options.pdf = "categories_plots.pdf"
1107 pdf = PdfPages(options.pdf)
1108 else:
1109 pdf = False
1110
1111 selected_biotype = None
1112 if options.biotype_filter:
1113 options.biotype_filter = options.biotype_filter[0]
1114 for sample in cpt:
1115 for feature in cpt[sample]:
1116 biotype = feature[1]
1117 if options.biotype_filter.lower() == biotype.lower():
1118 selected_biotype=biotype
1119 break
1120 if selected_biotype == None :
1121 print "\nError: biotype '"+options.biotype_filter+"' not found. Please check the biotype name and that this biotype exists in your sample(s)."
1122 sys.exit()
1123
1124 #Print a warning message if the UTRs are not specified as 5' or 3' (they will be ploted as 5'UTR)
1125 if 'UTR' in [categ[0] for counts in cpt.values() for categ in counts.keys()]:
1126 print '''\nWARNING: (some) 5'UTR/3'UTR are not precisely defined. Consequently, positions annotated as "UTR" will be counted as "5'UTR"\n'''
1127
1128 #### Make the plot by categories
1129 #### Recategorizing with the final categories
1130 final_cats=categs_groups[options.categories_depth-1]
1131 final_cat_cpt,final_genome_cpt, filtered_cat_cpt = group_counts_by_categ(cpt,cpt_genome,final_cats,selected_biotype)
1132 #### Display the distribution of specified categories (or biotypes) in samples on a barplot
1133 # Remove the 'antisense' category if the library type is 'unstranded'
1134 for dic in cpt.values():
1135 if ('antisense','antisense') in dic.keys(): break
1136 else:
1137 cat_list.remove('antisense')
1138 make_plot(cat_list,samples_names,final_cat_cpt,final_genome_cpt,pdf, "categories",options.threshold, svg = options.svg, png = options.png)
1139 if selected_biotype :
1140 make_plot(cat_list,samples_names,filtered_cat_cpt,final_genome_cpt,pdf, "categories",options.threshold,title="Categories distribution for '"+selected_biotype+"' biotype", svg = options.svg, png = options.png)
1141
1142 #### Make the plot by biotypes
1143 #### Recategorizing with the final categories
1144 final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes)
1145 #### Display the distribution of specified categories (or biotypes) in samples on a barplot
1146 make_plot(biotypes,samples_names,final_cat_cpt,final_genome_cpt,pdf, "biotypes",options.threshold, svg = options.svg, png = options.png)
1147
1148
1149
1150 ##### Recategorizing with the final categories
1151 #final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes_group1)
1152 ##### Display the distribution of specified categories (or biotypes) in samples on a barplot
1153 #make_plot(biotypes_group1,samples_names,final_cat_cpt,final_genome_cpt,pdf,"Biotype groups", options.threshold, title="Biotypes distribution in mapped reads \n(biotypes are grouped by 'family')", svg = options.svg, png = options.png)
1154
1155
1156 if options.pdf:
1157 pdf.close()
1158 print "\n### Plots saved in pdf file: %s" %options.pdf
1159
1160 print "\n### End of program"