annotate ALFA/ALFA.py @ 32:b26aec436ab5 draft

Uploaded
author charles-bernard
date Thu, 21 Dec 2017 09:29:26 -0500
parents a1e2ab10b317
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1 #!/usr/bin/python
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
2 #-*- coding: utf-8 -*-
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
3
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
4 __author__ = 'noel & bahin'
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
5 ''' <decription> '''
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
6
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
7 import argparse
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
8 import os
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
9 import numpy
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
10 import sys
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
11 import subprocess
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
12 import matplotlib.pyplot as plt
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
13 import matplotlib.cm as cmx
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
14 import matplotlib.colors as colors
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
15 import matplotlib.patheffects as PathEffects
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
16 import re
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
17 from matplotlib.backends.backend_pdf import PdfPages
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
18 # To correctly embbed the texts when saving plots in svg format
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
19 import matplotlib
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
20 matplotlib.rcParams['svg.fonttype'] = 'none'
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
21
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
22 ##########################################################################
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
23 # FUNCTIONS #
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
24 ##########################################################################
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
25
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
26 def init_dict(d, key, init):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
27 if key not in d:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
28 d[key] = init
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
29
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
30 def get_chromosome_lengths(args):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
31 """
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
32 Parse the file containing the chromosomes lengths.
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
33 If no length file is provided, browse the annotation file (GTF) to estimate the chromosome sizes (
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
34 """
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
35 lengths={}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
36 gtf_chrom_names=set()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
37 force_get_lengths = False
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
38 # If the user provided the chromosome length file
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
39 if args.chr_len:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
40 with open(args.chr_len, 'r') as chr_len_file:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
41 for line in chr_len_file:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
42 lengths[line.split('\t')[0]] = int(line.rstrip().split('\t')[1])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
43 with open(args.annotation,'r') as gtf_file:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
44 for line in gtf_file:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
45 if not line.startswith('#'):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
46 chrom = line.split('\t')[0]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
47 if chrom not in gtf_chrom_names:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
48 gtf_chrom_names.add(chrom)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
49 for chrom in lengths.keys():
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
50 if chrom not in gtf_chrom_names:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
51 print "Warning: at least one chromosome name ('"+chrom+"') of the file '"+args.chr_len+"'does not match any chromosome name if GTF and was ignored."
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
52 #del lengths[chrom]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
53 break
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
54 for chrom in gtf_chrom_names:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
55 if force_get_lengths: break
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
56 if chrom not in lengths.keys():
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
57 print "WARNING: chromosome name '"+chrom+"' was found in gtf and does not match any chromosome name provided in",args.chr_len+". "
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
58 print "\t=> The chromosome lenghts will be approximated using annotations in the GTF file."
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
59 continue_value =""
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
60 while continue_value not in {"yes","y","no","n"}:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
61 continue_value = raw_input("\tDo you want to continue ('yes' or 'y')?\n\tElse write 'no' or 'n' to exit the script and check your file of lengths.\n")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
62 if continue_value == "no" or continue_value == "n":
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
63 sys.exit("Exiting")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
64 elif continue_value == "yes" or continue_value == "y":
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
65 force_get_lengths = True
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
66 break
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
67 print "Error: use 'yes/y/no/n' only"
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
68 if not force_get_lengths:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
69 return lengths
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
70 # Otherwise, (or if at least one chromosome was missing in chromosome lengths file) we consider the end of the last annotation of the chromosome in the GTF file as the chromosome length
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
71 with open(args.annotation, 'r') as gtf_file:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
72 for line in gtf_file:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
73 if not line.startswith('#'):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
74 chrom = line.split('\t')[0]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
75 end = int(line.split('\t')[4])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
76 init_dict(lengths, chrom, 0)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
77 lengths[chrom] = max(lengths[chrom], end)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
78 if force_get_lengths:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
79 print "The chromosome lenghts have been approximated using the last annotations in the GTF file."
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
80 return lengths
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
81
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
82 def write_feature_on_index(feat,chrom, start, stop, sign, stranded_genome_index, unstranded_genome_index=None):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
83 grouped_by_biotype_features = []
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
84 for biotype,categs in feat.iteritems():
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
85 categ_list=[]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
86 for cat in set(categs):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
87 categ_list.append(cat)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
88 grouped_by_biotype_features.append(":".join((str(biotype),",".join(categ_list))))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
89 stranded_genome_index.write('\t'.join((chrom, start, stop, sign,''))+'\t'.join(grouped_by_biotype_features)+'\n')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
90 if unstranded_genome_index :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
91 unstranded_genome_index.write('\t'.join((chrom, start, stop, '.',''))+'\t'.join(grouped_by_biotype_features)+'\n')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
92
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
93
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
94 def add_info(cpt, feat_values, start, stop, chrom=None, unstranded_genome_index=None, stranded_genome_index = None , biotype_prios=None, coverage=1, categ_prios=None):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
95 """
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
96 From an annotated genomic interval (start/stop positions and associated feature : one or more category(ies)/biotype pair(s) )
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
97 - If a file is provided: write the information at the end of the index file being generated;
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
98 - else : browse the features and update the counts of categories/biotypes found in the genome.
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
99 """
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
100 ## Writing in the file is it was provided
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
101 if stranded_genome_index :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
102 unstranded_features = None
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
103 # If only one strand has a feature, this feature will directly be written on the unstranded index
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
104 if feat_values[0] == {} :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
105 # A interval with no feature corresponds to a region annotated only on the reverse strand : update 'antisense' counts
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
106 stranded_genome_index.write('\t'.join((chrom, start, stop, '+','antisense\n')))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
107 write_feature_on_index(feat_values[1], chrom ,start, stop, '-', stranded_genome_index, unstranded_genome_index)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
108 else :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
109 if feat_values[1] == {} :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
110 write_feature_on_index(feat_values[0], chrom ,start, stop, '+', stranded_genome_index, unstranded_genome_index)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
111 stranded_genome_index.write('\t'.join((chrom, start, stop, '-','antisense\n')))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
112
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
113 # Else, the unstranded index should contain the union of plus and minus features
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
114 else :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
115 write_feature_on_index(feat_values[0], chrom ,start, stop, '+', stranded_genome_index)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
116 write_feature_on_index(feat_values[1], chrom ,start, stop, '-', stranded_genome_index)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
117 unstranded_feat = dict(feat_values[0], **feat_values[1])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
118 for name in set(feat_values[0]) & set(feat_values[1]):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
119 unstranded_feat[name]+=feat_values[0][name]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
120 write_feature_on_index(unstranded_feat, chrom ,start, stop, '.', unstranded_genome_index)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
121
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
122 ## Increasing category counter(s)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
123 else :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
124 # Default behavior if no biotype priorities : category with the highest priority for each found biotype has the same weight (1/n_biotypes)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
125 if not biotype_prios:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
126 nb_feat = len(feat_values)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
127 # For every categ(s)/biotype pairs
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
128 for feat in feat_values:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
129 cur_prio = 0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
130 selected_categ = []
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
131 # Separate categorie(s) and biotype
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
132 try:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
133 biot,cats = feat.split(":")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
134 # Error if feature equal "antisense" : update the 'antisense/antisense' counts
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
135 except ValueError :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
136 try :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
137 cpt[(feat,feat)] += (int(stop) - int(start)) * coverage / float(nb_feat)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
138 except :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
139 cpt[(feat,feat)] = (int(stop) - int(start)) * coverage / float(nb_feat)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
140 return
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
141 # Browse the categories and get only the one(s) with highest priority
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
142 for cat in cats.split(','):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
143 try: prio = prios[cat]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
144 except:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
145 #TODO Find a way to add unknown categories
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
146 if cat not in unknown_feature:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
147 print >> sys.stderr, "Warning: Unknown categorie %s found and ignored\n...\r" %cat,
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
148 unknown_feature.append(cat)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
149 continue
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
150 if prio > cur_prio :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
151 cur_prio = prio
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
152 selected_categ = [cat]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
153 if prio == cur_prio :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
154 if cat != selected_categ :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
155 try:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
156 if cat not in selected_categ :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
157 selected_categ.append(cat)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
158 except TypeError :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
159 selected_categ = [selected_categ,cat]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
160 # Increase each counts by the coverage divided by the number of categories and biotypes
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
161 nb_cats = len(selected_categ)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
162 for cat in selected_categ :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
163 try:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
164 cpt[(cat,biot)] += (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
165 except KeyError:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
166 cpt[(cat,biot)] = (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
167 #else :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
168 #cpt[(cats,biot)] = (int(stop) - int(start)) / float(nb_feat) * coverage
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
169 # Else, apply biotype selection according to the priority set
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
170 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
171 #TODO Add an option to pass biotype priorities and handle it
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
172 pass
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
173
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
174 def print_chrom(features_dict, chrom, stranded_index_file, unstranded_index_file, cpt_genome):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
175 with open(unstranded_index_file,'a') as findex, open(stranded_index_file,'a') as fstrandedindex:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
176 # Initialization of variables : start position of the first interval and associated features for +/- strands
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
177 start = ""
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
178 for pos in sorted(features_dict['+'].keys()):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
179 if start != "":
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
180 add_info(cpt_genome, [features_plus,features_minus], str(start), str(pos), chrom, stranded_genome_index = fstrandedindex, unstranded_genome_index = findex)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
181 start = pos
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
182 features_plus = features_dict['+'][pos]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
183 features_minus = features_dict['-'][pos]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
184
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
185
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
186 def create_genome_index(annotation, unstranded_genome_index, stranded_genome_index,cpt_genome,prios,biotypes,chrom_sizes):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
187 ''' Create an index of the genome annotations and save it in a file'''
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
188 print '\n### Generating genome indexes\n',
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
189 sys.stdout.flush()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
190 # Initializations
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
191 intervals_dict = 0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
192 max_value = -1
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
193 prev_chrom = ''
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
194 reverse_strand = {'+':'-','-':'+'}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
195 i = 0 # Line counter
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
196 # Write the chromosomes lengths as comment lines before the genome index
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
197 with open(unstranded_genome_index,'w') as findex, open(stranded_genome_index,'w') as fstrandedindex:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
198 for key,value in chrom_sizes.items():
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
199 findex.write("#%s\t%s\n" %(key, value))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
200 fstrandedindex.write("#%s\t%s\n" %(key, value))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
201 # Running through the GTF file and writing into genome index files
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
202 with open(annotation, 'r') as gtf_file:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
203 for line in gtf_file:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
204 # Print messages after X processed lines
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
205 i += 1
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
206 if i % 100000 == 0:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
207 print >> sys.stderr, '\r%s line processed...' %str(i)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
208 print '\r \r. . .',
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
209 sys.stdout.flush()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
210 elif i % 20000 == 0:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
211 print '\r \r. . .',
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
212 sys.stdout.flush()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
213 elif i % 2000 == 0:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
214 print '.',
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
215 sys.stdout.flush()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
216 # Processing lines except comment ones
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
217 if not line.startswith('#'):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
218 # Getting the line infos
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
219 line_split=line[:-1].split('\t')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
220 chrom = line_split[0]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
221 cat=line_split[2]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
222 start = int(line_split[3]) - 1
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
223 stop = int(line_split[4])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
224 strand = line_split[6]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
225 antisense = reverse_strand[strand]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
226 biotype=line_split[8].split('biotype')[1].split(';')[0].strip('" ')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
227 feat = [(cat,biotype)]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
228 # Registering chromosome info in the genome index file if this is a new chromosome or a annotation not overlapping previously recorded features
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
229 if start > max_value or chrom != prev_chrom:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
230 # Write the previous features
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
231 if intervals_dict != 0:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
232 if chrom != prev_chrom :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
233 print_chrom(intervals_dict, prev_chrom, stranded_genome_index, unstranded_genome_index, cpt_genome)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
234 print "\rChromosome '" + prev_chrom + "' registered."
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
235 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
236 print_chrom(intervals_dict, chrom, stranded_genome_index, unstranded_genome_index, cpt_genome)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
237 prev_chrom = chrom
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
238 # (Re)Initializing the chromosome lists
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
239 intervals_dict = {strand:{start:{biotype:[cat]}, stop:{}},antisense:{start:{},stop:{}}}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
240 max_value = stop
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
241
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
242 # Update the dictionary which represents intervals for every disctinct annotation
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
243 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
244 # Get intervals on the same strand as the current feature
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
245 stranded_intervals = intervals_dict[strand]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
246 start_added = False
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
247 for pos in sorted(stranded_intervals.keys()):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
248 #print pos
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
249 #print stranded_intervals[pos]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
250 #print
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
251 # While the position is below the feature's interval, store the features
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
252 if pos < start :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
253 cur_cat_dict = dict(stranded_intervals[pos])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
254 cur_antisense_dict = dict(intervals_dict[antisense][pos])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
255
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
256 # If the start position already exists: update it by addind the new feature
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
257 elif pos == start :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
258 cur_cat_dict = dict(stranded_intervals[pos])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
259 cur_antisense_dict = dict(intervals_dict[antisense][pos])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
260 #print "cur",cur_cat_dict
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
261 try : stranded_intervals[pos][biotype] = stranded_intervals[pos][biotype]+[cat]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
262 except KeyError: stranded_intervals[pos][biotype] = [cat]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
263 start_added = True
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
264 #print "cur",cur_cat_dict
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
265
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
266 elif pos > start :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
267 # Create a new entry for the start position if necessary
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
268 if not start_added :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
269 #print "cur",cur_cat_dict
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
270 stranded_intervals[start] = dict(cur_cat_dict)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
271 stranded_intervals[start][biotype] = [cat]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
272 #stranded_intervals[pos][biotype].append(cat)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
273 intervals_dict[antisense][start] = cur_antisense_dict
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
274 start_added = True
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
275 #print "cur",cur_cat_dict
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
276 # While the position is below the stop, just add the new feature
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
277 if pos < stop :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
278 cur_cat_dict = dict(stranded_intervals[pos])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
279 cur_antisense_dict = dict(intervals_dict[antisense][pos])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
280 try: stranded_intervals[pos][biotype] = stranded_intervals[pos][biotype] + [cat]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
281 except KeyError: stranded_intervals[pos][biotype] = [cat]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
282 # Close the created interval : create an entry at the stop position and restore the features
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
283 elif pos > stop :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
284 stranded_intervals[stop] = dict(cur_cat_dict)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
285 intervals_dict[antisense][stop] = cur_antisense_dict
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
286 break
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
287 else :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
288 break
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
289 #try:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
290 #cur_cat_dict = list(stranded_intervals[pos][biotype])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
291 #except KeyError: cur_cat_dict = list()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
292 #print stranded_intervals[pos]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
293 #print
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
294 # Extend the dictionary if necessary
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
295 if stop > max_value:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
296 max_value = stop
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
297 stranded_intervals[stop] = {}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
298 intervals_dict[antisense][stop] = {}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
299 #except KeyError:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
300 #print intervals_dict
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
301 #quit()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
302 #intervals_dict[strand] = {strand:{start:{biotype:[cat]}, stop:{}},antisense:{start:{},stop:{}}}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
303 #continue
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
304 #for sign in ['-','+']:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
305 #print sign
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
306 #for key,val in sorted(intervals_dict[sign].iteritems()):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
307 #print key,'\t',val
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
308 #print
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
309 #print "-------\n"
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
310
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
311
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
312 #Store the categories of the last chromosome
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
313 print_chrom(intervals_dict, chrom, stranded_genome_index, unstranded_genome_index, cpt_genome)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
314 print "\rChromosome '" + prev_chrom + "' registered.\nDone!"
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
315
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
316
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
317 def create_bedgraph_files(bams,strandness):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
318 samples_files = []
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
319 labels = []
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
320 print "\n### Generating the bedgraph files"
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
321 for n in range(0, len(bams), 2):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
322 print "\rProcessing '%s'\n..." %bams[n],
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
323 sys.stdout.flush()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
324 #Get the label for this sample
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
325 label = bams[n+1]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
326 #Modify it to contain only alphanumeric caracters (avoid files generation with dangerous names)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
327 modified_label = "_".join(re.findall(r"[\w']+", label))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
328 if strandness in ["reverse","fr-secondstrand"]:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
329 subprocess.call('bedtools genomecov -bg -split -strand - -ibam ' + bams[n] + ' > ' + modified_label + '.plus.bedgraph', shell=True)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
330 subprocess.call('bedtools genomecov -bg -split -strand + -ibam ' + bams[n] + ' > ' + modified_label + '.minus.bedgraph', shell=True)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
331 elif strandness in ["forward","fr-firststrand"]:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
332 subprocess.call('bedtools genomecov -bg -split -strand + -ibam ' + bams[n] + ' > ' + modified_label + '.plus.bedgraph', shell=True)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
333 subprocess.call('bedtools genomecov -bg -split -strand - -ibam ' + bams[n] + ' > ' + modified_label + '.minus.bedgraph', shell=True)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
334 else :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
335 subprocess.call('bedtools genomecov -bg -split -ibam ' + bams[n] + ' > ' + modified_label + '.bedgraph', shell=True)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
336 samples_files.append(modified_label)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
337 labels.append(label)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
338 print "\rDone!"
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
339 return samples_files, labels
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
340
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
341 def read_gtf(gtf_index_file, sign):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
342 global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_features, endGTF
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
343 strand = ""
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
344 while strand != sign :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
345 gtf_line = gtf_index_file.readline()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
346 # If the GTF file is finished
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
347 if not gtf_line:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
348 endGTF = True
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
349 return endGTF
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
350 splitline = gtf_line.rstrip().split('\t')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
351 try: strand = splitline[3]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
352 # strand information can not be found in the file file header
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
353 except IndexError: pass
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
354 gtf_chrom = splitline[0]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
355 gtf_features = splitline[4:]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
356 gtf_start, gtf_stop = map(int, splitline[1:3])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
357 return endGTF
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
358
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
359 def read_counts_files(counts_files):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
360 cpt={}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
361 cpt_genome={}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
362 labels=[]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
363 for fcounts in counts_files:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
364 label=os.path.splitext(os.path.basename(fcounts))[0]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
365 labels.append(label)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
366 cpt[label]={}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
367 with open (fcounts,"r") as f:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
368 for line in f:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
369 if line[0]=="#":
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
370 continue
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
371 line_split=line[:-1].split('\t')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
372 feature=tuple(line_split[0].split(','))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
373 cpt[label][feature]=float(line_split[1])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
374 cpt_genome[feature]=float(line_split[2])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
375 return cpt,cpt_genome,labels
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
376
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
377
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
378 def get_chromosome_names_in_index(genome_index):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
379 chrom_list = []
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
380 with open(genome_index, 'r') as findex:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
381 chrom = ""
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
382 for line in findex:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
383 cur_chrom = line.split('\t')[0]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
384 if cur_chrom == chrom:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
385 pass
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
386 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
387 chrom = cur_chrom
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
388 if chrom not in chrom_list:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
389 chrom_list.append(chrom)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
390 return set(chrom_list)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
391
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
392
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
393 def intersect_bedgraphs_and_index_to_counts_categories(samples_files,samples_names,prios,genome_index, strandness, biotype_prios = None):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
394 global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_cat, endGTF
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
395 print "\n### Intersecting files with indexes"
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
396 unknown_chrom = []
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
397 cpt = {} # Counter for the nucleotides in the BAM input file(s)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
398 for n in range(len(samples_files)):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
399 sample_file=samples_files[n]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
400 sample_name=samples_names[n]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
401 # Initializing the category counter dict for this sample
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
402 init_dict(cpt, sample_name, {})
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
403 if strandness == "unstranded":
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
404 strands = [("",".")]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
405 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
406 strands = [('.plus','+'), ('.minus','-')]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
407
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
408 # Intersecting the BEDGRAPH and genome index files
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
409 print "\rProcessing '%s'\n. . ." %sample_file,
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
410 sys.stdout.flush()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
411
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
412 for strand,sign in strands:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
413 prev_chrom = ''
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
414 endGTF = False # Reaching the next chr or the end of the GTF index
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
415 intergenic_adds = 0.0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
416 i = 0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
417 i_chgt = 0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
418 with open(sample_file + strand + '.bedgraph', 'r') as bam_count_file:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
419 # Running through the BEDGRAPH file
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
420 for bam_line in bam_count_file:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
421 i += 1
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
422 if i % 10000 == 0:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
423 print ".",
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
424 sys.stdout.flush()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
425 if i % 100000 == 0:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
426 print "\r \r. . .",
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
427 sys.stdout.flush()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
428 # Getting the BAM line info
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
429 bam_chrom = bam_line.split('\t')[0]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
430 bam_start, bam_stop, bam_cpt = map(float, bam_line.split('\t')[1:4])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
431 # Skip the line if the chromosome is not in the index
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
432 if bam_chrom not in chrom_list:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
433 if bam_chrom not in unknown_chrom:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
434 unknown_chrom.append(bam_chrom)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
435 print "\r \r Chromosome '" + bam_chrom + "' not found in index."
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
436 continue
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
437 # If this is a new chromosome (or the first one)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
438 if bam_chrom != prev_chrom:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
439 i_chgt = i
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
440 intergenic_adds = 0.0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
441 # (Re)opening the GTF index and looking for the first line of the matching chr
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
442 try: gtf_index_file.close()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
443 except UnboundLocalError: pass
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
444 gtf_index_file = open(genome_index, 'r')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
445 endGTF = False
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
446 read_gtf(gtf_index_file, sign)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
447 while bam_chrom != gtf_chrom:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
448 read_gtf(gtf_index_file, sign)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
449 if endGTF:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
450 break
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
451 prev_chrom = bam_chrom
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
452
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
453 # Looking for the first matching annotation in the GTF index
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
454 while (not endGTF) and (gtf_chrom == bam_chrom) and (bam_start >= gtf_stop):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
455 read_gtf(gtf_index_file, sign)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
456 if gtf_chrom != bam_chrom:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
457 endGTF = True
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
458 # Processing BAM lines before the first GTF annotation if there are
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
459 if bam_start < gtf_start:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
460 # Increase the 'intergenic' category counter with all or part of the BAM interval
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
461 try:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
462 intergenic_adds += min(bam_stop,gtf_start)-bam_start
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
463 cpt[sample_name][('intergenic','intergenic')] += (min(bam_stop, gtf_start) - bam_start) * bam_cpt
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
464 except KeyError:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
465 cpt[sample_name][('intergenic','intergenic')] = (min(bam_stop, gtf_start) - bam_start) * bam_cpt
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
466 # Go to next line if the BAM interval was totally upstream the first GTF annotation, carry on with the remaining part otherwise
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
467 if endGTF or (bam_stop <= gtf_start):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
468 continue
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
469 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
470 bam_start = gtf_start
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
471
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
472 # We can start the crossover
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
473 while not endGTF:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
474 # Update category counter
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
475 add_info(cpt[sample_name], gtf_features, bam_start, min(bam_stop,gtf_stop), coverage = bam_cpt, categ_prios = prios)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
476 # Read the next GTF file line if the BAM line is not entirely covered
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
477 if bam_stop > gtf_stop:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
478 # Update the BAM start pointer
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
479 bam_start = gtf_stop
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
480 endGTF = read_gtf(gtf_index_file, sign)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
481 # If we read a new chromosome in the GTF file then it is considered finished
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
482 if bam_chrom != gtf_chrom:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
483 endGTF = True
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
484 if endGTF:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
485 break
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
486 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
487 # Next if the BAM line is entirely covered
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
488 bam_start = bam_stop
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
489 break
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
490
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
491 # Processing the end of the BAM line if necessary
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
492 if endGTF and (bam_stop > bam_start):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
493 try:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
494 cpt[sample_name][('intergenic','intergenic')] += (bam_stop - bam_start) * bam_cpt
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
495 except KeyError:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
496 cpt[sample_name][('intergenic','intergenic')] = (bam_stop - bam_start) * bam_cpt
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
497 gtf_index_file.close()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
498 print "\r \rDone!"
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
499 return cpt
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
500
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
501 def write_counts_in_files(cpt,genome_counts):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
502 for label,dico in cpt.items():
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
503 label = "_".join(re.findall(r"[\w']+", label))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
504 with open(label+".categories_counts","w") as fout:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
505 fout.write("#Category,biotype\tCounts_in_bam\tSize_in_genome\n" )
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
506 for feature,counts in dico.items():
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
507 fout.write("%s\t%s\t%s\n" %(','.join(feature),counts,genome_counts[feature]))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
508
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
509 def recategorize_the_counts(cpt,cpt_genome,final):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
510 final_cat_cpt={}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
511 final_genome_cpt={}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
512 for f in cpt:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
513 #print "\nFinal categories for",f,"sample"
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
514 final_cat_cpt[f]={}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
515 for final_cat in final:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
516 tot = 0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
517 tot_genome=0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
518 for cat in final[final_cat]:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
519 tot += cpt[f][cat]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
520 tot_genome+=cpt_genome[cat]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
521 #output_file.write('\t'.join((final_cat, str(tot))) + '\n')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
522 #print '\t'.join((final_cat, str(tot)))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
523 final_cat_cpt[f][final_cat]=tot
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
524 final_genome_cpt[final_cat]=tot_genome
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
525 return final_cat_cpt,final_genome_cpt
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
526
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
527 def group_counts_by_categ(cpt,cpt_genome,final,selected_biotype):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
528 final_cat_cpt={}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
529 final_genome_cpt={}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
530 filtered_cat_cpt = {}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
531 for f in cpt:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
532 final_cat_cpt[f]={}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
533 filtered_cat_cpt[f] = {}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
534 for final_cat in final:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
535 tot = 0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
536 tot_filter = 0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
537 tot_genome=0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
538 for cat in final[final_cat]:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
539 for key,value in cpt[f].items():
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
540 if key[0] == cat :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
541 tot += value
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
542 tot_genome+=cpt_genome[key]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
543 if key[1] == selected_biotype :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
544 tot_filter += value
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
545 #output_file.write('\t'.join((final_cat, str(tot))) + '\n')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
546 #print '\t'.join((final_cat, str(tot)))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
547 final_cat_cpt[f][final_cat]=tot
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
548 if tot_genome == 0:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
549 final_genome_cpt[final_cat]= 1e-100
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
550 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
551 final_genome_cpt[final_cat]=tot_genome
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
552 filtered_cat_cpt[f][final_cat]=tot_filter
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
553 #if 'antisense' in final_genome_cpt: final_genome_cpt['antisense'] = 0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
554 return final_cat_cpt,final_genome_cpt,filtered_cat_cpt
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
555
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
556 def group_counts_by_biotype(cpt,cpt_genome,biotypes):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
557 final_cpt={}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
558 final_genome_cpt={}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
559 for f in cpt:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
560 final_cpt[f]={}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
561 for biot in biotypes:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
562 tot = 0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
563 tot_genome=0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
564 try:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
565 for final_biot in biotypes[biot]:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
566 for key,value in cpt[f].items():
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
567 if key[1] == final_biot :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
568 tot += value
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
569 #if key[1] != 'antisense':
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
570 tot_genome+=cpt_genome[key]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
571 except:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
572 for key,value in cpt[f].items():
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
573 if key[1] == biot :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
574 tot += value
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
575 tot_genome+=cpt_genome[key]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
576 if tot != 0:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
577 final_cpt[f][biot]=tot
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
578 final_genome_cpt[biot]=tot_genome
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
579 return final_cpt,final_genome_cpt
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
580
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
581 #def get_cmap(N):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
582 #'''Returns a function that maps each index in 0, 1, ... N-1 to a distinct
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
583 #RGB color.'''
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
584 #color_norm = colors.Normalize(vmin=0, vmax=N-1)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
585 #scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
586 #def map_index_to_rgb_color(index):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
587 #return scalar_map.to_rgba(index)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
588 #return map_index_to_rgb_color
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
589
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
590 def one_sample_plot(ordered_categs, percentages, enrichment, n_cat, index, index_enrichment, bar_width, counts_type, title) :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
591 ### Initialization
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
592 fig = plt.figure(figsize=(13,9))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
593 ax1 = plt.subplot2grid((2,4),(0,0),colspan=2)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
594 ax2 = plt.subplot2grid((2,4),(1,0),colspan=2)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
595 cmap= plt.get_cmap('Spectral')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
596 cols=[cmap(x) for x in xrange(0,256,256/n_cat)]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
597 if title:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
598 ax1.set_title(title+"in: %s" %samples_names[0])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
599 else :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
600 ax1.set_title(counts_type+" distribution in mapped reads in: %s" %samples_names[0])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
601 ax2.set_title('Normalized counts of '+counts_type)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
602
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
603
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
604 ### Barplots
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
605 #First barplot: percentage of reads in each categorie
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
606 ax1.bar(index, percentages, bar_width,
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
607 color=cols)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
608 #Second barplot: enrichment relative to the genome for each categ
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
609 # (the reads count in a categ is divided by the categ size in the genome)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
610 ax2.bar(index_enrichment, enrichment, bar_width,
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
611 color=cols,)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
612 ### Piecharts
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
613 pielabels = [ordered_categs[i] if percentages[i]>0.025 else '' for i in xrange(n_cat)]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
614 sum_enrichment = numpy.sum(enrichment)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
615 pielabels_enrichment = [ordered_categs[i] if enrichment[i]/sum_enrichment>0.025 else '' for i in xrange(n_cat)]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
616 # Categories piechart
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
617 ax3 = plt.subplot2grid((2,4),(0,2))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
618 pie_wedge_collection, texts = ax3.pie(percentages,labels=pielabels, shadow=True, colors=cols)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
619 # Enrichment piechart
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
620 ax4 = plt.subplot2grid((2,4),(1,2))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
621 pie_wedge_collection, texts = ax4.pie(enrichment,labels=pielabels_enrichment, shadow=True, colors=cols)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
622 # Add legends (append percentages to labels)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
623 labels = [" ".join((ordered_categs[i],"({:.1%})".format(percentages[i]))) for i in range(len(ordered_categs))]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
624 ax3.legend(pie_wedge_collection,labels,loc='center',fancybox=True, shadow=True,prop={'size':'medium'}, bbox_to_anchor=(1.7,0.5))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
625 labels = [" ".join((ordered_categs[i],"({:.1%})".format(enrichment[i]/sum_enrichment))) for i in range(len(ordered_categs))]# if ordered_categs[i] != 'antisense']
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
626 ax4.legend(pie_wedge_collection,labels,loc='center',fancybox=True, shadow=True,prop={'size':'medium'},bbox_to_anchor=(1.7,0.5))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
627 # Set aspect ratio to be equal so that pie is drawn as a circle
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
628 ax3.set_aspect('equal')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
629 ax4.set_aspect('equal')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
630 return fig, ax1, ax2
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
631
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
632 def make_plot(ordered_categs,samples_names,categ_counts,genome_counts,pdf, counts_type, threshold, title = None ,svg = None, png = None):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
633 # From ordered_categs, keep only the features (categs or biotypes) that we can find in at least one sample.
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
634 existing_categs = set()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
635 for sample in categ_counts.values():
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
636 existing_categs |= set(sample.keys())
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
637 ordered_categs = filter(existing_categs.__contains__, ordered_categs)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
638 n_cat = len(ordered_categs)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
639 n_exp=len(samples_names)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
640 ##Initialization of the matrix of counts (nrow=nb_experiements, ncol=nb_categorie)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
641 counts=numpy.matrix(numpy.zeros(shape=(n_exp,n_cat)))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
642 for exp in xrange(len(samples_names)):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
643 for cat in xrange(len(ordered_categs)):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
644 try: counts[exp,cat]=categ_counts[samples_names[exp]][ordered_categs[cat]]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
645 except: pass
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
646
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
647 ##Normalize the categorie sizes by the total size to get percentages
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
648 sizes=[]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
649 sizes_sum=0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
650 for cat in ordered_categs:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
651 sizes.append(genome_counts[cat])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
652 sizes_sum+=genome_counts[cat]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
653 if 'antisense' in ordered_categs:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
654 antisense_pos = ordered_categs.index('antisense')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
655 sizes[antisense_pos] = 1e-100
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
656 for cpt in xrange(len(sizes)):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
657 sizes[cpt]/=float(sizes_sum)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
658
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
659 ## Create array which contains the percentage of reads in each categ for every sample
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
660 percentages=numpy.array(counts/numpy.sum(counts,axis=1))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
661 ## Create the enrichment array (counts divided by the categorie sizes in the genome)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
662 enrichment=numpy.array(percentages/sizes)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
663 if 'antisense_pos' in locals():
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
664 for i in xrange(len(samples_names)):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
665 enrichment[i][antisense_pos] = 0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
666 #enrichment=numpy.log(numpy.array(percentages/sizes))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
667 for exp in xrange(n_exp):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
668 for i in xrange(n_cat):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
669 val = enrichment[exp][i]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
670 if val > 1:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
671 enrichment[exp][i] = val-1
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
672 elif val == 1 or val == 0:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
673 enrichment[exp][i] = 0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
674 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
675 enrichment[exp][i] = -1/val+1
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
676
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
677 #### Finally, produce the plot
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
678
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
679 ##Get the colors from the colormap
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
680 ncolor=16
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
681 cmap = ["#e47878", "#68b4e5", "#a3ea9b", "#ea9cf3", "#e5c957", "#a3ecd1", "#e97ca0", "#66d985", "#8e7ae5", "#b3e04b", "#b884e4", "#e4e758", "#738ee3", "#e76688", "#70dddd", "#e49261"]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
682 if n_exp > ncolor:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
683 cmap = plt.get_cmap('Set3',n_exp)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
684 cmap = [cmap(i) for i in xrange(n_exp)]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
685
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
686 ## Parameters for the plot
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
687 opacity = 1
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
688 #Create a vector which contains the position of each bar
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
689 index = numpy.arange(n_cat)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
690 #Size of the bars (depends on the categs number)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
691 bar_width = 0.9/n_exp
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
692
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
693
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
694 ##Initialise the subplot
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
695 # if there is only one sample, also plot piecharts
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
696 #if n_exp == 1 and counts_type.lower() == 'categories':
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
697 #fig, ax1, ax2 = one_sample_plot(ordered_categs, percentages[0], enrichment[0], n_cat, index, bar_width, counts_type, title)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
698 ## If more than one sample
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
699 #else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
700 fig, (ax1,ax2) = plt.subplots(2,figsize=(5+(n_cat+2*n_exp)/3,10))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
701 # Store the bars objects for enrichment plot
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
702 rects = []
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
703 #For each sample/experiment
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
704 for i in range(n_exp):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
705 #First barplot: percentage of reads in each categorie
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
706 ax1.bar(index+i*bar_width, percentages[i], bar_width,
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
707 alpha=opacity,
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
708 color=cmap[i],
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
709 label=samples_names[i], edgecolor='#FFFFFF', lw=0)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
710 #Second barplot: enrichment relative to the genome for each categ
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
711 # (the reads count in a categ is divided by the categ size in the genome)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
712 rects.append( ax2.bar(index+i*bar_width, enrichment[i], bar_width,
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
713 alpha=opacity,
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
714 color=cmap[i],
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
715 label=samples_names[i], edgecolor=cmap[i], lw=0))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
716
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
717 ## Graphical options for the plot
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
718 # Adding of the legend
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
719 ax1.legend(loc='best',frameon=False)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
720 #ax2.legend(loc='upper center',bbox_to_anchor=(0.5,-0.1), fancybox=True, shadow=True)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
721 # Main titles
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
722 if title:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
723 ax1.set_title(title)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
724 else :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
725 ax1.set_title(counts_type+" distribution in mapped reads")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
726 ax2.set_title('Normalized counts of '+counts_type)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
727
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
728 # Adding enrichment baseline
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
729 #ax2.axhline(y=0,color='black',linestyle='dashed',linewidth='1.5')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
730 # Axes limits
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
731 ax1.set_xlim(-0.1,len(ordered_categs)+0.1)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
732 if len(sizes) == 1: ax1.set_xlim(-2,3)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
733 ax2.set_xlim(ax1.get_xlim())
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
734 # Set axis limits (max/min values + 5% margin)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
735 ax2_ymin = []
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
736 ax2_ymax = []
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
737 for sample_values in enrichment:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
738 ax2_ymin.append(min(sample_values))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
739 ax2_ymax.append(max(sample_values))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
740 ax2_ymax = max(ax2_ymax)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
741 ax2_ymin = min(ax2_ymin)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
742 margin_top, margin_bottom = (abs(0.05*ax2_ymax), abs(0.05*ax2_ymin))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
743 ax1.set_ylim(0,ax1.get_ylim()[1]*1.05)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
744 if threshold:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
745 threshold_bottom = -abs(float(threshold[0]))+1
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
746 threshold_top = float(threshold[1])-1
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
747
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
748 for i in xrange(n_exp):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
749 for y in xrange(n_cat):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
750 val = enrichment[i][y]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
751 if not numpy.isnan(val) and not (threshold_bottom < val < threshold_top):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
752 rect = rects[i][y]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
753 rect_height = rect.get_height()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
754 if rect.get_y() < 0:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
755 diff = rect_height + threshold_bottom
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
756 rect.set_y(threshold_bottom)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
757 ax2_ymin = threshold_bottom
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
758 margin_bottom = 0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
759 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
760 diff = rect_height - threshold_top
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
761 ax2_ymax = threshold_top
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
762 margin_top = 0
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
763 rect.set_height(rect.get_height()-diff)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
764 if margin_top != 0 and margin_bottom != 0:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
765 margin_top, margin_bottom = [max(margin_top, margin_bottom) for i in xrange(2)]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
766 ax2.set_ylim(ax2_ymin-margin_bottom,ax2_ymax+margin_top)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
767 # Y axis title
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
768 ax1.set_ylabel('Proportion of reads (%)')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
769 ax2.set_ylabel('Enrichment relative to genome')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
770 # X axis title
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
771 ax1.set_xlabel(counts_type)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
772 ax2.set_xlabel(counts_type)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
773 # Add the categories on the x-axis
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
774 ax1.set_xticks(index + bar_width*n_exp/2)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
775 ax2.set_xticks(index + bar_width*n_exp/2)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
776 if counts_type.lower() != 'categories':
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
777 ax1.set_xticklabels(ordered_categs,rotation='30',ha='right')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
778 ax2.set_xticklabels(ordered_categs,rotation='30',ha='right')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
779 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
780 ax1.set_xticklabels(ordered_categs)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
781 ax2.set_xticklabels(ordered_categs)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
782 # Display fractions values in percentages
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
783 ax1.set_yticklabels([str(int(i*100)) for i in ax1.get_yticks()])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
784 # Correct y-axis ticks labels for enrichment subplot
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
785 #ax2.set_yticklabels([str(i+1)+"$^{+1}$" if i>0 else 1 if i==0 else str(-(i-1))+"$^{-1}$" for i in ax2.get_yticks()])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
786 yticks = list(ax2.get_yticks())
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
787 yticks = [ yticks[i]-1 if yticks[i]>9 else yticks[i]+1 if yticks[i]<-9 else yticks[i] for i in xrange(len(yticks))]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
788 ax2.set_yticks(yticks)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
789 ax2.set_yticklabels([str(int(i+1))+"$^{+1}$" if i>0 and i%1==0 else str(i+1)+"$^{+1}$" if i>0 else 1 if i==0 else str(int(-(i-1)))+"$^{-1}$" if i<0 and i%1==0 else str(-(i-1))+"$^{-1}$" for i in ax2.get_yticks()])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
790 #ax2.set_yticklabels([i+1 if i>0 else 1 if i==0 else "$\\frac{1}{%s}$" %-(i-1) for i in ax2.get_yticks()])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
791 # Change appearance of 'antisense' bars on enrichment plot since we cannot calculate an enrichment for this artificial category
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
792 if 'antisense_pos' in locals(): #ax2.text(antisense_pos+bar_width/2,ax2.get_ylim()[1]/10,'NA')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
793 for i in xrange(n_exp) :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
794 rect = rects[i][antisense_pos]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
795 rect.set_y(ax2.get_ylim()[0])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
796 rect.set_height(ax2.get_ylim()[1] - ax2.get_ylim()[0])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
797 rect.set_hatch('/')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
798 rect.set_fill(False)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
799 rect.set_linewidth(0)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
800 #rect.set_color('lightgrey')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
801 #rect.set_edgecolor('#EDEDED')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
802 rect.set_color('#EDEDED')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
803 ax2.text(index[antisense_pos] + bar_width*n_exp/2 - 0.1, (ax2_ymax+ax2_ymin)/2, 'NA')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
804 # Add text for features absent in sample
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
805 for i in xrange(n_exp):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
806 for y in xrange(n_cat):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
807 if percentages[i][y] == 0:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
808 txt = ax1.text(y + bar_width*(i+0.5), 0.02, 'Absent in sample', rotation = 'vertical', color = cmap[i], horizontalalignment ='center', verticalalignment = 'bottom')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
809 txt.set_path_effects([PathEffects.Stroke(linewidth=0.5),PathEffects.Normal()])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
810 elif enrichment[i][y] == 0:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
811 rects[i][y].set_linewidth(1)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
812
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
813 # Remove top/right/bottom axes
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
814 for ax in [ax1,ax2]:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
815 ax.spines['top'].set_visible(False)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
816 ax.spines['right'].set_visible(False)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
817 ax.spines['bottom'].set_visible(False)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
818 ax.tick_params(axis='x', which='both', bottom='on', top='off', labelbottom='on')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
819 ax.tick_params(axis='y', which='both', left='on', right='off', labelleft='on')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
820
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
821
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
822
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
823 #ax1.legend(loc='center right', bbox_to_anchor=(1.2, 0),fancybox=True, shadow=True)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
824 ## Showing the plot
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
825 plt.tight_layout()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
826 fig.subplots_adjust(wspace=0.2)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
827 if pdf:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
828 pdf.savefig()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
829 plt.close()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
830 elif svg:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
831 if svg == True:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
832 plt.savefig(counts_type+'.svg')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
833 else :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
834 if os.path.splitext(svg)[1] == '.svg':
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
835 plt.savefig('.'.join((os.path.splitext(svg)[0],counts_type,'svg')))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
836 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
837 plt.savefig('.'.join((svg,counts_type,'svg')))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
838 elif png:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
839 if png == True:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
840 plt.savefig(counts_type+'.png')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
841 else :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
842 if os.path.splitext(png)[1] == '.png':
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
843 plt.savefig('.'.join((os.path.splitext(png)[0],counts_type,'png')))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
844 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
845 plt.savefig('.'.join((png,counts_type,'png')))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
846 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
847 plt.show()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
848 ## Save on disk it as a png image
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
849 #fig.savefig('image_output.png', dpi=300, format='png', bbox_extra_artists=(lgd,), bbox_inches='tight')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
850
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
851
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
852 def filter_categs_on_biotype(selected_biotype,cpt) :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
853 filtered_cpt = {}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
854 for sample in cpt:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
855 filtered_cpt[sample] = {}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
856 for feature,count in cpt[sample].items():
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
857 if feature[1] == selected_biotype:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
858 filtered_cpt[sample][feature[0]] = count
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
859 return filtered_cpt
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
860
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
861
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
862 ##########################################################################
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
863 # MAIN #
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
864 ##########################################################################
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
865
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
866 def usage_message(name=None):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
867 return '''
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
868 Generate genome indexes:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
869 python ALFA.py -a GTF_FILE [-g GENOME_INDEX]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
870 [--chr_len CHR_LENGTHS_FILE]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
871 Process BAM file(s):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
872 python ALFA.py -g GENOME_INDEX -i BAM1 LABEL1 [BAM2 LABEL2 ...]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
873 [--bedgraph] [-s STRAND]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
874 [-n] [--pdf output.pdf]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
875 [-d {1,2,3,4}] [-t YMIN YMAX]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
876 Index genome + process BAM:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
877 python ALFA.py -a GTF_FILE [-g GENOME_INDEX]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
878 -i BAM1 LABEL1 [BAM2 LABEL2 ...]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
879 [--chr_len CHR_LENGTHS_FILE]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
880 [--bedgraph][-s STRAND]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
881 [-n] [--pdf output.pdf]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
882 [-d {1,2,3,4}] [-t YMIN YMAX]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
883
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
884 Process previously created ALFA counts file(s):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
885 python ALFA.py -c COUNTS1 [COUNTS2 ...]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
886 [-s STRAND]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
887 [-n] [--pdf output.pdf]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
888 [-d {1,2,3,4}] [-t YMIN YMAX]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
889
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
890 '''
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
891
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
892
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
893 #### Parse command line arguments and store them in 'options'
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
894 parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, usage=usage_message())
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
895 parser.add_argument('--version', action='version', version='version 1.0', help="show program's version number and exit\n\n-----------\n\n")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
896 # Options concernant l'index
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
897 parser.add_argument('-g','--genome_index', help="Genome index files path and basename for existing index, or path and basename for new index creation\n\n")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
898 parser.add_argument('-a','--annotation', metavar = "GTF_FILE", help='Genomic annotations file (GTF format)\n\n')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
899 parser.add_argument('--chr_len', help='Tabulated file containing chromosome names and lengths\n\n-----------\n\n')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
900
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
901 # Options pour l'étape d'intersection
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
902 parser.add_argument('-i','--input','--bam', dest='input', metavar=('BAM_FILE1 LABEL1',""), nargs='+', help='Input BAM file(s) and label(s). The BAM files must be sorted by position.\n\n')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
903 parser.add_argument('--bedgraph', action='store_const',default = False, const = True, help="Use this options if your input file(s) is(are) already in bedgraph format\n\n")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
904 parser.add_argument('-c','--counts',metavar=('COUNTS_FILE',""), nargs='+', help="Use this options instead of '-i/--input' to provide ALFA counts files as input \ninstead of bam/bedgraph files.\n\n")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
905 parser.add_argument('-s','--strandness', dest="strandness", nargs=1, action = 'store', default = ['unstranded'], choices = ['unstranded','forward','reverse','fr-firststrand','fr-secondstrand'], metavar="", help ="Library orientation. Choose within: 'unstranded', 'forward'/'fr-firststrand' \nor 'reverse'/'fr-secondstrand'. (Default: 'unstranded')\n\n-----------\n\n")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
906
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
907 # Options concernant le plot
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
908 parser.add_argument('-biotype_filter',nargs=1,help=argparse.SUPPRESS)#"Make an extra plot of categories distribution using only counts of the specified biotype.")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
909 parser.add_argument('-d','--categories_depth', type=int, default='3', choices=range(1,5), help = "Use this option to set the hierarchical level that will be considered in the GTF file (default=3): \n(1) gene,intergenic; \n(2) intron,exon,intergenic; \n(3) 5'UTR,CDS,3'UTR,intron,intergenic; \n(4) start_codon,5'UTR,CDS,3'UTR,stop_codon,intron,intergenic. \n\n")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
910 parser.add_argument('--pdf', nargs='?', default=False, help="Save produced plots in PDF format at specified path ('categories_plots.pdf' if no argument provided)\n\n")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
911 parser.add_argument('--png', nargs='?', default=False, const=True, help="Save produced plots in PNG format with provided argument as basename \nor 'categories.png' and 'biotypes.png' if no argument provided\n\n")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
912 parser.add_argument('--svg', nargs='?', default=False, const=True, help="Save produced plots in SVG format with provided argument as basename \nor 'categories.svg' and 'biotypes.svg' if no argument provided\n\n")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
913 parser.add_argument('-n','--no_plot', dest='quiet', action='store_const', default=False, const=True, help="Do not show plots\n\n")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
914 parser.add_argument('-t','--threshold', dest='threshold', nargs = 2, metavar=("ymin","ymax"), type=float , help="Set axis limits for enrichment plots\n\n")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
915
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
916 if len(sys.argv)==1:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
917 parser.print_usage()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
918 sys.exit(1)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
919
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
920 options = parser.parse_args()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
921
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
922 def required_arg(arg, aliases):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
923 if not arg:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
924 print >> sys.stderr, "\nError: %s argument is missing.\n" %aliases
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
925 parser.print_usage()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
926 sys.exit()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
927
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
928 def check_files_enxtension(files):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
929 return
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
930
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
931 # Booleans for steps to be executed
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
932 make_index = False
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
933 intersect_reads = False
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
934 process_counts = False
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
935
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
936 #### Check arguments conformity and define which steps have to be perfomed
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
937 if options.counts :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
938 # Aucun autre argument requis
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
939 # Vérifier extension input
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
940
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
941 # Action : Faire le plot uniquement
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
942 process_counts = True
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
943 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
944 if options.annotation :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
945 # Vérifier si présence -gi
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
946 if options.genome_index :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
947 genome_index_basename = options.genome_index
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
948 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
949 genome_index_basename = options.annotation.split("/")[-1].split(".gtf")[0]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
950 # Vérifier si un fichier existe déjà:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
951 if os.path.isfile(genome_index_basename+".stranded.index") :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
952 if options.input:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
953 print >> sys.stderr, "\nWarning: a index file named '%s' already exists and will be used. If you want to create a new index, please delete this file or specify an other path." %(genome_index_basename+".stranded.index")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
954 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
955 sys.exit("\nError: a index file named %s already exists. If you want to create a new index, please delete this file or specify an other path.\n" %(genome_index_basename+".stranded.index"))
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
956 # sinon -> action : index à faire
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
957 else :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
958 make_index = True
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
959 # si l'index n'est pas à faire :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
960 if options.input:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
961 # Arguments requis: input, genome_index
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
962 if 'genome_index_basename' not in locals():
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
963 required_arg(options.genome_index, "-g/--genome_index")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
964 genome_index_basename = options.genome_index
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
965 required_arg(options.input, "-i/--input/--bam")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
966 for i in xrange(0, len(options.input), 2):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
967 try :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
968 extension = os.path.splitext(options.input[i+1])[1]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
969 if extension == ".bam" or extension == '.bedgraph' or extension == '.bg':
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
970 sys.exit("Error: it seems input files and associated labels are not correctly provided.\n\
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
971 Make sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
972 except:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
973 sys.exit("Error: it seems input files and associated labels are not correctly provided.\nMake sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
974
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
975 intersect_reads = True
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
976 # Vérifier input's extension
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
977 #TODO
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
978 if not (options.counts or options.input or options.annotation):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
979 sys.exit("\nError : some arguments are missing At least '-a', '-c' or '-i' is required. Please refer to help (-h/--help) and usage cases for more details.\n")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
980 if not options.counts:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
981 # Declare genome_index variables
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
982 stranded_genome_index = genome_index_basename+".stranded.index"
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
983 unstranded_genome_index = genome_index_basename+".unstranded.index"
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
984 if options.strandness[0] == "unstranded":
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
985 genome_index = unstranded_genome_index
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
986 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
987 genome_index = stranded_genome_index
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
988
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
989
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
990
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
991 #### Initialization of some variables
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
992
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
993 # Initializing the category priority order, coding biotypes and the final list
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
994 prios = {'start_codon': 7, 'stop_codon': 7, 'five_prime_utr': 6, 'three_prime_utr': 6, 'UTR': 6, 'CDS': 5, 'exon': 4, 'transcript': 3, 'gene': 2, 'antisense': 1,'intergenic': 0}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
995
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
996 biotype_prios = None
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
997 #biotype_prios = {"protein_coding":1, "miRNA":2}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
998
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
999 # Possibles groups of categories to plot
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1000 categs_group1={'start': ['start_codon'], '5UTR': ['five_prime_utr','UTR'], 'CDS': ['CDS', 'exon'], '3UTR': ['three_prime_utr'], 'stop': ['stop_codon'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1001 categs_group2={'5UTR': ['five_prime_utr', 'UTR'], 'CDS': ['CDS', 'exon','start_codon','stop_codon'], '3UTR': ['three_prime_utr'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1002 categs_group3={'exons': ['five_prime_utr', 'three_prime_utr', 'UTR', 'CDS', 'exon','start_codon','stop_codon'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1003 categs_group4={'gene': ['five_prime_utr', 'three_prime_utr', 'UTR', 'CDS', 'exon','start_codon','stop_codon', 'transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1004 categs_groups = [categs_group4,categs_group3,categs_group2,categs_group1] # Order and merging for the final plot
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1005 cat_list = ['5UTR', 'start', 'CDS', 'stop', '3UTR', 'exons', 'introns', 'gene', 'intergenic', 'antisense']
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1006
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1007
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1008 # biotypes list
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1009 biotypes ={'protein_coding','polymorphic_pseudogene','TR_C_gene','TR_D_gene','TR_J_gene','TR_V_gene','IG_C_gene','IG_D_gene','IG_J_gene','IG_V_gene',"3prime_overlapping_ncrna","lincRNA","macro_lncRNA","miRNA","misc_RNA","Mt_rRNA","Mt_tRNA","processed_transcript","ribozyme","rRNA","scaRNA","sense_intronic","sense_overlapping","snoRNA","snRNA","sRNA","TEC","vaultRNA","antisense","transcribed_processed_pseudogene","transcribed_unitary_pseudogene","transcribed_unprocessed_pseudogene","translated_unprocessed_pseudogene","TR_J_pseudogene","TR_V_pseudogene","unitary_pseudogene","unprocessed_pseudogene","processed_pseudogene","IG_C_pseudogene","IG_J_pseudogene","IG_V_pseudogene","pseudogene","ncRNA","tRNA"} # Type: set (to access quickly)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1010
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1011 # Grouping of biotypes:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1012 biotypes_group1={'protein_coding':['protein_coding'],'pseudogenes':['polymorphic_pseudogene',"transcribed_processed_pseudogene","transcribed_unitary_pseudogene","transcribed_unprocessed_pseudogene","translated_unprocessed_pseudogene","TR_J_pseudogene","TR_V_pseudogene","unitary_pseudogene","unprocessed_pseudogene","processed_pseudogene","IG_C_pseudogene","IG_J_pseudogene","IG_V_pseudogene","pseudogene"],'TR':['TR_C_gene','TR_D_gene','TR_J_gene','TR_V_gene'],'IG':['IG_C_gene','IG_D_gene','IG_J_gene','IG_V_gene'],\
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1013 'MT_RNA':["Mt_rRNA","Mt_tRNA"],\
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1014 'ncRNA':["lincRNA","macro_lncRNA","3prime_overlapping_ncrna","ncRNA"],\
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1015 "others":["misc_RNA","processed_transcript","ribozyme","scaRNA","sense_intronic","sense_overlapping","TEC","vaultRNA"],
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1016 "antisense":["antisense"]}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1017 for biot in ["miRNA","snoRNA","snRNA","rRNA","sRNA","tRNA"]:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1018 biotypes_group1[biot]=[biot]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1019
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1020 # # Initializing the unkown features lits
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1021 unknown_feature = []
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1022
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1023 # Initializing the genome category counter dict
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1024 cpt_genome = {}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1025
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1026
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1027 if process_counts :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1028 #### If input files are the categories counts, just load them and continue to recategorization step
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1029 cpt,cpt_genome,samples_names = read_counts_files(options.counts)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1030 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1031 #### Create genome index if needed and get the sizes of categories
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1032 if make_index :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1033 #### Get the chromosome lengths
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1034 lengths = get_chromosome_lengths(options)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1035 # Generating the genome index files if the user didn't provide them
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1036 create_genome_index(options.annotation, unstranded_genome_index, stranded_genome_index, cpt_genome, prios, biotypes, lengths)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1037
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1038
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1039 #print '\nChr lengths:', lengths
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1040
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1041 if intersect_reads:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1042 # If the indexes already exist, read them to compute the sizes of the categories in the genome and retrieve the chromosome lengths
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1043 if not make_index :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1044 print "\n### Reading genome indexes\n...\r",
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1045 sys.stdout.flush()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1046 lengths={}
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1047 with open(genome_index, 'r') as genome_index_file:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1048 for line in genome_index_file:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1049 if line[0] == "#":
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1050 lengths[line.split('\t')[0][1:]] = int(line.split('\t')[1])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1051 else :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1052 add_info(cpt_genome, line.rstrip().split('\t')[4:], line.split('\t')[1], line.split('\t')[2], biotype_prios = None, categ_prios = prios)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1053
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1054 #### Computing the genome intergenic count: sum of the chr lengths minus sum of the genome annotated intervals
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1055 cpt_genome[('intergenic','intergenic')] = sum(lengths.itervalues()) - sum([v for x,v in cpt_genome.iteritems() if x != ('antisense','antisense')])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1056 if not make_index :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1057 print "Done!"
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1058 #print '\nGenome category counts:'
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1059 #for key,val in cpt_genome.iteritems():
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1060 #print key,"\t",val
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1061
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1062
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1063 #### Create the Bedgraph files if needed and get the files list
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1064
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1065 if not options.bedgraph:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1066 # Generating the BEDGRAPH files is the user provided BAM file(s) and get the samples labels (this names will be used in the plot legend)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1067 samples_files, samples_names = create_bedgraph_files(options.input,options.strandness[0])
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1068 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1069 # Just initialize the files list with the bedgraph paths
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1070 samples_files = [options.input[i] for i in range(0,len(options.input),2)]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1071 # and get the labels
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1072 samples_names = [options.input[i] for i in range(1,len(options.input),2)]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1073 #### Retrieving chromosome names saved in index
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1074 chrom_list = get_chromosome_names_in_index(genome_index)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1075 #### Processing the BEDGRAPH files: intersecting the bedgraph with the genome index and count the number of aligned positions in each category
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1076 cpt = intersect_bedgraphs_and_index_to_counts_categories(samples_files,samples_names,prios,genome_index, options.strandness[0], biotype_prios = None)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1077
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1078 #### Write the counts on disk
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1079 write_counts_in_files(cpt,cpt_genome)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1080
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1081 if not (intersect_reads or process_counts) or (options.quiet and options.pdf == False):
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1082 quit("\n### End of program")
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1083 print "\n### Generating plots"
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1084 # Updating the biotypes lists (biotypes and 'biotype_group1'): adding the 'unknow biotypes' found in gtf/index
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1085 if unknown_feature == []: # 'unknown_feature' is define only during the index generation
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1086 # Browse the feature to determine whether some biotypes are 'unknown'
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1087 for sample,counts in cpt.items():
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1088 for (cat,biot) in counts:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1089 if biot not in biotypes and cat not in unknown_feature:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1090 unknown_feature.append(biot)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1091 for new_biot in unknown_feature:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1092 biotypes.add(new_biot)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1093 biotypes_group1["others"].append(new_biot)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1094 biotypes = sorted(biotypes)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1095 # move antisense categ to the end of the list
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1096 biotypes.remove('antisense')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1097 biotypes.append('antisense')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1098 biotypes_group1 = sorted(biotypes_group1)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1099
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1100
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1101 #print '\nCounts for every category/biotype pair: ',cpt
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1102
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1103 # Generating plots
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1104 if options.pdf != False:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1105 if options.pdf == None:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1106 options.pdf = "categories_plots.pdf"
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1107 pdf = PdfPages(options.pdf)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1108 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1109 pdf = False
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1110
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1111 selected_biotype = None
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1112 if options.biotype_filter:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1113 options.biotype_filter = options.biotype_filter[0]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1114 for sample in cpt:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1115 for feature in cpt[sample]:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1116 biotype = feature[1]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1117 if options.biotype_filter.lower() == biotype.lower():
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1118 selected_biotype=biotype
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1119 break
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1120 if selected_biotype == None :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1121 print "\nError: biotype '"+options.biotype_filter+"' not found. Please check the biotype name and that this biotype exists in your sample(s)."
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1122 sys.exit()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1123
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1124 #Print a warning message if the UTRs are not specified as 5' or 3' (they will be ploted as 5'UTR)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1125 if 'UTR' in [categ[0] for counts in cpt.values() for categ in counts.keys()]:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1126 print '''\nWARNING: (some) 5'UTR/3'UTR are not precisely defined. Consequently, positions annotated as "UTR" will be counted as "5'UTR"\n'''
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1127
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1128 #### Make the plot by categories
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1129 #### Recategorizing with the final categories
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1130 final_cats=categs_groups[options.categories_depth-1]
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1131 final_cat_cpt,final_genome_cpt, filtered_cat_cpt = group_counts_by_categ(cpt,cpt_genome,final_cats,selected_biotype)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1132 #### Display the distribution of specified categories (or biotypes) in samples on a barplot
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1133 # Remove the 'antisense' category if the library type is 'unstranded'
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1134 for dic in cpt.values():
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1135 if ('antisense','antisense') in dic.keys(): break
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1136 else:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1137 cat_list.remove('antisense')
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1138 make_plot(cat_list,samples_names,final_cat_cpt,final_genome_cpt,pdf, "categories",options.threshold, svg = options.svg, png = options.png)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1139 if selected_biotype :
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1140 make_plot(cat_list,samples_names,filtered_cat_cpt,final_genome_cpt,pdf, "categories",options.threshold,title="Categories distribution for '"+selected_biotype+"' biotype", svg = options.svg, png = options.png)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1141
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1142 #### Make the plot by biotypes
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1143 #### Recategorizing with the final categories
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1144 final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1145 #### Display the distribution of specified categories (or biotypes) in samples on a barplot
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1146 make_plot(biotypes,samples_names,final_cat_cpt,final_genome_cpt,pdf, "biotypes",options.threshold, svg = options.svg, png = options.png)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1147
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1148
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1149
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1150 ##### Recategorizing with the final categories
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1151 #final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes_group1)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1152 ##### Display the distribution of specified categories (or biotypes) in samples on a barplot
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1153 #make_plot(biotypes_group1,samples_names,final_cat_cpt,final_genome_cpt,pdf,"Biotype groups", options.threshold, title="Biotypes distribution in mapped reads \n(biotypes are grouped by 'family')", svg = options.svg, png = options.png)
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1154
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1155
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1156 if options.pdf:
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1157 pdf.close()
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1158 print "\n### Plots saved in pdf file: %s" %options.pdf
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1159
a1e2ab10b317 Uploaded
charles-bernard
parents:
diff changeset
1160 print "\n### End of program"