Mercurial > repos > devteam > short_reads_figure_score
comparison short_reads_figure_score.py @ 0:b52b9c7aabd9 draft default tip
Imported from capsule None
author | devteam |
---|---|
date | Mon, 19 May 2014 12:35:00 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b52b9c7aabd9 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 boxplot: | |
4 - box: first quartile and third quartile | |
5 - line inside the box: median | |
6 - outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile | |
7 IQR = third quartile - first quartile | |
8 - The smallest/largest value that is not an outlier is connected to the box by with a horizontal line. | |
9 """ | |
10 | |
11 import os, sys, math, tempfile, re | |
12 from rpy import * | |
13 | |
14 assert sys.version_info[:2] >= ( 2, 4 ) | |
15 | |
16 def stop_err( msg ): | |
17 sys.stderr.write( "%s\n" % msg ) | |
18 sys.exit() | |
19 | |
20 def merge_to_20_datapoints( score ): | |
21 number_of_points = 20 | |
22 read_length = len( score ) | |
23 step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) ) | |
24 scores = [] | |
25 point = 1 | |
26 point_sum = 0 | |
27 step_average = 0 | |
28 score_points = 0 | |
29 | |
30 for i in xrange( 1, read_length ): | |
31 if i < ( point * step ): | |
32 point_sum += int( score[i] ) | |
33 step_average += 1 | |
34 else: | |
35 point_avg = point_sum * 1.0 / step_average | |
36 scores.append( point_avg ) | |
37 point += 1 | |
38 point_sum = 0 | |
39 step_average = 0 | |
40 if step_average > 0: | |
41 point_avg = point_sum * 1.0 / step_average | |
42 scores.append( point_avg ) | |
43 if len( scores ) > number_of_points: | |
44 last_avg = 0 | |
45 for j in xrange( number_of_points - 1, len( scores ) ): | |
46 last_avg += scores[j] | |
47 last_avg = last_avg / ( len(scores) - number_of_points + 1 ) | |
48 else: | |
49 last_avg = scores[-1] | |
50 score_points = [] | |
51 for k in range( number_of_points - 1 ): | |
52 score_points.append( scores[k] ) | |
53 score_points.append( last_avg ) | |
54 return score_points | |
55 | |
56 def __main__(): | |
57 | |
58 invalid_lines = 0 | |
59 | |
60 infile_score_name = sys.argv[1].strip() | |
61 outfile_R_name = sys.argv[2].strip() | |
62 | |
63 infile_name = infile_score_name | |
64 | |
65 # Determine tabular or fasta format within the first 100 lines | |
66 seq_method = None | |
67 data_type = None | |
68 for i, line in enumerate( file( infile_name ) ): | |
69 line = line.rstrip( '\r\n' ) | |
70 if not line or line.startswith( '#' ): | |
71 continue | |
72 if data_type == None: | |
73 if line.startswith( '>' ): | |
74 data_type = 'fasta' | |
75 continue | |
76 elif len( line.split( '\t' ) ) > 0: | |
77 fields = line.split() | |
78 for score in fields: | |
79 try: | |
80 int( score ) | |
81 data_type = 'tabular' | |
82 seq_method = 'solexa' | |
83 break | |
84 except: | |
85 break | |
86 elif data_type == 'fasta': | |
87 fields = line.split() | |
88 for score in fields: | |
89 try: | |
90 int( score ) | |
91 seq_method = '454' | |
92 break | |
93 except: | |
94 break | |
95 if i == 100: | |
96 break | |
97 | |
98 if data_type is None: | |
99 stop_err( 'This tool can only use fasta data or tabular data.' ) | |
100 if seq_method is None: | |
101 stop_err( 'Invalid data for fasta format.') | |
102 | |
103 # Determine fixed length or variable length within the first 100 lines | |
104 read_length = 0 | |
105 variable_length = False | |
106 if seq_method == 'solexa': | |
107 for i, line in enumerate( file( infile_name ) ): | |
108 line = line.rstrip( '\r\n' ) | |
109 if not line or line.startswith( '#' ): | |
110 continue | |
111 scores = line.split('\t') | |
112 if read_length == 0: | |
113 read_length = len( scores ) | |
114 if read_length != len( scores ): | |
115 variable_length = True | |
116 break | |
117 if i == 100: | |
118 break | |
119 elif seq_method == '454': | |
120 score = '' | |
121 for i, line in enumerate( file( infile_name ) ): | |
122 line = line.rstrip( '\r\n' ) | |
123 if not line or line.startswith( '#' ): | |
124 continue | |
125 if line.startswith( '>' ): | |
126 if len( score ) > 0: | |
127 score = score.split() | |
128 if read_length == 0: | |
129 read_length = len( score ) | |
130 if read_length != len( score ): | |
131 variable_length = True | |
132 break | |
133 score = '' | |
134 else: | |
135 score = score + ' ' + line | |
136 if i == 100: | |
137 break | |
138 | |
139 if variable_length: | |
140 number_of_points = 20 | |
141 else: | |
142 number_of_points = read_length | |
143 read_length_threshold = 100 # minimal read length for 454 file | |
144 score_points = [] | |
145 score_matrix = [] | |
146 invalid_scores = 0 | |
147 | |
148 if seq_method == 'solexa': | |
149 for i, line in enumerate( open( infile_name ) ): | |
150 line = line.rstrip( '\r\n' ) | |
151 if not line or line.startswith( '#' ): | |
152 continue | |
153 tmp_array = [] | |
154 scores = line.split( '\t' ) | |
155 for bases in scores: | |
156 nuc_errors = bases.split() | |
157 try: | |
158 nuc_errors[0] = int( nuc_errors[0] ) | |
159 nuc_errors[1] = int( nuc_errors[1] ) | |
160 nuc_errors[2] = int( nuc_errors[2] ) | |
161 nuc_errors[3] = int( nuc_errors[3] ) | |
162 big = max( nuc_errors ) | |
163 except: | |
164 #print 'Invalid numbers in the file. Skipped.' | |
165 invalid_scores += 1 | |
166 big = 0 | |
167 tmp_array.append( big ) | |
168 score_points.append( tmp_array ) | |
169 elif seq_method == '454': | |
170 # skip the last fasta sequence | |
171 score = '' | |
172 for i, line in enumerate( open( infile_name ) ): | |
173 line = line.rstrip( '\r\n' ) | |
174 if not line or line.startswith( '#' ): | |
175 continue | |
176 if line.startswith( '>' ): | |
177 if len( score ) > 0: | |
178 score = ['0'] + score.split() | |
179 read_length = len( score ) | |
180 tmp_array = [] | |
181 if not variable_length: | |
182 score.pop(0) | |
183 score_points.append( score ) | |
184 tmp_array = score | |
185 elif read_length > read_length_threshold: | |
186 score_points_tmp = merge_to_20_datapoints( score ) | |
187 score_points.append( score_points_tmp ) | |
188 tmp_array = score_points_tmp | |
189 score = '' | |
190 else: | |
191 score = "%s %s" % ( score, line ) | |
192 if len( score ) > 0: | |
193 score = ['0'] + score.split() | |
194 read_length = len( score ) | |
195 if not variable_length: | |
196 score.pop(0) | |
197 score_points.append( score ) | |
198 elif read_length > read_length_threshold: | |
199 score_points_tmp = merge_to_20_datapoints( score ) | |
200 score_points.append( score_points_tmp ) | |
201 tmp_array = score_points_tmp | |
202 | |
203 # reverse the matrix, for R | |
204 for i in range( number_of_points - 1 ): | |
205 tmp_array = [] | |
206 for j in range( len( score_points ) ): | |
207 try: | |
208 tmp_array.append( int( score_points[j][i] ) ) | |
209 except: | |
210 invalid_lines += 1 | |
211 score_matrix.append( tmp_array ) | |
212 | |
213 # generate pdf figures | |
214 #outfile_R_pdf = outfile_R_name | |
215 #r.pdf( outfile_R_pdf ) | |
216 outfile_R_png = outfile_R_name | |
217 r.bitmap( outfile_R_png ) | |
218 | |
219 title = "boxplot of quality scores" | |
220 empty_score_matrix_columns = 0 | |
221 for i, subset in enumerate( score_matrix ): | |
222 if not subset: | |
223 empty_score_matrix_columns += 1 | |
224 score_matrix[i] = [0] | |
225 | |
226 if not variable_length: | |
227 r.boxplot( score_matrix, xlab="location in read length", main=title ) | |
228 else: | |
229 r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title ) | |
230 x_old_range = [] | |
231 x_new_range = [] | |
232 step = read_length_threshold / number_of_points | |
233 for i in xrange( 0, read_length_threshold, step ): | |
234 x_old_range.append( ( i / step ) ) | |
235 x_new_range.append( i ) | |
236 r.axis( 1, x_old_range, x_new_range ) | |
237 r.dev_off() | |
238 | |
239 if invalid_scores > 0: | |
240 print 'Skipped %d invalid scores. ' % invalid_scores | |
241 if invalid_lines > 0: | |
242 print 'Skipped %d invalid lines. ' % invalid_lines | |
243 if empty_score_matrix_columns > 0: | |
244 print '%d missing scores in score_matrix. ' % empty_score_matrix_columns | |
245 | |
246 r.quit(save = "no") | |
247 | |
248 if __name__=="__main__":__main__() |