0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 boxplot:
|
|
4 - box: first quartile and third quartile
|
|
5 - line inside the box: median
|
|
6 - outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile
|
|
7 IQR = third quartile - first quartile
|
|
8 - The smallest/largest value that is not an outlier is connected to the box by with a horizontal line.
|
|
9 """
|
|
10
|
|
11 import os, sys, math, tempfile, re
|
|
12 from rpy import *
|
|
13
|
|
14 assert sys.version_info[:2] >= ( 2, 4 )
|
|
15
|
|
16 def stop_err( msg ):
|
|
17 sys.stderr.write( "%s\n" % msg )
|
|
18 sys.exit()
|
|
19
|
|
20 def merge_to_20_datapoints( score ):
|
|
21 number_of_points = 20
|
|
22 read_length = len( score )
|
|
23 step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) )
|
|
24 scores = []
|
|
25 point = 1
|
|
26 point_sum = 0
|
|
27 step_average = 0
|
|
28 score_points = 0
|
|
29
|
|
30 for i in xrange( 1, read_length ):
|
|
31 if i < ( point * step ):
|
|
32 point_sum += int( score[i] )
|
|
33 step_average += 1
|
|
34 else:
|
|
35 point_avg = point_sum * 1.0 / step_average
|
|
36 scores.append( point_avg )
|
|
37 point += 1
|
|
38 point_sum = 0
|
|
39 step_average = 0
|
|
40 if step_average > 0:
|
|
41 point_avg = point_sum * 1.0 / step_average
|
|
42 scores.append( point_avg )
|
|
43 if len( scores ) > number_of_points:
|
|
44 last_avg = 0
|
|
45 for j in xrange( number_of_points - 1, len( scores ) ):
|
|
46 last_avg += scores[j]
|
|
47 last_avg = last_avg / ( len(scores) - number_of_points + 1 )
|
|
48 else:
|
|
49 last_avg = scores[-1]
|
|
50 score_points = []
|
|
51 for k in range( number_of_points - 1 ):
|
|
52 score_points.append( scores[k] )
|
|
53 score_points.append( last_avg )
|
|
54 return score_points
|
|
55
|
|
56 def __main__():
|
|
57
|
|
58 invalid_lines = 0
|
|
59
|
|
60 infile_score_name = sys.argv[1].strip()
|
|
61 outfile_R_name = sys.argv[2].strip()
|
|
62
|
|
63 infile_name = infile_score_name
|
|
64
|
|
65 # Determine tabular or fasta format within the first 100 lines
|
|
66 seq_method = None
|
|
67 data_type = None
|
|
68 for i, line in enumerate( file( infile_name ) ):
|
|
69 line = line.rstrip( '\r\n' )
|
|
70 if not line or line.startswith( '#' ):
|
|
71 continue
|
|
72 if data_type == None:
|
|
73 if line.startswith( '>' ):
|
|
74 data_type = 'fasta'
|
|
75 continue
|
|
76 elif len( line.split( '\t' ) ) > 0:
|
|
77 fields = line.split()
|
|
78 for score in fields:
|
|
79 try:
|
|
80 int( score )
|
|
81 data_type = 'tabular'
|
|
82 seq_method = 'solexa'
|
|
83 break
|
|
84 except:
|
|
85 break
|
|
86 elif data_type == 'fasta':
|
|
87 fields = line.split()
|
|
88 for score in fields:
|
|
89 try:
|
|
90 int( score )
|
|
91 seq_method = '454'
|
|
92 break
|
|
93 except:
|
|
94 break
|
|
95 if i == 100:
|
|
96 break
|
|
97
|
|
98 if data_type is None:
|
|
99 stop_err( 'This tool can only use fasta data or tabular data.' )
|
|
100 if seq_method is None:
|
|
101 stop_err( 'Invalid data for fasta format.')
|
|
102
|
|
103 # Determine fixed length or variable length within the first 100 lines
|
|
104 read_length = 0
|
|
105 variable_length = False
|
|
106 if seq_method == 'solexa':
|
|
107 for i, line in enumerate( file( infile_name ) ):
|
|
108 line = line.rstrip( '\r\n' )
|
|
109 if not line or line.startswith( '#' ):
|
|
110 continue
|
|
111 scores = line.split('\t')
|
|
112 if read_length == 0:
|
|
113 read_length = len( scores )
|
|
114 if read_length != len( scores ):
|
|
115 variable_length = True
|
|
116 break
|
|
117 if i == 100:
|
|
118 break
|
|
119 elif seq_method == '454':
|
|
120 score = ''
|
|
121 for i, line in enumerate( file( infile_name ) ):
|
|
122 line = line.rstrip( '\r\n' )
|
|
123 if not line or line.startswith( '#' ):
|
|
124 continue
|
|
125 if line.startswith( '>' ):
|
|
126 if len( score ) > 0:
|
|
127 score = score.split()
|
|
128 if read_length == 0:
|
|
129 read_length = len( score )
|
|
130 if read_length != len( score ):
|
|
131 variable_length = True
|
|
132 break
|
|
133 score = ''
|
|
134 else:
|
|
135 score = score + ' ' + line
|
|
136 if i == 100:
|
|
137 break
|
|
138
|
|
139 if variable_length:
|
|
140 number_of_points = 20
|
|
141 else:
|
|
142 number_of_points = read_length
|
|
143 read_length_threshold = 100 # minimal read length for 454 file
|
|
144 score_points = []
|
|
145 score_matrix = []
|
|
146 invalid_scores = 0
|
|
147
|
|
148 if seq_method == 'solexa':
|
|
149 for i, line in enumerate( open( infile_name ) ):
|
|
150 line = line.rstrip( '\r\n' )
|
|
151 if not line or line.startswith( '#' ):
|
|
152 continue
|
|
153 tmp_array = []
|
|
154 scores = line.split( '\t' )
|
|
155 for bases in scores:
|
|
156 nuc_errors = bases.split()
|
|
157 try:
|
|
158 nuc_errors[0] = int( nuc_errors[0] )
|
|
159 nuc_errors[1] = int( nuc_errors[1] )
|
|
160 nuc_errors[2] = int( nuc_errors[2] )
|
|
161 nuc_errors[3] = int( nuc_errors[3] )
|
|
162 big = max( nuc_errors )
|
|
163 except:
|
|
164 #print 'Invalid numbers in the file. Skipped.'
|
|
165 invalid_scores += 1
|
|
166 big = 0
|
|
167 tmp_array.append( big )
|
|
168 score_points.append( tmp_array )
|
|
169 elif seq_method == '454':
|
|
170 # skip the last fasta sequence
|
|
171 score = ''
|
|
172 for i, line in enumerate( open( infile_name ) ):
|
|
173 line = line.rstrip( '\r\n' )
|
|
174 if not line or line.startswith( '#' ):
|
|
175 continue
|
|
176 if line.startswith( '>' ):
|
|
177 if len( score ) > 0:
|
|
178 score = ['0'] + score.split()
|
|
179 read_length = len( score )
|
|
180 tmp_array = []
|
|
181 if not variable_length:
|
|
182 score.pop(0)
|
|
183 score_points.append( score )
|
|
184 tmp_array = score
|
|
185 elif read_length > read_length_threshold:
|
|
186 score_points_tmp = merge_to_20_datapoints( score )
|
|
187 score_points.append( score_points_tmp )
|
|
188 tmp_array = score_points_tmp
|
|
189 score = ''
|
|
190 else:
|
|
191 score = "%s %s" % ( score, line )
|
|
192 if len( score ) > 0:
|
|
193 score = ['0'] + score.split()
|
|
194 read_length = len( score )
|
|
195 if not variable_length:
|
|
196 score.pop(0)
|
|
197 score_points.append( score )
|
|
198 elif read_length > read_length_threshold:
|
|
199 score_points_tmp = merge_to_20_datapoints( score )
|
|
200 score_points.append( score_points_tmp )
|
|
201 tmp_array = score_points_tmp
|
|
202
|
|
203 # reverse the matrix, for R
|
|
204 for i in range( number_of_points - 1 ):
|
|
205 tmp_array = []
|
|
206 for j in range( len( score_points ) ):
|
|
207 try:
|
|
208 tmp_array.append( int( score_points[j][i] ) )
|
|
209 except:
|
|
210 invalid_lines += 1
|
|
211 score_matrix.append( tmp_array )
|
|
212
|
|
213 # generate pdf figures
|
|
214 #outfile_R_pdf = outfile_R_name
|
|
215 #r.pdf( outfile_R_pdf )
|
|
216 outfile_R_png = outfile_R_name
|
|
217 r.bitmap( outfile_R_png )
|
|
218
|
|
219 title = "boxplot of quality scores"
|
|
220 empty_score_matrix_columns = 0
|
|
221 for i, subset in enumerate( score_matrix ):
|
|
222 if not subset:
|
|
223 empty_score_matrix_columns += 1
|
|
224 score_matrix[i] = [0]
|
|
225
|
|
226 if not variable_length:
|
|
227 r.boxplot( score_matrix, xlab="location in read length", main=title )
|
|
228 else:
|
|
229 r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title )
|
|
230 x_old_range = []
|
|
231 x_new_range = []
|
|
232 step = read_length_threshold / number_of_points
|
|
233 for i in xrange( 0, read_length_threshold, step ):
|
|
234 x_old_range.append( ( i / step ) )
|
|
235 x_new_range.append( i )
|
|
236 r.axis( 1, x_old_range, x_new_range )
|
|
237 r.dev_off()
|
|
238
|
|
239 if invalid_scores > 0:
|
|
240 print 'Skipped %d invalid scores. ' % invalid_scores
|
|
241 if invalid_lines > 0:
|
|
242 print 'Skipped %d invalid lines. ' % invalid_lines
|
|
243 if empty_score_matrix_columns > 0:
|
|
244 print '%d missing scores in score_matrix. ' % empty_score_matrix_columns
|
|
245
|
|
246 r.quit(save = "no")
|
|
247
|
|
248 if __name__=="__main__":__main__()
|