comparison short_reads_figure_score.py @ 0:b52b9c7aabd9 draft default tip

Imported from capsule None
author devteam
date Mon, 19 May 2014 12:35:00 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b52b9c7aabd9
1 #!/usr/bin/env python
2 """
3 boxplot:
4 - box: first quartile and third quartile
5 - line inside the box: median
6 - outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile
7 IQR = third quartile - first quartile
8 - The smallest/largest value that is not an outlier is connected to the box by with a horizontal line.
9 """
10
11 import os, sys, math, tempfile, re
12 from rpy import *
13
14 assert sys.version_info[:2] >= ( 2, 4 )
15
16 def stop_err( msg ):
17 sys.stderr.write( "%s\n" % msg )
18 sys.exit()
19
20 def merge_to_20_datapoints( score ):
21 number_of_points = 20
22 read_length = len( score )
23 step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) )
24 scores = []
25 point = 1
26 point_sum = 0
27 step_average = 0
28 score_points = 0
29
30 for i in xrange( 1, read_length ):
31 if i < ( point * step ):
32 point_sum += int( score[i] )
33 step_average += 1
34 else:
35 point_avg = point_sum * 1.0 / step_average
36 scores.append( point_avg )
37 point += 1
38 point_sum = 0
39 step_average = 0
40 if step_average > 0:
41 point_avg = point_sum * 1.0 / step_average
42 scores.append( point_avg )
43 if len( scores ) > number_of_points:
44 last_avg = 0
45 for j in xrange( number_of_points - 1, len( scores ) ):
46 last_avg += scores[j]
47 last_avg = last_avg / ( len(scores) - number_of_points + 1 )
48 else:
49 last_avg = scores[-1]
50 score_points = []
51 for k in range( number_of_points - 1 ):
52 score_points.append( scores[k] )
53 score_points.append( last_avg )
54 return score_points
55
56 def __main__():
57
58 invalid_lines = 0
59
60 infile_score_name = sys.argv[1].strip()
61 outfile_R_name = sys.argv[2].strip()
62
63 infile_name = infile_score_name
64
65 # Determine tabular or fasta format within the first 100 lines
66 seq_method = None
67 data_type = None
68 for i, line in enumerate( file( infile_name ) ):
69 line = line.rstrip( '\r\n' )
70 if not line or line.startswith( '#' ):
71 continue
72 if data_type == None:
73 if line.startswith( '>' ):
74 data_type = 'fasta'
75 continue
76 elif len( line.split( '\t' ) ) > 0:
77 fields = line.split()
78 for score in fields:
79 try:
80 int( score )
81 data_type = 'tabular'
82 seq_method = 'solexa'
83 break
84 except:
85 break
86 elif data_type == 'fasta':
87 fields = line.split()
88 for score in fields:
89 try:
90 int( score )
91 seq_method = '454'
92 break
93 except:
94 break
95 if i == 100:
96 break
97
98 if data_type is None:
99 stop_err( 'This tool can only use fasta data or tabular data.' )
100 if seq_method is None:
101 stop_err( 'Invalid data for fasta format.')
102
103 # Determine fixed length or variable length within the first 100 lines
104 read_length = 0
105 variable_length = False
106 if seq_method == 'solexa':
107 for i, line in enumerate( file( infile_name ) ):
108 line = line.rstrip( '\r\n' )
109 if not line or line.startswith( '#' ):
110 continue
111 scores = line.split('\t')
112 if read_length == 0:
113 read_length = len( scores )
114 if read_length != len( scores ):
115 variable_length = True
116 break
117 if i == 100:
118 break
119 elif seq_method == '454':
120 score = ''
121 for i, line in enumerate( file( infile_name ) ):
122 line = line.rstrip( '\r\n' )
123 if not line or line.startswith( '#' ):
124 continue
125 if line.startswith( '>' ):
126 if len( score ) > 0:
127 score = score.split()
128 if read_length == 0:
129 read_length = len( score )
130 if read_length != len( score ):
131 variable_length = True
132 break
133 score = ''
134 else:
135 score = score + ' ' + line
136 if i == 100:
137 break
138
139 if variable_length:
140 number_of_points = 20
141 else:
142 number_of_points = read_length
143 read_length_threshold = 100 # minimal read length for 454 file
144 score_points = []
145 score_matrix = []
146 invalid_scores = 0
147
148 if seq_method == 'solexa':
149 for i, line in enumerate( open( infile_name ) ):
150 line = line.rstrip( '\r\n' )
151 if not line or line.startswith( '#' ):
152 continue
153 tmp_array = []
154 scores = line.split( '\t' )
155 for bases in scores:
156 nuc_errors = bases.split()
157 try:
158 nuc_errors[0] = int( nuc_errors[0] )
159 nuc_errors[1] = int( nuc_errors[1] )
160 nuc_errors[2] = int( nuc_errors[2] )
161 nuc_errors[3] = int( nuc_errors[3] )
162 big = max( nuc_errors )
163 except:
164 #print 'Invalid numbers in the file. Skipped.'
165 invalid_scores += 1
166 big = 0
167 tmp_array.append( big )
168 score_points.append( tmp_array )
169 elif seq_method == '454':
170 # skip the last fasta sequence
171 score = ''
172 for i, line in enumerate( open( infile_name ) ):
173 line = line.rstrip( '\r\n' )
174 if not line or line.startswith( '#' ):
175 continue
176 if line.startswith( '>' ):
177 if len( score ) > 0:
178 score = ['0'] + score.split()
179 read_length = len( score )
180 tmp_array = []
181 if not variable_length:
182 score.pop(0)
183 score_points.append( score )
184 tmp_array = score
185 elif read_length > read_length_threshold:
186 score_points_tmp = merge_to_20_datapoints( score )
187 score_points.append( score_points_tmp )
188 tmp_array = score_points_tmp
189 score = ''
190 else:
191 score = "%s %s" % ( score, line )
192 if len( score ) > 0:
193 score = ['0'] + score.split()
194 read_length = len( score )
195 if not variable_length:
196 score.pop(0)
197 score_points.append( score )
198 elif read_length > read_length_threshold:
199 score_points_tmp = merge_to_20_datapoints( score )
200 score_points.append( score_points_tmp )
201 tmp_array = score_points_tmp
202
203 # reverse the matrix, for R
204 for i in range( number_of_points - 1 ):
205 tmp_array = []
206 for j in range( len( score_points ) ):
207 try:
208 tmp_array.append( int( score_points[j][i] ) )
209 except:
210 invalid_lines += 1
211 score_matrix.append( tmp_array )
212
213 # generate pdf figures
214 #outfile_R_pdf = outfile_R_name
215 #r.pdf( outfile_R_pdf )
216 outfile_R_png = outfile_R_name
217 r.bitmap( outfile_R_png )
218
219 title = "boxplot of quality scores"
220 empty_score_matrix_columns = 0
221 for i, subset in enumerate( score_matrix ):
222 if not subset:
223 empty_score_matrix_columns += 1
224 score_matrix[i] = [0]
225
226 if not variable_length:
227 r.boxplot( score_matrix, xlab="location in read length", main=title )
228 else:
229 r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title )
230 x_old_range = []
231 x_new_range = []
232 step = read_length_threshold / number_of_points
233 for i in xrange( 0, read_length_threshold, step ):
234 x_old_range.append( ( i / step ) )
235 x_new_range.append( i )
236 r.axis( 1, x_old_range, x_new_range )
237 r.dev_off()
238
239 if invalid_scores > 0:
240 print 'Skipped %d invalid scores. ' % invalid_scores
241 if invalid_lines > 0:
242 print 'Skipped %d invalid lines. ' % invalid_lines
243 if empty_score_matrix_columns > 0:
244 print '%d missing scores in score_matrix. ' % empty_score_matrix_columns
245
246 r.quit(save = "no")
247
248 if __name__=="__main__":__main__()