Mercurial > repos > devteam > short_reads_figure_score
comparison short_reads_figure_score.py @ 0:b52b9c7aabd9 draft default tip
Imported from capsule None
| author | devteam |
|---|---|
| date | Mon, 19 May 2014 12:35:00 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b52b9c7aabd9 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 boxplot: | |
| 4 - box: first quartile and third quartile | |
| 5 - line inside the box: median | |
| 6 - outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile | |
| 7 IQR = third quartile - first quartile | |
| 8 - The smallest/largest value that is not an outlier is connected to the box by with a horizontal line. | |
| 9 """ | |
| 10 | |
| 11 import os, sys, math, tempfile, re | |
| 12 from rpy import * | |
| 13 | |
| 14 assert sys.version_info[:2] >= ( 2, 4 ) | |
| 15 | |
| 16 def stop_err( msg ): | |
| 17 sys.stderr.write( "%s\n" % msg ) | |
| 18 sys.exit() | |
| 19 | |
| 20 def merge_to_20_datapoints( score ): | |
| 21 number_of_points = 20 | |
| 22 read_length = len( score ) | |
| 23 step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) ) | |
| 24 scores = [] | |
| 25 point = 1 | |
| 26 point_sum = 0 | |
| 27 step_average = 0 | |
| 28 score_points = 0 | |
| 29 | |
| 30 for i in xrange( 1, read_length ): | |
| 31 if i < ( point * step ): | |
| 32 point_sum += int( score[i] ) | |
| 33 step_average += 1 | |
| 34 else: | |
| 35 point_avg = point_sum * 1.0 / step_average | |
| 36 scores.append( point_avg ) | |
| 37 point += 1 | |
| 38 point_sum = 0 | |
| 39 step_average = 0 | |
| 40 if step_average > 0: | |
| 41 point_avg = point_sum * 1.0 / step_average | |
| 42 scores.append( point_avg ) | |
| 43 if len( scores ) > number_of_points: | |
| 44 last_avg = 0 | |
| 45 for j in xrange( number_of_points - 1, len( scores ) ): | |
| 46 last_avg += scores[j] | |
| 47 last_avg = last_avg / ( len(scores) - number_of_points + 1 ) | |
| 48 else: | |
| 49 last_avg = scores[-1] | |
| 50 score_points = [] | |
| 51 for k in range( number_of_points - 1 ): | |
| 52 score_points.append( scores[k] ) | |
| 53 score_points.append( last_avg ) | |
| 54 return score_points | |
| 55 | |
| 56 def __main__(): | |
| 57 | |
| 58 invalid_lines = 0 | |
| 59 | |
| 60 infile_score_name = sys.argv[1].strip() | |
| 61 outfile_R_name = sys.argv[2].strip() | |
| 62 | |
| 63 infile_name = infile_score_name | |
| 64 | |
| 65 # Determine tabular or fasta format within the first 100 lines | |
| 66 seq_method = None | |
| 67 data_type = None | |
| 68 for i, line in enumerate( file( infile_name ) ): | |
| 69 line = line.rstrip( '\r\n' ) | |
| 70 if not line or line.startswith( '#' ): | |
| 71 continue | |
| 72 if data_type == None: | |
| 73 if line.startswith( '>' ): | |
| 74 data_type = 'fasta' | |
| 75 continue | |
| 76 elif len( line.split( '\t' ) ) > 0: | |
| 77 fields = line.split() | |
| 78 for score in fields: | |
| 79 try: | |
| 80 int( score ) | |
| 81 data_type = 'tabular' | |
| 82 seq_method = 'solexa' | |
| 83 break | |
| 84 except: | |
| 85 break | |
| 86 elif data_type == 'fasta': | |
| 87 fields = line.split() | |
| 88 for score in fields: | |
| 89 try: | |
| 90 int( score ) | |
| 91 seq_method = '454' | |
| 92 break | |
| 93 except: | |
| 94 break | |
| 95 if i == 100: | |
| 96 break | |
| 97 | |
| 98 if data_type is None: | |
| 99 stop_err( 'This tool can only use fasta data or tabular data.' ) | |
| 100 if seq_method is None: | |
| 101 stop_err( 'Invalid data for fasta format.') | |
| 102 | |
| 103 # Determine fixed length or variable length within the first 100 lines | |
| 104 read_length = 0 | |
| 105 variable_length = False | |
| 106 if seq_method == 'solexa': | |
| 107 for i, line in enumerate( file( infile_name ) ): | |
| 108 line = line.rstrip( '\r\n' ) | |
| 109 if not line or line.startswith( '#' ): | |
| 110 continue | |
| 111 scores = line.split('\t') | |
| 112 if read_length == 0: | |
| 113 read_length = len( scores ) | |
| 114 if read_length != len( scores ): | |
| 115 variable_length = True | |
| 116 break | |
| 117 if i == 100: | |
| 118 break | |
| 119 elif seq_method == '454': | |
| 120 score = '' | |
| 121 for i, line in enumerate( file( infile_name ) ): | |
| 122 line = line.rstrip( '\r\n' ) | |
| 123 if not line or line.startswith( '#' ): | |
| 124 continue | |
| 125 if line.startswith( '>' ): | |
| 126 if len( score ) > 0: | |
| 127 score = score.split() | |
| 128 if read_length == 0: | |
| 129 read_length = len( score ) | |
| 130 if read_length != len( score ): | |
| 131 variable_length = True | |
| 132 break | |
| 133 score = '' | |
| 134 else: | |
| 135 score = score + ' ' + line | |
| 136 if i == 100: | |
| 137 break | |
| 138 | |
| 139 if variable_length: | |
| 140 number_of_points = 20 | |
| 141 else: | |
| 142 number_of_points = read_length | |
| 143 read_length_threshold = 100 # minimal read length for 454 file | |
| 144 score_points = [] | |
| 145 score_matrix = [] | |
| 146 invalid_scores = 0 | |
| 147 | |
| 148 if seq_method == 'solexa': | |
| 149 for i, line in enumerate( open( infile_name ) ): | |
| 150 line = line.rstrip( '\r\n' ) | |
| 151 if not line or line.startswith( '#' ): | |
| 152 continue | |
| 153 tmp_array = [] | |
| 154 scores = line.split( '\t' ) | |
| 155 for bases in scores: | |
| 156 nuc_errors = bases.split() | |
| 157 try: | |
| 158 nuc_errors[0] = int( nuc_errors[0] ) | |
| 159 nuc_errors[1] = int( nuc_errors[1] ) | |
| 160 nuc_errors[2] = int( nuc_errors[2] ) | |
| 161 nuc_errors[3] = int( nuc_errors[3] ) | |
| 162 big = max( nuc_errors ) | |
| 163 except: | |
| 164 #print 'Invalid numbers in the file. Skipped.' | |
| 165 invalid_scores += 1 | |
| 166 big = 0 | |
| 167 tmp_array.append( big ) | |
| 168 score_points.append( tmp_array ) | |
| 169 elif seq_method == '454': | |
| 170 # skip the last fasta sequence | |
| 171 score = '' | |
| 172 for i, line in enumerate( open( infile_name ) ): | |
| 173 line = line.rstrip( '\r\n' ) | |
| 174 if not line or line.startswith( '#' ): | |
| 175 continue | |
| 176 if line.startswith( '>' ): | |
| 177 if len( score ) > 0: | |
| 178 score = ['0'] + score.split() | |
| 179 read_length = len( score ) | |
| 180 tmp_array = [] | |
| 181 if not variable_length: | |
| 182 score.pop(0) | |
| 183 score_points.append( score ) | |
| 184 tmp_array = score | |
| 185 elif read_length > read_length_threshold: | |
| 186 score_points_tmp = merge_to_20_datapoints( score ) | |
| 187 score_points.append( score_points_tmp ) | |
| 188 tmp_array = score_points_tmp | |
| 189 score = '' | |
| 190 else: | |
| 191 score = "%s %s" % ( score, line ) | |
| 192 if len( score ) > 0: | |
| 193 score = ['0'] + score.split() | |
| 194 read_length = len( score ) | |
| 195 if not variable_length: | |
| 196 score.pop(0) | |
| 197 score_points.append( score ) | |
| 198 elif read_length > read_length_threshold: | |
| 199 score_points_tmp = merge_to_20_datapoints( score ) | |
| 200 score_points.append( score_points_tmp ) | |
| 201 tmp_array = score_points_tmp | |
| 202 | |
| 203 # reverse the matrix, for R | |
| 204 for i in range( number_of_points - 1 ): | |
| 205 tmp_array = [] | |
| 206 for j in range( len( score_points ) ): | |
| 207 try: | |
| 208 tmp_array.append( int( score_points[j][i] ) ) | |
| 209 except: | |
| 210 invalid_lines += 1 | |
| 211 score_matrix.append( tmp_array ) | |
| 212 | |
| 213 # generate pdf figures | |
| 214 #outfile_R_pdf = outfile_R_name | |
| 215 #r.pdf( outfile_R_pdf ) | |
| 216 outfile_R_png = outfile_R_name | |
| 217 r.bitmap( outfile_R_png ) | |
| 218 | |
| 219 title = "boxplot of quality scores" | |
| 220 empty_score_matrix_columns = 0 | |
| 221 for i, subset in enumerate( score_matrix ): | |
| 222 if not subset: | |
| 223 empty_score_matrix_columns += 1 | |
| 224 score_matrix[i] = [0] | |
| 225 | |
| 226 if not variable_length: | |
| 227 r.boxplot( score_matrix, xlab="location in read length", main=title ) | |
| 228 else: | |
| 229 r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title ) | |
| 230 x_old_range = [] | |
| 231 x_new_range = [] | |
| 232 step = read_length_threshold / number_of_points | |
| 233 for i in xrange( 0, read_length_threshold, step ): | |
| 234 x_old_range.append( ( i / step ) ) | |
| 235 x_new_range.append( i ) | |
| 236 r.axis( 1, x_old_range, x_new_range ) | |
| 237 r.dev_off() | |
| 238 | |
| 239 if invalid_scores > 0: | |
| 240 print 'Skipped %d invalid scores. ' % invalid_scores | |
| 241 if invalid_lines > 0: | |
| 242 print 'Skipped %d invalid lines. ' % invalid_lines | |
| 243 if empty_score_matrix_columns > 0: | |
| 244 print '%d missing scores in score_matrix. ' % empty_score_matrix_columns | |
| 245 | |
| 246 r.quit(save = "no") | |
| 247 | |
| 248 if __name__=="__main__":__main__() |
