Mercurial > repos > devteam > short_reads_figure_score
diff short_reads_figure_score.py @ 0:b52b9c7aabd9 draft default tip
Imported from capsule None
author | devteam |
---|---|
date | Mon, 19 May 2014 12:35:00 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/short_reads_figure_score.py Mon May 19 12:35:00 2014 -0400 @@ -0,0 +1,248 @@ +#!/usr/bin/env python +""" +boxplot: +- box: first quartile and third quartile +- line inside the box: median +- outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile + IQR = third quartile - first quartile +- The smallest/largest value that is not an outlier is connected to the box by with a horizontal line. +""" + +import os, sys, math, tempfile, re +from rpy import * + +assert sys.version_info[:2] >= ( 2, 4 ) + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def merge_to_20_datapoints( score ): + number_of_points = 20 + read_length = len( score ) + step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) ) + scores = [] + point = 1 + point_sum = 0 + step_average = 0 + score_points = 0 + + for i in xrange( 1, read_length ): + if i < ( point * step ): + point_sum += int( score[i] ) + step_average += 1 + else: + point_avg = point_sum * 1.0 / step_average + scores.append( point_avg ) + point += 1 + point_sum = 0 + step_average = 0 + if step_average > 0: + point_avg = point_sum * 1.0 / step_average + scores.append( point_avg ) + if len( scores ) > number_of_points: + last_avg = 0 + for j in xrange( number_of_points - 1, len( scores ) ): + last_avg += scores[j] + last_avg = last_avg / ( len(scores) - number_of_points + 1 ) + else: + last_avg = scores[-1] + score_points = [] + for k in range( number_of_points - 1 ): + score_points.append( scores[k] ) + score_points.append( last_avg ) + return score_points + +def __main__(): + + invalid_lines = 0 + + infile_score_name = sys.argv[1].strip() + outfile_R_name = sys.argv[2].strip() + + infile_name = infile_score_name + + # Determine tabular or fasta format within the first 100 lines + seq_method = None + data_type = None + for i, line in enumerate( file( infile_name ) ): + line = line.rstrip( '\r\n' ) + if not line or line.startswith( '#' ): + continue + if data_type == None: + if line.startswith( '>' ): + data_type = 'fasta' + continue + elif len( line.split( '\t' ) ) > 0: + fields = line.split() + for score in fields: + try: + int( score ) + data_type = 'tabular' + seq_method = 'solexa' + break + except: + break + elif data_type == 'fasta': + fields = line.split() + for score in fields: + try: + int( score ) + seq_method = '454' + break + except: + break + if i == 100: + break + + if data_type is None: + stop_err( 'This tool can only use fasta data or tabular data.' ) + if seq_method is None: + stop_err( 'Invalid data for fasta format.') + + # Determine fixed length or variable length within the first 100 lines + read_length = 0 + variable_length = False + if seq_method == 'solexa': + for i, line in enumerate( file( infile_name ) ): + line = line.rstrip( '\r\n' ) + if not line or line.startswith( '#' ): + continue + scores = line.split('\t') + if read_length == 0: + read_length = len( scores ) + if read_length != len( scores ): + variable_length = True + break + if i == 100: + break + elif seq_method == '454': + score = '' + for i, line in enumerate( file( infile_name ) ): + line = line.rstrip( '\r\n' ) + if not line or line.startswith( '#' ): + continue + if line.startswith( '>' ): + if len( score ) > 0: + score = score.split() + if read_length == 0: + read_length = len( score ) + if read_length != len( score ): + variable_length = True + break + score = '' + else: + score = score + ' ' + line + if i == 100: + break + + if variable_length: + number_of_points = 20 + else: + number_of_points = read_length + read_length_threshold = 100 # minimal read length for 454 file + score_points = [] + score_matrix = [] + invalid_scores = 0 + + if seq_method == 'solexa': + for i, line in enumerate( open( infile_name ) ): + line = line.rstrip( '\r\n' ) + if not line or line.startswith( '#' ): + continue + tmp_array = [] + scores = line.split( '\t' ) + for bases in scores: + nuc_errors = bases.split() + try: + nuc_errors[0] = int( nuc_errors[0] ) + nuc_errors[1] = int( nuc_errors[1] ) + nuc_errors[2] = int( nuc_errors[2] ) + nuc_errors[3] = int( nuc_errors[3] ) + big = max( nuc_errors ) + except: + #print 'Invalid numbers in the file. Skipped.' + invalid_scores += 1 + big = 0 + tmp_array.append( big ) + score_points.append( tmp_array ) + elif seq_method == '454': + # skip the last fasta sequence + score = '' + for i, line in enumerate( open( infile_name ) ): + line = line.rstrip( '\r\n' ) + if not line or line.startswith( '#' ): + continue + if line.startswith( '>' ): + if len( score ) > 0: + score = ['0'] + score.split() + read_length = len( score ) + tmp_array = [] + if not variable_length: + score.pop(0) + score_points.append( score ) + tmp_array = score + elif read_length > read_length_threshold: + score_points_tmp = merge_to_20_datapoints( score ) + score_points.append( score_points_tmp ) + tmp_array = score_points_tmp + score = '' + else: + score = "%s %s" % ( score, line ) + if len( score ) > 0: + score = ['0'] + score.split() + read_length = len( score ) + if not variable_length: + score.pop(0) + score_points.append( score ) + elif read_length > read_length_threshold: + score_points_tmp = merge_to_20_datapoints( score ) + score_points.append( score_points_tmp ) + tmp_array = score_points_tmp + + # reverse the matrix, for R + for i in range( number_of_points - 1 ): + tmp_array = [] + for j in range( len( score_points ) ): + try: + tmp_array.append( int( score_points[j][i] ) ) + except: + invalid_lines += 1 + score_matrix.append( tmp_array ) + + # generate pdf figures + #outfile_R_pdf = outfile_R_name + #r.pdf( outfile_R_pdf ) + outfile_R_png = outfile_R_name + r.bitmap( outfile_R_png ) + + title = "boxplot of quality scores" + empty_score_matrix_columns = 0 + for i, subset in enumerate( score_matrix ): + if not subset: + empty_score_matrix_columns += 1 + score_matrix[i] = [0] + + if not variable_length: + r.boxplot( score_matrix, xlab="location in read length", main=title ) + else: + r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title ) + x_old_range = [] + x_new_range = [] + step = read_length_threshold / number_of_points + for i in xrange( 0, read_length_threshold, step ): + x_old_range.append( ( i / step ) ) + x_new_range.append( i ) + r.axis( 1, x_old_range, x_new_range ) + r.dev_off() + + if invalid_scores > 0: + print 'Skipped %d invalid scores. ' % invalid_scores + if invalid_lines > 0: + print 'Skipped %d invalid lines. ' % invalid_lines + if empty_score_matrix_columns > 0: + print '%d missing scores in score_matrix. ' % empty_score_matrix_columns + + r.quit(save = "no") + +if __name__=="__main__":__main__()