diff short_reads_figure_score.py @ 0:b52b9c7aabd9 draft default tip

Imported from capsule None
author devteam
date Mon, 19 May 2014 12:35:00 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/short_reads_figure_score.py	Mon May 19 12:35:00 2014 -0400
@@ -0,0 +1,248 @@
+#!/usr/bin/env python
+"""
+boxplot:
+- box: first quartile and third quartile
+- line inside the box: median
+- outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile
+           IQR = third quartile - first quartile
+- The smallest/largest value that is not an outlier is connected to the box by with a horizontal line.
+"""
+
+import os, sys, math, tempfile, re
+from rpy import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def merge_to_20_datapoints( score ):
+    number_of_points = 20
+    read_length = len( score )
+    step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) )
+    scores = []
+    point = 1
+    point_sum = 0
+    step_average = 0
+    score_points = 0
+    
+    for i in xrange( 1, read_length ):
+        if i < ( point * step ):
+            point_sum += int( score[i] )
+            step_average += 1
+        else:
+            point_avg = point_sum * 1.0 / step_average
+            scores.append( point_avg )
+            point += 1
+            point_sum = 0
+            step_average = 0                       
+    if step_average > 0:
+        point_avg = point_sum * 1.0 / step_average
+        scores.append( point_avg )
+    if len( scores ) > number_of_points:
+        last_avg = 0
+        for j in xrange( number_of_points - 1, len( scores ) ):
+            last_avg += scores[j]
+        last_avg = last_avg / ( len(scores) - number_of_points + 1 )
+    else:    
+        last_avg = scores[-1]
+    score_points = []
+    for k in range( number_of_points - 1 ):
+        score_points.append( scores[k] )
+    score_points.append( last_avg )
+    return score_points
+
+def __main__():
+
+    invalid_lines = 0
+
+    infile_score_name = sys.argv[1].strip()
+    outfile_R_name = sys.argv[2].strip()
+
+    infile_name = infile_score_name
+
+    # Determine tabular or fasta format within the first 100 lines
+    seq_method = None
+    data_type = None
+    for i, line in enumerate( file( infile_name ) ):
+        line = line.rstrip( '\r\n' )
+        if not line or line.startswith( '#' ):
+            continue
+        if data_type == None:
+            if line.startswith( '>' ):
+                data_type = 'fasta'
+                continue
+            elif len( line.split( '\t' ) ) > 0:
+                fields = line.split()
+                for score in fields:
+                    try:
+                        int( score )
+                        data_type = 'tabular'
+                        seq_method = 'solexa'
+                        break
+                    except:
+                        break
+        elif data_type == 'fasta':
+            fields = line.split()
+            for score in fields:
+                try: 
+                    int( score )
+                    seq_method = '454'
+                    break
+                except:
+                    break
+        if i == 100:
+            break
+
+    if data_type is None:
+        stop_err( 'This tool can only use fasta data or tabular data.' ) 
+    if seq_method is None:
+        stop_err( 'Invalid data for fasta format.')
+
+    # Determine fixed length or variable length within the first 100 lines
+    read_length = 0
+    variable_length = False
+    if seq_method == 'solexa':
+        for i, line in enumerate( file( infile_name ) ):
+            line = line.rstrip( '\r\n' )
+            if not line or line.startswith( '#' ):
+                continue
+            scores = line.split('\t')
+            if read_length == 0:
+                read_length = len( scores )
+            if read_length != len( scores ):
+                variable_length = True
+                break
+            if i == 100:
+                break
+    elif seq_method == '454':
+        score = ''
+        for i, line in enumerate( file( infile_name ) ):
+            line = line.rstrip( '\r\n' )
+            if not line or line.startswith( '#' ):
+                continue
+            if line.startswith( '>' ):
+                if len( score ) > 0:
+                    score = score.split()
+                    if read_length == 0:
+                        read_length = len( score )
+                    if read_length != len( score ):
+                        variable_length = True
+                        break
+                score = ''
+            else:
+                score = score + ' ' + line
+            if i == 100:
+                break
+
+    if variable_length:
+        number_of_points = 20
+    else:
+        number_of_points = read_length
+    read_length_threshold = 100 # minimal read length for 454 file
+    score_points = []   
+    score_matrix = []
+    invalid_scores = 0   
+
+    if seq_method == 'solexa':
+        for i, line in enumerate( open( infile_name ) ):
+            line = line.rstrip( '\r\n' )
+            if not line or line.startswith( '#' ):
+                continue
+            tmp_array = []
+            scores = line.split( '\t' )
+            for bases in scores:
+                nuc_errors = bases.split()
+                try:
+                    nuc_errors[0] = int( nuc_errors[0] )
+                    nuc_errors[1] = int( nuc_errors[1] )
+                    nuc_errors[2] = int( nuc_errors[2] )
+                    nuc_errors[3] = int( nuc_errors[3] )
+                    big = max( nuc_errors )
+                except:
+                    #print 'Invalid numbers in the file. Skipped.'
+                    invalid_scores += 1
+                    big = 0
+                tmp_array.append( big )                        
+            score_points.append( tmp_array )
+    elif seq_method == '454':
+        # skip the last fasta sequence
+        score = ''
+        for i, line in enumerate( open( infile_name ) ):
+            line = line.rstrip( '\r\n' )
+            if not line or line.startswith( '#' ):
+                continue
+            if line.startswith( '>' ):
+                if len( score ) > 0:
+                    score = ['0'] + score.split()
+                    read_length = len( score )
+                    tmp_array = []
+                    if not variable_length:
+                        score.pop(0)
+                        score_points.append( score )
+                        tmp_array = score
+                    elif read_length > read_length_threshold:
+                        score_points_tmp = merge_to_20_datapoints( score )
+                        score_points.append( score_points_tmp )
+                        tmp_array = score_points_tmp
+                score = ''
+            else:
+                score = "%s %s" % ( score, line )
+        if len( score ) > 0:
+            score = ['0'] + score.split()
+            read_length = len( score )
+            if not variable_length:
+                score.pop(0)
+                score_points.append( score )
+            elif read_length > read_length_threshold:
+                score_points_tmp = merge_to_20_datapoints( score )
+                score_points.append( score_points_tmp )
+                tmp_array = score_points_tmp
+
+    # reverse the matrix, for R
+    for i in range( number_of_points - 1 ):
+        tmp_array = []
+        for j in range( len( score_points ) ):
+            try:
+                tmp_array.append( int( score_points[j][i] ) )
+            except:
+                invalid_lines += 1
+        score_matrix.append( tmp_array )
+
+    # generate pdf figures
+    #outfile_R_pdf = outfile_R_name 
+    #r.pdf( outfile_R_pdf )
+    outfile_R_png = outfile_R_name
+    r.bitmap( outfile_R_png )
+    
+    title = "boxplot of quality scores"
+    empty_score_matrix_columns = 0
+    for i, subset in enumerate( score_matrix ):
+        if not subset:
+            empty_score_matrix_columns += 1
+            score_matrix[i] = [0]
+            
+    if not variable_length:
+        r.boxplot( score_matrix, xlab="location in read length", main=title )
+    else:
+        r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title )
+        x_old_range = []
+        x_new_range = []
+        step = read_length_threshold / number_of_points 
+        for i in xrange( 0, read_length_threshold, step ):
+            x_old_range.append( ( i / step ) )
+            x_new_range.append( i )
+        r.axis( 1, x_old_range, x_new_range )
+    r.dev_off()
+
+    if invalid_scores > 0:
+        print 'Skipped %d invalid scores. ' % invalid_scores
+    if invalid_lines > 0:
+        print 'Skipped %d invalid lines. ' % invalid_lines
+    if empty_score_matrix_columns > 0:
+        print '%d missing scores in score_matrix. ' % empty_score_matrix_columns
+
+    r.quit(save = "no")
+
+if __name__=="__main__":__main__()