diff short_reads_figure_high_quality_length.py @ 0:556ceed24699 draft

Imported from capsule None
author devteam
date Mon, 19 May 2014 12:34:37 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/short_reads_figure_high_quality_length.py	Mon May 19 12:34:37 2014 -0400
@@ -0,0 +1,165 @@
+#!/usr/bin/env python
+
+import os, sys, math, tempfile, zipfile, re
+from rpy import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def unzip( filename ):
+    zip_file = zipfile.ZipFile( filename, 'r' )
+    tmpfilename = tempfile.NamedTemporaryFile().name
+    for name in zip_file.namelist():
+        file( tmpfilename, 'a' ).write( zip_file.read( name ) )
+    zip_file.close()
+    return tmpfilename
+
+def __main__():
+    infile_score_name = sys.argv[1].strip()
+    outfile_R_name = sys.argv[2].strip()
+    
+    try:
+        score_threshold = int( sys.argv[3].strip() )
+    except:
+        stop_err( 'Threshold for quality score must be numerical.' )
+
+    infile_is_zipped = False
+    if zipfile.is_zipfile( infile_score_name ):
+        infile_is_zipped = True
+        infile_name = unzip( infile_score_name )
+    else:
+        infile_name = infile_score_name
+
+    # detect whether it's tabular or fasta format
+    seq_method = None
+    data_type = None
+    for i, line in enumerate( file( infile_name ) ):
+        line = line.rstrip( '\r\n' )
+        if not line or line.startswith( '#' ):
+            continue
+        if data_type == None:
+            if line.startswith( '>' ):
+                data_type = 'fasta'
+                continue
+            elif len( line.split( '\t' ) ) > 0:
+                fields = line.split()
+                for score in fields:
+                    try:
+                        int( score )
+                        data_type = 'tabular'
+                        seq_method = 'solexa'
+                        break
+                    except:
+                        break
+        elif data_type == 'fasta':
+            fields = line.split()
+            for score in fields:
+                try: 
+                    int( score )
+                    seq_method = '454'
+                    break
+                except:
+                    break
+        if i == 100:
+            break
+
+    if data_type is None:
+        stop_err( 'This tool can only use fasta data or tabular data.' ) 
+    if seq_method is None:
+        stop_err( 'Invalid data for fasta format.')
+ 
+    cont_high_quality = []
+    invalid_lines = 0
+    invalid_scores = 0                       
+    if seq_method == 'solexa':
+        for i, line in enumerate( open( infile_name ) ):
+            line = line.rstrip( '\r\n' )
+            if not line or line.startswith( '#' ):
+                continue
+            locs = line.split( '\t' )
+            for j, base in enumerate( locs ):
+                nuc_errors = base.split()
+                try:
+                    nuc_errors[0] = int( nuc_errors[0] )
+                    nuc_errors[1] = int( nuc_errors[1] )
+                    nuc_errors[2] = int( nuc_errors[2] )
+                    nuc_errors[3] = int( nuc_errors[3] )
+                    big = max( nuc_errors )
+                except:
+                    invalid_scores += 1
+                    big = 0
+                if j == 0:
+                    cont_high_quality.append(1)
+                else:
+                    if big >= score_threshold:
+                        cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
+                    else:
+                        cont_high_quality.append(1)
+    else: # seq_method == '454'
+        tmp_score = ''
+        for i, line in enumerate( open( infile_name ) ):
+            line = line.rstrip( '\r\n' )
+            if not line or line.startswith( '#' ):
+                continue
+            if line.startswith( '>' ):
+                if len( tmp_score ) > 0:
+                    locs = tmp_score.split()
+                    for j, base in enumerate( locs ):
+                        try:
+                            base = int( base )
+                        except:
+                            invalid_scores += 1
+                            base = 0
+                        if j == 0:
+                            cont_high_quality.append(1)
+                        else:
+                            if base >= score_threshold:
+                                cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
+                            else:
+                                cont_high_quality.append(1)
+                tmp_score = ''
+            else:
+                tmp_score = "%s %s" % ( tmp_score, line )
+        if len( tmp_score ) > 0:
+            locs = tmp_score.split()
+            for j, base in enumerate( locs ):
+                try:
+                    base = int( base )
+                except:
+                    invalid_scores += 1
+                    base = 0
+                if j == 0:
+                    cont_high_quality.append(1)
+                else:
+                    if base >= score_threshold:
+                        cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
+                    else:
+                        cont_high_quality.append(1)
+
+    # generate pdf figures
+    cont_high_quality = array ( cont_high_quality )
+    outfile_R_pdf = outfile_R_name 
+    r.pdf( outfile_R_pdf )
+    title = "Histogram of continuous high quality scores"
+    xlim_range = [ 1, max( cont_high_quality ) ]
+    nclass = max( cont_high_quality )
+    if nclass > 100:
+        nclass = 100
+    r.hist( cont_high_quality, probability=True, xlab="Continuous High Quality Score length (bp)", ylab="Frequency (%)", xlim=xlim_range, main=title, nclass=nclass)
+    r.dev_off()    
+
+    if infile_is_zipped and os.path.exists( infile_name ):
+        # Need to delete temporary file created when we unzipped the infile archive
+        os.remove( infile_name )
+
+    if invalid_lines > 0: 
+        print 'Skipped %d invalid lines. ' % invalid_lines
+    if invalid_scores > 0:
+        print 'Skipped %d invalid scores. ' % invalid_scores
+
+    r.quit( save="no" )
+
+if __name__=="__main__":__main__()
\ No newline at end of file