short_reads_figure_high_quality_length: short_reads_figure_high_quality

comparison short_reads_figure_high_quality_length.py @ 0:556ceed24699 draft

Imported from capsule None

author	devteam
date	Mon, 19 May 2014 12:34:37 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:556ceed24699
+#!/usr/bin/env python
+import os, sys, math, tempfile, zipfile, re
+from rpy import *
+assert sys.version_info[:2] >= ( 2, 4 )
+def stop_err( msg ):
+sys.stderr.write( "%s\n" % msg )
+sys.exit()
+def unzip( filename ):
+zip_file = zipfile.ZipFile( filename, 'r' )
+tmpfilename = tempfile.NamedTemporaryFile().name
+for name in zip_file.namelist():
+file( tmpfilename, 'a' ).write( zip_file.read( name ) )
+zip_file.close()
+return tmpfilename
+def __main__():
+infile_score_name = sys.argv[1].strip()
+outfile_R_name = sys.argv[2].strip()
+try:
+score_threshold = int( sys.argv[3].strip() )
+except:
+stop_err( 'Threshold for quality score must be numerical.' )
+infile_is_zipped = False
+if zipfile.is_zipfile( infile_score_name ):
+infile_is_zipped = True
+infile_name = unzip( infile_score_name )
+else:
+infile_name = infile_score_name
+# detect whether it's tabular or fasta format
+seq_method = None
+data_type = None
+for i, line in enumerate( file( infile_name ) ):
+line = line.rstrip( '\r\n' )
+if not line or line.startswith( '#' ):
+continue
+if data_type == None:
+if line.startswith( '>' ):
+data_type = 'fasta'
+continue
+elif len( line.split( '\t' ) ) > 0:
+fields = line.split()
+for score in fields:
+try:
+int( score )
+data_type = 'tabular'
+seq_method = 'solexa'
+break
+except:
+break
+elif data_type == 'fasta':
+fields = line.split()
+for score in fields:
+try:
+int( score )
+seq_method = '454'
+break
+except:
+break
+if i == 100:
+break
+if data_type is None:
+stop_err( 'This tool can only use fasta data or tabular data.' )
+if seq_method is None:
+stop_err( 'Invalid data for fasta format.')
+cont_high_quality = []
+invalid_lines = 0
+invalid_scores = 0
+if seq_method == 'solexa':
+for i, line in enumerate( open( infile_name ) ):
+line = line.rstrip( '\r\n' )
+if not line or line.startswith( '#' ):
+continue
+locs = line.split( '\t' )
+for j, base in enumerate( locs ):
+nuc_errors = base.split()
+try:
+nuc_errors[0] = int( nuc_errors[0] )
+nuc_errors[1] = int( nuc_errors[1] )
+nuc_errors[2] = int( nuc_errors[2] )
+nuc_errors[3] = int( nuc_errors[3] )
+big = max( nuc_errors )
+except:
+invalid_scores += 1
+big = 0
+if j == 0:
+cont_high_quality.append(1)
+else:
+if big >= score_threshold:
+cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
+else:
+cont_high_quality.append(1)
+else: # seq_method == '454'
+tmp_score = ''
+for i, line in enumerate( open( infile_name ) ):
+line = line.rstrip( '\r\n' )
+if not line or line.startswith( '#' ):
+continue
+if line.startswith( '>' ):
+if len( tmp_score ) > 0:
+locs = tmp_score.split()
+for j, base in enumerate( locs ):
+try:
+base = int( base )
+except:
+invalid_scores += 1
+base = 0
+if j == 0:
+cont_high_quality.append(1)
+else:
+if base >= score_threshold:
+cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
+else:
+cont_high_quality.append(1)
+tmp_score = ''
+else:
+tmp_score = "%s %s" % ( tmp_score, line )
+if len( tmp_score ) > 0:
+locs = tmp_score.split()
+for j, base in enumerate( locs ):
+try:
+base = int( base )
+except:
+invalid_scores += 1
+base = 0
+if j == 0:
+cont_high_quality.append(1)
+else:
+if base >= score_threshold:
+cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
+else:
+cont_high_quality.append(1)
+# generate pdf figures
+cont_high_quality = array ( cont_high_quality )
+outfile_R_pdf = outfile_R_name
+r.pdf( outfile_R_pdf )
+title = "Histogram of continuous high quality scores"
+xlim_range = [ 1, max( cont_high_quality ) ]
+nclass = max( cont_high_quality )
+if nclass > 100:
+nclass = 100
+r.hist( cont_high_quality, probability=True, xlab="Continuous High Quality Score length (bp)", ylab="Frequency (%)", xlim=xlim_range, main=title, nclass=nclass)
+r.dev_off()
+if infile_is_zipped and os.path.exists( infile_name ):
+# Need to delete temporary file created when we unzipped the infile archive
+os.remove( infile_name )
+if invalid_lines > 0:
+print 'Skipped %d invalid lines. ' % invalid_lines
+if invalid_scores > 0:
+print 'Skipped %d invalid scores. ' % invalid_scores
+r.quit( save="no" )
+if __name__=="__main__":__main__()

Mercurial > repos > devteam > short_reads_figure_high_quality_length

comparison short_reads_figure_high_quality_length.py @ 0:556ceed24699 draft