view tools/metag_tools/short_reads_figure_high_quality_length.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line source

#!/usr/bin/env python

import os, sys, math, tempfile, zipfile, re
from rpy import *

assert sys.version_info[:2] >= ( 2, 4 )

def stop_err( msg ):
    sys.stderr.write( "%s\n" % msg )
    sys.exit()

def unzip( filename ):
    zip_file = zipfile.ZipFile( filename, 'r' )
    tmpfilename = tempfile.NamedTemporaryFile().name
    for name in zip_file.namelist():
        file( tmpfilename, 'a' ).write( zip_file.read( name ) )
    zip_file.close()
    return tmpfilename

def __main__():
    infile_score_name = sys.argv[1].strip()
    outfile_R_name = sys.argv[2].strip()
    
    try:
        score_threshold = int( sys.argv[3].strip() )
    except:
        stop_err( 'Threshold for quality score must be numerical.' )

    infile_is_zipped = False
    if zipfile.is_zipfile( infile_score_name ):
        infile_is_zipped = True
        infile_name = unzip( infile_score_name )
    else:
        infile_name = infile_score_name

    # detect whether it's tabular or fasta format
    seq_method = None
    data_type = None
    for i, line in enumerate( file( infile_name ) ):
        line = line.rstrip( '\r\n' )
        if not line or line.startswith( '#' ):
            continue
        if data_type == None:
            if line.startswith( '>' ):
                data_type = 'fasta'
                continue
            elif len( line.split( '\t' ) ) > 0:
                fields = line.split()
                for score in fields:
                    try:
                        int( score )
                        data_type = 'tabular'
                        seq_method = 'solexa'
                        break
                    except:
                        break
        elif data_type == 'fasta':
            fields = line.split()
            for score in fields:
                try: 
                    int( score )
                    seq_method = '454'
                    break
                except:
                    break
        if i == 100:
            break

    if data_type is None:
        stop_err( 'This tool can only use fasta data or tabular data.' ) 
    if seq_method is None:
        stop_err( 'Invalid data for fasta format.')
 
    cont_high_quality = []
    invalid_lines = 0
    invalid_scores = 0                       
    if seq_method == 'solexa':
        for i, line in enumerate( open( infile_name ) ):
            line = line.rstrip( '\r\n' )
            if not line or line.startswith( '#' ):
                continue
            locs = line.split( '\t' )
            for j, base in enumerate( locs ):
                nuc_errors = base.split()
                try:
                    nuc_errors[0] = int( nuc_errors[0] )
                    nuc_errors[1] = int( nuc_errors[1] )
                    nuc_errors[2] = int( nuc_errors[2] )
                    nuc_errors[3] = int( nuc_errors[3] )
                    big = max( nuc_errors )
                except:
                    invalid_scores += 1
                    big = 0
                if j == 0:
                    cont_high_quality.append(1)
                else:
                    if big >= score_threshold:
                        cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
                    else:
                        cont_high_quality.append(1)
    else: # seq_method == '454'
        tmp_score = ''
        for i, line in enumerate( open( infile_name ) ):
            line = line.rstrip( '\r\n' )
            if not line or line.startswith( '#' ):
                continue
            if line.startswith( '>' ):
                if len( tmp_score ) > 0:
                    locs = tmp_score.split()
                    for j, base in enumerate( locs ):
                        try:
                            base = int( base )
                        except:
                            invalid_scores += 1
                            base = 0
                        if j == 0:
                            cont_high_quality.append(1)
                        else:
                            if base >= score_threshold:
                                cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
                            else:
                                cont_high_quality.append(1)
                tmp_score = ''
            else:
                tmp_score = "%s %s" % ( tmp_score, line )
        if len( tmp_score ) > 0:
            locs = tmp_score.split()
            for j, base in enumerate( locs ):
                try:
                    base = int( base )
                except:
                    invalid_scores += 1
                    base = 0
                if j == 0:
                    cont_high_quality.append(1)
                else:
                    if base >= score_threshold:
                        cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
                    else:
                        cont_high_quality.append(1)

    # generate pdf figures
    cont_high_quality = array ( cont_high_quality )
    outfile_R_pdf = outfile_R_name 
    r.pdf( outfile_R_pdf )
    title = "Histogram of continuous high quality scores"
    xlim_range = [ 1, max( cont_high_quality ) ]
    nclass = max( cont_high_quality )
    if nclass > 100:
        nclass = 100
    r.hist( cont_high_quality, probability=True, xlab="Continuous High Quality Score length (bp)", ylab="Frequency (%)", xlim=xlim_range, main=title, nclass=nclass)
    r.dev_off()    

    if infile_is_zipped and os.path.exists( infile_name ):
        # Need to delete temporary file created when we unzipped the infile archive
        os.remove( infile_name )

    if invalid_lines > 0: 
        print 'Skipped %d invalid lines. ' % invalid_lines
    if invalid_scores > 0:
        print 'Skipped %d invalid scores. ' % invalid_scores

    r.quit( save="no" )

if __name__=="__main__":__main__()