view SNV/filter_snvmix_somatic.py @ 5:a4975ec34575

Uploaded
author ryanmorin
date Mon, 17 Oct 2011 14:57:09 -0400
parents 74f5ea818cea
children 361d6506850a
line wrap: on
line source

#!/usr/bin/env python

"""
Identifies sites with strong evidence for no variant (to be used on SNVMix output from matched normal samples)

usage: %prog [options]
   -n, --input1=n: raw snvmix output file from normal
   -t, --input2=t: annoated snvmix output from tumour
   -o, --output1=o: sorted list of germline sites
   -p, --posterior=p: threshold for posterior probability of AA
   -d, --max_nonref_depth=d: maximum number of non-reference bases allowed at site 
   
"""

import os, shutil, subprocess, sys, tempfile
from galaxy import eggs
import pkg_resources; pkg_resources.require( "bx-python" )
from bx.cookbook import doc_optparse
import re

def stop_err( msg ):
    sys.stderr.write( '%s\n' % msg )
    sys.exit()

def __main__():
    #Parse Command Line
    options, args = doc_optparse.parse( __doc__ )
    input = open(options.input1, 'r')
    tmpDir = tempfile.mkdtemp()
    output = tempfile.NamedTemporaryFile(mode='w',delete=False)
    min_p = options.posterior
    max_d = int(options.max_nonref_depth)
    for line in input:
        cols = line.split()
        #expect input to look like this:
        #1:10143872 C N C:44,N:0,1.0000000000,0.0000000000,0.0000000000,1
        data = cols[3]
        #print data
        data_list = data.split(',')
        nref_val = data_list[1]
        ref_p = data_list[2]
        nref_depth = int(nref_val.split(":")[1])
        #print nref_depth, 'vs', max_d
        if ref_p >= min_p and nref_depth <= max_d:
            #print nref_depth, '<=', max_d
            has_chr = re.match("chr",cols[0])
            if not has_chr:
                output.write("chr")
            output.write(cols[0])
            output.write("\n")
    output_file_nosort = output.name
    
    output.close()
    cmd = 'sort -S 2000M %s | join - %s > %s'
    cmd = cmd % (output_file_nosort,options.input2,options.output1)
    proc = subprocess.Popen( args=cmd, shell=True, cwd=tmpDir)
    
    returncode = proc.wait()
    if os.path.getsize( options.output1 ) > 0:
        sys.stdout.write( 'wrote filtered output' )
    else:
        stop_err( 'The output file is empty. Your input file may have had no matches, or there may be an error with your input file or settings.' )
    #clean up temp files
    if os.path.exists( tmpDir ):
        shutil.rmtree( tmpDir )
                    
if __name__ == "__main__" : __main__()