diff SNV/filter_snvmix_somatic.py @ 0:74f5ea818cea

Uploaded
author ryanmorin
date Wed, 12 Oct 2011 19:50:38 -0400
parents
children 361d6506850a
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SNV/filter_snvmix_somatic.py	Wed Oct 12 19:50:38 2011 -0400
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+
+"""
+Identifies sites with strong evidence for no variant (to be used on SNVMix output from matched normal samples)
+
+usage: %prog [options]
+   -n, --input1=n: raw snvmix output file from normal
+   -t, --input2=t: annoated snvmix output from tumour
+   -o, --output1=o: sorted list of germline sites
+   -p, --posterior=p: threshold for posterior probability of AA
+   -d, --max_nonref_depth=d: maximum number of non-reference bases allowed at site 
+   
+"""
+
+import os, shutil, subprocess, sys, tempfile
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+import re
+
+def stop_err( msg ):
+    sys.stderr.write( '%s\n' % msg )
+    sys.exit()
+
+def __main__():
+    #Parse Command Line
+    options, args = doc_optparse.parse( __doc__ )
+    input = open(options.input1, 'r')
+    tmpDir = tempfile.mkdtemp()
+    output = tempfile.NamedTemporaryFile(mode='w',delete=False)
+    min_p = options.posterior
+    max_d = int(options.max_nonref_depth)
+    for line in input:
+        cols = line.split()
+        #expect input to look like this:
+        #1:10143872 C N C:44,N:0,1.0000000000,0.0000000000,0.0000000000,1
+        data = cols[3]
+        #print data
+        data_list = data.split(',')
+        nref_val = data_list[1]
+        ref_p = data_list[2]
+        nref_depth = int(nref_val.split(":")[1])
+        #print nref_depth, 'vs', max_d
+        if ref_p >= min_p and nref_depth <= max_d:
+            #print nref_depth, '<=', max_d
+            has_chr = re.match("chr",cols[0])
+            if not has_chr:
+                output.write("chr")
+            output.write(cols[0])
+            output.write("\n")
+    output_file_nosort = output.name
+    
+    output.close()
+    cmd = 'sort -S 2000M %s | join - %s > %s'
+    cmd = cmd % (output_file_nosort,options.input2,options.output1)
+    proc = subprocess.Popen( args=cmd, shell=True, cwd=tmpDir)
+    
+    returncode = proc.wait()
+    if os.path.getsize( options.output1 ) > 0:
+        sys.stdout.write( 'wrote filtered output' )
+    else:
+        stop_err( 'The output file is empty. Your input file may have had no matches, or there may be an error with your input file or settings.' )
+    #clean up temp files
+    if os.path.exists( tmpDir ):
+        shutil.rmtree( tmpDir )
+                    
+if __name__ == "__main__" : __main__()