diff tools/rgenetics/rgRegion.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgRegion.py	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,80 @@
+"""
+released under the terms of the LGPL
+copyright ross lazarus August 2007 
+for the rgenetics project
+
+Special galaxy tool for the camp2007 data
+Allows grabbing arbitrary columns from an arbitrary region
+
+Needs a mongo results file in the location hardwired below or could be passed in as
+a library parameter - but this file must have a very specific structure
+rs chrom offset float1...floatn
+
+called as
+    <command interpreter="python">
+        rsRegion.py $infile '$cols' $r $tag $out_file1
+    </command>
+
+cols is a delimited list of chosen column names for the subset
+r is a ucsc location region pasted into the tool
+
+"""
+
+
+import sys,string       
+
+trantab = string.maketrans(string.punctuation,'_'*len(string.punctuation))
+print >> sys.stdout, '##rgRegion.py started'
+if len(sys.argv) <> 6: 
+  print >> sys.stdout, '##!expected  params in sys.argv, got %d - %s' % (len(sys.argv),sys.argv)
+  sys.exit(1)
+print '##got %d - %s' % (len(sys.argv),sys.argv)
+# quick and dirty for galaxy - we always get something for each parameter
+fname = sys.argv[1]
+wewant = sys.argv[2].split(',')
+region = sys.argv[3].lower()
+tag = sys.argv[4].translate(trantab)
+ofname = sys.argv[5] 
+myname = 'rgRegion'
+if len(wewant) == 0: # no columns selected?
+  print >> sys.stdout, '##!%s:  no columns selected - cannot run' % myname
+  sys.exit(1)
+try:
+  f = open(fname,'r')
+except: # bad input file name?
+  print >> sys.stdout, '##!%s unable to open file %s' % (myname, fname)
+  sys.exit(1)
+try: # TODO make a regexp?
+  c,rest = region.split(':')
+  c = c.replace('chr','') # leave although will break strict genome graphs  
+  rest = rest.replace(',','') # remove commas
+  spos,epos = rest.split('-')
+  spos = int(spos)
+  epos = int(epos)
+except:
+  print >> sys.stdout, '##!%s unable to parse region %s - MUST look like "chr8:10,000-100,000' % (myname,region)
+  sys.exit(1)
+print >> sys.stdout, '##%s parsing chrom %s from %d to %d' % (myname, c,spos,epos)
+res = []
+cnames = f.next().strip().split() # column titles for output
+linelen = len(cnames)
+wewant = [int(x) - 1 for x in wewant] # need col numbers base 0
+for n,l in enumerate(f):
+  ll = l.strip().split()
+  thisc = ll[1]
+  thispos = int(ll[2])
+  if (thisc == c) and (thispos >= spos) and (thispos <= epos):
+     if len(ll) == linelen:
+        res.append([ll[x] for x in wewant]) # subset of columns!
+     else:
+        print >> sys.stdout, '##! looking for %d fields - found %d in ll=%s' % (linelen,len(ll),str(ll))
+o = file(ofname,'w')
+res = ['%s\n' % '\t'.join(x) for x in res] # turn into tab delim string
+print >> sys.stdout, '##%s selected and returning %d data rows' % (myname,len(res))
+head = [cnames[x] for x in wewant] # ah, list comprehensions - list of needed column names
+o.write('%s\n' % '\t'.join(head)) # header row for output
+o.write(''.join(res))
+o.close()
+f.close()    
+
+