| 0 | 1 """ | 
|  | 2 released under the terms of the LGPL | 
|  | 3 copyright ross lazarus August 2007 | 
|  | 4 for the rgenetics project | 
|  | 5 | 
|  | 6 Special galaxy tool for the camp2007 data | 
|  | 7 Allows grabbing arbitrary columns from an arbitrary region | 
|  | 8 | 
|  | 9 Needs a mongo results file in the location hardwired below or could be passed in as | 
|  | 10 a library parameter - but this file must have a very specific structure | 
|  | 11 rs chrom offset float1...floatn | 
|  | 12 | 
|  | 13 called as | 
|  | 14     <command interpreter="python"> | 
|  | 15         rsRegion.py $infile '$cols' $r $tag $out_file1 | 
|  | 16     </command> | 
|  | 17 | 
|  | 18 cols is a delimited list of chosen column names for the subset | 
|  | 19 r is a ucsc location region pasted into the tool | 
|  | 20 | 
|  | 21 """ | 
|  | 22 | 
|  | 23 | 
|  | 24 import sys,string | 
|  | 25 | 
|  | 26 trantab = string.maketrans(string.punctuation,'_'*len(string.punctuation)) | 
|  | 27 print >> sys.stdout, '##rgRegion.py started' | 
|  | 28 if len(sys.argv) <> 6: | 
|  | 29   print >> sys.stdout, '##!expected  params in sys.argv, got %d - %s' % (len(sys.argv),sys.argv) | 
|  | 30   sys.exit(1) | 
|  | 31 print '##got %d - %s' % (len(sys.argv),sys.argv) | 
|  | 32 # quick and dirty for galaxy - we always get something for each parameter | 
|  | 33 fname = sys.argv[1] | 
|  | 34 wewant = sys.argv[2].split(',') | 
|  | 35 region = sys.argv[3].lower() | 
|  | 36 tag = sys.argv[4].translate(trantab) | 
|  | 37 ofname = sys.argv[5] | 
|  | 38 myname = 'rgRegion' | 
|  | 39 if len(wewant) == 0: # no columns selected? | 
|  | 40   print >> sys.stdout, '##!%s:  no columns selected - cannot run' % myname | 
|  | 41   sys.exit(1) | 
|  | 42 try: | 
|  | 43   f = open(fname,'r') | 
|  | 44 except: # bad input file name? | 
|  | 45   print >> sys.stdout, '##!%s unable to open file %s' % (myname, fname) | 
|  | 46   sys.exit(1) | 
|  | 47 try: # TODO make a regexp? | 
|  | 48   c,rest = region.split(':') | 
|  | 49   c = c.replace('chr','') # leave although will break strict genome graphs | 
|  | 50   rest = rest.replace(',','') # remove commas | 
|  | 51   spos,epos = rest.split('-') | 
|  | 52   spos = int(spos) | 
|  | 53   epos = int(epos) | 
|  | 54 except: | 
|  | 55   print >> sys.stdout, '##!%s unable to parse region %s - MUST look like "chr8:10,000-100,000' % (myname,region) | 
|  | 56   sys.exit(1) | 
|  | 57 print >> sys.stdout, '##%s parsing chrom %s from %d to %d' % (myname, c,spos,epos) | 
|  | 58 res = [] | 
|  | 59 cnames = f.next().strip().split() # column titles for output | 
|  | 60 linelen = len(cnames) | 
|  | 61 wewant = [int(x) - 1 for x in wewant] # need col numbers base 0 | 
|  | 62 for n,l in enumerate(f): | 
|  | 63   ll = l.strip().split() | 
|  | 64   thisc = ll[1] | 
|  | 65   thispos = int(ll[2]) | 
|  | 66   if (thisc == c) and (thispos >= spos) and (thispos <= epos): | 
|  | 67      if len(ll) == linelen: | 
|  | 68         res.append([ll[x] for x in wewant]) # subset of columns! | 
|  | 69      else: | 
|  | 70         print >> sys.stdout, '##! looking for %d fields - found %d in ll=%s' % (linelen,len(ll),str(ll)) | 
|  | 71 o = file(ofname,'w') | 
|  | 72 res = ['%s\n' % '\t'.join(x) for x in res] # turn into tab delim string | 
|  | 73 print >> sys.stdout, '##%s selected and returning %d data rows' % (myname,len(res)) | 
|  | 74 head = [cnames[x] for x in wewant] # ah, list comprehensions - list of needed column names | 
|  | 75 o.write('%s\n' % '\t'.join(head)) # header row for output | 
|  | 76 o.write(''.join(res)) | 
|  | 77 o.close() | 
|  | 78 f.close() | 
|  | 79 | 
|  | 80 |