0
|
1 """
|
|
2 released under the terms of the LGPL
|
|
3 copyright ross lazarus August 2007
|
|
4 for the rgenetics project
|
|
5
|
|
6 Special galaxy tool for the camp2007 data
|
|
7 Allows grabbing arbitrary columns from an arbitrary region
|
|
8
|
|
9 Needs a mongo results file in the location hardwired below or could be passed in as
|
|
10 a library parameter - but this file must have a very specific structure
|
|
11 rs chrom offset float1...floatn
|
|
12
|
|
13 called as
|
|
14 <command interpreter="python">
|
|
15 rsRegion.py $infile '$cols' $r $tag $out_file1
|
|
16 </command>
|
|
17
|
|
18 cols is a delimited list of chosen column names for the subset
|
|
19 r is a ucsc location region pasted into the tool
|
|
20
|
|
21 """
|
|
22
|
|
23
|
|
24 import sys,string
|
|
25
|
|
26 trantab = string.maketrans(string.punctuation,'_'*len(string.punctuation))
|
|
27 print >> sys.stdout, '##rgRegion.py started'
|
|
28 if len(sys.argv) <> 6:
|
|
29 print >> sys.stdout, '##!expected params in sys.argv, got %d - %s' % (len(sys.argv),sys.argv)
|
|
30 sys.exit(1)
|
|
31 print '##got %d - %s' % (len(sys.argv),sys.argv)
|
|
32 # quick and dirty for galaxy - we always get something for each parameter
|
|
33 fname = sys.argv[1]
|
|
34 wewant = sys.argv[2].split(',')
|
|
35 region = sys.argv[3].lower()
|
|
36 tag = sys.argv[4].translate(trantab)
|
|
37 ofname = sys.argv[5]
|
|
38 myname = 'rgRegion'
|
|
39 if len(wewant) == 0: # no columns selected?
|
|
40 print >> sys.stdout, '##!%s: no columns selected - cannot run' % myname
|
|
41 sys.exit(1)
|
|
42 try:
|
|
43 f = open(fname,'r')
|
|
44 except: # bad input file name?
|
|
45 print >> sys.stdout, '##!%s unable to open file %s' % (myname, fname)
|
|
46 sys.exit(1)
|
|
47 try: # TODO make a regexp?
|
|
48 c,rest = region.split(':')
|
|
49 c = c.replace('chr','') # leave although will break strict genome graphs
|
|
50 rest = rest.replace(',','') # remove commas
|
|
51 spos,epos = rest.split('-')
|
|
52 spos = int(spos)
|
|
53 epos = int(epos)
|
|
54 except:
|
|
55 print >> sys.stdout, '##!%s unable to parse region %s - MUST look like "chr8:10,000-100,000' % (myname,region)
|
|
56 sys.exit(1)
|
|
57 print >> sys.stdout, '##%s parsing chrom %s from %d to %d' % (myname, c,spos,epos)
|
|
58 res = []
|
|
59 cnames = f.next().strip().split() # column titles for output
|
|
60 linelen = len(cnames)
|
|
61 wewant = [int(x) - 1 for x in wewant] # need col numbers base 0
|
|
62 for n,l in enumerate(f):
|
|
63 ll = l.strip().split()
|
|
64 thisc = ll[1]
|
|
65 thispos = int(ll[2])
|
|
66 if (thisc == c) and (thispos >= spos) and (thispos <= epos):
|
|
67 if len(ll) == linelen:
|
|
68 res.append([ll[x] for x in wewant]) # subset of columns!
|
|
69 else:
|
|
70 print >> sys.stdout, '##! looking for %d fields - found %d in ll=%s' % (linelen,len(ll),str(ll))
|
|
71 o = file(ofname,'w')
|
|
72 res = ['%s\n' % '\t'.join(x) for x in res] # turn into tab delim string
|
|
73 print >> sys.stdout, '##%s selected and returning %d data rows' % (myname,len(res))
|
|
74 head = [cnames[x] for x in wewant] # ah, list comprehensions - list of needed column names
|
|
75 o.write('%s\n' % '\t'.join(head)) # header row for output
|
|
76 o.write(''.join(res))
|
|
77 o.close()
|
|
78 f.close()
|
|
79
|
|
80
|