annotate tools/filters/axt_to_lav.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 Application to convert AXT file to LAV file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 -------------------------------------------
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 :Author: Bob Harris (rsharris@bx.psu.edu)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 :Version: $Revision: $
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 The application reads an AXT file from standard input and writes a LAV file to
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 standard out; some statistics are written to standard error.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 import sys, copy
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 from galaxy import eggs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 import pkg_resources
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 pkg_resources.require( "bx-python" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 import bx.align.axt
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 import bx.align.lav
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 assert sys.version_info[:2] >= ( 2, 4 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 def usage(s=None):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 message = """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 axt_to_lav primary_spec secondary_spec [--silent] < axt_file > lav_file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 Each spec is of the form seq_file[:species_name]:lengths_file.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 seq_file should be a format string for the file names for the individual
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 sequences, with %s to be replaced by the alignment's src field. For example,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 "hg18/%s.nib" would prescribe files named "hg18/chr1.nib", "hg18/chr2.nib",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 etc.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 species_name is optional. If present, it is prepended to the alignment's src
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 field.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 Lengths files provide the length of each chromosome (lav format needs this
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 information but axt file does not contain it). The format is a series of
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 lines of the form
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 <chromosome name> <length>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 The chromosome field in each axt block must match some <chromosome name> in
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 the lengths file.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 if (s == None): sys.exit (message)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 else: sys.exit ("%s\n%s" % (s,message))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 def main():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 global debug
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 # parse the command line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 primary = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 secondary = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 silent = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 # pick off options
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 args = sys.argv[1:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 seq_file2 = open(args.pop(-1),'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 seq_file1 = open(args.pop(-1),'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 lav_out = args.pop(-1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 axt_in = args.pop(-1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 while (len(args) > 0):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 arg = args.pop(0)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 val = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 fields = arg.split("=",1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 if (len(fields) == 2):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 arg = fields[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 val = fields[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 if (val == ""):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 usage("missing a value in %s=" % arg)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 if (arg == "--silent") and (val == None):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 silent = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 elif (primary == None) and (val == None):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 primary = arg
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 elif (secondary == None) and (val == None):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 secondary = arg
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 usage("unknown argument: %s" % arg)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 if (primary == None):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 usage("missing primary file name and length")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 if (secondary == None):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 usage("missing secondary file name and length")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 (primaryFile,primary,primaryLengths) = parse_spec(primary)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 usage("bad primary spec (must be seq_file[:species_name]:lengths_file")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 (secondaryFile,secondary,secondaryLengths) = parse_spec(secondary)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 usage("bad secondary spec (must be seq_file[:species_name]:lengths_file")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 # read the lengths
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 speciesToLengths = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 speciesToLengths[primary] = read_lengths (primaryLengths)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 speciesToLengths[secondary] = read_lengths (secondaryLengths)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 # read the alignments
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 out = bx.align.lav.Writer(open(lav_out,'w'), \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 attributes = { "name_format_1" : primaryFile,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 "name_format_2" : secondaryFile })
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 axtsRead = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 axtsWritten = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 for axtBlock in bx.align.axt.Reader(open(axt_in), \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 species_to_lengths = speciesToLengths,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 species1 = primary,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 species2 = secondary,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 support_ids = True):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 axtsRead += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 out.write (axtBlock)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 primary_c = axtBlock.get_component_by_src_start(primary)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 secondary_c = axtBlock.get_component_by_src_start(secondary)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 print >>seq_file1, ">%s_%s_%s_%s" % (primary_c.src,secondary_c.strand,primary_c.start,primary_c.start+primary_c.size)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 print >>seq_file1,primary_c.text
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 print >>seq_file1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 print >>seq_file2, ">%s_%s_%s_%s" % (secondary_c.src,secondary_c.strand,secondary_c.start,secondary_c.start+secondary_c.size)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 print >>seq_file2,secondary_c.text
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 print >>seq_file2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 axtsWritten += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130 out.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 seq_file1.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 seq_file2.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 if (not silent):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 sys.stdout.write ("%d blocks read, %d written\n" % (axtsRead,axtsWritten))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 def parse_spec(spec): # returns (seq_file,species_name,lengths_file)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 fields = spec.split(":")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 if (len(fields) == 2): return (fields[0],"",fields[1])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 elif (len(fields) == 3): return (fields[0],fields[1],fields[2])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 else: raise ValueError
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 def read_lengths (fileName):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 chromToLength = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 f = file (fileName, "r")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 for lineNumber,line in enumerate(f):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 line = line.strip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 if (line == ""): continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 if (line.startswith("#")): continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154 fields = line.split ()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 if (len(fields) != 2):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
156 raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
157
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
158 chrom = fields[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
159 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
160 length = int(fields[1])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
161 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
162 raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
163
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
164 if (chrom in chromToLength):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
165 raise "%s appears more than once (%s:%d): %s" \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
166 % (chrom,fileName,lineNumber)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
167
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
168 chromToLength[chrom] = length
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
169
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
170 f.close ()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
171
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
172 return chromToLength
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
173
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
174
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
175 if __name__ == "__main__": main()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
176