comparison extract_genomic_dna_utils.py @ 0:8dd8e89c0603 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/extract_genomic_dna commit b'67cff25a50ba173b0468819204d0999496f68ea9'
author iuc
date Tue, 19 Jan 2016 09:34:23 -0500
parents
children 702970e4a134
comparison
equal deleted inserted replaced
-1:000000000000 0:8dd8e89c0603
1 import copy
2 import os
3 import subprocess
4 import sys
5 import tempfile
6
7 from bx.intervals.io import Comment, Header, GenomicInterval
8 from bx.intervals.io import GenomicIntervalReader, NiceReaderWrapper, ParseError
9
10 # Default chrom, start, end, strand cols for a bed file
11 BED_DEFAULT_COLS = 0, 1, 2, 5
12
13
14 class GFFInterval(GenomicInterval):
15 """
16 A GFF interval, including attributes. If file is strictly a GFF file,
17 only attribute is 'group.'
18 """
19
20 def __init__(self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4,
21 strand_col=6, score_col=5, default_strand='.', fix_strand=False):
22 # GFF format allows '.' for strand but GenomicInterval does not. To get around this,
23 # temporarily set strand and then unset after initing GenomicInterval.
24 unknown_strand = False
25 if not fix_strand and fields[strand_col] == '.':
26 unknown_strand = True
27 fields[strand_col] = '+'
28 GenomicInterval.__init__(self, reader, fields, chrom_col, start_col, end_col,
29 strand_col, default_strand, fix_strand=fix_strand)
30 if unknown_strand:
31 self.strand = '.'
32 self.fields[strand_col] = '.'
33 # Handle feature, score column.
34 self.feature_col = feature_col
35 if self.feature_col >= self.nfields:
36 stop_err("No field for feature_col (%d)" % feature_col)
37 self.feature = self.fields[self.feature_col]
38 self.score_col = score_col
39 if self.score_col >= self.nfields:
40 stop_err("No field for score_col (%d)" % score_col)
41 self.score = self.fields[self.score_col]
42 # GFF attributes.
43 self.attributes = parse_gff_attributes(fields[8])
44
45 def copy(self):
46 return GFFInterval(self.reader, list(self.fields), self.chrom_col, self.feature_col,
47 self.start_col, self.end_col, self.strand_col, self.score_col, self.strand)
48
49
50 class GFFFeature(GFFInterval):
51 """
52 A GFF feature, which can include multiple intervals.
53 """
54
55 def __init__(self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, strand_col=6,
56 score_col=5, default_strand='.', fix_strand=False, intervals=[], raw_size=0):
57 # Use copy so that first interval and feature do not share fields.
58 GFFInterval.__init__(self, reader, copy.deepcopy(intervals[0].fields), chrom_col, feature_col,
59 start_col, end_col, strand_col, score_col, default_strand, fix_strand=fix_strand)
60 self.intervals = intervals
61 self.raw_size = raw_size
62 # Use intervals to set feature attributes.
63 for interval in self.intervals:
64 # Error checking. NOTE: intervals need not share the same strand.
65 if interval.chrom != self.chrom:
66 stop_err("interval chrom does not match self chrom: %s != %s" % (interval.chrom, self.chrom))
67 # Set start, end of interval.
68 if interval.start < self.start:
69 self.start = interval.start
70 if interval.end > self.end:
71 self.end = interval.end
72
73 def name(self):
74 """
75 Returns feature's name.
76 """
77 name = None
78 # Preference for name:
79 # GTF: 'gene_id', 'transcript_id'
80 # GFF3: 'ID', 'id'
81 # GFF: 'group'
82 for attr_name in ['gene_id', 'transcript_id', 'ID', 'id', 'group']:
83 name = self.attributes.get(attr_name, None)
84 if name is not None:
85 break
86 return name
87
88 def copy(self):
89 intervals_copy = []
90 for interval in self.intervals:
91 intervals_copy.append(interval.copy())
92 return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col,
93 self.strand_col, self.score_col, self.strand, intervals=intervals_copy)
94
95 def lines(self):
96 lines = []
97 for interval in self.intervals:
98 lines.append('\t'.join(interval.fields))
99 return lines
100
101
102 class GFFReaderWrapper(NiceReaderWrapper):
103 """
104 Reader wrapper for GFF files which has two major functions:
105 1. group entries for GFF file (via group column), GFF3 (via id attribute),
106 or GTF (via gene_id/transcript id);
107 2. convert coordinates from GFF format--starting and ending coordinates
108 are 1-based, closed--to the 'traditional'/BED interval format--0 based,
109 half-open. This is useful when using GFF files as inputs to tools that
110 expect traditional interval format.
111 """
112
113 def __init__(self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, strand_col=6,
114 score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs):
115 NiceReaderWrapper.__init__(self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col,
116 strand_col=strand_col, fix_strand=fix_strand, **kwargs)
117 self.feature_col = feature_col
118 self.score_col = score_col
119 self.convert_to_bed_coord = convert_to_bed_coord
120 self.last_line = None
121 self.cur_offset = 0
122 self.seed_interval = None
123 self.seed_interval_line_len = 0
124
125 def parse_row(self, line):
126 interval = GFFInterval(self, line.split("\t"), self.chrom_col, self.feature_col, self.start_col,
127 self.end_col, self.strand_col, self.score_col, self.default_strand,
128 fix_strand=self.fix_strand)
129 return interval
130
131 def next(self):
132 """
133 Returns next GFFFeature.
134 """
135
136 def handle_parse_error(parse_error):
137 """
138 Actions to take when ParseError found.
139 """
140 if self.outstream:
141 if self.print_delegate and hasattr(self.print_delegate, "__call__"):
142 self.print_delegate(self.outstream, e, self)
143 self.skipped += 1
144 # No reason to stuff an entire bad file into memory.
145 if self.skipped < 10:
146 self.skipped_lines.append((self.linenum, self.current_line, str(e)))
147 # Get next GFFFeature
148 raw_size = self.seed_interval_line_len
149 # If there is no seed interval, set one. Also, if there are no more
150 # intervals to read, this is where iterator dies.
151 if not self.seed_interval:
152 while not self.seed_interval:
153 try:
154 self.seed_interval = GenomicIntervalReader.next(self)
155 except ParseError as e:
156 handle_parse_error(e)
157 finally:
158 raw_size += len(self.current_line)
159 # If header or comment, clear seed interval and return it with its size.
160 if isinstance(self.seed_interval, (Header, Comment)):
161 return_val = self.seed_interval
162 return_val.raw_size = len(self.current_line)
163 self.seed_interval = None
164 self.seed_interval_line_len = 0
165 return return_val
166 # Initialize feature identifier from seed.
167 # For GFF.
168 feature_group = self.seed_interval.attributes.get('group', None)
169 # For GFF3
170 feature_id = self.seed_interval.attributes.get('ID', None)
171 # For GTF.
172 feature_transcript_id = self.seed_interval.attributes.get('transcript_id', None)
173 # Read all intervals associated with seed.
174 feature_intervals = []
175 feature_intervals.append(self.seed_interval)
176 while True:
177 try:
178 interval = GenomicIntervalReader.next(self)
179 raw_size += len(self.current_line)
180 except StopIteration as e:
181 # No more intervals to read, but last feature needs to be
182 # returned.
183 interval = None
184 raw_size += len(self.current_line)
185 break
186 except ParseError as e:
187 handle_parse_error(e)
188 raw_size += len(self.current_line)
189 continue
190 # Ignore comments.
191 if isinstance(interval, Comment):
192 continue
193 # Determine if interval is part of feature.
194 part_of = False
195 group = interval.attributes.get('group', None)
196 # GFF test:
197 if group and feature_group == group:
198 part_of = True
199 # GFF3 test:
200 parent_id = interval.attributes.get('Parent', None)
201 cur_id = interval.attributes.get('ID', None)
202 if (cur_id and cur_id == feature_id) or (parent_id and parent_id == feature_id):
203 part_of = True
204 # GTF test:
205 transcript_id = interval.attributes.get('transcript_id', None)
206 if transcript_id and transcript_id == feature_transcript_id:
207 part_of = True
208 # If interval is not part of feature, clean up and break.
209 if not part_of:
210 # Adjust raw size because current line is not part of feature.
211 raw_size -= len(self.current_line)
212 break
213 # Interval associated with feature.
214 feature_intervals.append(interval)
215 # Last interval read is the seed for the next interval.
216 self.seed_interval = interval
217 self.seed_interval_line_len = len(self.current_line)
218 # Return feature.
219 feature = GFFFeature(self, self.chrom_col, self.feature_col, self.start_col,
220 self.end_col, self.strand_col, self.score_col,
221 self.default_strand, fix_strand=self.fix_strand,
222 intervals=feature_intervals, raw_size=raw_size)
223 # Convert to BED coords?
224 if self.convert_to_bed_coord:
225 convert_gff_coords_to_bed(feature)
226 return feature
227
228
229 def convert_bed_coords_to_gff(interval):
230 """
231 Converts an interval object's coordinates from BED format to GFF format.
232 Accepted object types include GenomicInterval and list (where the first
233 element in the list is the interval's start, and the second element is
234 the interval's end).
235 """
236 if isinstance(interval, GenomicInterval):
237 interval.start += 1
238 if isinstance(interval, GFFFeature):
239 for subinterval in interval.intervals:
240 convert_bed_coords_to_gff(subinterval)
241 elif isinstance(interval, list):
242 interval[0] += 1
243 return interval
244
245
246 def convert_gff_coords_to_bed(interval):
247 """
248 Converts an interval object's coordinates from GFF format to BED format.
249 Accepted object types include GFFFeature, GenomicInterval, and list (where
250 the first element in the list is the interval's start, and the second
251 element is the interval's end).
252 """
253 if isinstance(interval, GenomicInterval):
254 interval.start -= 1
255 if isinstance(interval, GFFFeature):
256 for subinterval in interval.intervals:
257 convert_gff_coords_to_bed(subinterval)
258 elif isinstance(interval, list):
259 interval[0] -= 1
260 return interval
261
262
263 def convert_to_twobit(reference_genome):
264 """
265 Create 2bit file history fasta dataset.
266 """
267 try:
268 seq_path = tempfile.NamedTemporaryFile(dir=".").name
269 cmd = "faToTwoBit %s %s" % (reference_genome, seq_path)
270 tmp_name = tempfile.NamedTemporaryFile(dir=".").name
271 tmp_stderr = open(tmp_name, 'wb')
272 proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno())
273 returncode = proc.wait()
274 tmp_stderr.close()
275 if returncode != 0:
276 # Get stderr, allowing for case where it's very large.
277 tmp_stderr = open(tmp_name, 'rb')
278 stderr = ''
279 buffsize = 1048576
280 try:
281 while True:
282 stderr += tmp_stderr.read(buffsize)
283 if not stderr or len(stderr) % buffsize != 0:
284 break
285 except OverflowError:
286 pass
287 tmp_stderr.close()
288 os.remove(tmp_name)
289 stop_err(stderr)
290 return seq_path
291 except Exception, e:
292 stop_err('Error running faToTwoBit. ' + str(e))
293
294
295 def get_lines(feature):
296 # Get feature's line(s).
297 if isinstance(feature, GFFFeature):
298 return feature.lines()
299 else:
300 return [feature.rstrip('\r\n')]
301
302
303 def gff_attributes_to_str(attrs, gff_format):
304 """
305 Convert GFF attributes to string. Supported formats are GFF3, GTF.
306 """
307 if gff_format == 'GTF':
308 format_string = '%s "%s"'
309 # Convert group (GFF) and ID, parent (GFF3) attributes to
310 # transcript_id, gene_id.
311 id_attr = None
312 if 'group' in attrs:
313 id_attr = 'group'
314 elif 'ID' in attrs:
315 id_attr = 'ID'
316 elif 'Parent' in attrs:
317 id_attr = 'Parent'
318 if id_attr:
319 attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr]
320 elif gff_format == 'GFF3':
321 format_string = '%s=%s'
322 attrs_strs = []
323 for name, value in attrs.items():
324 attrs_strs.append(format_string % (name, value))
325 return " ; ".join(attrs_strs)
326
327
328 def parse_cols_arg(cols):
329 """
330 Parse a columns command line argument into a four-tuple.
331 """
332 if cols:
333 # Handle case where no strand column included - in this case, cols
334 # looks something like 1,2,3,
335 if cols.endswith(','):
336 cols += '0'
337 col_list = map(lambda x: int(x) - 1, cols.split(","))
338 return col_list
339 else:
340 return BED_DEFAULT_COLS
341
342
343 def parse_gff_attributes(attr_str):
344 """
345 Parses a GFF/GTF attribute string and returns a dictionary of name-value
346 pairs. The general format for a GFF3 attributes string is
347 name1=value1;name2=value2
348 The general format for a GTF attribute string is
349 name1 "value1" ; name2 "value2"
350 The general format for a GFF attribute string is a single string that
351 denotes the interval's group; in this case, method returns a dictionary
352 with a single key-value pair, and key name is 'group'.
353 """
354 attributes_list = attr_str.split(";")
355 attributes = {}
356 for name_value_pair in attributes_list:
357 # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3
358 # attribute; next, try double quotes for GTF.
359 pair = name_value_pair.strip().split("=")
360 if len(pair) == 1:
361 pair = name_value_pair.strip().split("\"")
362 if len(pair) == 1:
363 # Could not split for some reason.
364 continue
365 if pair == '':
366 continue
367 name = pair[0].strip()
368 if name == '':
369 continue
370 # Need to strip double quote from values
371 value = pair[1].strip(" \"")
372 attributes[name] = value
373 if len(attributes) == 0:
374 # Could not split attributes string, so entire string must be
375 # 'group' attribute. This is the case for strictly GFF files.
376 attributes['group'] = attr_str
377 return attributes
378
379
380 def reverse_complement(s):
381 complement_dna = {"A": "T", "T": "A", "C": "G", "G": "C", "a": "t", "t": "a", "c": "g", "g": "c", "N": "N", "n": "n"}
382 reversed_s = []
383 for i in s:
384 reversed_s.append(complement_dna[i])
385 reversed_s.reverse()
386 return "".join(reversed_s)
387
388
389 def stop_err(msg):
390 sys.stderr.write(msg)
391 sys.exit(1)