Mercurial > repos > nick > dna_visualizer
comparison fastareader.py @ 0:5257ce9d9184
Initial literal.py tool
author | Nick Stoler <nstoler@psu.edu> |
---|---|
date | Sun, 02 Mar 2014 13:51:03 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5257ce9d9184 |
---|---|
1 #!/usr/bin/env python | |
2 import os | |
3 __version__ = '0.8' | |
4 | |
5 | |
6 class FormatError(Exception): | |
7 def __init__(self, message=None): | |
8 if message: | |
9 Exception.__init__(self, message) | |
10 | |
11 | |
12 class FastaLineGenerator(object): | |
13 """A simple FASTA parser that only reads a line at a time into memory. | |
14 Usage: | |
15 fasta = FastaLineGenerator('/home/user/sequence.fasta') | |
16 for line in fasta: | |
17 print "There is a sequence with this FASTA identifier: "+fasta.id | |
18 print "It has a line with this sequence: "+line | |
19 """ | |
20 | |
21 def __init__(self, filepath): | |
22 if not os.path.isfile(filepath): | |
23 raise IOError('File not found: "%s"' % filepath) | |
24 self.filepath = filepath | |
25 self.name = None | |
26 self.id = None | |
27 | |
28 def __iter__(self): | |
29 return self.new() | |
30 | |
31 def new(self): | |
32 filehandle = open(self.filepath, 'rU') | |
33 while True: | |
34 line_raw = filehandle.readline() | |
35 if not line_raw: | |
36 raise StopIteration | |
37 line = line_raw.strip() | |
38 if not line: | |
39 continue # allow empty lines | |
40 if line[0] == '>': | |
41 self.name = line[1:] # remove ">" | |
42 if self.name: | |
43 self.id = self.name.split()[0] | |
44 else: | |
45 self.id = '' | |
46 continue | |
47 else: | |
48 yield line | |
49 | |
50 | |
51 def bases(self): | |
52 """Generator that yields single bases, while still reading a whole line at | |
53 a time underneath. | |
54 This should be the best of both worlds: it yields a base at a time, but it | |
55 reads a line at a time from the file so it's not slow as molasses.""" | |
56 for line in self.new(): | |
57 for base in line: | |
58 yield base | |
59 | |
60 | |
61 def extract(self, start, end, chrom=None): | |
62 """Extract a subsequence based on a start and end coordinate. | |
63 The start and end are inclusive, 1-based. If chrom is not supplied, it will | |
64 default to the first chromosome (record) encountered in the FASTA file. | |
65 If the end coordinate is beyond the end of the chromosome, the returned | |
66 sequence will be truncated to the end of the chromosome. If the start | |
67 coordinate is beyond the end of the chromosome, an empty string will be | |
68 returned.""" | |
69 outseq = '' | |
70 line_start = 1 | |
71 for line in self: | |
72 if chrom is not None and self.id != chrom: | |
73 continue | |
74 line_end = line_start + len(line) - 1 | |
75 # if we haven't encountered the start yet, keep searching | |
76 if line_end < start: | |
77 line_start = line_end + 1 | |
78 continue | |
79 slice_start = max(start, line_start) - line_start | |
80 slice_end = min(end, line_end) - line_start + 1 | |
81 outseq += line[slice_start:slice_end] | |
82 # done? (on the last line?) | |
83 if line_end >= end: | |
84 break | |
85 line_start = line_end + 1 | |
86 return outseq | |
87 | |
88 | |
89 #TODO: see 0notes.txt | |
90 class FastaBaseGenerator(object): | |
91 """For when you absolutely have to read one base at a time. VERY SLOW. | |
92 Usage: | |
93 fasta = FastaBaseGenerator('/home/user/sequence.fasta') | |
94 for base in fasta: | |
95 print "There is a sequence with this FASTA identifier: "+fasta.id | |
96 print "This is the next base from it: "+base | |
97 """ | |
98 | |
99 def __init__(self, filepath): | |
100 self.filehandle = open(filepath, 'rU') | |
101 self.header = False | |
102 self.name = None | |
103 self.id = None | |
104 self._in_id = None | |
105 | |
106 def __iter__(self): | |
107 return self.new() | |
108 | |
109 def new(self): | |
110 | |
111 newline = True | |
112 while True: | |
113 base = self.filehandle.read(1) | |
114 if not base: | |
115 raise StopIteration | |
116 elif base == '\n': | |
117 newline = True | |
118 self.header = False | |
119 elif newline and base == '>': | |
120 newline = False | |
121 self.header = True | |
122 self._in_id = True | |
123 self.name = '' | |
124 self.id = '' | |
125 elif self.header: | |
126 if self._in_id: | |
127 if base.isspace(): | |
128 self._in_id = False | |
129 else: | |
130 self.id += base | |
131 self.name += base | |
132 else: | |
133 newline = False | |
134 yield base | |
135 |