0
|
1 #!/usr/bin/env python
|
|
2 import os
|
|
3 __version__ = '0.8'
|
|
4
|
|
5
|
|
6 class FormatError(Exception):
|
|
7 def __init__(self, message=None):
|
|
8 if message:
|
|
9 Exception.__init__(self, message)
|
|
10
|
|
11
|
|
12 class FastaLineGenerator(object):
|
|
13 """A simple FASTA parser that only reads a line at a time into memory.
|
|
14 Usage:
|
|
15 fasta = FastaLineGenerator('/home/user/sequence.fasta')
|
|
16 for line in fasta:
|
|
17 print "There is a sequence with this FASTA identifier: "+fasta.id
|
|
18 print "It has a line with this sequence: "+line
|
|
19 """
|
|
20
|
|
21 def __init__(self, filepath):
|
|
22 if not os.path.isfile(filepath):
|
|
23 raise IOError('File not found: "%s"' % filepath)
|
|
24 self.filepath = filepath
|
|
25 self.name = None
|
|
26 self.id = None
|
|
27
|
|
28 def __iter__(self):
|
|
29 return self.new()
|
|
30
|
|
31 def new(self):
|
|
32 filehandle = open(self.filepath, 'rU')
|
|
33 while True:
|
|
34 line_raw = filehandle.readline()
|
|
35 if not line_raw:
|
|
36 raise StopIteration
|
|
37 line = line_raw.strip()
|
|
38 if not line:
|
|
39 continue # allow empty lines
|
|
40 if line[0] == '>':
|
|
41 self.name = line[1:] # remove ">"
|
|
42 if self.name:
|
|
43 self.id = self.name.split()[0]
|
|
44 else:
|
|
45 self.id = ''
|
|
46 continue
|
|
47 else:
|
|
48 yield line
|
|
49
|
|
50
|
|
51 def bases(self):
|
|
52 """Generator that yields single bases, while still reading a whole line at
|
|
53 a time underneath.
|
|
54 This should be the best of both worlds: it yields a base at a time, but it
|
|
55 reads a line at a time from the file so it's not slow as molasses."""
|
|
56 for line in self.new():
|
|
57 for base in line:
|
|
58 yield base
|
|
59
|
|
60
|
|
61 def extract(self, start, end, chrom=None):
|
|
62 """Extract a subsequence based on a start and end coordinate.
|
|
63 The start and end are inclusive, 1-based. If chrom is not supplied, it will
|
|
64 default to the first chromosome (record) encountered in the FASTA file.
|
|
65 If the end coordinate is beyond the end of the chromosome, the returned
|
|
66 sequence will be truncated to the end of the chromosome. If the start
|
|
67 coordinate is beyond the end of the chromosome, an empty string will be
|
|
68 returned."""
|
|
69 outseq = ''
|
|
70 line_start = 1
|
|
71 for line in self:
|
|
72 if chrom is not None and self.id != chrom:
|
|
73 continue
|
|
74 line_end = line_start + len(line) - 1
|
|
75 # if we haven't encountered the start yet, keep searching
|
|
76 if line_end < start:
|
|
77 line_start = line_end + 1
|
|
78 continue
|
|
79 slice_start = max(start, line_start) - line_start
|
|
80 slice_end = min(end, line_end) - line_start + 1
|
|
81 outseq += line[slice_start:slice_end]
|
|
82 # done? (on the last line?)
|
|
83 if line_end >= end:
|
|
84 break
|
|
85 line_start = line_end + 1
|
|
86 return outseq
|
|
87
|
|
88
|
|
89 #TODO: see 0notes.txt
|
|
90 class FastaBaseGenerator(object):
|
|
91 """For when you absolutely have to read one base at a time. VERY SLOW.
|
|
92 Usage:
|
|
93 fasta = FastaBaseGenerator('/home/user/sequence.fasta')
|
|
94 for base in fasta:
|
|
95 print "There is a sequence with this FASTA identifier: "+fasta.id
|
|
96 print "This is the next base from it: "+base
|
|
97 """
|
|
98
|
|
99 def __init__(self, filepath):
|
|
100 self.filehandle = open(filepath, 'rU')
|
|
101 self.header = False
|
|
102 self.name = None
|
|
103 self.id = None
|
|
104 self._in_id = None
|
|
105
|
|
106 def __iter__(self):
|
|
107 return self.new()
|
|
108
|
|
109 def new(self):
|
|
110
|
|
111 newline = True
|
|
112 while True:
|
|
113 base = self.filehandle.read(1)
|
|
114 if not base:
|
|
115 raise StopIteration
|
|
116 elif base == '\n':
|
|
117 newline = True
|
|
118 self.header = False
|
|
119 elif newline and base == '>':
|
|
120 newline = False
|
|
121 self.header = True
|
|
122 self._in_id = True
|
|
123 self.name = ''
|
|
124 self.id = ''
|
|
125 elif self.header:
|
|
126 if self._in_id:
|
|
127 if base.isspace():
|
|
128 self._in_id = False
|
|
129 else:
|
|
130 self.id += base
|
|
131 self.name += base
|
|
132 else:
|
|
133 newline = False
|
|
134 yield base
|
|
135
|