comparison fastareader.py @ 0:5257ce9d9184

Initial literal.py tool
author Nick Stoler <nstoler@psu.edu>
date Sun, 02 Mar 2014 13:51:03 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5257ce9d9184
1 #!/usr/bin/env python
2 import os
3 __version__ = '0.8'
4
5
6 class FormatError(Exception):
7 def __init__(self, message=None):
8 if message:
9 Exception.__init__(self, message)
10
11
12 class FastaLineGenerator(object):
13 """A simple FASTA parser that only reads a line at a time into memory.
14 Usage:
15 fasta = FastaLineGenerator('/home/user/sequence.fasta')
16 for line in fasta:
17 print "There is a sequence with this FASTA identifier: "+fasta.id
18 print "It has a line with this sequence: "+line
19 """
20
21 def __init__(self, filepath):
22 if not os.path.isfile(filepath):
23 raise IOError('File not found: "%s"' % filepath)
24 self.filepath = filepath
25 self.name = None
26 self.id = None
27
28 def __iter__(self):
29 return self.new()
30
31 def new(self):
32 filehandle = open(self.filepath, 'rU')
33 while True:
34 line_raw = filehandle.readline()
35 if not line_raw:
36 raise StopIteration
37 line = line_raw.strip()
38 if not line:
39 continue # allow empty lines
40 if line[0] == '>':
41 self.name = line[1:] # remove ">"
42 if self.name:
43 self.id = self.name.split()[0]
44 else:
45 self.id = ''
46 continue
47 else:
48 yield line
49
50
51 def bases(self):
52 """Generator that yields single bases, while still reading a whole line at
53 a time underneath.
54 This should be the best of both worlds: it yields a base at a time, but it
55 reads a line at a time from the file so it's not slow as molasses."""
56 for line in self.new():
57 for base in line:
58 yield base
59
60
61 def extract(self, start, end, chrom=None):
62 """Extract a subsequence based on a start and end coordinate.
63 The start and end are inclusive, 1-based. If chrom is not supplied, it will
64 default to the first chromosome (record) encountered in the FASTA file.
65 If the end coordinate is beyond the end of the chromosome, the returned
66 sequence will be truncated to the end of the chromosome. If the start
67 coordinate is beyond the end of the chromosome, an empty string will be
68 returned."""
69 outseq = ''
70 line_start = 1
71 for line in self:
72 if chrom is not None and self.id != chrom:
73 continue
74 line_end = line_start + len(line) - 1
75 # if we haven't encountered the start yet, keep searching
76 if line_end < start:
77 line_start = line_end + 1
78 continue
79 slice_start = max(start, line_start) - line_start
80 slice_end = min(end, line_end) - line_start + 1
81 outseq += line[slice_start:slice_end]
82 # done? (on the last line?)
83 if line_end >= end:
84 break
85 line_start = line_end + 1
86 return outseq
87
88
89 #TODO: see 0notes.txt
90 class FastaBaseGenerator(object):
91 """For when you absolutely have to read one base at a time. VERY SLOW.
92 Usage:
93 fasta = FastaBaseGenerator('/home/user/sequence.fasta')
94 for base in fasta:
95 print "There is a sequence with this FASTA identifier: "+fasta.id
96 print "This is the next base from it: "+base
97 """
98
99 def __init__(self, filepath):
100 self.filehandle = open(filepath, 'rU')
101 self.header = False
102 self.name = None
103 self.id = None
104 self._in_id = None
105
106 def __iter__(self):
107 return self.new()
108
109 def new(self):
110
111 newline = True
112 while True:
113 base = self.filehandle.read(1)
114 if not base:
115 raise StopIteration
116 elif base == '\n':
117 newline = True
118 self.header = False
119 elif newline and base == '>':
120 newline = False
121 self.header = True
122 self._in_id = True
123 self.name = ''
124 self.id = ''
125 elif self.header:
126 if self._in_id:
127 if base.isspace():
128 self._in_id = False
129 else:
130 self.id += base
131 self.name += base
132 else:
133 newline = False
134 yield base
135