annotate fastareader.py @ 10:54223991334b default tip

tool_dependencies.xml: Defined PIL dependency
author Nick Stoler <nstoler@psu.edu>
date Mon, 03 Mar 2014 12:54:02 -0500
parents 5257ce9d9184
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
1 #!/usr/bin/env python
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
2 import os
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
3 __version__ = '0.8'
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
4
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
5
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
6 class FormatError(Exception):
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
7 def __init__(self, message=None):
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
8 if message:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
9 Exception.__init__(self, message)
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
10
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
11
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
12 class FastaLineGenerator(object):
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
13 """A simple FASTA parser that only reads a line at a time into memory.
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
14 Usage:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
15 fasta = FastaLineGenerator('/home/user/sequence.fasta')
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
16 for line in fasta:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
17 print "There is a sequence with this FASTA identifier: "+fasta.id
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
18 print "It has a line with this sequence: "+line
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
19 """
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
20
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
21 def __init__(self, filepath):
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
22 if not os.path.isfile(filepath):
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
23 raise IOError('File not found: "%s"' % filepath)
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
24 self.filepath = filepath
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
25 self.name = None
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
26 self.id = None
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
27
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
28 def __iter__(self):
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
29 return self.new()
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
30
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
31 def new(self):
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
32 filehandle = open(self.filepath, 'rU')
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
33 while True:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
34 line_raw = filehandle.readline()
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
35 if not line_raw:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
36 raise StopIteration
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
37 line = line_raw.strip()
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
38 if not line:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
39 continue # allow empty lines
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
40 if line[0] == '>':
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
41 self.name = line[1:] # remove ">"
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
42 if self.name:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
43 self.id = self.name.split()[0]
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
44 else:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
45 self.id = ''
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
46 continue
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
47 else:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
48 yield line
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
49
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
50
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
51 def bases(self):
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
52 """Generator that yields single bases, while still reading a whole line at
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
53 a time underneath.
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
54 This should be the best of both worlds: it yields a base at a time, but it
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
55 reads a line at a time from the file so it's not slow as molasses."""
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
56 for line in self.new():
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
57 for base in line:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
58 yield base
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
59
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
60
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
61 def extract(self, start, end, chrom=None):
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
62 """Extract a subsequence based on a start and end coordinate.
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
63 The start and end are inclusive, 1-based. If chrom is not supplied, it will
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
64 default to the first chromosome (record) encountered in the FASTA file.
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
65 If the end coordinate is beyond the end of the chromosome, the returned
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
66 sequence will be truncated to the end of the chromosome. If the start
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
67 coordinate is beyond the end of the chromosome, an empty string will be
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
68 returned."""
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
69 outseq = ''
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
70 line_start = 1
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
71 for line in self:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
72 if chrom is not None and self.id != chrom:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
73 continue
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
74 line_end = line_start + len(line) - 1
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
75 # if we haven't encountered the start yet, keep searching
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
76 if line_end < start:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
77 line_start = line_end + 1
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
78 continue
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
79 slice_start = max(start, line_start) - line_start
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
80 slice_end = min(end, line_end) - line_start + 1
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
81 outseq += line[slice_start:slice_end]
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
82 # done? (on the last line?)
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
83 if line_end >= end:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
84 break
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
85 line_start = line_end + 1
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
86 return outseq
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
87
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
88
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
89 #TODO: see 0notes.txt
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
90 class FastaBaseGenerator(object):
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
91 """For when you absolutely have to read one base at a time. VERY SLOW.
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
92 Usage:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
93 fasta = FastaBaseGenerator('/home/user/sequence.fasta')
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
94 for base in fasta:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
95 print "There is a sequence with this FASTA identifier: "+fasta.id
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
96 print "This is the next base from it: "+base
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
97 """
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
98
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
99 def __init__(self, filepath):
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
100 self.filehandle = open(filepath, 'rU')
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
101 self.header = False
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
102 self.name = None
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
103 self.id = None
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
104 self._in_id = None
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
105
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
106 def __iter__(self):
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
107 return self.new()
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
108
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
109 def new(self):
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
110
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
111 newline = True
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
112 while True:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
113 base = self.filehandle.read(1)
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
114 if not base:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
115 raise StopIteration
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
116 elif base == '\n':
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
117 newline = True
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
118 self.header = False
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
119 elif newline and base == '>':
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
120 newline = False
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
121 self.header = True
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
122 self._in_id = True
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
123 self.name = ''
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
124 self.id = ''
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
125 elif self.header:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
126 if self._in_id:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
127 if base.isspace():
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
128 self._in_id = False
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
129 else:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
130 self.id += base
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
131 self.name += base
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
132 else:
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
133 newline = False
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
134 yield base
5257ce9d9184 Initial literal.py tool
Nick Stoler <nstoler@psu.edu>
parents:
diff changeset
135