Mercurial > repos > nick > dna_visualizer
comparison fastareader.py @ 0:5257ce9d9184
Initial literal.py tool
| author | Nick Stoler <nstoler@psu.edu> |
|---|---|
| date | Sun, 02 Mar 2014 13:51:03 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:5257ce9d9184 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import os | |
| 3 __version__ = '0.8' | |
| 4 | |
| 5 | |
| 6 class FormatError(Exception): | |
| 7 def __init__(self, message=None): | |
| 8 if message: | |
| 9 Exception.__init__(self, message) | |
| 10 | |
| 11 | |
| 12 class FastaLineGenerator(object): | |
| 13 """A simple FASTA parser that only reads a line at a time into memory. | |
| 14 Usage: | |
| 15 fasta = FastaLineGenerator('/home/user/sequence.fasta') | |
| 16 for line in fasta: | |
| 17 print "There is a sequence with this FASTA identifier: "+fasta.id | |
| 18 print "It has a line with this sequence: "+line | |
| 19 """ | |
| 20 | |
| 21 def __init__(self, filepath): | |
| 22 if not os.path.isfile(filepath): | |
| 23 raise IOError('File not found: "%s"' % filepath) | |
| 24 self.filepath = filepath | |
| 25 self.name = None | |
| 26 self.id = None | |
| 27 | |
| 28 def __iter__(self): | |
| 29 return self.new() | |
| 30 | |
| 31 def new(self): | |
| 32 filehandle = open(self.filepath, 'rU') | |
| 33 while True: | |
| 34 line_raw = filehandle.readline() | |
| 35 if not line_raw: | |
| 36 raise StopIteration | |
| 37 line = line_raw.strip() | |
| 38 if not line: | |
| 39 continue # allow empty lines | |
| 40 if line[0] == '>': | |
| 41 self.name = line[1:] # remove ">" | |
| 42 if self.name: | |
| 43 self.id = self.name.split()[0] | |
| 44 else: | |
| 45 self.id = '' | |
| 46 continue | |
| 47 else: | |
| 48 yield line | |
| 49 | |
| 50 | |
| 51 def bases(self): | |
| 52 """Generator that yields single bases, while still reading a whole line at | |
| 53 a time underneath. | |
| 54 This should be the best of both worlds: it yields a base at a time, but it | |
| 55 reads a line at a time from the file so it's not slow as molasses.""" | |
| 56 for line in self.new(): | |
| 57 for base in line: | |
| 58 yield base | |
| 59 | |
| 60 | |
| 61 def extract(self, start, end, chrom=None): | |
| 62 """Extract a subsequence based on a start and end coordinate. | |
| 63 The start and end are inclusive, 1-based. If chrom is not supplied, it will | |
| 64 default to the first chromosome (record) encountered in the FASTA file. | |
| 65 If the end coordinate is beyond the end of the chromosome, the returned | |
| 66 sequence will be truncated to the end of the chromosome. If the start | |
| 67 coordinate is beyond the end of the chromosome, an empty string will be | |
| 68 returned.""" | |
| 69 outseq = '' | |
| 70 line_start = 1 | |
| 71 for line in self: | |
| 72 if chrom is not None and self.id != chrom: | |
| 73 continue | |
| 74 line_end = line_start + len(line) - 1 | |
| 75 # if we haven't encountered the start yet, keep searching | |
| 76 if line_end < start: | |
| 77 line_start = line_end + 1 | |
| 78 continue | |
| 79 slice_start = max(start, line_start) - line_start | |
| 80 slice_end = min(end, line_end) - line_start + 1 | |
| 81 outseq += line[slice_start:slice_end] | |
| 82 # done? (on the last line?) | |
| 83 if line_end >= end: | |
| 84 break | |
| 85 line_start = line_end + 1 | |
| 86 return outseq | |
| 87 | |
| 88 | |
| 89 #TODO: see 0notes.txt | |
| 90 class FastaBaseGenerator(object): | |
| 91 """For when you absolutely have to read one base at a time. VERY SLOW. | |
| 92 Usage: | |
| 93 fasta = FastaBaseGenerator('/home/user/sequence.fasta') | |
| 94 for base in fasta: | |
| 95 print "There is a sequence with this FASTA identifier: "+fasta.id | |
| 96 print "This is the next base from it: "+base | |
| 97 """ | |
| 98 | |
| 99 def __init__(self, filepath): | |
| 100 self.filehandle = open(filepath, 'rU') | |
| 101 self.header = False | |
| 102 self.name = None | |
| 103 self.id = None | |
| 104 self._in_id = None | |
| 105 | |
| 106 def __iter__(self): | |
| 107 return self.new() | |
| 108 | |
| 109 def new(self): | |
| 110 | |
| 111 newline = True | |
| 112 while True: | |
| 113 base = self.filehandle.read(1) | |
| 114 if not base: | |
| 115 raise StopIteration | |
| 116 elif base == '\n': | |
| 117 newline = True | |
| 118 self.header = False | |
| 119 elif newline and base == '>': | |
| 120 newline = False | |
| 121 self.header = True | |
| 122 self._in_id = True | |
| 123 self.name = '' | |
| 124 self.id = '' | |
| 125 elif self.header: | |
| 126 if self._in_id: | |
| 127 if base.isspace(): | |
| 128 self._in_id = False | |
| 129 else: | |
| 130 self.id += base | |
| 131 self.name += base | |
| 132 else: | |
| 133 newline = False | |
| 134 yield base | |
| 135 |
