Mercurial > repos > nick > dna_visualizer
diff fastareader.py @ 0:5257ce9d9184
Initial literal.py tool
author | Nick Stoler <nstoler@psu.edu> |
---|---|
date | Sun, 02 Mar 2014 13:51:03 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastareader.py Sun Mar 02 13:51:03 2014 -0500 @@ -0,0 +1,135 @@ +#!/usr/bin/env python +import os +__version__ = '0.8' + + +class FormatError(Exception): + def __init__(self, message=None): + if message: + Exception.__init__(self, message) + + +class FastaLineGenerator(object): + """A simple FASTA parser that only reads a line at a time into memory. + Usage: + fasta = FastaLineGenerator('/home/user/sequence.fasta') + for line in fasta: + print "There is a sequence with this FASTA identifier: "+fasta.id + print "It has a line with this sequence: "+line + """ + + def __init__(self, filepath): + if not os.path.isfile(filepath): + raise IOError('File not found: "%s"' % filepath) + self.filepath = filepath + self.name = None + self.id = None + + def __iter__(self): + return self.new() + + def new(self): + filehandle = open(self.filepath, 'rU') + while True: + line_raw = filehandle.readline() + if not line_raw: + raise StopIteration + line = line_raw.strip() + if not line: + continue # allow empty lines + if line[0] == '>': + self.name = line[1:] # remove ">" + if self.name: + self.id = self.name.split()[0] + else: + self.id = '' + continue + else: + yield line + + + def bases(self): + """Generator that yields single bases, while still reading a whole line at + a time underneath. + This should be the best of both worlds: it yields a base at a time, but it + reads a line at a time from the file so it's not slow as molasses.""" + for line in self.new(): + for base in line: + yield base + + + def extract(self, start, end, chrom=None): + """Extract a subsequence based on a start and end coordinate. + The start and end are inclusive, 1-based. If chrom is not supplied, it will + default to the first chromosome (record) encountered in the FASTA file. + If the end coordinate is beyond the end of the chromosome, the returned + sequence will be truncated to the end of the chromosome. If the start + coordinate is beyond the end of the chromosome, an empty string will be + returned.""" + outseq = '' + line_start = 1 + for line in self: + if chrom is not None and self.id != chrom: + continue + line_end = line_start + len(line) - 1 + # if we haven't encountered the start yet, keep searching + if line_end < start: + line_start = line_end + 1 + continue + slice_start = max(start, line_start) - line_start + slice_end = min(end, line_end) - line_start + 1 + outseq += line[slice_start:slice_end] + # done? (on the last line?) + if line_end >= end: + break + line_start = line_end + 1 + return outseq + + +#TODO: see 0notes.txt +class FastaBaseGenerator(object): + """For when you absolutely have to read one base at a time. VERY SLOW. + Usage: + fasta = FastaBaseGenerator('/home/user/sequence.fasta') + for base in fasta: + print "There is a sequence with this FASTA identifier: "+fasta.id + print "This is the next base from it: "+base + """ + + def __init__(self, filepath): + self.filehandle = open(filepath, 'rU') + self.header = False + self.name = None + self.id = None + self._in_id = None + + def __iter__(self): + return self.new() + + def new(self): + + newline = True + while True: + base = self.filehandle.read(1) + if not base: + raise StopIteration + elif base == '\n': + newline = True + self.header = False + elif newline and base == '>': + newline = False + self.header = True + self._in_id = True + self.name = '' + self.id = '' + elif self.header: + if self._in_id: + if base.isspace(): + self._in_id = False + else: + self.id += base + self.name += base + else: + newline = False + yield base +