# HG changeset patch # User Nick Stoler # Date 1393786263 18000 # Node ID 5257ce9d9184272f2e940f5667a4990b3f408fe4 Initial literal.py tool diff -r 000000000000 -r 5257ce9d9184 .hgignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgignore Sun Mar 02 13:51:03 2014 -0500 @@ -0,0 +1,4 @@ +.git +.gitignore +img +.pyc diff -r 000000000000 -r 5257ce9d9184 fastareader.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastareader.py Sun Mar 02 13:51:03 2014 -0500 @@ -0,0 +1,135 @@ +#!/usr/bin/env python +import os +__version__ = '0.8' + + +class FormatError(Exception): + def __init__(self, message=None): + if message: + Exception.__init__(self, message) + + +class FastaLineGenerator(object): + """A simple FASTA parser that only reads a line at a time into memory. + Usage: + fasta = FastaLineGenerator('/home/user/sequence.fasta') + for line in fasta: + print "There is a sequence with this FASTA identifier: "+fasta.id + print "It has a line with this sequence: "+line + """ + + def __init__(self, filepath): + if not os.path.isfile(filepath): + raise IOError('File not found: "%s"' % filepath) + self.filepath = filepath + self.name = None + self.id = None + + def __iter__(self): + return self.new() + + def new(self): + filehandle = open(self.filepath, 'rU') + while True: + line_raw = filehandle.readline() + if not line_raw: + raise StopIteration + line = line_raw.strip() + if not line: + continue # allow empty lines + if line[0] == '>': + self.name = line[1:] # remove ">" + if self.name: + self.id = self.name.split()[0] + else: + self.id = '' + continue + else: + yield line + + + def bases(self): + """Generator that yields single bases, while still reading a whole line at + a time underneath. + This should be the best of both worlds: it yields a base at a time, but it + reads a line at a time from the file so it's not slow as molasses.""" + for line in self.new(): + for base in line: + yield base + + + def extract(self, start, end, chrom=None): + """Extract a subsequence based on a start and end coordinate. + The start and end are inclusive, 1-based. If chrom is not supplied, it will + default to the first chromosome (record) encountered in the FASTA file. + If the end coordinate is beyond the end of the chromosome, the returned + sequence will be truncated to the end of the chromosome. If the start + coordinate is beyond the end of the chromosome, an empty string will be + returned.""" + outseq = '' + line_start = 1 + for line in self: + if chrom is not None and self.id != chrom: + continue + line_end = line_start + len(line) - 1 + # if we haven't encountered the start yet, keep searching + if line_end < start: + line_start = line_end + 1 + continue + slice_start = max(start, line_start) - line_start + slice_end = min(end, line_end) - line_start + 1 + outseq += line[slice_start:slice_end] + # done? (on the last line?) + if line_end >= end: + break + line_start = line_end + 1 + return outseq + + +#TODO: see 0notes.txt +class FastaBaseGenerator(object): + """For when you absolutely have to read one base at a time. VERY SLOW. + Usage: + fasta = FastaBaseGenerator('/home/user/sequence.fasta') + for base in fasta: + print "There is a sequence with this FASTA identifier: "+fasta.id + print "This is the next base from it: "+base + """ + + def __init__(self, filepath): + self.filehandle = open(filepath, 'rU') + self.header = False + self.name = None + self.id = None + self._in_id = None + + def __iter__(self): + return self.new() + + def new(self): + + newline = True + while True: + base = self.filehandle.read(1) + if not base: + raise StopIteration + elif base == '\n': + newline = True + self.header = False + elif newline and base == '>': + newline = False + self.header = True + self._in_id = True + self.name = '' + self.id = '' + elif self.header: + if self._in_id: + if base.isspace(): + self._in_id = False + else: + self.id += base + self.name += base + else: + newline = False + yield base + diff -r 000000000000 -r 5257ce9d9184 literal.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/literal.py Sun Mar 02 13:51:03 2014 -0500 @@ -0,0 +1,132 @@ +#!/usr/bin/env python +from __future__ import division +import os +import sys +import Image +import argparse +import fastareader + +OPT_DEFAULTS = {'size':'512x512', 'verbose':True, + 'A':'0,255,0', 'T':'255,0,0', 'G':'255,255,255', 'C':'0,0,255'} +USAGE = "%(prog)s [options] genome.fasta" +DESCRIPTION = """Convert DNA sequence into a PNG image by representing each base + with one colored pixel.""" +EPILOG = """""" + +def main(): + + parser = argparse.ArgumentParser( + description=DESCRIPTION, usage=USAGE, epilog=EPILOG) + parser.set_defaults(**OPT_DEFAULTS) + + parser.add_argument('fasta', metavar='genome.fasta', + help="""Input sequence. Can be in FASTA format or a plain text file + containing only the sequence. Any non-ATGC characters (case-insensitive) + will be skipped.""") + parser.add_argument('-s', '--size', + help="""The output image size, in pixels, in the format "widthxheight", e.g. + "640x480". If the sequence is larger than the number of pixels in the + image, it will be cut off. Default size: %(default)s""") + parser.add_argument('-o', '--outfile', metavar='image.png', + help="""Output filename. Overrides the default, which is + to use the input filename base plus .png.""") + parser.add_argument('-d', '--display', action='store_true', + help="""Display the image instead of saving it.""") + parser.add_argument('-c', '--clobber', action='store_true', + help="""If the output filename already exists, overwrite it instead of + throwing an error (the default).""") + parser.add_argument('-v', '--verbose', action='store_true', + help="""Verbose mode. On by default.""") + parser.add_argument('-q', '--quiet', action='store_false', dest='verbose', + help="""Quiet mode.""") + group = parser.add_argument_group('Color customization', """Use these options + to use custom colors for bases. Specify with a comma-delimited RGB value + like "100,150,10".""") + group.add_argument('-A', metavar='R,G,B', + help="""default: %(default)s""") + group.add_argument('-T', metavar='R,G,B', + help="""default: %(default)s""") + group.add_argument('-G', metavar='R,G,B', + help="""default: %(default)s""") + group.add_argument('-C', metavar='R,G,B', + help="""default: %(default)s""") + + args = parser.parse_args() + + try: + size = parse_size(args.size) + except ValueError: + parser.print_help() + fail('\nError: Invalid size string "%s".' % args.size) + + fasta = fastareader.FastaLineGenerator(args.fasta) + bases = fasta.bases() + + if not args.display: + outfile = args.outfile if args.outfile else outfile_name(args.fasta) + if os.path.exists(outfile) and not args.clobber: + fail('Error: Output filename already taken: "%s"' % outfile) + + colors = {} + colors['A'] = parse_rgb(args.A) + colors['T'] = parse_rgb(args.T) + colors['G'] = parse_rgb(args.G) + colors['C'] = parse_rgb(args.C) + + image = Image.new('RGB', size, 'white') + pixels = image.load() + + done = False + for i in range(image.size[1]): + for j in range(image.size[0]): + try: + base = next(bases).upper() + except StopIteration: + done = True + break + if base in colors: + pixels[j,i] = colors[base] + if done: + break + + if args.display: + image.show() + else: + image.save(outfile, 'PNG') + + + +def parse_size(size_str): + """Parse size string, return a tuple of (width, height). + Accepts size strings in the format "640x480". + If not valid, raises ValueError.""" + size = map(int, size_str.split('x')) + if len(size) != 2: + raise ValueError + else: + return tuple(size) + + +def parse_rgb(rgb_str): + """Parse RGB string, return a tuple of (R, G, B). + If not valid, raises ValueError.""" + rgb = map(int, rgb_str.split(',')) + if len(rgb) != 3: + raise ValueError + else: + return tuple(rgb) + + +def outfile_name(infilename): + base = infilename.split('.')[0] + if not base: + base = infilename + return base+'.png' + + +def fail(message): + sys.stderr.write(message+"\n") + sys.exit(1) + +if __name__ == '__main__': + main() diff -r 000000000000 -r 5257ce9d9184 literal.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/literal.xml Sun Mar 02 13:51:03 2014 -0500 @@ -0,0 +1,35 @@ + + Visualize DNA with colored pixels. + literal.py $input -o $output -s ${width}x${height} + + + + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This tool convert DNA sequence into a PNG image by representing each base with one colored pixel. + +----- + +.. class:: infomark + +**Input Format** + +The input sequence can be in FASTA format or a plain text file containing only the sequence. Any non-ATGC characters (case-insensitive) will be skipped. + + + +