changeset 0:5257ce9d9184

Initial literal.py tool
author Nick Stoler <nstoler@psu.edu>
date Sun, 02 Mar 2014 13:51:03 -0500
parents
children 717a4009675c
files .hgignore fastareader.py literal.py literal.xml
diffstat 4 files changed, 306 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.hgignore	Sun Mar 02 13:51:03 2014 -0500
@@ -0,0 +1,4 @@
+.git
+.gitignore
+img
+.pyc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastareader.py	Sun Mar 02 13:51:03 2014 -0500
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+import os
+__version__ = '0.8'
+
+
+class FormatError(Exception):
+  def __init__(self, message=None):
+    if message:
+      Exception.__init__(self, message)
+
+
+class FastaLineGenerator(object):
+  """A simple FASTA parser that only reads a line at a time into memory.
+  Usage:
+  fasta = FastaLineGenerator('/home/user/sequence.fasta')
+  for line in fasta:
+    print "There is a sequence with this FASTA identifier: "+fasta.id
+    print "It has a line with this sequence: "+line
+  """
+
+  def __init__(self, filepath):
+    if not os.path.isfile(filepath):
+      raise IOError('File not found: "%s"' % filepath)
+    self.filepath = filepath
+    self.name = None
+    self.id = None
+
+  def __iter__(self):
+    return self.new()
+
+  def new(self):
+    filehandle = open(self.filepath, 'rU')
+    while True:
+      line_raw = filehandle.readline()
+      if not line_raw:
+        raise StopIteration
+      line = line_raw.strip()
+      if not line:
+        continue # allow empty lines
+      if line[0] == '>':
+        self.name = line[1:]  # remove ">"
+        if self.name:
+          self.id = self.name.split()[0]
+        else:
+          self.id = ''
+        continue
+      else:
+        yield line
+
+
+  def bases(self):
+    """Generator that yields single bases, while still reading a whole line at
+    a time underneath.
+    This should be the best of both worlds: it yields a base at a time, but it
+    reads a line at a time from the file so it's not slow as molasses."""
+    for line in self.new():
+      for base in line:
+        yield base
+
+
+  def extract(self, start, end, chrom=None):
+    """Extract a subsequence based on a start and end coordinate.
+    The start and end are inclusive, 1-based. If chrom is not supplied, it will
+    default to the first chromosome (record) encountered in the FASTA file.
+    If the end coordinate is beyond the end of the chromosome, the returned
+    sequence will be truncated to the end of the chromosome. If the start
+    coordinate is beyond the end of the chromosome, an empty string will be
+    returned."""
+    outseq = ''
+    line_start = 1
+    for line in self:
+      if chrom is not None and self.id != chrom:
+        continue
+      line_end = line_start + len(line) - 1
+      # if we haven't encountered the start yet, keep searching
+      if line_end < start:
+        line_start = line_end + 1
+        continue
+      slice_start = max(start, line_start) - line_start
+      slice_end = min(end, line_end) - line_start + 1
+      outseq += line[slice_start:slice_end]
+      # done? (on the last line?)
+      if line_end >= end:
+        break
+      line_start = line_end + 1
+    return outseq
+
+
+#TODO: see 0notes.txt
+class FastaBaseGenerator(object):
+  """For when you absolutely have to read one base at a time. VERY SLOW.
+  Usage:
+  fasta = FastaBaseGenerator('/home/user/sequence.fasta')
+  for base in fasta:
+    print "There is a sequence with this FASTA identifier: "+fasta.id
+    print "This is the next base from it: "+base
+  """
+
+  def __init__(self, filepath):
+    self.filehandle = open(filepath, 'rU')
+    self.header = False
+    self.name = None
+    self.id = None
+    self._in_id = None
+
+  def __iter__(self):
+    return self.new()
+
+  def new(self):
+
+    newline = True
+    while True:
+      base = self.filehandle.read(1)
+      if not base:
+        raise StopIteration
+      elif base == '\n':
+        newline = True
+        self.header = False
+      elif newline and base == '>':
+        newline = False
+        self.header = True
+        self._in_id = True
+        self.name = ''
+        self.id = ''
+      elif self.header:
+        if self._in_id:
+          if base.isspace():
+            self._in_id = False
+          else:
+            self.id += base
+        self.name += base
+      else:
+        newline = False
+        yield base
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/literal.py	Sun Mar 02 13:51:03 2014 -0500
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+from __future__ import division
+import os
+import sys
+import Image
+import argparse
+import fastareader
+
+OPT_DEFAULTS = {'size':'512x512', 'verbose':True,
+  'A':'0,255,0', 'T':'255,0,0', 'G':'255,255,255', 'C':'0,0,255'}
+USAGE = "%(prog)s [options] genome.fasta"
+DESCRIPTION = """Convert DNA sequence into a PNG image by representing each base
+  with one colored pixel."""
+EPILOG = """"""
+
+def main():
+
+  parser = argparse.ArgumentParser(
+    description=DESCRIPTION, usage=USAGE, epilog=EPILOG)
+  parser.set_defaults(**OPT_DEFAULTS)
+
+  parser.add_argument('fasta', metavar='genome.fasta',
+    help="""Input sequence. Can be in FASTA format or a plain text file
+      containing only the sequence. Any non-ATGC characters (case-insensitive)
+      will be skipped.""")
+  parser.add_argument('-s', '--size',
+    help="""The output image size, in pixels, in the format "widthxheight", e.g.
+      "640x480". If the sequence is larger than the number of pixels in the
+      image, it will be cut off. Default size: %(default)s""")
+  parser.add_argument('-o', '--outfile', metavar='image.png',
+    help="""Output filename. Overrides the default, which is
+      to use the input filename base plus .png.""")
+  parser.add_argument('-d', '--display', action='store_true',
+    help="""Display the image instead of saving it.""")
+  parser.add_argument('-c', '--clobber', action='store_true',
+    help="""If the output filename already exists, overwrite it instead of
+      throwing an error (the default).""")
+  parser.add_argument('-v', '--verbose', action='store_true',
+    help="""Verbose mode. On by default.""")
+  parser.add_argument('-q', '--quiet', action='store_false', dest='verbose',
+    help="""Quiet mode.""")
+  group = parser.add_argument_group('Color customization', """Use these options
+    to use custom colors for bases. Specify with a comma-delimited RGB value
+    like "100,150,10".""")
+  group.add_argument('-A', metavar='R,G,B',
+    help="""default: %(default)s""")
+  group.add_argument('-T', metavar='R,G,B',
+    help="""default: %(default)s""")
+  group.add_argument('-G', metavar='R,G,B',
+    help="""default: %(default)s""")
+  group.add_argument('-C', metavar='R,G,B',
+    help="""default: %(default)s""")
+
+  args = parser.parse_args()
+
+  try:
+    size = parse_size(args.size)
+  except ValueError:
+    parser.print_help()
+    fail('\nError: Invalid size string "%s".' % args.size)
+
+  fasta = fastareader.FastaLineGenerator(args.fasta)
+  bases = fasta.bases()
+
+  if not args.display:
+    outfile = args.outfile if args.outfile else outfile_name(args.fasta)
+    if os.path.exists(outfile) and not args.clobber:
+      fail('Error: Output filename already taken: "%s"' % outfile)
+
+  colors = {}
+  colors['A'] = parse_rgb(args.A)
+  colors['T'] = parse_rgb(args.T)
+  colors['G'] = parse_rgb(args.G)
+  colors['C'] = parse_rgb(args.C)
+
+  image = Image.new('RGB', size, 'white')
+  pixels = image.load()
+
+  done = False
+  for i in range(image.size[1]):
+    for j in range(image.size[0]):
+      try:
+        base = next(bases).upper()
+      except StopIteration:
+        done = True
+        break
+      if base in colors:
+        pixels[j,i] = colors[base]
+    if done:
+      break
+
+  if args.display:
+    image.show()
+  else:
+    image.save(outfile, 'PNG')
+
+
+
+def parse_size(size_str):
+  """Parse size string, return a tuple of (width, height).
+  Accepts size strings in the format "640x480".
+  If not valid, raises ValueError."""
+  size = map(int, size_str.split('x'))
+  if len(size) != 2:
+    raise ValueError
+  else:
+    return tuple(size)
+
+
+def parse_rgb(rgb_str):
+  """Parse RGB string, return a tuple of (R, G, B).
+  If not valid, raises ValueError."""
+  rgb = map(int, rgb_str.split(','))
+  if len(rgb) != 3:
+    raise ValueError
+  else:
+    return tuple(rgb)
+
+
+def outfile_name(infilename):
+  base = infilename.split('.')[0]
+  if not base:
+    base = infilename
+  return base+'.png'
+
+
+def fail(message):
+  sys.stderr.write(message+"\n")
+  sys.exit(1)
+
+if __name__ == '__main__':
+  main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/literal.xml	Sun Mar 02 13:51:03 2014 -0500
@@ -0,0 +1,35 @@
+<tool id="visualdna" name="DNA visualizer" version="0.1">
+  <description>Visualize DNA with colored pixels.</description>
+  <command interpreter="python">literal.py $input -o $output -s ${width}x${height} </command>
+  <inputs>
+    <param name="input" type="data" format="fasta" label="Input sequence" />
+    <param name="width" type="integer" value="512" min="512" label="Output image width" help="in pixels" />
+    <param name="height" type="integer" value="512" min="512" label="Output image height" help="in pixels" />
+  </inputs>
+  <outputs>
+    <data name="output" format="png"/>
+  </outputs>
+  <stdio>
+    <exit_code range="1:" err_level="fatal"/>
+    <exit_code range=":-1" err_level="fatal"/>
+  </stdio>
+
+  <help>
+
+.. class:: infomark
+
+**What it does**
+
+This tool convert DNA sequence into a PNG image by representing each base with one colored pixel.
+
+-----
+
+.. class:: infomark
+
+**Input Format**
+
+The input sequence can be in FASTA format or a plain text file containing only the sequence. Any non-ATGC characters (case-insensitive) will be skipped.
+
+  </help>
+
+</tool>