#! /usr/bin/env python
"""Get the size distribution of a Fasta / BED file"""

import os
from optparse import OptionParser
from parsing.fastaParser import *
from parsing.fastqParser import *
from structure.transcriptContainer import *
from parsing.gffParser import *
from misc.progress import *
from misc.rPlotter import *
from misc import utils


if __name__ == "__main__":
  
  # parse command line
  description = "Get Sizes: Get the sizes of a set of genomic coordinates. [Category: Visualization]"

  parser = OptionParser(description = description)
  parser.add_option("-i", "--input",     dest="inputFileName",  action="store",                        type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]")
  parser.add_option("-f", "--format",    dest="format",         action="store",                        type="string", help="format of the input [compulsory] [format: transcript or sequence file format]")
  parser.add_option("-q", "--query",     dest="query",          action="store",                        type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]")   
  parser.add_option("-o", "--output",    dest="outputFileName", action="store",                        type="string", help="output file [format: output file in PNG format]")
  parser.add_option("-x", "--xMax",      dest="xMax",           action="store",      default=None,     type="int",    help="maximum value on the x-axis to plot [format: int]")
  parser.add_option("-v", "--verbosity", dest="verbosity",      action="store",      default=1,        type="int",    help="trace level [format: int]")
  parser.add_option("-c", "--csv",       dest="csv",            action="store_true", default=False,                   help="write a .csv file [format: bool] [default: false]")
  parser.add_option("-l", "--log",       dest="log",            action="store_true", default=False,                   help="write a log file [format: bool] [default: false]")
  (options, args) = parser.parse_args()

  if options.format == "fasta":
    parser = FastaParser(options.inputFileName, options.verbosity)
  elif options.format == "fastq":
    parser = FastqParser(options.inputFileName, options.verbosity)
  else:
    parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity)

  nbItems = 0
  if options.verbosity > 0:
    nbItems = parser.getNbItems()
    print "%i items found" % (nbItems)

  # treat items
  progress   = Progress(nbItems, "Analyzing sequences of " + options.inputFileName, options.verbosity)
  sizes      = {}
  names      = {}
  minimum    = 1000000000000
  maximum    = 0
  sum        = 0
  number     = 0
  nbSubItems = 0
  for item in parser.getIterator():
    items = []
    if options.query == "exon":
      items = item.getExons()
    elif options.query == "exon1":
      if len(item.getExons()) > 1:
        item.sortExons()
        items = [item.getExons()[0]]
    elif options.query == "intron":
      items = item.getIntrons()
    else:
      items = [item, ]

    for thisItem in items:
      size    = thisItem.getSize()
      minimum = min(minimum, size)
      maximum = max(maximum, size)
      name    = thisItem.name.split()[0]
      
      if size not in sizes:
        sizes[size] = 1
        if options.csv:
          names[size] = [name, ]
      else:
        sizes[size] += 1
        if options.csv:
          names[size].append(name)
      sum        += size
      nbSubItems += 1
    number += 1
    progress.inc()
  progress.done()


  # plot sequences
  if options.outputFileName != None:
    plotter = RPlotter("%s.png" % (options.outputFileName), options.verbosity)
    plotter.setFill(0)
    plotter.setMaximumX(options.xMax)
    plotter.setXLabel("Size")
    plotter.setYLabel("# reads")
    plotter.addLine(sizes)
    plotter.plot()
    
  if nbSubItems == 0:
    print "No item found"
    sys.exit(0)
    
  if options.csv:
    csvHandle = open("%s.csv" % (options.outputFileName), "w")
    for size in range(min(sizes.keys()), max(sizes.keys())+1):
      if size not in sizes:
        csvHandle.write("%d,0,\n" % (size))
      else:
        csvHandle.write("%d,%d,%s\n" % (size, sizes[size], ";".join(names[size])))
    csvHandle.close()
                           
  print "%d items" % (number)
  print "%d sub-items" % (nbSubItems)
  print "%d nucleotides" % (sum)
  print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % utils.getMinAvgMedMax(sizes)
