#! /usr/bin/env python
"""Get the size distribution of a Fasta / BED file"""

import os
from optparse import OptionParser
from parsing.sequenceListParser import *
from misc.progress import *
from misc.rPlotter import *


if __name__ == "__main__":

  # parse command line
  description = "Get Letter Distribution: Compute the distribution of nucleotides of a set of genomic coordinates. [Category: Visualization]"

  parser = OptionParser(description = description)
  parser.add_option("-i", "--input",     dest="inputFileName",  action="store",                        type="string", help="input file [compulsory] [format: file in FASTA format]")
  parser.add_option("-o", "--output",    dest="outputFileName", action="store",                        type="string", help="output file [compulsory] [format: output file in PNG format]")
  parser.add_option("-v", "--verbosity", dest="verbosity",      action="store",      default=1,        type="int",    help="trace level [format: int]")
  parser.add_option("-c", "--csv",       dest="csv",            action="store_true", default=False,                   help="write a .csv file [format: bool] [default: false]")
  parser.add_option("-l", "--log",       dest="log",            action="store_true", default=False,                   help="write a log file [format: bool] [default: false]")
  (options, args) = parser.parse_args()

  parser = SequenceListParser(options.inputFileName, options.verbosity)
  nbSequences = parser.getNbSequences()
  print "%i sequences read" % (nbSequences)

  # treat items
  progress       = Progress(nbSequences, "Analyzing sequences of " + options.inputFileName, options.verbosity)
  nbLettersTotal = 0
  nbLetters      = {}
  lettersRate    = {}
  nbPositions    = {}
  positionCount  = {}
  positionRate   = {}
  nbPositionRate = {}
  for sequence in parser.getIterator():
    letters            = sequence.sequence
    thisNbLettersTotal = sequence.getSize()
    nbLettersTotal    += thisNbLettersTotal
    thisNbLetters      = {}
    
    for pos in range(len(letters)):
      letter = letters[pos]
      if letter not in thisNbLetters:
        thisNbLetters[letter] = 1
      else:
        thisNbLetters[letter] += 1
      if pos not in nbPositions:
        nbPositions[pos] = 1
      else:
        nbPositions[pos] += 1
      if letter not in positionCount:
        positionCount[letter] = {}
      if pos not in positionCount[letter]:
        positionCount[letter][pos] = 1
      else:
        positionCount[letter][pos] += 1

    for letter in thisNbLetters:
      if letter not in nbLetters:
        nbLetters[letter] = thisNbLetters[letter]
      else:
        nbLetters[letter] += thisNbLetters[letter]
      if letter not in lettersRate:
        lettersRate[letter] = {}
      rate = int(float(thisNbLetters[letter]) / thisNbLettersTotal * 100)
      if rate not in lettersRate[letter]:
        lettersRate[letter][rate] = 1
      else:
        lettersRate[letter][rate] += 1
    progress.inc()
  progress.done()
  
  for letter in positionCount:
    positionRate[letter] = {}
    for pos in positionCount[letter]:
      positionRate[letter][pos] = positionCount[letter][pos] / float(nbPositions[pos]) * 100
  for pos in nbPositions:
    nbPositionRate[pos] = nbPositions[pos] / float(nbPositions[0]) * 100

  # plot content distributions
  plotter = RPlotter("%s.png" % (options.outputFileName), options.verbosity, True)
  plotter.setFill(0)
  plotter.setLegend(True)
  for letter in lettersRate:
    plotter.addLine(lettersRate[letter], letter)
  plotter.plot()
  
  # plot distribution per position
  plotter = RPlotter("%sPerNt.png" % (options.outputFileName), options.verbosity, True)
  plotter.setFill(0)
  plotter.setLegend(True)
  plotter.setXLabel("Position on the read")
  plotter.setYLabel("Percentage")
  for letter in positionRate:
    plotter.addLine(positionRate[letter], letter)
  plotter.addLine(nbPositionRate, "#")
  plotter.plot()

  print "%d sequences" % (nbSequences)
  print "%d letters" % (nbLettersTotal)
  for letter in nbLetters:
    print "%s: %d (%.2f%%)" % (letter, nbLetters[letter], float(nbLetters[letter]) / nbLettersTotal * 100)
