#! /usr/bin/env python
"""Remove sequences with low reliability"""

from optparse import OptionParser
from parsing.sequenceListParser import *
from writer.fastaWriter import *
from misc.progress import *


if __name__ == "__main__":
  
  # parse command line
  description = "Trim Sequences: Remove sequences with low reliability: low occurrences and highly repeted. [Category: Personnal]"

  parser = OptionParser(description = description)
  parser.add_option("-i", "--input",     dest="inputFileName",  action="store",                     type="string", help="input file [compulsory] [format: file in FASTA format]")
  parser.add_option("-o", "--output",    dest="outputFileName", action="store",                     type="string", help="output file [compulsory] [format: output file in FASTA format]")
  parser.add_option("-v", "--verbosity", dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int]")
  parser.add_option("-l", "--log",       dest="log",            action="store_true", default=False,                help="write a log file [format: bool] [default: false]")
  (options, args) = parser.parse_args()

  parser      = SequenceListParser(options.inputFileName, options.verbosity)
  nbSequences = parser.getNbSequences()
  progress    = Progress(nbSequences, "Parsing file %s" % (options.inputFileName), options.verbosity)
  
  writer = FastaWriter(options.outputFileName, options.verbosity)
  if options.log:
    logHandle = open("log.txt", "w")
  
  letters              = ("A", "C", "G", "T")
  nbLowComplexity      = 0
  nbTooManyOccurrences = 0
  
  for sequence in parser.getIteractor():
    halfSize           = len(sequence.sequence) / 2
    occurrences        = set()
    nbOccurrences      = dict(zip(letters, [0 for letter in letters]))
    tooManyOccurrences = False
    good               = True
    
    for char in sequence.sequence:
      if char in letters:
        occurrences.add(char)
        nbOccurrences[char] += 1
        
      
    if len(occurrences) < 4:
      nbLowComplexity += 1
      if options.log:
        logHandle.write("Low complexity for %s\n" % (sequence.sequence))
      good = False

    if good:
      for letter, nbOccurrence in nbOccurrences.iteritems():
        if nbOccurrence > halfSize:
          if not tooManyOccurrences:
            nbTooManyOccurrences += 1
            if options.log:
              logHandle.write("Too many occurrences for %s\n" % (sequence.sequence))
          tooManyOccurrences = True
          good = False
      
    if good:
      writer.addSequence(sequence)
          
    progress.inc()
  progress.done()
  
  if options.log:
    logHandle.close()
    
  print "%d out of %d have low complexity (%f%%)"       % (nbLowComplexity, nbSequences, (float(nbLowComplexity) / nbSequences * 100))    
  print "%d out of %d have too many occurrences (%f%%)" % (nbTooManyOccurrences, nbSequences, (float(nbTooManyOccurrences) / nbSequences * 100))
