#! /usr/bin/env python
"""Remove all dirty sequences"""

import os
import sys
from optparse import OptionParser
from parsing.fastaParser import *
from writer.fastaWriter import *
from parsing.fastqParser import *
from writer.fastqWriter import *
from misc.progress import *
from misc.rPlotter import *


if __name__ == "__main__":
  
  # parse command line
  description = "Restrict from nucleotide: Remove the sequences with ambiguous nucleotides. [Category: Data Selection]"

  parser = OptionParser(description = description)
  parser.add_option("-i", "--input",     dest="inputFileName",  action="store",                        type="string", help="input file [compulsory] [format: file in sequence format given by -f]")
  parser.add_option("-f", "--format",    dest="inputFileName",  action="store",      default="fasta",  type="string", help="format of the input and output files [compulsory] [format: sequence file format]")
  parser.add_option("-o", "--output",    dest="outputFileName", action="store",                        type="string", help="output file [compulsory] [format: output file in sequence format given by -f]")
  parser.add_option("-v", "--verbosity", dest="verbosity",      action="store",      default=1,        type="int",    help="trace level [format: int]")
  parser.add_option("-l", "--log",       dest="log",            action="store_true", default=False,                   help="write a log file [format: bool] [default: false]")
  (options, args) = parser.parse_args()

  # treat items
  if options.format == "fasta":
    parser   = FastaParser(options.inputFileName, options.verbosity)
    writer   = FastaWriter(options.outputFileName, options.verbosity)
  elif options.format == "fastq":
    parser   = FastqParser(options.inputFileName, options.verbosity)
    writer   = FastqWriter(options.outputFileName, options.verbosity)
  else:
    sys.exit("Do not understand '%s' format." % (options.format))
  nbSequences = parser.getNbSequences()
  print "sequences: %d" % (nbSequences)
  
  progress = Progress(nbSequences, "Analyzing sequences of %s" % (options.inputFileName), options.verbosity)
  nbKept   = 0
  for sequence in parser.getIterator():
    if not sequence.containsAmbiguousNucleotides():
      writer.addSequence(sequence)
      nbKept += 1
    progress.inc()
  progress.done()

  print "%d items, %d kept (%.2f%%)" % (nbSequences, nbKept, float(nbKept) / nbSequences * 100)
