import re
import sys
from structure.interval import *
from parser.transcriptListParser import *


class GffParser(TranscriptListParser):
  """A class that parses a GFF file and create a transcript list"""


  def __init__(self, fileName, verbosity = 0):
    super(GffParser, self).__init__(fileName, verbosity)


  def __del__(self):
    super(GffParser, self).__del__()


  def skipFirstLines(self):
    pass


  def getInfos(self):
    self.chromosomes   = set()
    self.nbTranscripts = 0
    self.size          = 0
    self.reset()
    if self.verbosity >= 10:
      print "Getting information on %s." % (self.fileName)
    self.reset()
    for line in self.handle:
      line = line.strip()
      if line == "":
        continue
      parts = line.split("\t")
      self.chromosomes.add(parts[0])
      if parts[8].find("Parent") == -1:
        self.nbTranscripts += 1
      else:
        self.size += max(int(parts[3]), int(parts[4])) - min(int(parts[3]), int(parts[4])) + 1
      if self.verbosity >= 10 and self.nbTranscripts % 100000 == 0:
        sys.stdout.write("  %d transcripts read\r" % (self.nbTranscripts))
        sys.stdout.flush()
    self.reset()
    if self.verbosity >= 10:
      print "  %d transcripts read" % (self.nbTranscripts)
      print "Done."


  def parseLine(self, line):
    m = re.search(r"^\s*(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+([+-])\s+\S+\s+(\S.*)$", line)
    if m == None:
      sys.exit("\nCannot read GFF line %s" % (line))
    interval = Interval()
    interval.setChromosome(m.group(1))
    interval.setName("unnamed transcript")
    interval.setStart(min(int(m.group(4)), int(m.group(5))))
    interval.setEnd(max(int(m.group(4)), int(m.group(5))))
    interval.setDirection(m.group(7))
    interval.feature = m.group(3)
    remainings = m.group(8).split(";")
    for remaining in remainings:
      remaining = remaining.strip()
      if remaining == "":
        continue
      posSpace = remaining.find(" ")
      posEqual = remaining.find("=")
      if posEqual != -1 and (posEqual < posSpace or posSpace == -1):
        parts = remaining.split("=")
      else:
        parts = remaining.split()
      field = parts[0].strip()
      value = " ".join(parts[1:]).strip(" \"")
      if field == "Name" or field == "name" or field == "Sequence" or field == "TE" or field == "SAT":
        interval.setName(value)
      else:
        try:
          intValue = int(value)
          interval.setTagValue(field, intValue)
        except ValueError:
          interval.setTagValue(field, value)

    if "Parent" in interval.getTagNames():
      if self.currentTranscript == None:
        sys.exit("GFF file does not start with a transcript!")
      if interval.getTagValue("Parent") != self.currentTranscript.getTagValue("ID"):
        sys.exit("Exon '%s' is not right after its transcript in GFF file!" % (interval))
      self.currentTranscript.addExon(interval)
      if interval.name == None:
        interval.name = self.currentTranscript.name
      return None
    transcript = self.currentTranscript
    self.currentTranscript = Transcript()
    self.currentTranscript.copy(interval)
    return transcript
