diff SMART/Java/Python/CountReadGCPercent.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SMART/Java/Python/CountReadGCPercent.py	Fri Jan 18 04:54:14 2013 -0500
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+
+from optparse import OptionParser
+from commons.core.parsing.FastaParser import FastaParser
+from commons.core.writer.Gff3Writer import Gff3Writer
+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
+from SMART.Java.Python.misc.Progress import Progress
+from commons.core.utils.RepetOptionParser import RepetOptionParser
+from Gnome_tools.CountGCPercentBySlidingWindow import CountGCPercentBySlidingWindow
+
+
+class CountReadGCPercent(object):
+    
+    def __init__(self):
+        self.referenceReader = None
+        self.gffReader = None
+        self.outputWriter = None
+        self.verbose = 0
+        
+    def setInputReferenceFile(self, fileName):
+        self.referenceReader = fileName
+ 
+    def setInputGffFile(self, fileName):
+        self.gffReader = TranscriptContainer(fileName, 'gff3', self.verbose)
+        
+    def setOutputFileName(self, fileName):
+        self.outputWriter = Gff3Writer(fileName, self.verbose)
+
+    def readGffAnnotation(self):
+        self.coveredRegions = {}
+        progress = Progress(self.gffReader.getNbTranscripts(), "Reading gff3 annotation file", self.verbose)
+        for transcript in self.gffReader.getIterator():
+            chromosome = transcript.getChromosome()
+            if chromosome not in self.coveredRegions:
+                self.coveredRegions[chromosome] = {}
+            for exon in transcript.getExons():
+                for position in range(exon.getStart(), exon.getEnd()+1):
+                    self.coveredRegions[chromosome][position] = 1
+            progress.inc()
+        progress.done()
+        
+    def write(self):
+        iParser = FastaParser(self.referenceReader)
+        iParser.setTags()
+        iGetGCPercentBySW = CountGCPercentBySlidingWindow()
+        progress = Progress(self.gffReader.getNbTranscripts(), "Writing output file", self.verbose)
+        for transcript in self.gffReader.getIterator():
+            chromosome = transcript.getChromosome()
+            GCpercent = 0
+            nPercent = 0
+            for exon in transcript.getExons():
+                    for sequenceName in iParser.getTags().keys():
+                        if sequenceName != chromosome:
+                            continue
+                        else:
+                            subSequence = iParser.getSubSequence(sequenceName, exon.getStart() , exon.getEnd(), 1)
+                            GCpercent, nPercent = iGetGCPercentBySW.getGCPercentAccordingToNAndNPercent(subSequence)
+                            print "GCpercent = %f, nPercent = %f" % (GCpercent, nPercent)
+            transcript.setTagValue("GCpercent", GCpercent)
+            transcript.setTagValue("NPercent", nPercent)
+            self.outputWriter.addTranscript(transcript)
+            progress.inc()
+        progress.done()
+ 
+    def run(self):
+        self.readGffAnnotation()
+        if self.outputWriter != None:
+            self.write()
+            
+if __name__ == "__main__":
+        description = "Count GC percent for each read against a genome."
+        usage = "CountReadGCPercent.py -i <fasta file> -j <gff3 file> -o <output gff3 file> -v <verbose> -h]"
+        examples = "\nExample: \n"
+        examples += "\t$ python CountReadGCPercent.py -i file.fasta -j annotation.gff -o output.gff3"
+        examples += "\n\n"
+        parser = RepetOptionParser(description = description, usage = usage, version = "v1.0", epilog = examples)
+        parser.add_option( '-i', '--inputGenome', dest='fastaFile', help='fasta file [compulsory]', default= None )
+        parser.add_option( '-j', '--inputAnnotation', dest='gffFile', help='gff3 file [compulsory]', default= None)
+        parser.add_option( '-o', '--output', dest='outputFile', help='output gff3 file [compulsory]', default= None )
+        parser.add_option( '-v', '--verbose', dest='verbose', help='verbosity level (default=0/1)',type="int", default= 0 )
+        (options, args) = parser.parse_args()
+    
+        readGCPercent = CountReadGCPercent()
+        readGCPercent.setInputReferenceFile(options.fastaFile)
+        readGCPercent.setInputGffFile(options.gffFile)
+        readGCPercent.setOutputFileName(options.outputFile)
+        readGCPercent.run()
+        
\ No newline at end of file