comparison smart_toolShed/SMART/Java/Python/CountReadGCPercent.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #!/usr/bin/env python
2
3 from optparse import OptionParser
4 from commons.core.parsing.FastaParser import FastaParser
5 from commons.core.writer.Gff3Writer import Gff3Writer
6 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
7 from SMART.Java.Python.misc.Progress import Progress
8 from commons.core.utils.RepetOptionParser import RepetOptionParser
9 from Gnome_tools.CountGCPercentBySlidingWindow import CountGCPercentBySlidingWindow
10
11
12 class CountReadGCPercent(object):
13
14 def __init__(self):
15 self.referenceReader = None
16 self.gffReader = None
17 self.outputWriter = None
18 self.verbose = 0
19
20 def setInputReferenceFile(self, fileName):
21 self.referenceReader = fileName
22
23 def setInputGffFile(self, fileName):
24 self.gffReader = TranscriptContainer(fileName, 'gff3', self.verbose)
25
26 def setOutputFileName(self, fileName):
27 self.outputWriter = Gff3Writer(fileName, self.verbose)
28
29 def readGffAnnotation(self):
30 self.coveredRegions = {}
31 progress = Progress(self.gffReader.getNbTranscripts(), "Reading gff3 annotation file", self.verbose)
32 for transcript in self.gffReader.getIterator():
33 chromosome = transcript.getChromosome()
34 if chromosome not in self.coveredRegions:
35 self.coveredRegions[chromosome] = {}
36 for exon in transcript.getExons():
37 for position in range(exon.getStart(), exon.getEnd()+1):
38 self.coveredRegions[chromosome][position] = 1
39 progress.inc()
40 progress.done()
41
42 def write(self):
43 iParser = FastaParser(self.referenceReader)
44 iParser.setTags()
45 iGetGCPercentBySW = CountGCPercentBySlidingWindow()
46 progress = Progress(self.gffReader.getNbTranscripts(), "Writing output file", self.verbose)
47 for transcript in self.gffReader.getIterator():
48 chromosome = transcript.getChromosome()
49 GCpercent = 0
50 nPercent = 0
51 for exon in transcript.getExons():
52 for sequenceName in iParser.getTags().keys():
53 if sequenceName != chromosome:
54 continue
55 else:
56 subSequence = iParser.getSubSequence(sequenceName, exon.getStart() , exon.getEnd(), 1)
57 GCpercent, nPercent = iGetGCPercentBySW.getGCPercentAccordingToNAndNPercent(subSequence)
58 print "GCpercent = %f, nPercent = %f" % (GCpercent, nPercent)
59 transcript.setTagValue("GCpercent", GCpercent)
60 transcript.setTagValue("NPercent", nPercent)
61 self.outputWriter.addTranscript(transcript)
62 progress.inc()
63 progress.done()
64
65 def run(self):
66 self.readGffAnnotation()
67 if self.outputWriter != None:
68 self.write()
69
70 if __name__ == "__main__":
71 description = "Count GC percent for each read against a genome."
72 usage = "CountReadGCPercent.py -i <fasta file> -j <gff3 file> -o <output gff3 file> -v <verbose> -h]"
73 examples = "\nExample: \n"
74 examples += "\t$ python CountReadGCPercent.py -i file.fasta -j annotation.gff -o output.gff3"
75 examples += "\n\n"
76 parser = RepetOptionParser(description = description, usage = usage, version = "v1.0", epilog = examples)
77 parser.add_option( '-i', '--inputGenome', dest='fastaFile', help='fasta file [compulsory]', default= None )
78 parser.add_option( '-j', '--inputAnnotation', dest='gffFile', help='gff3 file [compulsory]', default= None)
79 parser.add_option( '-o', '--output', dest='outputFile', help='output gff3 file [compulsory]', default= None )
80 parser.add_option( '-v', '--verbose', dest='verbose', help='verbosity level (default=0/1)',type="int", default= 0 )
81 (options, args) = parser.parse_args()
82
83 readGCPercent = CountReadGCPercent()
84 readGCPercent.setInputReferenceFile(options.fastaFile)
85 readGCPercent.setInputGffFile(options.gffFile)
86 readGCPercent.setOutputFileName(options.outputFile)
87 readGCPercent.run()
88