6
|
1 #!/usr/bin/env python
|
|
2
|
|
3 from optparse import OptionParser
|
|
4 from commons.core.parsing.FastaParser import FastaParser
|
|
5 from commons.core.writer.Gff3Writer import Gff3Writer
|
|
6 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
|
|
7 from SMART.Java.Python.misc.Progress import Progress
|
|
8 from commons.core.utils.RepetOptionParser import RepetOptionParser
|
|
9 from Gnome_tools.CountGCPercentBySlidingWindow import CountGCPercentBySlidingWindow
|
|
10
|
|
11
|
|
12 class CountReadGCPercent(object):
|
|
13
|
|
14 def __init__(self):
|
|
15 self.referenceReader = None
|
|
16 self.gffReader = None
|
|
17 self.outputWriter = None
|
|
18 self.verbose = 0
|
|
19
|
|
20 def setInputReferenceFile(self, fileName):
|
|
21 self.referenceReader = fileName
|
|
22
|
|
23 def setInputGffFile(self, fileName):
|
|
24 self.gffReader = TranscriptContainer(fileName, 'gff3', self.verbose)
|
|
25
|
|
26 def setOutputFileName(self, fileName):
|
|
27 self.outputWriter = Gff3Writer(fileName, self.verbose)
|
|
28
|
|
29 def readGffAnnotation(self):
|
|
30 self.coveredRegions = {}
|
|
31 progress = Progress(self.gffReader.getNbTranscripts(), "Reading gff3 annotation file", self.verbose)
|
|
32 for transcript in self.gffReader.getIterator():
|
|
33 chromosome = transcript.getChromosome()
|
|
34 if chromosome not in self.coveredRegions:
|
|
35 self.coveredRegions[chromosome] = {}
|
|
36 for exon in transcript.getExons():
|
|
37 for position in range(exon.getStart(), exon.getEnd()+1):
|
|
38 self.coveredRegions[chromosome][position] = 1
|
|
39 progress.inc()
|
|
40 progress.done()
|
|
41
|
|
42 def write(self):
|
|
43 iParser = FastaParser(self.referenceReader)
|
|
44 iParser.setTags()
|
|
45 iGetGCPercentBySW = CountGCPercentBySlidingWindow()
|
|
46 progress = Progress(self.gffReader.getNbTranscripts(), "Writing output file", self.verbose)
|
|
47 for transcript in self.gffReader.getIterator():
|
|
48 chromosome = transcript.getChromosome()
|
|
49 GCpercent = 0
|
|
50 nPercent = 0
|
|
51 for exon in transcript.getExons():
|
|
52 for sequenceName in iParser.getTags().keys():
|
|
53 if sequenceName != chromosome:
|
|
54 continue
|
|
55 else:
|
|
56 subSequence = iParser.getSubSequence(sequenceName, exon.getStart() , exon.getEnd(), 1)
|
|
57 GCpercent, nPercent = iGetGCPercentBySW.getGCPercentAccordingToNAndNPercent(subSequence)
|
|
58 print "GCpercent = %f, nPercent = %f" % (GCpercent, nPercent)
|
|
59 transcript.setTagValue("GCpercent", GCpercent)
|
|
60 transcript.setTagValue("NPercent", nPercent)
|
|
61 self.outputWriter.addTranscript(transcript)
|
|
62 progress.inc()
|
|
63 progress.done()
|
|
64
|
|
65 def run(self):
|
|
66 self.readGffAnnotation()
|
|
67 if self.outputWriter != None:
|
|
68 self.write()
|
|
69
|
|
70 if __name__ == "__main__":
|
|
71 description = "Count GC percent for each read against a genome."
|
|
72 usage = "CountReadGCPercent.py -i <fasta file> -j <gff3 file> -o <output gff3 file> -v <verbose> -h]"
|
|
73 examples = "\nExample: \n"
|
|
74 examples += "\t$ python CountReadGCPercent.py -i file.fasta -j annotation.gff -o output.gff3"
|
|
75 examples += "\n\n"
|
|
76 parser = RepetOptionParser(description = description, usage = usage, version = "v1.0", epilog = examples)
|
|
77 parser.add_option( '-i', '--inputGenome', dest='fastaFile', help='fasta file [compulsory]', default= None )
|
|
78 parser.add_option( '-j', '--inputAnnotation', dest='gffFile', help='gff3 file [compulsory]', default= None)
|
|
79 parser.add_option( '-o', '--output', dest='outputFile', help='output gff3 file [compulsory]', default= None )
|
|
80 parser.add_option( '-v', '--verbose', dest='verbose', help='verbosity level (default=0/1)',type="int", default= 0 )
|
|
81 (options, args) = parser.parse_args()
|
|
82
|
|
83 readGCPercent = CountReadGCPercent()
|
|
84 readGCPercent.setInputReferenceFile(options.fastaFile)
|
|
85 readGCPercent.setInputGffFile(options.gffFile)
|
|
86 readGCPercent.setOutputFileName(options.outputFile)
|
|
87 readGCPercent.run()
|
|
88 |