comparison SMART/Java/Python/cleaning/GtfCleaner.py @ 38:2c0c0a89fad7

Uploaded
author m-zytnicki
date Thu, 02 May 2013 09:56:47 -0400
parents 769e306b7933
children 97aa2e42bfdf
comparison
equal deleted inserted replaced
37:d22fadc825e3 38:2c0c0a89fad7
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 """
32 Clean a GTF file
33 """
34
35 import shlex
36 from SMART.Java.Python.cleaning.TranscriptListCleaner import TranscriptListCleaner
37 from SMART.Java.Python.misc.Progress import Progress
38 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
39
40 count = {}
41
42 class ParsedLine(object):
43 def __init__(self, line, cpt):
44 self.line = line
45 self.cpt = cpt
46 self.parse()
47
48 def parse(self):
49 self.line = self.line.strip()
50 self.splittedLine = self.line.split(None, 8)
51 if len(self.splittedLine) < 9:
52 raise Exception("Line '%s' has less than 9 fields. Exiting..." % (self.line))
53 self.type = self.splittedLine[2]
54 self.parseOptions()
55
56 def parseOptions(self):
57 self.parsedOptions = {}
58 key = None
59 value = ""
60 for option in shlex.split(self.splittedLine[8]):
61 option = option.strip()
62 if option == "": continue
63 if key == None:
64 key = option
65 else:
66 endValue = False
67 if option[-1] == ";":
68 endValue = True
69 option.rstrip(";")
70 value = "%s \"%s\"" % (value, option)
71 if endValue:
72 self.parsedOptions[key] = value
73 if key == "transcript_id":
74 self.transcriptId = value
75 key = None
76 value = ""
77
78 def export(self):
79 return "%s\n" % (self.line)
80
81
82 class GtfCleaner(TranscriptListCleaner):
83
84 def __init__(self, verbosity = 1):
85 super(GtfCleaner, self).__init__(verbosity)
86 self.acceptedTypes = ["exon"]
87 self.parents = {}
88
89 def getFileFormats():
90 return ["gtf"]
91 getFileFormats = staticmethod(getFileFormats)
92
93 def setAcceptedTypes(self, types):
94 self.acceptedTypes = types
95
96 def parse(self):
97 progress = UnlimitedProgress(100000, "Reading input file", self.verbosity)
98 for cpt, line in enumerate(self.inputHandle):
99 if not line or line[0] == "#": continue
100 parsedLine = ParsedLine(line, cpt)
101 if self.acceptedTypes == None or parsedLine.type in self.acceptedTypes:
102 transcriptId = parsedLine.transcriptId
103 if transcriptId not in self.parents:
104 self.parents[parsedLine.transcriptId] = [parsedLine]
105 else:
106 self.parents[parsedLine.transcriptId].append(parsedLine)
107 progress.inc()
108 progress.done()
109
110 def write(self):
111 progress = Progress(len(self.parents.keys()), "Writing output file", self.verbosity)
112 for parent in sorted(self.parents.keys()):
113 for line in self.parents[parent]:
114 self.outputHandle.write(line.export())
115 progress.inc()
116 progress.done()
117
118 def _clean(self):
119 self.parse()
120 self.write()
121