comparison Gtf.py @ 19:0152500d9acd draft

Uploaded
author rmarenco
date Thu, 13 Oct 2016 22:49:01 -0400
parents c02720d1afee
children 2677f1899aa8
comparison
equal deleted inserted replaced
18:d786bca6a75d 19:0152500d9acd
3 import os 3 import os
4 import tempfile 4 import tempfile
5 5
6 # Internal dependencies 6 # Internal dependencies
7 from Datatype import Datatype 7 from Datatype import Datatype
8 from Track import Track
9 from TrackDb import TrackDb
10 from util import subtools 8 from util import subtools
11 9
10 class InfoModifiedGtf():
11 def __init__(self, is_modified=False, array_modified_lines=[]):
12 self.is_modified = is_modified
13 self.array_modified_lines = array_modified_lines
14
15 def get_str_modified_lines(self):
16 return ','.join(map(str, self.array_modified_lines))
12 17
13 class Gtf( Datatype ): 18 class Gtf( Datatype ):
14 def __init__( self, input_gtf_false_path, data_gtf): 19 def __init__( self, input_gtf_false_path, data_gtf):
15 20
16 super(Gtf, self).__init__() 21 super(Gtf, self).__init__()
30 genePredFile = tempfile.NamedTemporaryFile(bufsize=0, suffix=".genePred") 35 genePredFile = tempfile.NamedTemporaryFile(bufsize=0, suffix=".genePred")
31 unsorted_bigGenePred_file = tempfile.NamedTemporaryFile(bufsize=0, suffix=".unsorted.bigGenePred") 36 unsorted_bigGenePred_file = tempfile.NamedTemporaryFile(bufsize=0, suffix=".unsorted.bigGenePred")
32 sorted_bigGenePred_file = tempfile.NamedTemporaryFile(suffix=".sortedBed.bigGenePred") 37 sorted_bigGenePred_file = tempfile.NamedTemporaryFile(suffix=".sortedBed.bigGenePred")
33 38
34 # GtfToGenePred 39 # GtfToGenePred
40 ## Checking the integrity of the inputs
41 modified_gtf = self._checkAndFixGtf()
42
43 ## Processing the gtf
35 subtools.gtfToGenePred(self.input_gtf_false_path, genePredFile.name) 44 subtools.gtfToGenePred(self.input_gtf_false_path, genePredFile.name)
36 45
37 # TODO: From there, refactor because common use with Gff3.py 46 # TODO: From there, refactor because common use with Gff3.py
38 # genePredToBigGenePred processing 47 # genePredToBigGenePred processing
39 subtools.genePredToBigGenePred(genePredFile.name, unsorted_bigGenePred_file.name) 48 subtools.genePredToBigGenePred(genePredFile.name, unsorted_bigGenePred_file.name)
64 visibility='dense', priority=self.priority, 73 visibility='dense', priority=self.priority,
65 track_file=myBigBedFilePath, 74 track_file=myBigBedFilePath,
66 track_color=self.track_color, 75 track_color=self.track_color,
67 group_name=self.group_name) 76 group_name=self.group_name)
68 77
69 print("- Gtf %s created" % self.name_gtf) 78 # TODO: Use Logging instead of print
79 if modified_gtf.is_modified:
80 print("- Warning: Gtf %s created with a modified version of your Gtf because of start/end coordinates issues."
81 % self.name_gtf)
82 print("Here are the lines removed: " + modified_gtf.get_str_modified_lines())
83 else:
84 print("- Gtf %s created" % self.name_gtf)
85
86 def _checkAndFixGtf(self):
87 """
88 Call _checkAndFixGtf, check the integrity of gtf file,
89 if coordinates exceed chromosome size, either removed the whole line(s) or truncated to the end of the scaffold
90 depending on the user choice
91 default: remove the whole line(s)
92 """
93 # Set the boolean telling if we had to modify the file
94 modified_gtf = InfoModifiedGtf()
95
96 # Create a temp gtf just in case we have issues
97 temp_gtf = tempfile.NamedTemporaryFile(bufsize=0, suffix=".gtf", delete=False)
98
99 # TODO: Get the user choice and use it
100 # TODO: Check if the start > 0 and the end <= chromosome size
101 # Get the chrom.sizes into a dictionary to have a faster access
102 # TODO: Think about doing this in Datatype.py, so everywhere we have access to this read-only dictionary
103 dict_chrom_sizes = {}
104 with open(self.chromSizesFile.name, 'r') as chromSizes:
105 lines = chromSizes.readlines()
106 for line in lines:
107 fields = line.split()
108 # fields[1] should be the name of the scaffold
109 # fields[2] should be the size of the scaffold
110 # TODO: Ensure this is true for all lines
111 dict_chrom_sizes[fields[0]] = fields[1]
112
113 # Parse the GTF and check each line using the chrom sizes dictionary
114 with open(temp_gtf.name, 'a+') as tmp:
115 with open(self.input_gtf_false_path, 'r') as gtf:
116 lines = gtf.readlines()
117 for index, line in enumerate(lines):
118 # If this is not a comment, we check the fields
119 if not line.startswith('#'):
120 fields = line.split()
121 # We are interested in fields[0] => Seqname (scaffold)
122 # We are interested in fields[3] => Start of the scaffold
123 # We are interested in fields[4] => End of the scaffold
124 scaffold_size = dict_chrom_sizes[fields[0]]
125 start_position = fields[3]
126 end_position = fields[4]
127
128 if start_position > 0 and end_position <= scaffold_size:
129 # We are good, so we copy this line
130 tmp.write(line)
131 tmp.write(os.linesep)
132
133
134 # The sequence is not good, we are going to process it regarding the user choice
135 # TODO: Process the user choice
136 # By default, we are assuming the user choice is to remove the lines: We don't copy it
137
138 # If we are here, it means the gtf has been modified
139 else:
140 # We save the line for the feedback to the user
141 modified_gtf.array_modified_lines.append(index + 1)
142
143 if modified_gtf.is_modified is False:
144 modified_gtf.is_modified = True
145 else:
146 pass
147 else:
148 tmp.write(line)
149 tmp.write(os.linesep)
150
151 # Once the process it completed, we just replace the path of the gtf
152 self.input_gtf_false_path = temp_gtf.name
153
154 # TODO: Manage the issue with the fact the dataset is going to still exist on the disk because of delete=False
155
156 return modified_gtf