Mercurial > repos > rmarenco > hubarchivecreator
comparison Gtf.py @ 19:0152500d9acd draft
Uploaded
author | rmarenco |
---|---|
date | Thu, 13 Oct 2016 22:49:01 -0400 |
parents | c02720d1afee |
children | 2677f1899aa8 |
comparison
equal
deleted
inserted
replaced
18:d786bca6a75d | 19:0152500d9acd |
---|---|
3 import os | 3 import os |
4 import tempfile | 4 import tempfile |
5 | 5 |
6 # Internal dependencies | 6 # Internal dependencies |
7 from Datatype import Datatype | 7 from Datatype import Datatype |
8 from Track import Track | |
9 from TrackDb import TrackDb | |
10 from util import subtools | 8 from util import subtools |
11 | 9 |
10 class InfoModifiedGtf(): | |
11 def __init__(self, is_modified=False, array_modified_lines=[]): | |
12 self.is_modified = is_modified | |
13 self.array_modified_lines = array_modified_lines | |
14 | |
15 def get_str_modified_lines(self): | |
16 return ','.join(map(str, self.array_modified_lines)) | |
12 | 17 |
13 class Gtf( Datatype ): | 18 class Gtf( Datatype ): |
14 def __init__( self, input_gtf_false_path, data_gtf): | 19 def __init__( self, input_gtf_false_path, data_gtf): |
15 | 20 |
16 super(Gtf, self).__init__() | 21 super(Gtf, self).__init__() |
30 genePredFile = tempfile.NamedTemporaryFile(bufsize=0, suffix=".genePred") | 35 genePredFile = tempfile.NamedTemporaryFile(bufsize=0, suffix=".genePred") |
31 unsorted_bigGenePred_file = tempfile.NamedTemporaryFile(bufsize=0, suffix=".unsorted.bigGenePred") | 36 unsorted_bigGenePred_file = tempfile.NamedTemporaryFile(bufsize=0, suffix=".unsorted.bigGenePred") |
32 sorted_bigGenePred_file = tempfile.NamedTemporaryFile(suffix=".sortedBed.bigGenePred") | 37 sorted_bigGenePred_file = tempfile.NamedTemporaryFile(suffix=".sortedBed.bigGenePred") |
33 | 38 |
34 # GtfToGenePred | 39 # GtfToGenePred |
40 ## Checking the integrity of the inputs | |
41 modified_gtf = self._checkAndFixGtf() | |
42 | |
43 ## Processing the gtf | |
35 subtools.gtfToGenePred(self.input_gtf_false_path, genePredFile.name) | 44 subtools.gtfToGenePred(self.input_gtf_false_path, genePredFile.name) |
36 | 45 |
37 # TODO: From there, refactor because common use with Gff3.py | 46 # TODO: From there, refactor because common use with Gff3.py |
38 # genePredToBigGenePred processing | 47 # genePredToBigGenePred processing |
39 subtools.genePredToBigGenePred(genePredFile.name, unsorted_bigGenePred_file.name) | 48 subtools.genePredToBigGenePred(genePredFile.name, unsorted_bigGenePred_file.name) |
64 visibility='dense', priority=self.priority, | 73 visibility='dense', priority=self.priority, |
65 track_file=myBigBedFilePath, | 74 track_file=myBigBedFilePath, |
66 track_color=self.track_color, | 75 track_color=self.track_color, |
67 group_name=self.group_name) | 76 group_name=self.group_name) |
68 | 77 |
69 print("- Gtf %s created" % self.name_gtf) | 78 # TODO: Use Logging instead of print |
79 if modified_gtf.is_modified: | |
80 print("- Warning: Gtf %s created with a modified version of your Gtf because of start/end coordinates issues." | |
81 % self.name_gtf) | |
82 print("Here are the lines removed: " + modified_gtf.get_str_modified_lines()) | |
83 else: | |
84 print("- Gtf %s created" % self.name_gtf) | |
85 | |
86 def _checkAndFixGtf(self): | |
87 """ | |
88 Call _checkAndFixGtf, check the integrity of gtf file, | |
89 if coordinates exceed chromosome size, either removed the whole line(s) or truncated to the end of the scaffold | |
90 depending on the user choice | |
91 default: remove the whole line(s) | |
92 """ | |
93 # Set the boolean telling if we had to modify the file | |
94 modified_gtf = InfoModifiedGtf() | |
95 | |
96 # Create a temp gtf just in case we have issues | |
97 temp_gtf = tempfile.NamedTemporaryFile(bufsize=0, suffix=".gtf", delete=False) | |
98 | |
99 # TODO: Get the user choice and use it | |
100 # TODO: Check if the start > 0 and the end <= chromosome size | |
101 # Get the chrom.sizes into a dictionary to have a faster access | |
102 # TODO: Think about doing this in Datatype.py, so everywhere we have access to this read-only dictionary | |
103 dict_chrom_sizes = {} | |
104 with open(self.chromSizesFile.name, 'r') as chromSizes: | |
105 lines = chromSizes.readlines() | |
106 for line in lines: | |
107 fields = line.split() | |
108 # fields[1] should be the name of the scaffold | |
109 # fields[2] should be the size of the scaffold | |
110 # TODO: Ensure this is true for all lines | |
111 dict_chrom_sizes[fields[0]] = fields[1] | |
112 | |
113 # Parse the GTF and check each line using the chrom sizes dictionary | |
114 with open(temp_gtf.name, 'a+') as tmp: | |
115 with open(self.input_gtf_false_path, 'r') as gtf: | |
116 lines = gtf.readlines() | |
117 for index, line in enumerate(lines): | |
118 # If this is not a comment, we check the fields | |
119 if not line.startswith('#'): | |
120 fields = line.split() | |
121 # We are interested in fields[0] => Seqname (scaffold) | |
122 # We are interested in fields[3] => Start of the scaffold | |
123 # We are interested in fields[4] => End of the scaffold | |
124 scaffold_size = dict_chrom_sizes[fields[0]] | |
125 start_position = fields[3] | |
126 end_position = fields[4] | |
127 | |
128 if start_position > 0 and end_position <= scaffold_size: | |
129 # We are good, so we copy this line | |
130 tmp.write(line) | |
131 tmp.write(os.linesep) | |
132 | |
133 | |
134 # The sequence is not good, we are going to process it regarding the user choice | |
135 # TODO: Process the user choice | |
136 # By default, we are assuming the user choice is to remove the lines: We don't copy it | |
137 | |
138 # If we are here, it means the gtf has been modified | |
139 else: | |
140 # We save the line for the feedback to the user | |
141 modified_gtf.array_modified_lines.append(index + 1) | |
142 | |
143 if modified_gtf.is_modified is False: | |
144 modified_gtf.is_modified = True | |
145 else: | |
146 pass | |
147 else: | |
148 tmp.write(line) | |
149 tmp.write(os.linesep) | |
150 | |
151 # Once the process it completed, we just replace the path of the gtf | |
152 self.input_gtf_false_path = temp_gtf.name | |
153 | |
154 # TODO: Manage the issue with the fact the dataset is going to still exist on the disk because of delete=False | |
155 | |
156 return modified_gtf |