comparison gstf_preparation.py @ 0:28879ca33b5f draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
author earlhaminst
date Wed, 15 Mar 2017 20:18:57 -0400
parents
children a36645976342
comparison
equal deleted inserted replaced
-1:000000000000 0:28879ca33b5f
1 from __future__ import print_function
2
3 import collections
4 import json
5 import optparse
6 import sqlite3
7 import sys
8
9 version = "0.3.0"
10 gene_count = 0
11
12 Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])
13
14
15 def FASTAReader_gen(fasta_filename):
16 with open(fasta_filename) as fasta_file:
17 line = fasta_file.readline()
18 while True:
19 if not line:
20 return
21 assert line.startswith('>'), "FASTA headers must start with >"
22 header = line.rstrip()
23 sequence_parts = []
24 line = fasta_file.readline()
25 while line and line[0] != '>':
26 sequence_parts.append(line.rstrip())
27 line = fasta_file.readline()
28 sequence = "\n".join(sequence_parts)
29 yield Sequence(header, sequence)
30
31
32 def create_tables(conn):
33 cur = conn.cursor()
34
35 cur.execute('''CREATE TABLE meta (
36 version VARCHAR PRIMARY KEY NOT NULL)''')
37
38 cur.execute('INSERT INTO meta (version) VALUES (?)',
39 (version, ))
40
41 cur.execute('''CREATE TABLE gene (
42 gene_id VARCHAR PRIMARY KEY NOT NULL,
43 gene_symbol VARCHAR,
44 species VARCHAR NOT NULL,
45 gene_json VARCHAR NOT NULL)''')
46 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)')
47
48 cur.execute('''CREATE TABLE transcript (
49 transcript_id VARCHAR PRIMARY KEY NOT NULL,
50 protein_id VARCHAR UNIQUE,
51 protein_sequence VARCHAR,
52 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''')
53
54 cur.execute('''CREATE VIEW transcript_species AS
55 SELECT transcript_id, species
56 FROM transcript JOIN gene
57 ON transcript.gene_id = gene.gene_id''')
58
59 conn.commit()
60
61
62 def remove_type_from_list_of_ids(l):
63 return ','.join(remove_type_from_id(_) for _ in l.split(','))
64
65
66 def remove_type_from_id(id_):
67 colon_index = id_.find(':')
68 if colon_index >= 0:
69 return id_[colon_index + 1:]
70 else:
71 return id_
72
73
74 def feature_to_dict(cols, parent_dict=None):
75 d = {
76 'end': int(cols[4]),
77 'start': int(cols[3]),
78 }
79 for attr in cols[8].split(';'):
80 if '=' in attr:
81 (tag, value) = attr.split('=')
82 if tag == 'ID':
83 tag = 'id'
84 value = remove_type_from_id(value)
85 elif tag == 'Parent':
86 value = remove_type_from_list_of_ids(value)
87 d[tag] = value
88 if cols[6] == '+':
89 d['strand'] = 1
90 elif cols[6] == '-':
91 d['strand'] = -1
92 else:
93 raise Exception("Unrecognized strand '%s'" % cols[6])
94 if parent_dict is not None and 'Parent' in d:
95 # a 3' UTR can be split among multiple exons
96 # a 5' UTR can be split among multiple exons
97 # a CDS can be part of multiple transcripts
98 for parent in d['Parent'].split(','):
99 if parent not in parent_dict:
100 parent_dict[parent] = [d]
101 else:
102 parent_dict[parent].append(d)
103 return d
104
105
106 def add_gene_to_dict(cols, species, gene_dict):
107 global gene_count
108 gene = feature_to_dict(cols)
109 gene.update({
110 'member_id': gene_count,
111 'object_type': 'Gene',
112 'seq_region_name': cols[0],
113 'species': species,
114 'Transcript': [],
115 'display_name': gene['Name']
116 })
117 if gene['id']:
118 gene_dict[gene['id']] = gene
119 gene_count = gene_count + 1
120
121
122 def add_transcript_to_dict(cols, species, transcript_dict):
123 transcript = feature_to_dict(cols)
124 transcript.update({
125 'object_type': 'Transcript',
126 'seq_region_name': cols[0],
127 'species': species,
128 })
129 transcript_dict[transcript['id']] = transcript
130
131
132 def add_exon_to_dict(cols, species, exon_parent_dict):
133 exon = feature_to_dict(cols, exon_parent_dict)
134 exon.update({
135 'length': int(cols[4]) - int(cols[3]) + 1,
136 'object_type': 'Exon',
137 'seq_region_name': cols[0],
138 'species': species,
139 })
140 if 'id' not in exon and 'Name' in exon:
141 exon['id'] = exon['Name']
142
143
144 def add_cds_to_dict(cols, cds_parent_dict):
145 cds = feature_to_dict(cols, cds_parent_dict)
146 if 'id' not in cds:
147 if 'Name' in cds:
148 cds['id'] = cds['Name']
149 elif 'Parent' in cds and ',' not in cds['Parent']:
150 cds['id'] = cds['Parent']
151
152
153 def join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict):
154 for parent, exon_list in exon_parent_dict.items():
155 if parent in transcript_dict:
156 exon_list.sort(key=lambda _: _['start'])
157 transcript_dict[parent]['Exon'] = exon_list
158
159 for transcript_id, transcript in transcript_dict.items():
160 translation = {
161 'CDS': [],
162 'id': None,
163 'end': transcript['end'],
164 'object_type': 'Translation',
165 'species': transcript['species'],
166 'start': transcript['start'],
167 }
168 found_cds = False
169 derived_translation_start = None
170 derived_translation_end = None
171 if transcript_id in cds_parent_dict:
172 cds_list = cds_parent_dict[transcript_id]
173 cds_ids = set(_['id'] for _ in cds_list)
174 if len(cds_ids) > 1:
175 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % parent)
176 translation['id'] = cds_ids.pop()
177 cds_list.sort(key=lambda _: _['start'])
178 translation['CDS'] = cds_list
179 translation['start'] = cds_list[0]['start']
180 translation['end'] = cds_list[-1]['end']
181 found_cds = True
182 if transcript_id in five_prime_utr_parent_dict:
183 five_prime_utr_list = five_prime_utr_parent_dict[transcript_id]
184 five_prime_utr_list.sort(key=lambda _: _['start'])
185 if transcript['strand'] == 1:
186 derived_translation_start = five_prime_utr_list[-1]['end'] + 1
187 else:
188 derived_translation_end = five_prime_utr_list[0]['start'] - 1
189 if transcript_id in three_prime_utr_parent_dict:
190 three_prime_utr_list = three_prime_utr_parent_dict[transcript_id]
191 three_prime_utr_list.sort(key=lambda _: _['start'])
192 if transcript['strand'] == 1:
193 derived_translation_end = three_prime_utr_list[0]['start'] - 1
194 else:
195 derived_translation_start = three_prime_utr_list[-1]['end'] + 1
196 if derived_translation_start is not None:
197 if found_cds:
198 if derived_translation_start > translation['start']:
199 raise Exception("UTR overlaps with CDS")
200 else:
201 translation['start'] = derived_translation_start
202 if derived_translation_end is not None:
203 if found_cds:
204 if derived_translation_end < translation['end']:
205 raise Exception("UTR overlaps with CDS")
206 else:
207 translation['end'] = derived_translation_end
208 if found_cds or derived_translation_start is not None or derived_translation_end is not None:
209 transcript['Translation'] = translation
210
211 for transcript in transcript_dict.values():
212 if 'Parent' in transcript:
213 # A polycistronic transcript can have multiple parents
214 for parent in transcript['Parent'].split(','):
215 if parent in gene_dict:
216 gene_dict[parent]['Transcript'].append(transcript)
217
218
219 def write_gene_dict_to_db(conn, gene_dict):
220 cur = conn.cursor()
221
222 for gene in gene_dict.values():
223 gene_id = gene['id']
224 cur.execute('INSERT INTO gene (gene_id, gene_symbol, species, gene_json) VALUES (?, ?, ?, ?)',
225 (gene_id, gene.get('display_name', None), gene['species'], json.dumps(gene)))
226
227 if "Transcript" in gene:
228 for transcript in gene["Transcript"]:
229 transcript_id = transcript['id']
230 protein_id = transcript.get('Translation', {}).get('id', None)
231 try:
232 cur.execute('INSERT INTO transcript (transcript_id, protein_id, gene_id) VALUES (?, ?, ?)',
233 (transcript_id, protein_id, gene_id))
234 except Exception as e:
235 raise Exception("Error while inserting (%s, %s, %s) into transcript table: %s" % (transcript_id, protein_id, gene_id, e))
236
237 conn.commit()
238
239
240 def fetch_species_for_transcript(conn, transcript_id):
241 cur = conn.cursor()
242
243 cur.execute('SELECT species FROM transcript_species WHERE transcript_id=?',
244 (transcript_id, ))
245 results = cur.fetchone()
246 if not results:
247 return None
248 return results[0]
249
250
251 def remove_id_version(s):
252 """
253 Remove the optional '.VERSION' from an Ensembl id.
254 """
255 if s.startswith('ENS'):
256 return s.split('.')[0]
257 else:
258 return s
259
260
261 def __main__():
262 parser = optparse.OptionParser()
263 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files')
264 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files')
265 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files')
266 parser.add_option('-o', '--output', help='Path of the output SQLite file')
267 parser.add_option('--of', help='Path of the output FASTA file')
268 options, args = parser.parse_args()
269 if args:
270 raise Exception('Use options to provide inputs')
271
272 conn = sqlite3.connect(options.output)
273 conn.execute('PRAGMA foreign_keys = ON')
274 create_tables(conn)
275
276 for gff3_arg in options.gff3:
277 try:
278 (species, filename) = gff3_arg.split(':')
279 except ValueError:
280 raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg)
281 gene_dict = dict()
282 transcript_dict = dict()
283 exon_parent_dict = dict()
284 cds_parent_dict = dict()
285 five_prime_utr_parent_dict = dict()
286 three_prime_utr_parent_dict = dict()
287 with open(filename) as f:
288 for i, line in enumerate(f, start=1):
289 line = line.strip()
290 if not line:
291 # skip empty lines
292 continue
293 if line[0] == '#':
294 # skip comment lines
295 continue
296 cols = line.split('\t')
297 if len(cols) != 9:
298 raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line))
299 feature_type = cols[2]
300 try:
301 if feature_type == 'gene':
302 add_gene_to_dict(cols, species, gene_dict)
303 elif feature_type in ('mRNA', 'transcript'):
304 add_transcript_to_dict(cols, species, transcript_dict)
305 elif feature_type == 'exon':
306 add_exon_to_dict(cols, species, exon_parent_dict)
307 elif feature_type == 'five_prime_UTR':
308 feature_to_dict(cols, five_prime_utr_parent_dict)
309 elif feature_type == 'three_prime_UTR':
310 feature_to_dict(cols, three_prime_utr_parent_dict)
311 elif feature_type == 'CDS':
312 add_cds_to_dict(cols, cds_parent_dict)
313 else:
314 print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr)
315 except Exception as e:
316 print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr)
317
318 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict)
319 write_gene_dict_to_db(conn, gene_dict)
320
321 for json_arg in options.json:
322 with open(json_arg) as f:
323 write_gene_dict_to_db(conn, json.load(f))
324
325 with open(options.of, 'w') as output_fasta_file:
326 for fasta_arg in options.fasta:
327 for entry in FASTAReader_gen(fasta_arg):
328 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id
329 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
330 species_for_transcript = fetch_species_for_transcript(conn, transcript_id)
331 if not species_for_transcript:
332 print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr)
333 continue
334 # Remove any underscore in the species
335 species_for_transcript = species_for_transcript.replace('_', '')
336 # Write the FASTA sequence using '>TranscriptId_species' as the header, as required by TreeBest
337 output_fasta_file.write(">%s_%s\n%s\n" % (transcript_id, species_for_transcript, entry.sequence))
338
339 conn.close()
340
341
342 if __name__ == '__main__':
343 __main__()