Mercurial > repos > earlhaminst > gstf_preparation
comparison gstf_preparation.py @ 13:51a7a2a82902 draft
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
author | earlhaminst |
---|---|
date | Tue, 06 Oct 2020 17:10:37 +0000 |
parents | 99bae410128c |
children | 598e9172b8e7 |
comparison
equal
deleted
inserted
replaced
12:99bae410128c | 13:51a7a2a82902 |
---|---|
1 from __future__ import print_function | |
2 | |
3 import json | 1 import json |
4 import optparse | 2 import optparse |
5 import os | 3 import os |
6 import sqlite3 | 4 import sqlite3 |
7 import sys | 5 import sys |
8 | 6 |
9 version = "0.5.0" | 7 version = "0.5.0" |
10 gene_count = 0 | 8 gene_count = 0 |
11 | 9 |
12 | 10 |
13 class Sequence(object): | 11 def asbool(val): |
12 if isinstance(val, str): | |
13 val_lower = val.strip().lower() | |
14 if val_lower in ('true', '1'): | |
15 return True | |
16 elif val_lower in ('false', '0'): | |
17 return False | |
18 else: | |
19 raise ValueError(f"Cannot convert {val} to bool") | |
20 else: | |
21 return bool(val) | |
22 | |
23 | |
24 class Sequence: | |
14 def __init__(self, header, sequence_parts): | 25 def __init__(self, header, sequence_parts): |
15 self.header = header | 26 self.header = header |
16 self.sequence_parts = sequence_parts | 27 self.sequence_parts = sequence_parts |
17 self._sequence = None | 28 self._sequence = None |
18 | 29 |
202 found_cds = False | 213 found_cds = False |
203 derived_translation_start = None | 214 derived_translation_start = None |
204 derived_translation_end = None | 215 derived_translation_end = None |
205 if transcript_id in cds_parent_dict: | 216 if transcript_id in cds_parent_dict: |
206 cds_list = cds_parent_dict[transcript_id] | 217 cds_list = cds_parent_dict[transcript_id] |
207 cds_ids = set(_['id'] for _ in cds_list) | 218 cds_ids = {_['id'] for _ in cds_list} |
208 if len(cds_ids) > 1: | 219 if len(cds_ids) > 1: |
209 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % transcript_id) | 220 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % transcript_id) |
210 cds_id = cds_ids.pop() | 221 cds_id = cds_ids.pop() |
211 translation['id'] = cds_id | 222 translation['id'] = cds_id |
212 cds_list.sort(key=lambda _: _['start']) | 223 cds_list.sort(key=lambda _: _['start']) |
229 else: | 240 else: |
230 derived_translation_start = three_prime_utr_list[-1]['end'] + 1 | 241 derived_translation_start = three_prime_utr_list[-1]['end'] + 1 |
231 if derived_translation_start is not None: | 242 if derived_translation_start is not None: |
232 if found_cds: | 243 if found_cds: |
233 if derived_translation_start > translation['start']: | 244 if derived_translation_start > translation['start']: |
234 raise Exception("Transcript %s has the start of CDS %s overlapping with the UTR end" % (transcript_id, cds_id)) | 245 raise Exception(f"Transcript {transcript_id} has the start of CDS {cds_id} overlapping with the UTR end") |
235 else: | 246 else: |
236 translation['start'] = derived_translation_start | 247 translation['start'] = derived_translation_start |
237 if derived_translation_end is not None: | 248 if derived_translation_end is not None: |
238 if found_cds: | 249 if found_cds: |
239 if derived_translation_end < translation['end']: | 250 if derived_translation_end < translation['end']: |
240 raise Exception("Transcript %s has the end of CDS %s overlapping with the UTR start" % (transcript_id, cds_id)) | 251 raise Exception(f"Transcript {transcript_id} has the end of CDS {cds_id} overlapping with the UTR start") |
241 else: | 252 else: |
242 translation['end'] = derived_translation_end | 253 translation['end'] = derived_translation_end |
243 if found_cds or derived_translation_start is not None or derived_translation_end is not None: | 254 if found_cds or derived_translation_start is not None or derived_translation_end is not None: |
244 transcript['Translation'] = translation | 255 transcript['Translation'] = translation |
245 | 256 |
257 for gene in gene_dict.values(): | 268 for gene in gene_dict.values(): |
258 if gene is None: | 269 if gene is None: |
259 # This can happen when loading a JSON file from Ensembl | 270 # This can happen when loading a JSON file from Ensembl |
260 continue | 271 continue |
261 if 'confidence' in gene and gene['confidence'].lower() != 'high': | 272 if 'confidence' in gene and gene['confidence'].lower() != 'high': |
262 print("Gene %s has confidence %s (not high), discarding" % (gene['id'], gene['confidence']), file=sys.stderr) | 273 print("Gene {} has confidence {} (not high), discarding".format(gene['id'], gene['confidence']), file=sys.stderr) |
263 continue | 274 continue |
264 gene_id = gene['id'] | 275 gene_id = gene['id'] |
265 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, biotype, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', | 276 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, biotype, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', |
266 (gene_id, gene.get('display_name'), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], gene.get('biotype'), json.dumps(gene))) | 277 (gene_id, gene.get('display_name'), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], gene.get('biotype'), json.dumps(gene))) |
267 | 278 |
269 for transcript in gene["Transcript"]: | 280 for transcript in gene["Transcript"]: |
270 transcript_id = transcript['id'] | 281 transcript_id = transcript['id'] |
271 transcript_symbol = transcript.get('display_name') | 282 transcript_symbol = transcript.get('display_name') |
272 protein_id = transcript.get('Translation', {}).get('id') | 283 protein_id = transcript.get('Translation', {}).get('id') |
273 biotype = transcript.get('biotype') | 284 biotype = transcript.get('biotype') |
274 is_canonical = transcript.get('is_canonical', False) | 285 is_canonical = asbool(transcript.get('is_canonical', False)) |
275 to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) | 286 to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) |
276 try: | 287 try: |
277 cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)', | 288 cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)', |
278 to_insert) | 289 to_insert) |
279 except Exception as e: | 290 except Exception as e: |
280 raise Exception("Error while inserting %s into transcript table: %s" % (str(to_insert), e)) | 291 raise Exception("Error while inserting {} into transcript table: {}".format(str(to_insert), e)) |
281 | 292 |
282 conn.commit() | 293 conn.commit() |
283 | 294 |
284 | 295 |
285 def remove_id_version(s, force=False): | 296 def remove_id_version(s, force=False): |
395 if transcript: | 406 if transcript: |
396 force_remove_id_version = True | 407 force_remove_id_version = True |
397 force_remove_id_version_file_list.append(fasta_arg) | 408 force_remove_id_version_file_list.append(fasta_arg) |
398 print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr) | 409 print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr) |
399 if not transcript: | 410 if not transcript: |
400 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) | 411 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr) |
401 continue | 412 continue |
402 if options.filter != 'canonical': | 413 if options.filter != 'canonical': |
403 break | 414 break |
404 found_gene_transcript = True | 415 found_gene_transcript = True |
405 | 416 |
432 for entry in FASTAReader_gen(fasta_arg): | 443 for entry in FASTAReader_gen(fasta_arg): |
433 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version) | 444 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version) |
434 | 445 |
435 transcript = fetch_transcript_and_gene(conn, transcript_id) | 446 transcript = fetch_transcript_and_gene(conn, transcript_id) |
436 if not transcript: | 447 if not transcript: |
437 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) | 448 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr) |
438 continue | 449 continue |
439 | 450 |
440 if options.filter == 'canonical': | 451 if options.filter == 'canonical': |
441 # We already filtered out non-protein-coding transcripts when populating gene_transcripts_dict | 452 # We already filtered out non-protein-coding transcripts when populating gene_transcripts_dict |
442 if transcript_id not in selected_transcript_ids: | 453 if transcript_id not in selected_transcript_ids: |
443 continue | 454 continue |
444 elif options.filter == 'coding': | 455 elif options.filter == 'coding': |
445 if len(entry.sequence) % 3 != 0: | 456 if len(entry.sequence) % 3 != 0: |
446 print("Transcript '%s' in FASTA file '%s' has a coding sequence length which is not multiple of 3, removing from FASTA output" % (transcript_id, fasta_arg), file=sys.stderr) | 457 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' has a coding sequence length which is not multiple of 3, removing from FASTA output", file=sys.stderr) |
447 continue | 458 continue |
448 transcript_biotype = transcript['biotype'] # This is the biotype of the transcript or, if that is NULL, the one of the gene | 459 transcript_biotype = transcript['biotype'] # This is the biotype of the transcript or, if that is NULL, the one of the gene |
449 if transcript_biotype and transcript_biotype != 'protein_coding': | 460 if transcript_biotype and transcript_biotype != 'protein_coding': |
450 print("Transcript %s has biotype %s (not protein-coding), removing from FASTA output" % (transcript_id, transcript_biotype), file=sys.stderr) | 461 print(f"Transcript {transcript_id} has biotype {transcript_biotype} (not protein-coding), removing from FASTA output", file=sys.stderr) |
451 continue | 462 continue |
452 | 463 |
453 if options.headers == "TranscriptId_species": | 464 if options.headers == "TranscriptId_species": |
454 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest | 465 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest |
455 # Remove any underscore in the species | 466 # Remove any underscore in the species |
456 entry.header = ">%s_%s" % (transcript_id, transcript['species'].replace('_', '')) | 467 entry.header = ">{}_{}".format(transcript_id, transcript['species'].replace('_', '')) |
457 elif options.headers == "TranscriptID-GeneSymbol_species": | 468 elif options.headers == "TranscriptID-GeneSymbol_species": |
458 # Remove any underscore in the species | 469 # Remove any underscore in the species |
459 entry.header = ">%s-%s_%s" % (transcript_id, transcript['gene_symbol'], transcript['species'].replace('_', '')) | 470 entry.header = ">{}-{}_{}".format(transcript_id, transcript['gene_symbol'], transcript['species'].replace('_', '')) |
460 elif options.headers == "TranscriptID-TranscriptSymbol_species": | 471 elif options.headers == "TranscriptID-TranscriptSymbol_species": |
461 # Remove any underscore in the species | 472 # Remove any underscore in the species |
462 entry.header = ">%s-%s_%s" % (transcript_id, transcript['transcript_symbol'], transcript['species'].replace('_', '')) | 473 entry.header = ">{}-{}_{}".format(transcript_id, transcript['transcript_symbol'], transcript['species'].replace('_', '')) |
463 | 474 |
464 if transcript['seq_region_name'].lower() in regions: | 475 if transcript['seq_region_name'].lower() in regions: |
465 entry.print(filtered_fasta_file) | 476 entry.print(filtered_fasta_file) |
466 else: | 477 else: |
467 entry.print(output_fasta_file) | 478 entry.print(output_fasta_file) |