Mercurial > repos > earlhaminst > gstf_preparation
comparison gstf_preparation.py @ 11:dbe37a658cd2 draft
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
author | earlhaminst |
---|---|
date | Sun, 27 Sep 2020 18:54:31 +0000 |
parents | e8e75a79de59 |
children | 99bae410128c |
comparison
equal
deleted
inserted
replaced
10:e8e75a79de59 | 11:dbe37a658cd2 |
---|---|
4 import optparse | 4 import optparse |
5 import os | 5 import os |
6 import sqlite3 | 6 import sqlite3 |
7 import sys | 7 import sys |
8 | 8 |
9 version = "0.4.0" | 9 version = "0.5.0" |
10 gene_count = 0 | 10 gene_count = 0 |
11 | 11 |
12 | 12 |
13 class Sequence(object): | 13 class Sequence(object): |
14 def __init__(self, header, sequence_parts): | 14 def __init__(self, header, sequence_parts): |
59 seq_region_name VARCHAR NOT NULL, | 59 seq_region_name VARCHAR NOT NULL, |
60 seq_region_start INTEGER NOT NULL, | 60 seq_region_start INTEGER NOT NULL, |
61 seq_region_end INTEGER NOT NULL, | 61 seq_region_end INTEGER NOT NULL, |
62 seq_region_strand INTEGER NOT NULL, | 62 seq_region_strand INTEGER NOT NULL, |
63 species VARCHAR NOT NULL, | 63 species VARCHAR NOT NULL, |
64 biotype VARCHAR, | |
64 gene_json VARCHAR NOT NULL)''') | 65 gene_json VARCHAR NOT NULL)''') |
65 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') | 66 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') |
66 | 67 |
67 cur.execute('''CREATE TABLE transcript ( | 68 cur.execute('''CREATE TABLE transcript ( |
68 transcript_id VARCHAR PRIMARY KEY NOT NULL, | 69 transcript_id VARCHAR PRIMARY KEY NOT NULL, |
70 transcript_symbol VARCHAR, | |
69 protein_id VARCHAR UNIQUE, | 71 protein_id VARCHAR UNIQUE, |
70 protein_sequence VARCHAR, | 72 protein_sequence VARCHAR, |
73 biotype VARCHAR, | |
74 is_canonical BOOLEAN NOT NULL DEFAULT FALSE, | |
71 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') | 75 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') |
72 | 76 |
73 cur.execute('''CREATE VIEW transcript_species AS | 77 # The following temporary view is not used in GAFA, so schema changes to it |
74 SELECT transcript_id, species, seq_region_name | 78 # don't require a meta version upgrade. |
79 cur.execute('''CREATE TEMPORARY VIEW transcript_join_gene AS | |
80 SELECT transcript_id, transcript_symbol, COALESCE(transcript.biotype, gene.biotype) AS biotype, is_canonical, gene_id, gene_symbol, seq_region_name, species | |
75 FROM transcript JOIN gene | 81 FROM transcript JOIN gene |
76 ON transcript.gene_id = gene.gene_id''') | 82 USING (gene_id)''') |
77 | 83 |
78 conn.commit() | 84 conn.commit() |
79 | 85 |
80 | 86 |
81 def remove_type_from_list_of_ids(l): | 87 def fetch_transcript_and_gene(conn, transcript_id): |
82 return ','.join(remove_type_from_id(_) for _ in l.split(',')) | 88 cur = conn.cursor() |
89 | |
90 cur.execute('SELECT * FROM transcript_join_gene WHERE transcript_id=?', | |
91 (transcript_id, )) | |
92 return cur.fetchone() | |
93 | |
94 | |
95 def remove_type_from_list_of_ids(ids): | |
96 return ','.join(remove_type_from_id(id_) for id_ in ids.split(',')) | |
83 | 97 |
84 | 98 |
85 def remove_type_from_id(id_): | 99 def remove_type_from_id(id_): |
86 colon_index = id_.find(':') | 100 colon_index = id_.find(':') |
87 if colon_index >= 0: | 101 if colon_index >= 0: |
101 if tag == 'ID': | 115 if tag == 'ID': |
102 tag = 'id' | 116 tag = 'id' |
103 value = remove_type_from_id(value) | 117 value = remove_type_from_id(value) |
104 elif tag == 'Parent': | 118 elif tag == 'Parent': |
105 value = remove_type_from_list_of_ids(value) | 119 value = remove_type_from_list_of_ids(value) |
120 elif tag == 'representative': | |
121 tag = 'is_canonical' | |
106 d[tag] = value | 122 d[tag] = value |
107 if cols[6] == '+': | 123 if cols[6] == '+': |
108 d['strand'] = 1 | 124 d['strand'] = 1 |
109 elif cols[6] == '-': | 125 elif cols[6] == '-': |
110 d['strand'] = -1 | 126 d['strand'] = -1 |
120 | 136 |
121 | 137 |
122 def add_gene_to_dict(cols, species, gene_dict): | 138 def add_gene_to_dict(cols, species, gene_dict): |
123 global gene_count | 139 global gene_count |
124 gene = feature_to_dict(cols) | 140 gene = feature_to_dict(cols) |
141 if not gene['id']: | |
142 raise Exception("Id not found among column 9 attribute tags: %s" % cols[8]) | |
125 gene.update({ | 143 gene.update({ |
126 'member_id': gene_count, | 144 'member_id': gene_count, |
127 'object_type': 'Gene', | 145 'object_type': 'Gene', |
128 'seq_region_name': cols[0], | 146 'seq_region_name': cols[0], |
129 'species': species, | 147 'species': species, |
130 'Transcript': [], | 148 'Transcript': [], |
131 'display_name': gene.get('Name', None) | 149 'display_name': gene.get('Name'), |
132 }) | 150 }) |
133 if gene['id']: | 151 gene_dict[gene['id']] = gene |
134 gene_dict[gene['id']] = gene | 152 gene_count = gene_count + 1 |
135 gene_count = gene_count + 1 | |
136 | 153 |
137 | 154 |
138 def add_transcript_to_dict(cols, species, transcript_dict): | 155 def add_transcript_to_dict(cols, species, transcript_dict): |
139 transcript = feature_to_dict(cols) | 156 transcript = feature_to_dict(cols) |
140 if 'biotype' in transcript and transcript['biotype'] != 'protein_coding': | |
141 return | |
142 transcript.update({ | 157 transcript.update({ |
143 'object_type': 'Transcript', | 158 'object_type': 'Transcript', |
144 'seq_region_name': cols[0], | 159 'seq_region_name': cols[0], |
145 'species': species, | 160 'species': species, |
161 'display_name': transcript.get('Name'), | |
146 }) | 162 }) |
147 transcript_dict[transcript['id']] = transcript | 163 transcript_dict[transcript['id']] = transcript |
148 | 164 |
149 | 165 |
150 def add_exon_to_dict(cols, species, exon_parent_dict): | 166 def add_exon_to_dict(cols, species, exon_parent_dict): |
240 | 256 |
241 for gene in gene_dict.values(): | 257 for gene in gene_dict.values(): |
242 if gene is None: | 258 if gene is None: |
243 # This can happen when loading a JSON file from Ensembl | 259 # This can happen when loading a JSON file from Ensembl |
244 continue | 260 continue |
261 if 'confidence' in gene and gene['confidence'] != 'high': | |
262 print("Gene %s has confidence %s (not high), discarding" % (gene['id'], gene['confidence']), file=sys.stderr) | |
263 continue | |
245 gene_id = gene['id'] | 264 gene_id = gene['id'] |
246 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?)', | 265 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, biotype, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', |
247 (gene_id, gene.get('display_name', None), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], json.dumps(gene))) | 266 (gene_id, gene.get('display_name'), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], gene.get('biotype'), json.dumps(gene))) |
248 | 267 |
249 if "Transcript" in gene: | 268 if "Transcript" in gene: |
250 for transcript in gene["Transcript"]: | 269 for transcript in gene["Transcript"]: |
251 transcript_id = transcript['id'] | 270 transcript_id = transcript['id'] |
252 protein_id = transcript.get('Translation', {}).get('id', None) | 271 transcript_symbol = transcript.get('display_name') |
272 protein_id = transcript.get('Translation', {}).get('id') | |
273 biotype = transcript.get('biotype') | |
274 is_canonical = transcript.get('is_canonical', False) | |
275 to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) | |
253 try: | 276 try: |
254 cur.execute('INSERT INTO transcript (transcript_id, protein_id, gene_id) VALUES (?, ?, ?)', | 277 cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)', |
255 (transcript_id, protein_id, gene_id)) | 278 to_insert) |
256 except Exception as e: | 279 except Exception as e: |
257 raise Exception("Error while inserting (%s, %s, %s) into transcript table: %s" % (transcript_id, protein_id, gene_id, e)) | 280 raise Exception("Error while inserting %s into transcript table: %s" % (str(to_insert), e)) |
258 | 281 |
259 conn.commit() | 282 conn.commit() |
260 | |
261 | |
262 def fetch_species_and_seq_region_for_transcript(conn, transcript_id): | |
263 cur = conn.cursor() | |
264 | |
265 cur.execute('SELECT species, seq_region_name FROM transcript_species WHERE transcript_id=?', | |
266 (transcript_id, )) | |
267 row = cur.fetchone() | |
268 if not row: | |
269 return (None, None) | |
270 return row | |
271 | |
272 | |
273 def fetch_gene_id_for_transcript(conn, transcript_id): | |
274 cur = conn.cursor() | |
275 | |
276 cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?', | |
277 (transcript_id, )) | |
278 row = cur.fetchone() | |
279 if not row: | |
280 return None | |
281 return row[0] | |
282 | 283 |
283 | 284 |
284 def remove_id_version(s, force=False): | 285 def remove_id_version(s, force=False): |
285 """ | 286 """ |
286 Remove the optional '.VERSION' from an id if it's an Ensembl id or if | 287 Remove the optional '.VERSION' from an id if it's an Ensembl id or if |
295 def __main__(): | 296 def __main__(): |
296 parser = optparse.OptionParser() | 297 parser = optparse.OptionParser() |
297 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') | 298 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') |
298 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') | 299 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') |
299 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') | 300 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') |
300 parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene') | 301 parser.add_option('--filter', type='choice', choices=['canonical', 'coding', ''], default='', help='Which transcripts to keep') |
301 parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format') | 302 parser.add_option('--headers', type='choice', |
303 choices=['TranscriptId_species', 'GeneSymbol-TranscriptID_species', 'TranscriptSymbol-TranscriptID_species', ''], | |
304 default='', help='Change the header line of the FASTA sequences to this format') | |
302 parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered') | 305 parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered') |
303 parser.add_option('-o', '--output', help='Path of the output SQLite file') | 306 parser.add_option('-o', '--output', help='Path of the output SQLite file') |
304 parser.add_option('--of', help='Path of the output FASTA file') | 307 parser.add_option('--of', help='Path of the output FASTA file') |
305 parser.add_option('--ff', default=os.devnull, help='Path of the filtered sequences output FASTA file') | 308 parser.add_option('--ff', default=os.devnull, help='Path of the filtered sequences output FASTA file') |
306 | 309 |
307 options, args = parser.parse_args() | 310 options, args = parser.parse_args() |
308 if args: | 311 if args: |
309 raise Exception('Use options to provide inputs') | 312 raise Exception('Use options to provide inputs') |
310 | 313 |
311 conn = sqlite3.connect(options.output) | 314 conn = sqlite3.connect(options.output) |
315 conn.row_factory = sqlite3.Row | |
312 conn.execute('PRAGMA foreign_keys = ON') | 316 conn.execute('PRAGMA foreign_keys = ON') |
313 create_tables(conn) | 317 create_tables(conn) |
314 | 318 |
315 for gff3_arg in options.gff3: | 319 for gff3_arg in options.gff3: |
316 try: | 320 try: |
369 write_gene_dict_to_db(conn, json.load(f)) | 373 write_gene_dict_to_db(conn, json.load(f)) |
370 | 374 |
371 # Read the FASTA files a first time to: | 375 # Read the FASTA files a first time to: |
372 # - determine for each file if we need to force the removal of the version | 376 # - determine for each file if we need to force the removal of the version |
373 # from the transcript id | 377 # from the transcript id |
374 # - fill gene_transcripts_dict when keeping only the longest CDS per gene | 378 # - fill gene_transcripts_dict when keeping only the canonical transcripts |
375 force_remove_id_version_file_list = [] | 379 force_remove_id_version_file_list = [] |
376 gene_transcripts_dict = dict() | 380 gene_transcripts_dict = dict() |
377 for fasta_arg in options.fasta: | 381 for fasta_arg in options.fasta: |
378 force_remove_id_version = False | 382 force_remove_id_version = False |
379 found_gene_transcript = False | 383 found_gene_transcript = False |
380 for entry in FASTAReader_gen(fasta_arg): | 384 for entry in FASTAReader_gen(fasta_arg): |
381 # Extract the transcript id by removing everything after the first space and then removing the version if needed | 385 # Extract the transcript id by removing everything after the first space and then removing the version if needed |
382 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version) | 386 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version) |
383 | 387 |
384 if len(entry.sequence) % 3 != 0: | 388 transcript = fetch_transcript_and_gene(conn, transcript_id) |
385 continue | 389 if not transcript and not found_gene_transcript: |
386 | |
387 gene_id = fetch_gene_id_for_transcript(conn, transcript_id) | |
388 if not gene_id and not found_gene_transcript: | |
389 # We have not found a proper gene transcript in this file yet, | 390 # We have not found a proper gene transcript in this file yet, |
390 # try to force the removal of the version from the transcript id | 391 # try to force the removal of the version from the transcript id |
391 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], True) | 392 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], True) |
392 gene_id = fetch_gene_id_for_transcript(conn, transcript_id) | 393 transcript = fetch_transcript_and_gene(conn, transcript_id) |
393 # Remember that we need to force the removal for this file | 394 # Remember that we need to force the removal for this file |
394 if gene_id: | 395 if transcript: |
395 force_remove_id_version = True | 396 force_remove_id_version = True |
396 force_remove_id_version_file_list.append(fasta_arg) | 397 force_remove_id_version_file_list.append(fasta_arg) |
397 print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr) | 398 print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr) |
398 if not gene_id: | 399 if not transcript: |
399 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) | 400 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) |
400 continue | 401 continue |
401 if options.longestCDS: | 402 if options.filter != 'canonical': |
402 found_gene_transcript = True | 403 break |
404 found_gene_transcript = True | |
405 | |
406 if len(entry.sequence) % 3 != 0: | |
407 continue | |
408 transcript_biotype = transcript['biotype'] # This is the biotype of the transcript or, if that is NULL, the one of the gene | |
409 if transcript_biotype and transcript_biotype != 'protein_coding': | |
410 continue | |
411 gene_transcripts_dict.setdefault(transcript['gene_id'], []).append((transcript_id, transcript['is_canonical'], len(entry.sequence))) | |
412 | |
413 if options.filter == 'canonical': | |
414 selected_transcript_ids = [] | |
415 for gene_id, transcript_tuples in gene_transcripts_dict.items(): | |
416 canonical_transcript_ids = [id_ for (id_, is_canonical, _) in transcript_tuples if is_canonical] | |
417 if not canonical_transcript_ids: | |
418 # Select the transcript with the longest sequence. If more than | |
419 # one transcripts have the same longest sequence for a gene, the | |
420 # first one to appear in the FASTA file is selected. | |
421 selected_transcript_id = max(transcript_tuples, key=lambda transcript_tuple: transcript_tuple[2])[0] | |
422 elif len(canonical_transcript_ids) > 1: | |
423 raise Exception("Gene %s has more than 1 canonical transcripts" % (gene_id)) | |
403 else: | 424 else: |
404 break | 425 selected_transcript_id = canonical_transcript_ids[0] |
405 | 426 selected_transcript_ids.append(selected_transcript_id) |
406 gene_transcripts_dict.setdefault(gene_id, []).append((transcript_id, len(entry.sequence))) | |
407 | |
408 if options.longestCDS: | |
409 # For each gene, select the transcript with the longest sequence. | |
410 # If more than one transcripts have the same longest sequence for a | |
411 # gene, the first one to appear in the FASTA file is selected | |
412 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] | |
413 | 427 |
414 regions = [_.strip().lower() for _ in options.regions.split(",")] | 428 regions = [_.strip().lower() for _ in options.regions.split(",")] |
415 with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file: | 429 with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file: |
416 for fasta_arg in options.fasta: | 430 for fasta_arg in options.fasta: |
417 force_remove_id_version = fasta_arg in force_remove_id_version_file_list | 431 force_remove_id_version = fasta_arg in force_remove_id_version_file_list |
418 for entry in FASTAReader_gen(fasta_arg): | 432 for entry in FASTAReader_gen(fasta_arg): |
419 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version) | 433 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version) |
420 if options.longestCDS and transcript_id not in selected_transcript_ids: | 434 |
421 continue | 435 transcript = fetch_transcript_and_gene(conn, transcript_id) |
422 | 436 if not transcript: |
423 if len(entry.sequence) % 3 != 0: | |
424 print("Transcript '%s' in FASTA file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr) | |
425 continue | |
426 | |
427 species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id) | |
428 if not species_for_transcript: | |
429 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) | 437 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) |
430 continue | 438 continue |
431 | 439 |
432 if options.headers: | 440 if options.filter == 'canonical': |
441 # We already filtered out non-protein-coding transcripts when populating gene_transcripts_dict | |
442 if transcript_id not in selected_transcript_ids: | |
443 continue | |
444 elif options.filter == 'coding': | |
445 if len(entry.sequence) % 3 != 0: | |
446 print("Transcript '%s' in FASTA file '%s' has a coding sequence length which is not multiple of 3, removing from FASTA output" % (transcript_id, fasta_arg), file=sys.stderr) | |
447 continue | |
448 transcript_biotype = transcript['biotype'] # This is the biotype of the transcript or, if that is NULL, the one of the gene | |
449 if transcript_biotype and transcript_biotype != 'protein_coding': | |
450 print("Transcript %s has biotype %s (not protein-coding), removing from FASTA output" % (transcript_id, transcript_biotype), file=sys.stderr) | |
451 continue | |
452 | |
453 if options.headers == "TranscriptId_species": | |
433 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest | 454 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest |
434 # Remove any underscore in the species | 455 # Remove any underscore in the species |
435 entry.header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', '')) | 456 entry.header = ">%s_%s" % (transcript_id, transcript['species'].replace('_', '')) |
436 | 457 elif options.headers == "GeneSymbol-TranscriptID_species": |
437 if seq_region_for_transcript.lower() in regions: | 458 # Remove any underscore in the species |
459 entry.header = ">%s-%s_%s" % (transcript['gene_symbol'], transcript_id, transcript['species'].replace('_', '')) | |
460 elif options.headers == "TranscriptSymbol-TranscriptID_species": | |
461 # Remove any underscore in the species | |
462 entry.header = ">%s-%s_%s" % (transcript['transcript_symbol'], transcript_id, transcript['species'].replace('_', '')) | |
463 | |
464 if transcript['seq_region_name'].lower() in regions: | |
438 entry.print(filtered_fasta_file) | 465 entry.print(filtered_fasta_file) |
439 else: | 466 else: |
440 entry.print(output_fasta_file) | 467 entry.print(output_fasta_file) |
441 | 468 |
442 conn.close() | 469 conn.close() |