Mercurial > repos > earlhaminst > gff3_to_json
diff gff3_to_json.py @ 1:befe6021e476 draft default tip
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
author | earlhaminst |
---|---|
date | Tue, 28 Feb 2017 12:06:04 -0500 |
parents | be6cec883b02 |
children |
line wrap: on
line diff
--- a/gff3_to_json.py Wed Dec 21 10:02:59 2016 -0500 +++ b/gff3_to_json.py Tue Feb 28 12:06:04 2017 -0500 @@ -4,16 +4,22 @@ import optparse import sys -cds_parent_dict = dict() -exon_parent_dict = dict() -five_prime_utr_parent_dict = dict() gene_count = 0 -gene_dict = dict() -transcript_dict = dict() -three_prime_utr_parent_dict = dict() + + +def remove_type_from_list_of_ids(l): + return ','.join(remove_type_from_id(_) for _ in l.split(',')) -def feature_to_json(cols): +def remove_type_from_id(id_): + colon_index = id_.find(':') + if colon_index >= 0: + return id_[colon_index + 1:] + else: + return id_ + + +def feature_to_dict(cols, parent_dict=None): d = { 'end': int(cols[4]), 'start': int(cols[3]), @@ -22,21 +28,32 @@ if '=' in attr: (tag, value) = attr.split('=') if tag == 'ID': - d['id'] = value - else: - d[tag] = value + tag = 'id' + value = remove_type_from_id(value) + elif tag == 'Parent': + value = remove_type_from_list_of_ids(value) + d[tag] = value if cols[6] == '+': d['strand'] = 1 elif cols[6] == '-': d['strand'] = -1 else: raise Exception("Unrecognized strand '%s'" % cols[6]) + if parent_dict is not None and 'Parent' in d: + # a 3' UTR can be split among multiple exons + # a 5' UTR can be split among multiple exons + # a CDS can be part of multiple transcripts + for parent in d['Parent'].split(','): + if parent not in parent_dict: + parent_dict[parent] = [d] + else: + parent_dict[parent].append(d) return d -def gene_to_json(cols, species): +def add_gene_to_dict(cols, species, gene_dict): global gene_count - gene = feature_to_json(cols) + gene = feature_to_dict(cols) gene.update({ 'member_id': gene_count, 'object_type': 'Gene', @@ -48,8 +65,8 @@ gene_count = gene_count + 1 -def transcript_to_json(cols, species): - transcript = feature_to_json(cols) +def add_transcript_to_dict(cols, species, transcript_dict): + transcript = feature_to_dict(cols) transcript.update({ 'object_type': 'Transcript', 'seq_region_name': cols[0], @@ -58,8 +75,8 @@ transcript_dict[transcript['id']] = transcript -def exon_to_json(cols, species): - exon = feature_to_json(cols) +def add_exon_to_dict(cols, species, exon_parent_dict): + exon = feature_to_dict(cols, exon_parent_dict) exon.update({ 'length': int(cols[4]) - int(cols[3]) + 1, 'object_type': 'Exon', @@ -69,56 +86,20 @@ if 'id' not in exon and 'Name' in exon: exon['id'] = exon['Name'] - if 'Parent' in exon: - for parent in exon['Parent'].split(','): - if parent not in exon_parent_dict: - exon_parent_dict[parent] = [exon] - else: - exon_parent_dict[parent].append(exon) - -def five_prime_utr_to_json(cols): - five_prime_utr = feature_to_json(cols) - if 'Parent' in five_prime_utr: - for parent in five_prime_utr['Parent'].split(','): - # the 5' UTR can be split among multiple exons - if parent not in five_prime_utr_parent_dict: - five_prime_utr_parent_dict[parent] = [five_prime_utr] - else: - five_prime_utr_parent_dict[parent].append(five_prime_utr) - - -def three_prime_utr_to_json(cols): - three_prime_utr = feature_to_json(cols) - if 'Parent' in three_prime_utr: - for parent in three_prime_utr['Parent'].split(','): - # the 3' UTR can be split among multiple exons - if parent not in three_prime_utr_parent_dict: - three_prime_utr_parent_dict[parent] = [three_prime_utr] - else: - three_prime_utr_parent_dict[parent].append(three_prime_utr) - - -def cds_to_json(cols): - cds = feature_to_json(cols) +def add_cds_to_dict(cols, cds_parent_dict): + cds = feature_to_dict(cols, cds_parent_dict) if 'id' not in cds: if 'Name' in cds: cds['id'] = cds['Name'] - elif 'Parent' in cds: + elif 'Parent' in cds and ',' not in cds['Parent']: cds['id'] = cds['Parent'] - if 'Parent' in cds: - # At this point we are sure than 'id' is in cds - for parent in cds['Parent'].split(','): - if parent not in cds_parent_dict: - cds_parent_dict[parent] = [cds] - else: - cds_parent_dict[parent].append(cds) -def join_dicts(): +def join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict): for parent, exon_list in exon_parent_dict.items(): - exon_list.sort(key=lambda _: _['start']) if parent in transcript_dict: + exon_list.sort(key=lambda _: _['start']) transcript_dict[parent]['Exon'] = exon_list for transcript_id, transcript in transcript_dict.items(): @@ -181,21 +162,19 @@ gene_dict[parent]['Transcript'].append(transcript) -def merge_dicts(json_arg): - with open(json_arg) as f: - dict_from_json = json.load(f) - gene_intersection = set(gene_dict.keys()) & set(dict_from_json.keys()) +def update_full_gene_dict_no_overwrite(full_gene_dict, gene_dict): + gene_intersection = set(full_gene_dict.keys()) & set(gene_dict.keys()) if gene_intersection: - raise Exception("JSON file '%s' contains information for genes '%s', which are also present in other files" % (json_arg, ', '.join(gene_intersection))) - gene_dict.update(dict_from_json) + raise Exception("Information for genes '%s' are present in multiple files" % ', '.join(gene_intersection)) + full_gene_dict.update(gene_dict) -def write_json(outfile=None, sort_keys=False): +def write_json(full_gene_dict, outfile=None, sort_keys=False): if outfile: with open(outfile, 'w') as f: - json.dump(gene_dict, f, sort_keys=sort_keys) + json.dump(full_gene_dict, f, sort_keys=sort_keys) else: - print(json.dumps(gene_dict, indent=3, sort_keys=sort_keys)) + json.dump(full_gene_dict, sys.stdout, sort_keys=sort_keys) def __main__(): @@ -205,16 +184,23 @@ parser.add_option('-s', '--sort', action='store_true', help='Sort the keys in the JSON output') parser.add_option('-o', '--output', help='Path of the output file. If not specified, will print on the standard output') options, args = parser.parse_args() - if args: raise Exception('Use options to provide inputs') + + full_gene_dict = dict() for gff3_arg in options.gff3: try: (species, filename) = gff3_arg.split(':') except ValueError: raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg) + gene_dict = dict() + transcript_dict = dict() + exon_parent_dict = dict() + cds_parent_dict = dict() + five_prime_utr_parent_dict = dict() + three_prime_utr_parent_dict = dict() with open(filename) as f: - for i, line in enumerate(f): + for i, line in enumerate(f, start=1): line = line.strip() if not line: # skip empty lines @@ -228,27 +214,29 @@ feature_type = cols[2] try: if feature_type == 'gene': - gene_to_json(cols, species) + add_gene_to_dict(cols, species, gene_dict) elif feature_type in ('mRNA', 'transcript'): - transcript_to_json(cols, species) + add_transcript_to_dict(cols, species, transcript_dict) elif feature_type == 'exon': - exon_to_json(cols, species) + add_exon_to_dict(cols, species, exon_parent_dict) elif feature_type == 'five_prime_UTR': - five_prime_utr_to_json(cols) + feature_to_dict(cols, five_prime_utr_parent_dict) elif feature_type == 'three_prime_UTR': - three_prime_utr_to_json(cols) + feature_to_dict(cols, three_prime_utr_parent_dict) elif feature_type == 'CDS': - cds_to_json(cols) + add_cds_to_dict(cols, cds_parent_dict) else: print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr) except Exception as e: raise Exception("Line %i in file '%s': %s" % (i, filename, e)) - join_dicts() + join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict) + update_full_gene_dict_no_overwrite(full_gene_dict, gene_dict) for json_arg in options.json: - merge_dicts(json_arg) + with open(json_arg) as f: + update_full_gene_dict_no_overwrite(full_gene_dict, json.load(f)) - write_json(options.output, options.sort) + write_json(full_gene_dict, options.output, options.sort) if __name__ == '__main__':