Mercurial > repos > earlhaminst > gafa
diff GAFA.py @ 0:af9f72ddf7f9 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
author | earlhaminst |
---|---|
date | Wed, 21 Dec 2016 07:31:50 -0500 |
parents | |
children | fc8ca4ade638 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GAFA.py Wed Dec 21 07:31:50 2016 -0500 @@ -0,0 +1,120 @@ +from __future__ import print_function + +import json +import optparse +import sqlite3 + +version = "0.1.0" + + +def create_tables(conn): + cur = conn.cursor() + cur.execute('PRAGMA foreign_keys = ON') + cur.execute('''CREATE TABLE meta ( + version VARCHAR)''') + + cur.execute('INSERT INTO meta (version) VALUES (?)', + (version, )) + + cur.execute('''CREATE TABLE gene_family ( + gene_family_id INTEGER PRIMARY KEY, + gene_tree VARCHAR NOT NULL)''') + + cur.execute('''CREATE TABLE gene ( + gene_id VARCHAR PRIMARY KEY NOT NULL, + gene_symbol VARCHAR, + gene_json VARCHAR NOT NULL)''') + cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') + + cur.execute('''CREATE TABLE transcript ( + transcript_id VARCHAR PRIMARY KEY NOT NULL, + protein_id VARCHAR UNIQUE, + gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') + + cur.execute('''CREATE TABLE gene_family_member ( + gene_family_id INTEGER NOT NULL REFERENCES gene_family(gene_family_id), + protein_id VARCHAR KEY NOT NULL REFERENCES transcript(protein_id), + alignment VARCHAR NOT NULL, + PRIMARY KEY (gene_family_id, protein_id))''') + conn.commit() + + +def cigar_to_db(conn, i, fname): + cur = conn.cursor() + with open(fname) as f: + for element in f.readlines(): + seq_id, cigar = element.rstrip('\n').split('\t') + # Trim seq_id by removing everything from the first underscore + seq_id = seq_id.split('_', 1)[0] + + cur.execute('SELECT transcript_id, protein_id FROM transcript WHERE transcript_id=? OR protein_id=?', + (seq_id, seq_id)) + results = cur.fetchall() + if len(results) == 0: + raise Exception("Sequence id '%s' could not be found among the transcript and protein ids" % seq_id) + elif len(results) > 1: + raise Exception("Searching sequence id '%s' among the transcript and protein ids returned multiple results" % seq_id) + transcript_id, protein_id = results[0] + if protein_id is None: + print("Skipping transcript '%s' with no protein id" % transcript_id) + else: + cur.execute('INSERT INTO gene_family_member (gene_family_id, protein_id, alignment) VALUES (?, ?, ?)', + (i, protein_id, cigar)) + conn.commit() + + +def newicktree_to_db(conn, i, fname): + with open(fname) as f: + tree = f.read().replace('\n', '') + + cur = conn.cursor() + cur.execute('INSERT INTO gene_family (gene_family_id, gene_tree) VALUES (?, ?)', + (i, tree)) + conn.commit() + + +def gene_json_to_db(conn, fname): + with open(fname) as f: + all_genes_dict = json.load(f) + + cur = conn.cursor() + for gene_dict in all_genes_dict.values(): + gene_id = gene_dict['id'] + gene_symbol = gene_dict.get('display_name', None) + cur.execute("INSERT INTO gene (gene_id, gene_symbol, gene_json) VALUES (?, ?, ?)", + (gene_id, gene_symbol, json.dumps(gene_dict))) + + if "Transcript" in gene_dict: + for transcript in gene_dict["Transcript"]: + transcript_id = transcript['id'] + if 'Translation' in transcript and 'id' in transcript['Translation']: + protein_id = transcript["Translation"]["id"] + else: + protein_id = None + cur.execute("INSERT INTO transcript (transcript_id, protein_id, gene_id) VALUES (?, ?, ?)", + (transcript_id, protein_id, gene_id)) + conn.commit() + + +def __main__(): + parser = optparse.OptionParser() + parser.add_option('-t', '--tree', action='append', help='Gene tree files') + parser.add_option('-c', '--cigar', action='append', help='CIGAR alignments of CDS files in tabular format') + parser.add_option('-g', '--gene', help='Gene features file in JSON format') + parser.add_option('-o', '--output', help='Path of the output file') + options, args = parser.parse_args() + if args: + raise Exception('Use options to provide inputs') + + conn = sqlite3.connect(options.output) + create_tables(conn) + + gene_json_to_db(conn, options.gene) + + for i, (tree, cigar) in enumerate(zip(options.tree, options.cigar), start=1): + newicktree_to_db(conn, i, tree) + cigar_to_db(conn, i, cigar) + + +if __name__ == '__main__': + __main__()