comparison GAFA.py @ 0:af9f72ddf7f9 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
author earlhaminst
date Wed, 21 Dec 2016 07:31:50 -0500
parents
children fc8ca4ade638
comparison
equal deleted inserted replaced
-1:000000000000 0:af9f72ddf7f9
1 from __future__ import print_function
2
3 import json
4 import optparse
5 import sqlite3
6
7 version = "0.1.0"
8
9
10 def create_tables(conn):
11 cur = conn.cursor()
12 cur.execute('PRAGMA foreign_keys = ON')
13 cur.execute('''CREATE TABLE meta (
14 version VARCHAR)''')
15
16 cur.execute('INSERT INTO meta (version) VALUES (?)',
17 (version, ))
18
19 cur.execute('''CREATE TABLE gene_family (
20 gene_family_id INTEGER PRIMARY KEY,
21 gene_tree VARCHAR NOT NULL)''')
22
23 cur.execute('''CREATE TABLE gene (
24 gene_id VARCHAR PRIMARY KEY NOT NULL,
25 gene_symbol VARCHAR,
26 gene_json VARCHAR NOT NULL)''')
27 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)')
28
29 cur.execute('''CREATE TABLE transcript (
30 transcript_id VARCHAR PRIMARY KEY NOT NULL,
31 protein_id VARCHAR UNIQUE,
32 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''')
33
34 cur.execute('''CREATE TABLE gene_family_member (
35 gene_family_id INTEGER NOT NULL REFERENCES gene_family(gene_family_id),
36 protein_id VARCHAR KEY NOT NULL REFERENCES transcript(protein_id),
37 alignment VARCHAR NOT NULL,
38 PRIMARY KEY (gene_family_id, protein_id))''')
39 conn.commit()
40
41
42 def cigar_to_db(conn, i, fname):
43 cur = conn.cursor()
44 with open(fname) as f:
45 for element in f.readlines():
46 seq_id, cigar = element.rstrip('\n').split('\t')
47 # Trim seq_id by removing everything from the first underscore
48 seq_id = seq_id.split('_', 1)[0]
49
50 cur.execute('SELECT transcript_id, protein_id FROM transcript WHERE transcript_id=? OR protein_id=?',
51 (seq_id, seq_id))
52 results = cur.fetchall()
53 if len(results) == 0:
54 raise Exception("Sequence id '%s' could not be found among the transcript and protein ids" % seq_id)
55 elif len(results) > 1:
56 raise Exception("Searching sequence id '%s' among the transcript and protein ids returned multiple results" % seq_id)
57 transcript_id, protein_id = results[0]
58 if protein_id is None:
59 print("Skipping transcript '%s' with no protein id" % transcript_id)
60 else:
61 cur.execute('INSERT INTO gene_family_member (gene_family_id, protein_id, alignment) VALUES (?, ?, ?)',
62 (i, protein_id, cigar))
63 conn.commit()
64
65
66 def newicktree_to_db(conn, i, fname):
67 with open(fname) as f:
68 tree = f.read().replace('\n', '')
69
70 cur = conn.cursor()
71 cur.execute('INSERT INTO gene_family (gene_family_id, gene_tree) VALUES (?, ?)',
72 (i, tree))
73 conn.commit()
74
75
76 def gene_json_to_db(conn, fname):
77 with open(fname) as f:
78 all_genes_dict = json.load(f)
79
80 cur = conn.cursor()
81 for gene_dict in all_genes_dict.values():
82 gene_id = gene_dict['id']
83 gene_symbol = gene_dict.get('display_name', None)
84 cur.execute("INSERT INTO gene (gene_id, gene_symbol, gene_json) VALUES (?, ?, ?)",
85 (gene_id, gene_symbol, json.dumps(gene_dict)))
86
87 if "Transcript" in gene_dict:
88 for transcript in gene_dict["Transcript"]:
89 transcript_id = transcript['id']
90 if 'Translation' in transcript and 'id' in transcript['Translation']:
91 protein_id = transcript["Translation"]["id"]
92 else:
93 protein_id = None
94 cur.execute("INSERT INTO transcript (transcript_id, protein_id, gene_id) VALUES (?, ?, ?)",
95 (transcript_id, protein_id, gene_id))
96 conn.commit()
97
98
99 def __main__():
100 parser = optparse.OptionParser()
101 parser.add_option('-t', '--tree', action='append', help='Gene tree files')
102 parser.add_option('-c', '--cigar', action='append', help='CIGAR alignments of CDS files in tabular format')
103 parser.add_option('-g', '--gene', help='Gene features file in JSON format')
104 parser.add_option('-o', '--output', help='Path of the output file')
105 options, args = parser.parse_args()
106 if args:
107 raise Exception('Use options to provide inputs')
108
109 conn = sqlite3.connect(options.output)
110 create_tables(conn)
111
112 gene_json_to_db(conn, options.gene)
113
114 for i, (tree, cigar) in enumerate(zip(options.tree, options.cigar), start=1):
115 newicktree_to_db(conn, i, tree)
116 cigar_to_db(conn, i, cigar)
117
118
119 if __name__ == '__main__':
120 __main__()