Mercurial > repos > earlhaminst > gafa
comparison GAFA.py @ 0:af9f72ddf7f9 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
author | earlhaminst |
---|---|
date | Wed, 21 Dec 2016 07:31:50 -0500 |
parents | |
children | fc8ca4ade638 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:af9f72ddf7f9 |
---|---|
1 from __future__ import print_function | |
2 | |
3 import json | |
4 import optparse | |
5 import sqlite3 | |
6 | |
7 version = "0.1.0" | |
8 | |
9 | |
10 def create_tables(conn): | |
11 cur = conn.cursor() | |
12 cur.execute('PRAGMA foreign_keys = ON') | |
13 cur.execute('''CREATE TABLE meta ( | |
14 version VARCHAR)''') | |
15 | |
16 cur.execute('INSERT INTO meta (version) VALUES (?)', | |
17 (version, )) | |
18 | |
19 cur.execute('''CREATE TABLE gene_family ( | |
20 gene_family_id INTEGER PRIMARY KEY, | |
21 gene_tree VARCHAR NOT NULL)''') | |
22 | |
23 cur.execute('''CREATE TABLE gene ( | |
24 gene_id VARCHAR PRIMARY KEY NOT NULL, | |
25 gene_symbol VARCHAR, | |
26 gene_json VARCHAR NOT NULL)''') | |
27 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') | |
28 | |
29 cur.execute('''CREATE TABLE transcript ( | |
30 transcript_id VARCHAR PRIMARY KEY NOT NULL, | |
31 protein_id VARCHAR UNIQUE, | |
32 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') | |
33 | |
34 cur.execute('''CREATE TABLE gene_family_member ( | |
35 gene_family_id INTEGER NOT NULL REFERENCES gene_family(gene_family_id), | |
36 protein_id VARCHAR KEY NOT NULL REFERENCES transcript(protein_id), | |
37 alignment VARCHAR NOT NULL, | |
38 PRIMARY KEY (gene_family_id, protein_id))''') | |
39 conn.commit() | |
40 | |
41 | |
42 def cigar_to_db(conn, i, fname): | |
43 cur = conn.cursor() | |
44 with open(fname) as f: | |
45 for element in f.readlines(): | |
46 seq_id, cigar = element.rstrip('\n').split('\t') | |
47 # Trim seq_id by removing everything from the first underscore | |
48 seq_id = seq_id.split('_', 1)[0] | |
49 | |
50 cur.execute('SELECT transcript_id, protein_id FROM transcript WHERE transcript_id=? OR protein_id=?', | |
51 (seq_id, seq_id)) | |
52 results = cur.fetchall() | |
53 if len(results) == 0: | |
54 raise Exception("Sequence id '%s' could not be found among the transcript and protein ids" % seq_id) | |
55 elif len(results) > 1: | |
56 raise Exception("Searching sequence id '%s' among the transcript and protein ids returned multiple results" % seq_id) | |
57 transcript_id, protein_id = results[0] | |
58 if protein_id is None: | |
59 print("Skipping transcript '%s' with no protein id" % transcript_id) | |
60 else: | |
61 cur.execute('INSERT INTO gene_family_member (gene_family_id, protein_id, alignment) VALUES (?, ?, ?)', | |
62 (i, protein_id, cigar)) | |
63 conn.commit() | |
64 | |
65 | |
66 def newicktree_to_db(conn, i, fname): | |
67 with open(fname) as f: | |
68 tree = f.read().replace('\n', '') | |
69 | |
70 cur = conn.cursor() | |
71 cur.execute('INSERT INTO gene_family (gene_family_id, gene_tree) VALUES (?, ?)', | |
72 (i, tree)) | |
73 conn.commit() | |
74 | |
75 | |
76 def gene_json_to_db(conn, fname): | |
77 with open(fname) as f: | |
78 all_genes_dict = json.load(f) | |
79 | |
80 cur = conn.cursor() | |
81 for gene_dict in all_genes_dict.values(): | |
82 gene_id = gene_dict['id'] | |
83 gene_symbol = gene_dict.get('display_name', None) | |
84 cur.execute("INSERT INTO gene (gene_id, gene_symbol, gene_json) VALUES (?, ?, ?)", | |
85 (gene_id, gene_symbol, json.dumps(gene_dict))) | |
86 | |
87 if "Transcript" in gene_dict: | |
88 for transcript in gene_dict["Transcript"]: | |
89 transcript_id = transcript['id'] | |
90 if 'Translation' in transcript and 'id' in transcript['Translation']: | |
91 protein_id = transcript["Translation"]["id"] | |
92 else: | |
93 protein_id = None | |
94 cur.execute("INSERT INTO transcript (transcript_id, protein_id, gene_id) VALUES (?, ?, ?)", | |
95 (transcript_id, protein_id, gene_id)) | |
96 conn.commit() | |
97 | |
98 | |
99 def __main__(): | |
100 parser = optparse.OptionParser() | |
101 parser.add_option('-t', '--tree', action='append', help='Gene tree files') | |
102 parser.add_option('-c', '--cigar', action='append', help='CIGAR alignments of CDS files in tabular format') | |
103 parser.add_option('-g', '--gene', help='Gene features file in JSON format') | |
104 parser.add_option('-o', '--output', help='Path of the output file') | |
105 options, args = parser.parse_args() | |
106 if args: | |
107 raise Exception('Use options to provide inputs') | |
108 | |
109 conn = sqlite3.connect(options.output) | |
110 create_tables(conn) | |
111 | |
112 gene_json_to_db(conn, options.gene) | |
113 | |
114 for i, (tree, cigar) in enumerate(zip(options.tree, options.cigar), start=1): | |
115 newicktree_to_db(conn, i, tree) | |
116 cigar_to_db(conn, i, cigar) | |
117 | |
118 | |
119 if __name__ == '__main__': | |
120 __main__() |