Mercurial > repos > earlhaminst > gafa
comparison GAFA.py @ 4:117fc7414307 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 651fae48371f845578753052c6fe173e3bb35670
author | earlhaminst |
---|---|
date | Wed, 15 Mar 2017 20:20:58 -0400 |
parents | e17a3470c70a |
children | b9f1bcf5ee59 |
comparison
equal
deleted
inserted
replaced
3:e17a3470c70a | 4:117fc7414307 |
---|---|
1 from __future__ import print_function | 1 from __future__ import print_function |
2 | 2 |
3 import collections | 3 import collections |
4 import json | |
5 import optparse | 4 import optparse |
6 import re | 5 import re |
6 import shutil | |
7 import sqlite3 | 7 import sqlite3 |
8 | 8 |
9 version = "0.2.0" | 9 version = "0.3.0" |
10 | 10 |
11 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) | 11 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) |
12 | 12 |
13 | 13 |
14 def FASTAReader_gen(fasta_filename): | 14 def FASTAReader_gen(fasta_filename): |
48 return cigar | 48 return cigar |
49 | 49 |
50 | 50 |
51 def create_tables(conn): | 51 def create_tables(conn): |
52 cur = conn.cursor() | 52 cur = conn.cursor() |
53 cur.execute('PRAGMA foreign_keys = ON') | 53 # Check that the version of the input database is compatible |
54 cur.execute('''CREATE TABLE meta ( | 54 cur.execute('SELECT version FROM meta') |
55 version VARCHAR)''') | 55 result = cur.fetchone() |
56 | 56 input_meta_version = result[0] |
57 cur.execute('INSERT INTO meta (version) VALUES (?)', | 57 if input_meta_version != '0.3.0': |
58 raise Exception("Incompatible input meta version '%s'" % input_meta_version) | |
59 cur.execute('UPDATE meta SET version=?', | |
58 (version, )) | 60 (version, )) |
59 | 61 |
60 cur.execute('''CREATE TABLE gene_family ( | 62 cur.execute('''CREATE TABLE gene_family ( |
61 gene_family_id INTEGER PRIMARY KEY, | 63 gene_family_id INTEGER PRIMARY KEY, |
62 gene_tree VARCHAR NOT NULL)''') | 64 gene_tree VARCHAR NOT NULL)''') |
63 | |
64 cur.execute('''CREATE TABLE gene ( | |
65 gene_id VARCHAR PRIMARY KEY NOT NULL, | |
66 gene_symbol VARCHAR, | |
67 gene_json VARCHAR NOT NULL)''') | |
68 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') | |
69 | |
70 cur.execute('''CREATE TABLE transcript ( | |
71 transcript_id VARCHAR PRIMARY KEY NOT NULL, | |
72 protein_id VARCHAR UNIQUE, | |
73 protein_sequence VARCHAR, | |
74 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') | |
75 | 65 |
76 cur.execute('''CREATE TABLE gene_family_member ( | 66 cur.execute('''CREATE TABLE gene_family_member ( |
77 gene_family_id INTEGER NOT NULL REFERENCES gene_family(gene_family_id), | 67 gene_family_id INTEGER NOT NULL REFERENCES gene_family(gene_family_id), |
78 protein_id VARCHAR KEY NOT NULL REFERENCES transcript(protein_id), | 68 protein_id VARCHAR KEY NOT NULL REFERENCES transcript(protein_id), |
79 protein_alignment VARCHAR NOT NULL, | 69 protein_alignment VARCHAR NOT NULL, |
115 cur.execute('INSERT INTO gene_family (gene_family_id, gene_tree) VALUES (?, ?)', | 105 cur.execute('INSERT INTO gene_family (gene_family_id, gene_tree) VALUES (?, ?)', |
116 (i, tree)) | 106 (i, tree)) |
117 conn.commit() | 107 conn.commit() |
118 | 108 |
119 | 109 |
120 def gene_json_to_db(conn, fname): | |
121 with open(fname) as f: | |
122 all_genes_dict = json.load(f) | |
123 | |
124 cur = conn.cursor() | |
125 for gene_dict in all_genes_dict.values(): | |
126 gene_id = gene_dict['id'] | |
127 gene_symbol = gene_dict.get('display_name', None) | |
128 cur.execute("INSERT INTO gene (gene_id, gene_symbol, gene_json) VALUES (?, ?, ?)", | |
129 (gene_id, gene_symbol, json.dumps(gene_dict))) | |
130 | |
131 if "Transcript" in gene_dict: | |
132 for transcript in gene_dict["Transcript"]: | |
133 transcript_id = transcript['id'] | |
134 if 'Translation' in transcript and 'id' in transcript['Translation']: | |
135 protein_id = transcript["Translation"]["id"] | |
136 else: | |
137 protein_id = None | |
138 cur.execute("INSERT INTO transcript (transcript_id, protein_id, gene_id) VALUES (?, ?, ?)", | |
139 (transcript_id, protein_id, gene_id)) | |
140 conn.commit() | |
141 | |
142 | |
143 def __main__(): | 110 def __main__(): |
144 parser = optparse.OptionParser() | 111 parser = optparse.OptionParser() |
145 parser.add_option('-t', '--tree', action='append', help='Gene tree files') | 112 parser.add_option('-t', '--tree', action='append', help='Gene tree files') |
146 parser.add_option('-a', '--align', action='append', help='Protein alignments in fasta_aln format') | 113 parser.add_option('-a', '--align', action='append', help='Protein alignments in fasta_aln format') |
147 parser.add_option('-g', '--gene', help='Gene features file in JSON format') | 114 parser.add_option('-g', '--gene', help='Gene features file in SQLite format') |
148 parser.add_option('-o', '--output', help='Path of the output file') | 115 parser.add_option('-o', '--output', help='Path of the output file') |
149 options, args = parser.parse_args() | 116 options, args = parser.parse_args() |
150 if args: | 117 if args: |
151 raise Exception('Use options to provide inputs') | 118 raise Exception('Use options to provide inputs') |
152 | 119 |
120 if options.gene != options.output: | |
121 shutil.copyfile(options.gene, options.output) | |
122 | |
153 conn = sqlite3.connect(options.output) | 123 conn = sqlite3.connect(options.output) |
124 conn.execute('PRAGMA foreign_keys = ON') | |
154 create_tables(conn) | 125 create_tables(conn) |
155 | |
156 gene_json_to_db(conn, options.gene) | |
157 | 126 |
158 for i, (tree, align) in enumerate(zip(options.tree, options.align), start=1): | 127 for i, (tree, align) in enumerate(zip(options.tree, options.align), start=1): |
159 newicktree_to_db(conn, i, tree) | 128 newicktree_to_db(conn, i, tree) |
160 align_to_db(conn, i, align) | 129 align_to_db(conn, i, align) |
161 | 130 |