comparison GAFA.py @ 4:117fc7414307 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 651fae48371f845578753052c6fe173e3bb35670
author earlhaminst
date Wed, 15 Mar 2017 20:20:58 -0400
parents e17a3470c70a
children b9f1bcf5ee59
comparison
equal deleted inserted replaced
3:e17a3470c70a 4:117fc7414307
1 from __future__ import print_function 1 from __future__ import print_function
2 2
3 import collections 3 import collections
4 import json
5 import optparse 4 import optparse
6 import re 5 import re
6 import shutil
7 import sqlite3 7 import sqlite3
8 8
9 version = "0.2.0" 9 version = "0.3.0"
10 10
11 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) 11 Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])
12 12
13 13
14 def FASTAReader_gen(fasta_filename): 14 def FASTAReader_gen(fasta_filename):
48 return cigar 48 return cigar
49 49
50 50
51 def create_tables(conn): 51 def create_tables(conn):
52 cur = conn.cursor() 52 cur = conn.cursor()
53 cur.execute('PRAGMA foreign_keys = ON') 53 # Check that the version of the input database is compatible
54 cur.execute('''CREATE TABLE meta ( 54 cur.execute('SELECT version FROM meta')
55 version VARCHAR)''') 55 result = cur.fetchone()
56 56 input_meta_version = result[0]
57 cur.execute('INSERT INTO meta (version) VALUES (?)', 57 if input_meta_version != '0.3.0':
58 raise Exception("Incompatible input meta version '%s'" % input_meta_version)
59 cur.execute('UPDATE meta SET version=?',
58 (version, )) 60 (version, ))
59 61
60 cur.execute('''CREATE TABLE gene_family ( 62 cur.execute('''CREATE TABLE gene_family (
61 gene_family_id INTEGER PRIMARY KEY, 63 gene_family_id INTEGER PRIMARY KEY,
62 gene_tree VARCHAR NOT NULL)''') 64 gene_tree VARCHAR NOT NULL)''')
63
64 cur.execute('''CREATE TABLE gene (
65 gene_id VARCHAR PRIMARY KEY NOT NULL,
66 gene_symbol VARCHAR,
67 gene_json VARCHAR NOT NULL)''')
68 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)')
69
70 cur.execute('''CREATE TABLE transcript (
71 transcript_id VARCHAR PRIMARY KEY NOT NULL,
72 protein_id VARCHAR UNIQUE,
73 protein_sequence VARCHAR,
74 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''')
75 65
76 cur.execute('''CREATE TABLE gene_family_member ( 66 cur.execute('''CREATE TABLE gene_family_member (
77 gene_family_id INTEGER NOT NULL REFERENCES gene_family(gene_family_id), 67 gene_family_id INTEGER NOT NULL REFERENCES gene_family(gene_family_id),
78 protein_id VARCHAR KEY NOT NULL REFERENCES transcript(protein_id), 68 protein_id VARCHAR KEY NOT NULL REFERENCES transcript(protein_id),
79 protein_alignment VARCHAR NOT NULL, 69 protein_alignment VARCHAR NOT NULL,
115 cur.execute('INSERT INTO gene_family (gene_family_id, gene_tree) VALUES (?, ?)', 105 cur.execute('INSERT INTO gene_family (gene_family_id, gene_tree) VALUES (?, ?)',
116 (i, tree)) 106 (i, tree))
117 conn.commit() 107 conn.commit()
118 108
119 109
120 def gene_json_to_db(conn, fname):
121 with open(fname) as f:
122 all_genes_dict = json.load(f)
123
124 cur = conn.cursor()
125 for gene_dict in all_genes_dict.values():
126 gene_id = gene_dict['id']
127 gene_symbol = gene_dict.get('display_name', None)
128 cur.execute("INSERT INTO gene (gene_id, gene_symbol, gene_json) VALUES (?, ?, ?)",
129 (gene_id, gene_symbol, json.dumps(gene_dict)))
130
131 if "Transcript" in gene_dict:
132 for transcript in gene_dict["Transcript"]:
133 transcript_id = transcript['id']
134 if 'Translation' in transcript and 'id' in transcript['Translation']:
135 protein_id = transcript["Translation"]["id"]
136 else:
137 protein_id = None
138 cur.execute("INSERT INTO transcript (transcript_id, protein_id, gene_id) VALUES (?, ?, ?)",
139 (transcript_id, protein_id, gene_id))
140 conn.commit()
141
142
143 def __main__(): 110 def __main__():
144 parser = optparse.OptionParser() 111 parser = optparse.OptionParser()
145 parser.add_option('-t', '--tree', action='append', help='Gene tree files') 112 parser.add_option('-t', '--tree', action='append', help='Gene tree files')
146 parser.add_option('-a', '--align', action='append', help='Protein alignments in fasta_aln format') 113 parser.add_option('-a', '--align', action='append', help='Protein alignments in fasta_aln format')
147 parser.add_option('-g', '--gene', help='Gene features file in JSON format') 114 parser.add_option('-g', '--gene', help='Gene features file in SQLite format')
148 parser.add_option('-o', '--output', help='Path of the output file') 115 parser.add_option('-o', '--output', help='Path of the output file')
149 options, args = parser.parse_args() 116 options, args = parser.parse_args()
150 if args: 117 if args:
151 raise Exception('Use options to provide inputs') 118 raise Exception('Use options to provide inputs')
152 119
120 if options.gene != options.output:
121 shutil.copyfile(options.gene, options.output)
122
153 conn = sqlite3.connect(options.output) 123 conn = sqlite3.connect(options.output)
124 conn.execute('PRAGMA foreign_keys = ON')
154 create_tables(conn) 125 create_tables(conn)
155
156 gene_json_to_db(conn, options.gene)
157 126
158 for i, (tree, align) in enumerate(zip(options.tree, options.align), start=1): 127 for i, (tree, align) in enumerate(zip(options.tree, options.align), start=1):
159 newicktree_to_db(conn, i, tree) 128 newicktree_to_db(conn, i, tree)
160 align_to_db(conn, i, align) 129 align_to_db(conn, i, align)
161 130