annotate GAFA.py @ 3:e17a3470c70a draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
author earlhaminst
date Fri, 03 Mar 2017 07:20:23 -0500
parents fc8ca4ade638
children 117fc7414307
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
1 from __future__ import print_function
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
2
1
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
3 import collections
0
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
4 import json
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
5 import optparse
1
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
6 import re
0
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
7 import sqlite3
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
8
1
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
9 version = "0.2.0"
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
10
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
11 Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
12
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
13
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
14 def FASTAReader_gen(fasta_filename):
3
e17a3470c70a planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 1
diff changeset
15 with open(fasta_filename) as fasta_file:
1
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
16 line = fasta_file.readline()
3
e17a3470c70a planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 1
diff changeset
17 while True:
e17a3470c70a planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 1
diff changeset
18 if not line:
e17a3470c70a planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 1
diff changeset
19 return
e17a3470c70a planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 1
diff changeset
20 assert line.startswith('>'), "FASTA headers must start with >"
e17a3470c70a planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 1
diff changeset
21 header = line.rstrip()
e17a3470c70a planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 1
diff changeset
22 sequence_parts = []
1
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
23 line = fasta_file.readline()
3
e17a3470c70a planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 1
diff changeset
24 while line and line[0] != '>':
e17a3470c70a planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 1
diff changeset
25 sequence_parts.append(line.rstrip())
e17a3470c70a planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 1
diff changeset
26 line = fasta_file.readline()
e17a3470c70a planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 1
diff changeset
27 sequence = "".join(sequence_parts)
e17a3470c70a planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 1
diff changeset
28 yield Sequence(header, sequence)
1
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
29
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
30
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
31 FASTA_MATCH_RE = re.compile(r'[^-]')
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
32
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
33
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
34 def fasta_aln2cigar(sequence):
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
35 # Converts each match into M and each gap into D
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
36 tmp_seq = FASTA_MATCH_RE.sub('M', sequence)
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
37 tmp_seq = tmp_seq.replace('-', 'D')
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
38 # Split the sequence in substrings composed by the same letter
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
39 tmp_seq = tmp_seq.replace('DM', 'D,M')
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
40 tmp_seq = tmp_seq.replace('MD', 'M,D')
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
41 cigar_list = tmp_seq.split(',')
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
42 # Condense each substring, e.g. DDDD in 4D, and concatenate them again
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
43 cigar = ''
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
44 for s in cigar_list:
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
45 if len(s) > 1:
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
46 cigar += str(len(s))
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
47 cigar += s[0]
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
48 return cigar
0
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
49
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
50
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
51 def create_tables(conn):
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
52 cur = conn.cursor()
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
53 cur.execute('PRAGMA foreign_keys = ON')
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
54 cur.execute('''CREATE TABLE meta (
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
55 version VARCHAR)''')
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
56
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
57 cur.execute('INSERT INTO meta (version) VALUES (?)',
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
58 (version, ))
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
59
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
60 cur.execute('''CREATE TABLE gene_family (
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
61 gene_family_id INTEGER PRIMARY KEY,
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
62 gene_tree VARCHAR NOT NULL)''')
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
63
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
64 cur.execute('''CREATE TABLE gene (
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
65 gene_id VARCHAR PRIMARY KEY NOT NULL,
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
66 gene_symbol VARCHAR,
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
67 gene_json VARCHAR NOT NULL)''')
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
68 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)')
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
69
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
70 cur.execute('''CREATE TABLE transcript (
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
71 transcript_id VARCHAR PRIMARY KEY NOT NULL,
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
72 protein_id VARCHAR UNIQUE,
1
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
73 protein_sequence VARCHAR,
0
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
74 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''')
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
75
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
76 cur.execute('''CREATE TABLE gene_family_member (
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
77 gene_family_id INTEGER NOT NULL REFERENCES gene_family(gene_family_id),
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
78 protein_id VARCHAR KEY NOT NULL REFERENCES transcript(protein_id),
1
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
79 protein_alignment VARCHAR NOT NULL,
0
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
80 PRIMARY KEY (gene_family_id, protein_id))''')
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
81 conn.commit()
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
82
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
83
1
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
84 def align_to_db(conn, i, fname):
0
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
85 cur = conn.cursor()
1
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
86 for fasta_seq_align in FASTAReader_gen(fname):
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
87 seq_id = fasta_seq_align.header[1:]
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
88 # Trim seq_id by removing everything from the first underscore
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
89 seq_id = seq_id.split('_', 1)[0]
0
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
90
1
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
91 cur.execute('SELECT transcript_id, protein_id FROM transcript WHERE transcript_id=? OR protein_id=?',
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
92 (seq_id, seq_id))
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
93 results = cur.fetchall()
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
94 if len(results) == 0:
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
95 raise Exception("Sequence id '%s' could not be found among the transcript and protein ids" % seq_id)
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
96 elif len(results) > 1:
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
97 raise Exception("Searching sequence id '%s' among the transcript and protein ids returned multiple results" % seq_id)
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
98 transcript_id, protein_id = results[0]
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
99 if protein_id is None:
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
100 print("Skipping transcript '%s' with no protein id" % transcript_id)
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
101 else:
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
102 cigar = fasta_aln2cigar(fasta_seq_align.sequence)
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
103 cur.execute('INSERT INTO gene_family_member (gene_family_id, protein_id, protein_alignment) VALUES (?, ?, ?)',
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
104 (i, protein_id, cigar))
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
105 protein_sequence = fasta_seq_align.sequence.replace('-', '')
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
106 cur.execute('UPDATE transcript SET protein_sequence=? WHERE protein_id=?', (protein_sequence, protein_id))
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
107 conn.commit()
0
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
108
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
109
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
110 def newicktree_to_db(conn, i, fname):
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
111 with open(fname) as f:
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
112 tree = f.read().replace('\n', '')
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
113
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
114 cur = conn.cursor()
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
115 cur.execute('INSERT INTO gene_family (gene_family_id, gene_tree) VALUES (?, ?)',
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
116 (i, tree))
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
117 conn.commit()
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
118
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
119
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
120 def gene_json_to_db(conn, fname):
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
121 with open(fname) as f:
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
122 all_genes_dict = json.load(f)
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
123
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
124 cur = conn.cursor()
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
125 for gene_dict in all_genes_dict.values():
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
126 gene_id = gene_dict['id']
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
127 gene_symbol = gene_dict.get('display_name', None)
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
128 cur.execute("INSERT INTO gene (gene_id, gene_symbol, gene_json) VALUES (?, ?, ?)",
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
129 (gene_id, gene_symbol, json.dumps(gene_dict)))
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
130
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
131 if "Transcript" in gene_dict:
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
132 for transcript in gene_dict["Transcript"]:
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
133 transcript_id = transcript['id']
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
134 if 'Translation' in transcript and 'id' in transcript['Translation']:
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
135 protein_id = transcript["Translation"]["id"]
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
136 else:
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
137 protein_id = None
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
138 cur.execute("INSERT INTO transcript (transcript_id, protein_id, gene_id) VALUES (?, ?, ?)",
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
139 (transcript_id, protein_id, gene_id))
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
140 conn.commit()
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
141
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
142
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
143 def __main__():
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
144 parser = optparse.OptionParser()
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
145 parser.add_option('-t', '--tree', action='append', help='Gene tree files')
1
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
146 parser.add_option('-a', '--align', action='append', help='Protein alignments in fasta_aln format')
0
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
147 parser.add_option('-g', '--gene', help='Gene features file in JSON format')
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
148 parser.add_option('-o', '--output', help='Path of the output file')
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
149 options, args = parser.parse_args()
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
150 if args:
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
151 raise Exception('Use options to provide inputs')
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
152
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
153 conn = sqlite3.connect(options.output)
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
154 create_tables(conn)
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
155
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
156 gene_json_to_db(conn, options.gene)
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
157
1
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
158 for i, (tree, align) in enumerate(zip(options.tree, options.align), start=1):
0
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
159 newicktree_to_db(conn, i, tree)
1
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
160 align_to_db(conn, i, align)
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
161
fc8ca4ade638 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 0
diff changeset
162 conn.close()
0
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
163
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
164
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
165 if __name__ == '__main__':
af9f72ddf7f9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
166 __main__()