comparison GAFA.py @ 1:fc8ca4ade638 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
author earlhaminst
date Mon, 20 Feb 2017 06:25:33 -0500
parents af9f72ddf7f9
children e17a3470c70a
comparison
equal deleted inserted replaced
0:af9f72ddf7f9 1:fc8ca4ade638
1 from __future__ import print_function 1 from __future__ import print_function
2 2
3 import collections
3 import json 4 import json
4 import optparse 5 import optparse
6 import re
5 import sqlite3 7 import sqlite3
6 8
7 version = "0.1.0" 9 version = "0.2.0"
10
11 Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])
12
13
14 def FASTAReader_gen(fasta_filename):
15 fasta_file = open(fasta_filename)
16 line = fasta_file.readline()
17 while True:
18 if not line:
19 return
20 assert line.startswith('>'), "FASTA headers must start with >"
21 header = line.rstrip()
22 sequence_parts = []
23 line = fasta_file.readline()
24 while line and line[0] != '>':
25 sequence_parts.append(line.rstrip())
26 line = fasta_file.readline()
27 sequence = "".join(sequence_parts)
28 yield Sequence(header, sequence)
29
30
31 FASTA_MATCH_RE = re.compile(r'[^-]')
32
33
34 def fasta_aln2cigar(sequence):
35 # Converts each match into M and each gap into D
36 tmp_seq = FASTA_MATCH_RE.sub('M', sequence)
37 tmp_seq = tmp_seq.replace('-', 'D')
38 # Split the sequence in substrings composed by the same letter
39 tmp_seq = tmp_seq.replace('DM', 'D,M')
40 tmp_seq = tmp_seq.replace('MD', 'M,D')
41 cigar_list = tmp_seq.split(',')
42 # Condense each substring, e.g. DDDD in 4D, and concatenate them again
43 cigar = ''
44 for s in cigar_list:
45 if len(s) > 1:
46 cigar += str(len(s))
47 cigar += s[0]
48 return cigar
8 49
9 50
10 def create_tables(conn): 51 def create_tables(conn):
11 cur = conn.cursor() 52 cur = conn.cursor()
12 cur.execute('PRAGMA foreign_keys = ON') 53 cur.execute('PRAGMA foreign_keys = ON')
27 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') 68 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)')
28 69
29 cur.execute('''CREATE TABLE transcript ( 70 cur.execute('''CREATE TABLE transcript (
30 transcript_id VARCHAR PRIMARY KEY NOT NULL, 71 transcript_id VARCHAR PRIMARY KEY NOT NULL,
31 protein_id VARCHAR UNIQUE, 72 protein_id VARCHAR UNIQUE,
73 protein_sequence VARCHAR,
32 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') 74 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''')
33 75
34 cur.execute('''CREATE TABLE gene_family_member ( 76 cur.execute('''CREATE TABLE gene_family_member (
35 gene_family_id INTEGER NOT NULL REFERENCES gene_family(gene_family_id), 77 gene_family_id INTEGER NOT NULL REFERENCES gene_family(gene_family_id),
36 protein_id VARCHAR KEY NOT NULL REFERENCES transcript(protein_id), 78 protein_id VARCHAR KEY NOT NULL REFERENCES transcript(protein_id),
37 alignment VARCHAR NOT NULL, 79 protein_alignment VARCHAR NOT NULL,
38 PRIMARY KEY (gene_family_id, protein_id))''') 80 PRIMARY KEY (gene_family_id, protein_id))''')
39 conn.commit() 81 conn.commit()
40 82
41 83
42 def cigar_to_db(conn, i, fname): 84 def align_to_db(conn, i, fname):
43 cur = conn.cursor() 85 cur = conn.cursor()
44 with open(fname) as f: 86 for fasta_seq_align in FASTAReader_gen(fname):
45 for element in f.readlines(): 87 seq_id = fasta_seq_align.header[1:]
46 seq_id, cigar = element.rstrip('\n').split('\t') 88 # Trim seq_id by removing everything from the first underscore
47 # Trim seq_id by removing everything from the first underscore 89 seq_id = seq_id.split('_', 1)[0]
48 seq_id = seq_id.split('_', 1)[0]
49 90
50 cur.execute('SELECT transcript_id, protein_id FROM transcript WHERE transcript_id=? OR protein_id=?', 91 cur.execute('SELECT transcript_id, protein_id FROM transcript WHERE transcript_id=? OR protein_id=?',
51 (seq_id, seq_id)) 92 (seq_id, seq_id))
52 results = cur.fetchall() 93 results = cur.fetchall()
53 if len(results) == 0: 94 if len(results) == 0:
54 raise Exception("Sequence id '%s' could not be found among the transcript and protein ids" % seq_id) 95 raise Exception("Sequence id '%s' could not be found among the transcript and protein ids" % seq_id)
55 elif len(results) > 1: 96 elif len(results) > 1:
56 raise Exception("Searching sequence id '%s' among the transcript and protein ids returned multiple results" % seq_id) 97 raise Exception("Searching sequence id '%s' among the transcript and protein ids returned multiple results" % seq_id)
57 transcript_id, protein_id = results[0] 98 transcript_id, protein_id = results[0]
58 if protein_id is None: 99 if protein_id is None:
59 print("Skipping transcript '%s' with no protein id" % transcript_id) 100 print("Skipping transcript '%s' with no protein id" % transcript_id)
60 else: 101 else:
61 cur.execute('INSERT INTO gene_family_member (gene_family_id, protein_id, alignment) VALUES (?, ?, ?)', 102 cigar = fasta_aln2cigar(fasta_seq_align.sequence)
62 (i, protein_id, cigar)) 103 cur.execute('INSERT INTO gene_family_member (gene_family_id, protein_id, protein_alignment) VALUES (?, ?, ?)',
63 conn.commit() 104 (i, protein_id, cigar))
105 protein_sequence = fasta_seq_align.sequence.replace('-', '')
106 cur.execute('UPDATE transcript SET protein_sequence=? WHERE protein_id=?', (protein_sequence, protein_id))
107 conn.commit()
64 108
65 109
66 def newicktree_to_db(conn, i, fname): 110 def newicktree_to_db(conn, i, fname):
67 with open(fname) as f: 111 with open(fname) as f:
68 tree = f.read().replace('\n', '') 112 tree = f.read().replace('\n', '')
97 141
98 142
99 def __main__(): 143 def __main__():
100 parser = optparse.OptionParser() 144 parser = optparse.OptionParser()
101 parser.add_option('-t', '--tree', action='append', help='Gene tree files') 145 parser.add_option('-t', '--tree', action='append', help='Gene tree files')
102 parser.add_option('-c', '--cigar', action='append', help='CIGAR alignments of CDS files in tabular format') 146 parser.add_option('-a', '--align', action='append', help='Protein alignments in fasta_aln format')
103 parser.add_option('-g', '--gene', help='Gene features file in JSON format') 147 parser.add_option('-g', '--gene', help='Gene features file in JSON format')
104 parser.add_option('-o', '--output', help='Path of the output file') 148 parser.add_option('-o', '--output', help='Path of the output file')
105 options, args = parser.parse_args() 149 options, args = parser.parse_args()
106 if args: 150 if args:
107 raise Exception('Use options to provide inputs') 151 raise Exception('Use options to provide inputs')
109 conn = sqlite3.connect(options.output) 153 conn = sqlite3.connect(options.output)
110 create_tables(conn) 154 create_tables(conn)
111 155
112 gene_json_to_db(conn, options.gene) 156 gene_json_to_db(conn, options.gene)
113 157
114 for i, (tree, cigar) in enumerate(zip(options.tree, options.cigar), start=1): 158 for i, (tree, align) in enumerate(zip(options.tree, options.align), start=1):
115 newicktree_to_db(conn, i, tree) 159 newicktree_to_db(conn, i, tree)
116 cigar_to_db(conn, i, cigar) 160 align_to_db(conn, i, align)
161
162 conn.close()
117 163
118 164
119 if __name__ == '__main__': 165 if __name__ == '__main__':
120 __main__() 166 __main__()