Mercurial > repos > earlhaminst > gafa
comparison GAFA.py @ 1:fc8ca4ade638 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
author | earlhaminst |
---|---|
date | Mon, 20 Feb 2017 06:25:33 -0500 |
parents | af9f72ddf7f9 |
children | e17a3470c70a |
comparison
equal
deleted
inserted
replaced
0:af9f72ddf7f9 | 1:fc8ca4ade638 |
---|---|
1 from __future__ import print_function | 1 from __future__ import print_function |
2 | 2 |
3 import collections | |
3 import json | 4 import json |
4 import optparse | 5 import optparse |
6 import re | |
5 import sqlite3 | 7 import sqlite3 |
6 | 8 |
7 version = "0.1.0" | 9 version = "0.2.0" |
10 | |
11 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) | |
12 | |
13 | |
14 def FASTAReader_gen(fasta_filename): | |
15 fasta_file = open(fasta_filename) | |
16 line = fasta_file.readline() | |
17 while True: | |
18 if not line: | |
19 return | |
20 assert line.startswith('>'), "FASTA headers must start with >" | |
21 header = line.rstrip() | |
22 sequence_parts = [] | |
23 line = fasta_file.readline() | |
24 while line and line[0] != '>': | |
25 sequence_parts.append(line.rstrip()) | |
26 line = fasta_file.readline() | |
27 sequence = "".join(sequence_parts) | |
28 yield Sequence(header, sequence) | |
29 | |
30 | |
31 FASTA_MATCH_RE = re.compile(r'[^-]') | |
32 | |
33 | |
34 def fasta_aln2cigar(sequence): | |
35 # Converts each match into M and each gap into D | |
36 tmp_seq = FASTA_MATCH_RE.sub('M', sequence) | |
37 tmp_seq = tmp_seq.replace('-', 'D') | |
38 # Split the sequence in substrings composed by the same letter | |
39 tmp_seq = tmp_seq.replace('DM', 'D,M') | |
40 tmp_seq = tmp_seq.replace('MD', 'M,D') | |
41 cigar_list = tmp_seq.split(',') | |
42 # Condense each substring, e.g. DDDD in 4D, and concatenate them again | |
43 cigar = '' | |
44 for s in cigar_list: | |
45 if len(s) > 1: | |
46 cigar += str(len(s)) | |
47 cigar += s[0] | |
48 return cigar | |
8 | 49 |
9 | 50 |
10 def create_tables(conn): | 51 def create_tables(conn): |
11 cur = conn.cursor() | 52 cur = conn.cursor() |
12 cur.execute('PRAGMA foreign_keys = ON') | 53 cur.execute('PRAGMA foreign_keys = ON') |
27 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') | 68 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') |
28 | 69 |
29 cur.execute('''CREATE TABLE transcript ( | 70 cur.execute('''CREATE TABLE transcript ( |
30 transcript_id VARCHAR PRIMARY KEY NOT NULL, | 71 transcript_id VARCHAR PRIMARY KEY NOT NULL, |
31 protein_id VARCHAR UNIQUE, | 72 protein_id VARCHAR UNIQUE, |
73 protein_sequence VARCHAR, | |
32 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') | 74 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') |
33 | 75 |
34 cur.execute('''CREATE TABLE gene_family_member ( | 76 cur.execute('''CREATE TABLE gene_family_member ( |
35 gene_family_id INTEGER NOT NULL REFERENCES gene_family(gene_family_id), | 77 gene_family_id INTEGER NOT NULL REFERENCES gene_family(gene_family_id), |
36 protein_id VARCHAR KEY NOT NULL REFERENCES transcript(protein_id), | 78 protein_id VARCHAR KEY NOT NULL REFERENCES transcript(protein_id), |
37 alignment VARCHAR NOT NULL, | 79 protein_alignment VARCHAR NOT NULL, |
38 PRIMARY KEY (gene_family_id, protein_id))''') | 80 PRIMARY KEY (gene_family_id, protein_id))''') |
39 conn.commit() | 81 conn.commit() |
40 | 82 |
41 | 83 |
42 def cigar_to_db(conn, i, fname): | 84 def align_to_db(conn, i, fname): |
43 cur = conn.cursor() | 85 cur = conn.cursor() |
44 with open(fname) as f: | 86 for fasta_seq_align in FASTAReader_gen(fname): |
45 for element in f.readlines(): | 87 seq_id = fasta_seq_align.header[1:] |
46 seq_id, cigar = element.rstrip('\n').split('\t') | 88 # Trim seq_id by removing everything from the first underscore |
47 # Trim seq_id by removing everything from the first underscore | 89 seq_id = seq_id.split('_', 1)[0] |
48 seq_id = seq_id.split('_', 1)[0] | |
49 | 90 |
50 cur.execute('SELECT transcript_id, protein_id FROM transcript WHERE transcript_id=? OR protein_id=?', | 91 cur.execute('SELECT transcript_id, protein_id FROM transcript WHERE transcript_id=? OR protein_id=?', |
51 (seq_id, seq_id)) | 92 (seq_id, seq_id)) |
52 results = cur.fetchall() | 93 results = cur.fetchall() |
53 if len(results) == 0: | 94 if len(results) == 0: |
54 raise Exception("Sequence id '%s' could not be found among the transcript and protein ids" % seq_id) | 95 raise Exception("Sequence id '%s' could not be found among the transcript and protein ids" % seq_id) |
55 elif len(results) > 1: | 96 elif len(results) > 1: |
56 raise Exception("Searching sequence id '%s' among the transcript and protein ids returned multiple results" % seq_id) | 97 raise Exception("Searching sequence id '%s' among the transcript and protein ids returned multiple results" % seq_id) |
57 transcript_id, protein_id = results[0] | 98 transcript_id, protein_id = results[0] |
58 if protein_id is None: | 99 if protein_id is None: |
59 print("Skipping transcript '%s' with no protein id" % transcript_id) | 100 print("Skipping transcript '%s' with no protein id" % transcript_id) |
60 else: | 101 else: |
61 cur.execute('INSERT INTO gene_family_member (gene_family_id, protein_id, alignment) VALUES (?, ?, ?)', | 102 cigar = fasta_aln2cigar(fasta_seq_align.sequence) |
62 (i, protein_id, cigar)) | 103 cur.execute('INSERT INTO gene_family_member (gene_family_id, protein_id, protein_alignment) VALUES (?, ?, ?)', |
63 conn.commit() | 104 (i, protein_id, cigar)) |
105 protein_sequence = fasta_seq_align.sequence.replace('-', '') | |
106 cur.execute('UPDATE transcript SET protein_sequence=? WHERE protein_id=?', (protein_sequence, protein_id)) | |
107 conn.commit() | |
64 | 108 |
65 | 109 |
66 def newicktree_to_db(conn, i, fname): | 110 def newicktree_to_db(conn, i, fname): |
67 with open(fname) as f: | 111 with open(fname) as f: |
68 tree = f.read().replace('\n', '') | 112 tree = f.read().replace('\n', '') |
97 | 141 |
98 | 142 |
99 def __main__(): | 143 def __main__(): |
100 parser = optparse.OptionParser() | 144 parser = optparse.OptionParser() |
101 parser.add_option('-t', '--tree', action='append', help='Gene tree files') | 145 parser.add_option('-t', '--tree', action='append', help='Gene tree files') |
102 parser.add_option('-c', '--cigar', action='append', help='CIGAR alignments of CDS files in tabular format') | 146 parser.add_option('-a', '--align', action='append', help='Protein alignments in fasta_aln format') |
103 parser.add_option('-g', '--gene', help='Gene features file in JSON format') | 147 parser.add_option('-g', '--gene', help='Gene features file in JSON format') |
104 parser.add_option('-o', '--output', help='Path of the output file') | 148 parser.add_option('-o', '--output', help='Path of the output file') |
105 options, args = parser.parse_args() | 149 options, args = parser.parse_args() |
106 if args: | 150 if args: |
107 raise Exception('Use options to provide inputs') | 151 raise Exception('Use options to provide inputs') |
109 conn = sqlite3.connect(options.output) | 153 conn = sqlite3.connect(options.output) |
110 create_tables(conn) | 154 create_tables(conn) |
111 | 155 |
112 gene_json_to_db(conn, options.gene) | 156 gene_json_to_db(conn, options.gene) |
113 | 157 |
114 for i, (tree, cigar) in enumerate(zip(options.tree, options.cigar), start=1): | 158 for i, (tree, align) in enumerate(zip(options.tree, options.align), start=1): |
115 newicktree_to_db(conn, i, tree) | 159 newicktree_to_db(conn, i, tree) |
116 cigar_to_db(conn, i, cigar) | 160 align_to_db(conn, i, align) |
161 | |
162 conn.close() | |
117 | 163 |
118 | 164 |
119 if __name__ == '__main__': | 165 if __name__ == '__main__': |
120 __main__() | 166 __main__() |