Mercurial > repos > earlhaminst > gafa
diff GAFA.py @ 1:fc8ca4ade638 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/GAFA/ commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
author | earlhaminst |
---|---|
date | Mon, 20 Feb 2017 06:25:33 -0500 |
parents | af9f72ddf7f9 |
children | e17a3470c70a |
line wrap: on
line diff
--- a/GAFA.py Wed Dec 21 07:31:50 2016 -0500 +++ b/GAFA.py Mon Feb 20 06:25:33 2017 -0500 @@ -1,10 +1,51 @@ from __future__ import print_function +import collections import json import optparse +import re import sqlite3 -version = "0.1.0" +version = "0.2.0" + +Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) + + +def FASTAReader_gen(fasta_filename): + fasta_file = open(fasta_filename) + line = fasta_file.readline() + while True: + if not line: + return + assert line.startswith('>'), "FASTA headers must start with >" + header = line.rstrip() + sequence_parts = [] + line = fasta_file.readline() + while line and line[0] != '>': + sequence_parts.append(line.rstrip()) + line = fasta_file.readline() + sequence = "".join(sequence_parts) + yield Sequence(header, sequence) + + +FASTA_MATCH_RE = re.compile(r'[^-]') + + +def fasta_aln2cigar(sequence): + # Converts each match into M and each gap into D + tmp_seq = FASTA_MATCH_RE.sub('M', sequence) + tmp_seq = tmp_seq.replace('-', 'D') + # Split the sequence in substrings composed by the same letter + tmp_seq = tmp_seq.replace('DM', 'D,M') + tmp_seq = tmp_seq.replace('MD', 'M,D') + cigar_list = tmp_seq.split(',') + # Condense each substring, e.g. DDDD in 4D, and concatenate them again + cigar = '' + for s in cigar_list: + if len(s) > 1: + cigar += str(len(s)) + cigar += s[0] + return cigar def create_tables(conn): @@ -29,38 +70,41 @@ cur.execute('''CREATE TABLE transcript ( transcript_id VARCHAR PRIMARY KEY NOT NULL, protein_id VARCHAR UNIQUE, + protein_sequence VARCHAR, gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') cur.execute('''CREATE TABLE gene_family_member ( gene_family_id INTEGER NOT NULL REFERENCES gene_family(gene_family_id), protein_id VARCHAR KEY NOT NULL REFERENCES transcript(protein_id), - alignment VARCHAR NOT NULL, + protein_alignment VARCHAR NOT NULL, PRIMARY KEY (gene_family_id, protein_id))''') conn.commit() -def cigar_to_db(conn, i, fname): +def align_to_db(conn, i, fname): cur = conn.cursor() - with open(fname) as f: - for element in f.readlines(): - seq_id, cigar = element.rstrip('\n').split('\t') - # Trim seq_id by removing everything from the first underscore - seq_id = seq_id.split('_', 1)[0] + for fasta_seq_align in FASTAReader_gen(fname): + seq_id = fasta_seq_align.header[1:] + # Trim seq_id by removing everything from the first underscore + seq_id = seq_id.split('_', 1)[0] - cur.execute('SELECT transcript_id, protein_id FROM transcript WHERE transcript_id=? OR protein_id=?', - (seq_id, seq_id)) - results = cur.fetchall() - if len(results) == 0: - raise Exception("Sequence id '%s' could not be found among the transcript and protein ids" % seq_id) - elif len(results) > 1: - raise Exception("Searching sequence id '%s' among the transcript and protein ids returned multiple results" % seq_id) - transcript_id, protein_id = results[0] - if protein_id is None: - print("Skipping transcript '%s' with no protein id" % transcript_id) - else: - cur.execute('INSERT INTO gene_family_member (gene_family_id, protein_id, alignment) VALUES (?, ?, ?)', - (i, protein_id, cigar)) - conn.commit() + cur.execute('SELECT transcript_id, protein_id FROM transcript WHERE transcript_id=? OR protein_id=?', + (seq_id, seq_id)) + results = cur.fetchall() + if len(results) == 0: + raise Exception("Sequence id '%s' could not be found among the transcript and protein ids" % seq_id) + elif len(results) > 1: + raise Exception("Searching sequence id '%s' among the transcript and protein ids returned multiple results" % seq_id) + transcript_id, protein_id = results[0] + if protein_id is None: + print("Skipping transcript '%s' with no protein id" % transcript_id) + else: + cigar = fasta_aln2cigar(fasta_seq_align.sequence) + cur.execute('INSERT INTO gene_family_member (gene_family_id, protein_id, protein_alignment) VALUES (?, ?, ?)', + (i, protein_id, cigar)) + protein_sequence = fasta_seq_align.sequence.replace('-', '') + cur.execute('UPDATE transcript SET protein_sequence=? WHERE protein_id=?', (protein_sequence, protein_id)) + conn.commit() def newicktree_to_db(conn, i, fname): @@ -99,7 +143,7 @@ def __main__(): parser = optparse.OptionParser() parser.add_option('-t', '--tree', action='append', help='Gene tree files') - parser.add_option('-c', '--cigar', action='append', help='CIGAR alignments of CDS files in tabular format') + parser.add_option('-a', '--align', action='append', help='Protein alignments in fasta_aln format') parser.add_option('-g', '--gene', help='Gene features file in JSON format') parser.add_option('-o', '--output', help='Path of the output file') options, args = parser.parse_args() @@ -111,9 +155,11 @@ gene_json_to_db(conn, options.gene) - for i, (tree, cigar) in enumerate(zip(options.tree, options.cigar), start=1): + for i, (tree, align) in enumerate(zip(options.tree, options.align), start=1): newicktree_to_db(conn, i, tree) - cigar_to_db(conn, i, cigar) + align_to_db(conn, i, align) + + conn.close() if __name__ == '__main__':