Mercurial > repos > earlhaminst > t_coffee
changeset 1:b3833e5b50d4 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 011cabb2a2b3237bbbc4850ed26972816702a2ba-dirty
author | earlhaminst |
---|---|
date | Mon, 19 Dec 2016 17:47:31 -0500 |
parents | 794a6e864a96 |
children | df6527887a18 |
files | filter_by_fasta_ids.py t_coffee.xml t_coffee_to_cigar.pl t_coffee_to_cigar.py |
diffstat | 4 files changed, 51 insertions(+), 53 deletions(-) [+] |
line wrap: on
line diff
--- a/filter_by_fasta_ids.py Thu Dec 15 11:04:25 2016 -0500 +++ b/filter_by_fasta_ids.py Mon Dec 19 17:47:31 2016 -0500 @@ -82,10 +82,9 @@ work_summary = {'wanted': 0, 'found': 0, 'duplicates': 0} targets = [] - f_target = open(sys.argv[1]) - for line in f_target.readlines(): - targets.append(">%s" % line.strip().upper()) - f_target.close() + with open(sys.argv[1]) as f_target: + for line in f_target.readlines(): + targets.append(">%s" % line.strip().upper()) work_summary['wanted'] = len(targets) homd_db = FASTAReader(sys.argv[2])
--- a/t_coffee.xml Thu Dec 15 11:04:25 2016 -0500 +++ b/t_coffee.xml Mon Dec 19 17:47:31 2016 -0500 @@ -51,7 +51,7 @@ t_coffee '$input' $method_opt $output_opt -n_core \${GALAXY_SLOTS:-1} -run_name t_coffee_out -quiet #if 'cigar' in $outputs_arr - && perl '$__tool_directory__/t_coffee_to_cigar.pl' t_coffee_out.fasta_aln > '$cigar' + && python '$__tool_directory__/t_coffee_to_cigar.py' t_coffee_out.fasta_aln > '$cigar' #end if ]]> </command>
--- a/t_coffee_to_cigar.pl Thu Dec 15 11:04:25 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ -#!/usr/bin/perl -# -use strict; -use warnings; - -# A simple Perl script to convert FASTA sequence alignments into 2-column output where first column is FASTA id and second is CIGAR line -# TCoffee_to_cigar.pl <file> - -sub convert_and_print { - my ($header, $sequence) = @_; - # Converts each match into M and each gap into D - $sequence =~ s/[^-]/M/g; - $sequence =~ s/-/D/g; - - # Split the sequence in substrings composed by the same letter - $sequence =~ s/DM/D,M/g; - $sequence =~ s/MD/M,D/g; - my @cigar_array = split(',', $sequence); - - # Condense each substring, e.g. DDDD in 4D, and concatenate them again - my $cigar = ''; - foreach my $str (@cigar_array) { - if (length($str) > 1) { - $cigar .= length($str); - } - $cigar .= substr($str, 0, 1); - } - print "$header\t$cigar\n"; -} - -my $file1 = $ARGV[0]; -open my $fh1, '<', $file1; - -my $header = '', my $sequence = ''; -while (my $line = <$fh1>) { - chomp $line; - if (substr($line, 0, 1) eq '>') { - if ($header) { - convert_and_print($header, $sequence); - } - $header = substr($line, 1); - $sequence = ''; - } else { - $sequence .= $line; - } -} -close $fh1; -convert_and_print($header, $sequence);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/t_coffee_to_cigar.py Mon Dec 19 17:47:31 2016 -0500 @@ -0,0 +1,47 @@ +#!/usr/bin/env python +""" A script to build specific fasta databases """ +from __future__ import print_function + +import re +import sys + + +FASTA_MATCH_RE = re.compile(r'[^-]') + + +def convert_and_print(header, sequence): + # Converts each match into M and each gap into D + tmp_seq = FASTA_MATCH_RE.sub('M', sequence) + tmp_seq = tmp_seq.replace('-', 'D') + # Split the sequence in substrings composed by the same letter + tmp_seq = tmp_seq.replace('DM', 'D,M') + tmp_seq = tmp_seq.replace('MD', 'M,D') + cigar_list = tmp_seq.split(',') + # Condense each substring, e.g. DDDD in 4D, and concatenate them again + cigar = '' + for s in cigar_list: + if len(s) > 1: + cigar += str(len(s)) + cigar += s[0] + print("%s\t%s" % (header, cigar)) + + +def main(): + with open(sys.argv[1]) as fh: + header = None + sequence = None + for line in fh: + line = line.strip() + if line and line[0] == '>': + if header: + convert_and_print(header, sequence) + header = line[1:] + sequence = '' + else: + sequence += line + if header: + convert_and_print(header, sequence) + + +if __name__ == "__main__": + main()